From 02e3d8dcf6794405d6376ba671ccc632f412c3d0 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Tue, 8 Jun 2021 16:36:16 -0400
Subject: [PATCH 01/46] prelim

---
 cpp/src/arrow/util/bitmap.h | 122 ++++++++++++++++++++++++++++++++++++
 1 file changed, 122 insertions(+)
diff --git a/cpp/src/arrow/util/bitmap.h b/cpp/src/arrow/util/bitmap.h
index 8562c55e3d5..575e51cf956 100644
--- a/cpp/src/arrow/util/bitmap.h
+++ b/cpp/src/arrow/util/bitmap.h
@@ -225,6 +225,128 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
     return min_offset;
   }
 
+  /// \brief Visit words of bits from each bitmap as array<Word, N>
+  ///
+  /// All bitmaps must have identical length. The first bit in a visited bitmap
+  /// may be offset within the first visited word, but words will otherwise contain
+  /// densely packed bits loaded from the bitmap. That offset within the first word is
+  /// returned.
+  ///
+  /// TODO(bkietz) allow for early termination
+  // NOTE: this function is efficient on 3+ sufficiently large bitmaps.
+  // It also has a large prolog / epilog overhead and should be used
+  // carefully in other cases.
+  // For 2 bitmaps or less, and/or smaller bitmaps, see also VisitTwoBitBlocksVoid
+  // and BitmapUInt64Reader.
+  template <size_t N, typename Visitor,
+            typename Word = typename std::decay<
+                internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
+  static int64_t VisitWordsNew(const std::array<Bitmap, N>& bitmaps_arg,
+                               Visitor&& visitor, Bitmap* out_bitmap_arg) {
+    constexpr int64_t kBitWidth = sizeof(Word) * 8;
+
+    // local, mutable variables which will be sliced/decremented to represent consumption:
+    std::array<Bitmap, N + 1> bitmaps;
+    std::array<int64_t, N + 1> offsets;
+    int64_t bit_length = BitLength(bitmaps_arg, N+ 1);
+    std::array<View<Word>, N + 1> words;
+
+    for (size_t i = 0; i < N; ++i) {
+      bitmaps[i] = bitmaps_arg[i];
+      offsets[i] = bitmaps[i].template word_offset<Word>();
+      assert(offsets[i] >= 0 && offsets[i] < kBitWidth);
+      words[i] = bitmaps[i].template words<Word>();
+    }
+    bitmaps[N] = *out_bitmap_arg;
+    offsets[N] = bitmaps[N].template word_offset<Word>();
+    assert(offsets[N] >= 0 && offsets[N] < kBitWidth);
+    words[N] = bitmaps[N].template words<Word>();
+
+    auto consume = [&](int64_t consumed_bits) {
+      for (size_t i = 0; i < N; ++i) {
+        bitmaps[i] = bitmaps[i].Slice(consumed_bits, bit_length - consumed_bits);
+        offsets[i] = bitmaps[i].template word_offset<Word>();
+        assert(offsets[i] >= 0 && offsets[i] < kBitWidth);
+        words[i] = bitmaps[i].template words<Word>();
+      }
+      bit_length -= consumed_bits;
+    };
+
+    std::array<Word, N> visited_words;
+    visited_words.fill(0);
+
+    if (bit_length <= kBitWidth * 2) {
+      // bitmaps fit into one or two words so don't bother with optimization
+      while (bit_length > 0) {
+        auto leading_bits = std::min(bit_length, kBitWidth);
+        SafeLoadWords(bitmaps, 0, leading_bits, false, &visited_words);
+        visitor(visited_words);
+        consume(leading_bits);
+      }
+      return 0;
+    }
+
+    int64_t max_offset = *std::max_element(offsets, offsets + N);
+    int64_t min_offset = *std::min_element(offsets, offsets + N);
+    if (max_offset > 0) {
+      // consume leading bits
+      auto leading_bits = kBitWidth - min_offset;
+      SafeLoadWords(bitmaps, 0, leading_bits, true, &visited_words);
+      visitor(visited_words);
+      consume(leading_bits);
+    }
+    assert(*std::min_element(offsets, offsets + N) == 0);
+
+    int64_t whole_word_count = bit_length / kBitWidth;
+    assert(whole_word_count >= 1);
+
+    if (min_offset == max_offset) {
+      // all offsets were identical, all leading bits have been consumed
+      assert(
+          std::all_of(offsets, offsets + N, [](int64_t offset) { return offset == 0; }));
+
+      for (int64_t word_i = 0; word_i < whole_word_count; ++word_i) {
+        for (size_t i = 0; i < N; ++i) {
+          visited_words[i] = words[i][word_i];
+        }
+        visitor(visited_words);
+      }
+      consume(whole_word_count * kBitWidth);
+    } else {
+      // leading bits from potentially incomplete words have been consumed
+
+      // word_i such that words[i][word_i] and words[i][word_i + 1] are lie entirely
+      // within the bitmap for all i
+      for (int64_t word_i = 0; word_i < whole_word_count - 1; ++word_i) {
+        for (size_t i = 0; i < N; ++i) {
+          if (offsets[i] == 0) {
+            visited_words[i] = words[i][word_i];
+          } else {
+            auto words0 = BitUtil::ToLittleEndian(words[i][word_i]);
+            auto words1 = BitUtil::ToLittleEndian(words[i][word_i + 1]);
+            visited_words[i] = BitUtil::FromLittleEndian(
+                (words0 >> offsets[i]) | (words1 << (kBitWidth - offsets[i])));
+          }
+        }
+        visitor(visited_words);
+      }
+      consume((whole_word_count - 1) * kBitWidth);
+
+      SafeLoadWords(bitmaps, 0, kBitWidth, false, &visited_words);
+
+      visitor(visited_words);
+      consume(kBitWidth);
+    }
+
+    // load remaining bits
+    if (bit_length > 0) {
+      SafeLoadWords(bitmaps, 0, bit_length, false, &visited_words);
+      visitor(visited_words);
+    }
+
+    return min_offset;
+  }
+
   const std::shared_ptr<Buffer>& buffer() const { return buffer_; }
 
   /// offset of first bit relative to buffer().data()

From fad03833ef4e4a524939baf061f48f3a9828f3cb Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Wed, 9 Jun 2021 00:17:39 -0400
Subject: [PATCH 02/46] working - not tested properly. requires clean up

---
 .../arrow/compute/kernels/scalar_if_else.cc   | 106 ++++++++++-------
 cpp/src/arrow/util/bitmap.h                   | 112 +++++++++++++-----
 2 files changed, 145 insertions(+), 73 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
index 7a0defaccd6..83e5501a0f1 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
@@ -76,15 +76,6 @@ Status PromoteNullsVisitor(KernelContext* ctx, const Datum& cond_d, const Datum&
   // duplicated (probably elided) access to cond_data
   const Bitmap& _ = cond_data;
 
-  // lambda function that will be used inside the visitor
-  uint64_t* out_validity = nullptr;
-  int64_t i = 0;
-  auto apply = [&](uint64_t c_valid, uint64_t c_data, uint64_t l_valid,
-                   uint64_t r_valid) {
-    out_validity[i] = c_valid & ((c_data & l_valid) | (~c_data & r_valid));
-    i++;
-  };
-
   // cond.valid & (cond.data & left.valid | ~cond.data & right.valid)
   // In the following cases, we dont need to allocate out_valid bitmap
 
@@ -110,72 +101,103 @@ Status PromoteNullsVisitor(KernelContext* ctx, const Datum& cond_d, const Datum&
 
   // following cases requires a separate out_valid buffer
   ARROW_ASSIGN_OR_RAISE(output->buffers[0], ctx->AllocateBitmap(cond.length));
-  out_validity = output->GetMutableValues<uint64_t>(0);
+
+  // lambda function that will be used inside the visitor
+  auto apply = [&](uint64_t c_valid, uint64_t c_data, uint64_t l_valid,
+                   uint64_t r_valid) {
+    return c_valid & ((c_data & l_valid) | (~c_data & r_valid));
+  };
+
+  Bitmap out_bitmap(output->buffers[0], 0, cond.length);
 
   enum { C_VALID, C_DATA, L_VALID, R_VALID };
 
   switch (flag) {
     case COND_CONST | LEFT_CONST | RIGHT_CONST: {
-      Bitmap bitmaps[] = {_, cond_data, _, _};
-      Bitmap::VisitWords(bitmaps, [&](std::array<uint64_t, 4> words) {
-        apply(*cond_const, words[C_DATA], *left_const, *right_const);
-      });
+      std::array<Bitmap, 4> bitmaps{_, cond_data, _, _};
+      Bitmap::VisitWordsAndWrite(
+          bitmaps,
+          [&](std::array<uint64_t, 4> words) {
+            return apply(*cond_const, words[C_DATA], *left_const, *right_const);
+          },
+          &out_bitmap);
       break;
     }
     case LEFT_CONST | RIGHT_CONST: {
-      Bitmap bitmaps[] = {cond_valid, cond_data, _, _};
-      Bitmap::VisitWords(bitmaps, [&](std::array<uint64_t, 4> words) {
-        apply(words[C_VALID], words[C_DATA], *left_const, *right_const);
-      });
+      std::array<Bitmap, 4> bitmaps{cond_valid, cond_data, _, _};
+      Bitmap::VisitWordsAndWrite(
+          bitmaps,
+          [&](std::array<uint64_t, 4> words) {
+            return apply(words[C_VALID], words[C_DATA], *left_const, *right_const);
+          },
+          &out_bitmap);
       break;
     }
     case COND_CONST | RIGHT_CONST: {
       // bitmaps[C_VALID], bitmaps[R_VALID] might be null; override to make it safe for
       // Visit()
-      Bitmap bitmaps[] = {_, cond_data, left_valid, _};
-      Bitmap::VisitWords(bitmaps, [&](std::array<uint64_t, 4> words) {
-        apply(*cond_const, words[C_DATA], words[L_VALID], *right_const);
-      });
+      std::array<Bitmap, 4> bitmaps{_, cond_data, left_valid, _};
+      Bitmap::VisitWordsAndWrite(
+          bitmaps,
+          [&](std::array<uint64_t, 4> words) {
+            return apply(*cond_const, words[C_DATA], words[L_VALID], *right_const);
+          },
+          &out_bitmap);
       break;
     }
     case RIGHT_CONST: {
       // bitmaps[R_VALID] might be null; override to make it safe for Visit()
-      Bitmap bitmaps[] = {cond_valid, cond_data, left_valid, _};
-      Bitmap::VisitWords(bitmaps, [&](std::array<uint64_t, 4> words) {
-        apply(words[C_VALID], words[C_DATA], words[L_VALID], *right_const);
-      });
+      std::array<Bitmap, 4> bitmaps{cond_valid, cond_data, left_valid, _};
+      Bitmap::VisitWordsAndWrite(
+          bitmaps,
+          [&](std::array<uint64_t, 4> words) {
+            return apply(words[C_VALID], words[C_DATA], words[L_VALID], *right_const);
+          },
+          &out_bitmap);
       break;
     }
     case COND_CONST | LEFT_CONST: {
       // bitmaps[C_VALID], bitmaps[L_VALID] might be null; override to make it safe for
       // Visit()
-      Bitmap bitmaps[] = {_, cond_data, _, right_valid};
-      Bitmap::VisitWords(bitmaps, [&](std::array<uint64_t, 4> words) {
-        apply(*cond_const, words[C_DATA], *left_const, words[R_VALID]);
-      });
+      std::array<Bitmap, 4> bitmaps{_, cond_data, _, right_valid};
+      Bitmap::VisitWordsAndWrite(
+          bitmaps,
+          [&](std::array<uint64_t, 4> words) {
+            return apply(*cond_const, words[C_DATA], *left_const, words[R_VALID]);
+          },
+          &out_bitmap);
       break;
     }
     case LEFT_CONST: {
       // bitmaps[L_VALID] might be null; override to make it safe for Visit()
-      Bitmap bitmaps[] = {cond_valid, cond_data, _, right_valid};
-      Bitmap::VisitWords(bitmaps, [&](std::array<uint64_t, 4> words) {
-        apply(words[C_VALID], words[C_DATA], *left_const, words[R_VALID]);
-      });
+      std::array<Bitmap, 4> bitmaps{cond_valid, cond_data, _, right_valid};
+      Bitmap::VisitWordsAndWrite(
+          bitmaps,
+          [&](std::array<uint64_t, 4> words) {
+            return apply(words[C_VALID], words[C_DATA], *left_const, words[R_VALID]);
+          },
+          &out_bitmap);
       break;
     }
     case COND_CONST: {
       // bitmaps[C_VALID] might be null; override to make it safe for Visit()
-      Bitmap bitmaps[] = {_, cond_data, left_valid, right_valid};
-      Bitmap::VisitWords(bitmaps, [&](std::array<uint64_t, 4> words) {
-        apply(*cond_const, words[C_DATA], words[L_VALID], words[R_VALID]);
-      });
+      std::array<Bitmap, 4> bitmaps{_, cond_data, left_valid, right_valid};
+      Bitmap::VisitWordsAndWrite(
+          bitmaps,
+          [&](std::array<uint64_t, 4> words) {
+            return apply(*cond_const, words[C_DATA], words[L_VALID], words[R_VALID]);
+          },
+          &out_bitmap);
       break;
     }
     case 0: {
-      Bitmap bitmaps[] = {cond_valid, cond_data, left_valid, right_valid};
-      Bitmap::VisitWords(bitmaps, [&](std::array<uint64_t, 4> words) {
-        apply(words[C_VALID], words[C_DATA], words[L_VALID], words[R_VALID]);
-      });
+      std::array<Bitmap, 4> bitmaps{cond_valid, cond_data, left_valid, right_valid};
+      Bitmap::VisitWordsAndWrite(
+          bitmaps,
+          [&](std::array<uint64_t, 4> words) {
+            return apply(words[C_VALID], words[C_DATA], words[L_VALID], words[R_VALID]);
+          },
+          &out_bitmap);
       break;
     }
   }
diff --git a/cpp/src/arrow/util/bitmap.h b/cpp/src/arrow/util/bitmap.h
index 575e51cf956..b73bb3f8c94 100644
--- a/cpp/src/arrow/util/bitmap.h
+++ b/cpp/src/arrow/util/bitmap.h
@@ -29,6 +29,7 @@
 
 #include "arrow/buffer.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
 #include "arrow/util/compare.h"
 #include "arrow/util/endian.h"
 #include "arrow/util/functional.h"
@@ -241,34 +242,55 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
   template <size_t N, typename Visitor,
             typename Word = typename std::decay<
                 internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
-  static int64_t VisitWordsNew(const std::array<Bitmap, N>& bitmaps_arg,
-                               Visitor&& visitor, Bitmap* out_bitmap_arg) {
+  static int64_t VisitWordsAndWrite(const std::array<Bitmap, N>& bitmaps_arg,
+                                    Visitor&& visitor, Bitmap* out_bitmap_arg) {
     constexpr int64_t kBitWidth = sizeof(Word) * 8;
 
     // local, mutable variables which will be sliced/decremented to represent consumption:
-    std::array<Bitmap, N + 1> bitmaps;
-    std::array<int64_t, N + 1> offsets;
-    int64_t bit_length = BitLength(bitmaps_arg, N+ 1);
-    std::array<View<Word>, N + 1> words;
+    // todo use std::array here
+    Bitmap bitmaps[N];
+    int64_t word_offsets[N];
+    int64_t bit_length = BitLength(bitmaps_arg);
+    View<Word> words[N];
+
+    struct BitmapHolder {
+      explicit BitmapHolder(Bitmap bitmap_)
+          : bitmap(std::move(bitmap_)),
+            word_offset(BitmapHolder::bitmap.template word_offset<Word>()),
+            words(BitmapHolder::bitmap.template words<Word>()) {
+        assert(BitmapHolder::word_offset >= 0 && BitmapHolder::word_offset < kBitWidth);
+      }
+
+      void SliceAndUpdate(int64_t _offset, int64_t _length) {
+        BitmapHolder::bitmap = bitmap.Slice(_offset, _length);
+        BitmapHolder::word_offset = bitmap.template word_offset<Word>();
+        assert(BitmapHolder::word_offset >= 0 && BitmapHolder::word_offset < kBitWidth);
+        BitmapHolder::words = bitmap.template words<Word>();
+      }
+
+      Bitmap bitmap;
+      int64_t word_offset;
+      View<Word> words;
+    };
 
     for (size_t i = 0; i < N; ++i) {
       bitmaps[i] = bitmaps_arg[i];
-      offsets[i] = bitmaps[i].template word_offset<Word>();
-      assert(offsets[i] >= 0 && offsets[i] < kBitWidth);
+      word_offsets[i] = bitmaps[i].template word_offset<Word>();
+      assert(word_offsets[i] >= 0 && word_offsets[i] < kBitWidth);
       words[i] = bitmaps[i].template words<Word>();
     }
-    bitmaps[N] = *out_bitmap_arg;
-    offsets[N] = bitmaps[N].template word_offset<Word>();
-    assert(offsets[N] >= 0 && offsets[N] < kBitWidth);
-    words[N] = bitmaps[N].template words<Word>();
+
+    BitmapHolder out_bitmap(*out_bitmap_arg);
 
     auto consume = [&](int64_t consumed_bits) {
       for (size_t i = 0; i < N; ++i) {
         bitmaps[i] = bitmaps[i].Slice(consumed_bits, bit_length - consumed_bits);
-        offsets[i] = bitmaps[i].template word_offset<Word>();
-        assert(offsets[i] >= 0 && offsets[i] < kBitWidth);
+        word_offsets[i] = bitmaps[i].template word_offset<Word>();
+        assert(word_offsets[i] >= 0 && word_offsets[i] < kBitWidth);
         words[i] = bitmaps[i].template words<Word>();
       }
+      out_bitmap.SliceAndUpdate(consumed_bits, bit_length - consumed_bits);
+
       bit_length -= consumed_bits;
     };
 
@@ -280,37 +302,50 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
       while (bit_length > 0) {
         auto leading_bits = std::min(bit_length, kBitWidth);
         SafeLoadWords(bitmaps, 0, leading_bits, false, &visited_words);
-        visitor(visited_words);
+        Word visit_out = visitor(visited_words);  // outputs a word/ partial word
+        CopyBitmap(reinterpret_cast<uint8_t*>(&visit_out), 0, leading_bits,
+                   out_bitmap.bitmap.buffer_->mutable_data(), out_bitmap.bitmap.offset_);
         consume(leading_bits);
       }
       return 0;
     }
 
-    int64_t max_offset = *std::max_element(offsets, offsets + N);
-    int64_t min_offset = *std::min_element(offsets, offsets + N);
-    if (max_offset > 0) {
+    int64_t max_word_offset = *std::max_element(word_offsets, word_offsets + N);
+    int64_t min_word_offset = *std::min_element(word_offsets, word_offsets + N);
+    if (max_word_offset > 0) {
       // consume leading bits
-      auto leading_bits = kBitWidth - min_offset;
+      auto leading_bits = kBitWidth - min_word_offset;
       SafeLoadWords(bitmaps, 0, leading_bits, true, &visited_words);
-      visitor(visited_words);
+      Word visit_out = visitor(visited_words);
+      CopyBitmap(reinterpret_cast<uint8_t*>(&visit_out), sizeof(Word) * 8 - leading_bits,
+                 leading_bits, out_bitmap.bitmap.buffer_->mutable_data(),
+                 out_bitmap.bitmap.offset_);
       consume(leading_bits);
     }
-    assert(*std::min_element(offsets, offsets + N) == 0);
+    assert(*std::min_element(word_offsets, word_offsets + N) == 0);
+    assert(out_bitmap.word_offset == 0);
 
     int64_t whole_word_count = bit_length / kBitWidth;
     assert(whole_word_count >= 1);
 
-    if (min_offset == max_offset) {
+    std::vector<Word> visit_outs;
+    visit_outs.reserve(whole_word_count);
+
+    if (min_word_offset == max_word_offset) {
       // all offsets were identical, all leading bits have been consumed
-      assert(
-          std::all_of(offsets, offsets + N, [](int64_t offset) { return offset == 0; }));
+      assert(std::all_of(word_offsets, word_offsets + N,
+                         [](int64_t offset) { return offset == 0; }));
+      assert(out_bitmap.word_offset == 0);
 
       for (int64_t word_i = 0; word_i < whole_word_count; ++word_i) {
         for (size_t i = 0; i < N; ++i) {
           visited_words[i] = words[i][word_i];
         }
-        visitor(visited_words);
+        visit_outs.template emplace_back(visitor(visited_words));
       }
+      CopyBitmap(reinterpret_cast<const uint8_t*>(visit_outs.data()), 0,
+                 whole_word_count * kBitWidth, out_bitmap.bitmap.buffer_->mutable_data(),
+                 out_bitmap.bitmap.offset_);
       consume(whole_word_count * kBitWidth);
     } else {
       // leading bits from potentially incomplete words have been consumed
@@ -319,32 +354,39 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
       // within the bitmap for all i
       for (int64_t word_i = 0; word_i < whole_word_count - 1; ++word_i) {
         for (size_t i = 0; i < N; ++i) {
-          if (offsets[i] == 0) {
+          if (word_offsets[i] == 0) {
             visited_words[i] = words[i][word_i];
           } else {
             auto words0 = BitUtil::ToLittleEndian(words[i][word_i]);
             auto words1 = BitUtil::ToLittleEndian(words[i][word_i + 1]);
             visited_words[i] = BitUtil::FromLittleEndian(
-                (words0 >> offsets[i]) | (words1 << (kBitWidth - offsets[i])));
+                (words0 >> word_offsets[i]) | (words1 << (kBitWidth - word_offsets[i])));
           }
         }
-        visitor(visited_words);
+        visit_outs.template emplace_back(visitor(visited_words));
       }
+      CopyBitmap(reinterpret_cast<const uint8_t*>(visit_outs.data()), 0,
+                 (whole_word_count - 1) * kBitWidth,
+                 out_bitmap.bitmap.buffer_->mutable_data(), out_bitmap.bitmap.offset_);
       consume((whole_word_count - 1) * kBitWidth);
 
       SafeLoadWords(bitmaps, 0, kBitWidth, false, &visited_words);
 
-      visitor(visited_words);
+      Word visit_out = visitor(visited_words);  // outputs a word/ partial word
+      CopyBitmap(reinterpret_cast<uint8_t*>(&visit_out), 0, kBitWidth,
+                 out_bitmap.bitmap.buffer_->mutable_data(), out_bitmap.bitmap.offset_);
       consume(kBitWidth);
     }
 
     // load remaining bits
     if (bit_length > 0) {
       SafeLoadWords(bitmaps, 0, bit_length, false, &visited_words);
-      visitor(visited_words);
+      Word visit_out = visitor(visited_words);
+      CopyBitmap(reinterpret_cast<uint8_t*>(&visit_out), 0, bit_length,
+                 out_bitmap.bitmap.buffer_->mutable_data(), out_bitmap.bitmap.offset_);
     }
 
-    return min_offset;
+    return min_word_offset;
   }
 
   const std::shared_ptr<Buffer>& buffer() const { return buffer_; }
@@ -423,6 +465,14 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
   /// assert bitmaps have identical length and return that length
   static int64_t BitLength(const Bitmap* bitmaps, size_t N);
 
+  template <size_t N>
+  static int64_t BitLength(const std::array<Bitmap, N>& bitmaps) {
+    for (size_t i = 1; i < bitmaps.size(); ++i) {
+      DCHECK_EQ(bitmaps[i].length(), bitmaps[0].length());
+    }
+    return bitmaps[0].length();
+  }
+
   std::shared_ptr<Buffer> buffer_;
   int64_t offset_ = 0, length_ = 0;
 };

From 1ffce3fce469ddf4e8fd92c6b37c364d7a1e55d7 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Wed, 9 Jun 2021 09:36:17 -0400
Subject: [PATCH 03/46] adding striding

---
 cpp/src/arrow/util/bitmap.h | 57 ++++++++++++++++++++++++-------------
 1 file changed, 37 insertions(+), 20 deletions(-)

diff --git a/cpp/src/arrow/util/bitmap.h b/cpp/src/arrow/util/bitmap.h
index b73bb3f8c94..d3cca8e46c4 100644
--- a/cpp/src/arrow/util/bitmap.h
+++ b/cpp/src/arrow/util/bitmap.h
@@ -74,6 +74,11 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
     return Bitmap(buffer_, offset_ + offset, length);
   }
 
+  void Stride(int64_t stride) {
+    this->offset_ += stride;
+    this->length_ -= stride;
+  }
+
   std::string ToString() const;
 
   bool Equals(const Bitmap& other) const;
@@ -254,21 +259,34 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
     View<Word> words[N];
 
     struct BitmapHolder {
-      explicit BitmapHolder(Bitmap bitmap_)
-          : bitmap(std::move(bitmap_)),
-            word_offset(BitmapHolder::bitmap.template word_offset<Word>()),
-            words(BitmapHolder::bitmap.template words<Word>()) {
+      explicit BitmapHolder(Bitmap* bitmap_)
+          : bitmap(bitmap_),
+            word_offset(bitmap_->template word_offset<Word>()),
+            words(bitmap_->template words<Word>()) {
         assert(BitmapHolder::word_offset >= 0 && BitmapHolder::word_offset < kBitWidth);
       }
 
-      void SliceAndUpdate(int64_t _offset, int64_t _length) {
-        BitmapHolder::bitmap = bitmap.Slice(_offset, _length);
-        BitmapHolder::word_offset = bitmap.template word_offset<Word>();
+      //      void SliceAndUpdate(int64_t _offset, int64_t _length) {
+      //        BitmapHolder::bitmap = bitmap.Slice(_offset, _length);
+      //        BitmapHolder::word_offset = bitmap.template word_offset<Word>();
+      //        assert(BitmapHolder::word_offset >= 0 && BitmapHolder::word_offset <
+      //        kBitWidth); BitmapHolder::words = bitmap.template words<Word>();
+      //      }
+
+      void StrideAndUpdate(int64_t _stride) {
+        BitmapHolder::bitmap->Stride(_stride);
+        BitmapHolder::word_offset = bitmap->template word_offset<Word>();
         assert(BitmapHolder::word_offset >= 0 && BitmapHolder::word_offset < kBitWidth);
-        BitmapHolder::words = bitmap.template words<Word>();
+        BitmapHolder::words = bitmap->template words<Word>();
       }
 
-      Bitmap bitmap;
+      inline int64_t offset() const { return bitmap->offset_; }
+
+      inline const uint8_t* data() const { return bitmap->buffer_->data(); }
+
+      inline uint8_t* mutable_data() { return bitmap->buffer_->mutable_data(); }
+
+      Bitmap* bitmap;
       int64_t word_offset;
       View<Word> words;
     };
@@ -280,7 +298,7 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
       words[i] = bitmaps[i].template words<Word>();
     }
 
-    BitmapHolder out_bitmap(*out_bitmap_arg);
+    BitmapHolder out_bitmap(out_bitmap_arg);
 
     auto consume = [&](int64_t consumed_bits) {
       for (size_t i = 0; i < N; ++i) {
@@ -289,7 +307,7 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
         assert(word_offsets[i] >= 0 && word_offsets[i] < kBitWidth);
         words[i] = bitmaps[i].template words<Word>();
       }
-      out_bitmap.SliceAndUpdate(consumed_bits, bit_length - consumed_bits);
+      out_bitmap.StrideAndUpdate(consumed_bits);
 
       bit_length -= consumed_bits;
     };
@@ -304,7 +322,7 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
         SafeLoadWords(bitmaps, 0, leading_bits, false, &visited_words);
         Word visit_out = visitor(visited_words);  // outputs a word/ partial word
         CopyBitmap(reinterpret_cast<uint8_t*>(&visit_out), 0, leading_bits,
-                   out_bitmap.bitmap.buffer_->mutable_data(), out_bitmap.bitmap.offset_);
+                   out_bitmap.mutable_data(), out_bitmap.offset());
         consume(leading_bits);
       }
       return 0;
@@ -318,8 +336,7 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
       SafeLoadWords(bitmaps, 0, leading_bits, true, &visited_words);
       Word visit_out = visitor(visited_words);
       CopyBitmap(reinterpret_cast<uint8_t*>(&visit_out), sizeof(Word) * 8 - leading_bits,
-                 leading_bits, out_bitmap.bitmap.buffer_->mutable_data(),
-                 out_bitmap.bitmap.offset_);
+                 leading_bits, out_bitmap.mutable_data(), out_bitmap.offset());
       consume(leading_bits);
     }
     assert(*std::min_element(word_offsets, word_offsets + N) == 0);
@@ -344,8 +361,8 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
         visit_outs.template emplace_back(visitor(visited_words));
       }
       CopyBitmap(reinterpret_cast<const uint8_t*>(visit_outs.data()), 0,
-                 whole_word_count * kBitWidth, out_bitmap.bitmap.buffer_->mutable_data(),
-                 out_bitmap.bitmap.offset_);
+                 whole_word_count * kBitWidth, out_bitmap.mutable_data(),
+                 out_bitmap.offset());
       consume(whole_word_count * kBitWidth);
     } else {
       // leading bits from potentially incomplete words have been consumed
@@ -366,15 +383,15 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
         visit_outs.template emplace_back(visitor(visited_words));
       }
       CopyBitmap(reinterpret_cast<const uint8_t*>(visit_outs.data()), 0,
-                 (whole_word_count - 1) * kBitWidth,
-                 out_bitmap.bitmap.buffer_->mutable_data(), out_bitmap.bitmap.offset_);
+                 (whole_word_count - 1) * kBitWidth, out_bitmap.mutable_data(),
+                 out_bitmap.offset());
       consume((whole_word_count - 1) * kBitWidth);
 
       SafeLoadWords(bitmaps, 0, kBitWidth, false, &visited_words);
 
       Word visit_out = visitor(visited_words);  // outputs a word/ partial word
       CopyBitmap(reinterpret_cast<uint8_t*>(&visit_out), 0, kBitWidth,
-                 out_bitmap.bitmap.buffer_->mutable_data(), out_bitmap.bitmap.offset_);
+                 out_bitmap.mutable_data(), out_bitmap.offset());
       consume(kBitWidth);
     }
 
@@ -383,7 +400,7 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
       SafeLoadWords(bitmaps, 0, bit_length, false, &visited_words);
       Word visit_out = visitor(visited_words);
       CopyBitmap(reinterpret_cast<uint8_t*>(&visit_out), 0, bit_length,
-                 out_bitmap.bitmap.buffer_->mutable_data(), out_bitmap.bitmap.offset_);
+                 out_bitmap.mutable_data(), out_bitmap.offset());
     }
 
     return min_word_offset;

From 35f61788e50bebf7802cceced55f12a0c3af46bd Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Wed, 9 Jun 2021 15:41:53 -0400
Subject: [PATCH 04/46] adding tests

---
 cpp/src/arrow/util/CMakeLists.txt |   1 +
 cpp/src/arrow/util/bitmap.h       | 110 +++++++++++++------------
 cpp/src/arrow/util/bitmap_test.cc | 132 ++++++++++++++++++++++++++++++
 3 files changed, 191 insertions(+), 52 deletions(-)
 create mode 100644 cpp/src/arrow/util/bitmap_test.cc

diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt
index e26a17120cd..571834dfca6 100644
--- a/cpp/src/arrow/util/CMakeLists.txt
+++ b/cpp/src/arrow/util/CMakeLists.txt
@@ -44,6 +44,7 @@ add_arrow_test(utility-test
                async_generator_test.cc
                bit_block_counter_test.cc
                bit_util_test.cc
+               bitmap_test.cc
                cache_test.cc
                checked_cast_test.cc
                compression_test.cc
diff --git a/cpp/src/arrow/util/bitmap.h b/cpp/src/arrow/util/bitmap.h
index d3cca8e46c4..877811afd31 100644
--- a/cpp/src/arrow/util/bitmap.h
+++ b/cpp/src/arrow/util/bitmap.h
@@ -115,6 +115,21 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
     }
   }
 
+  /// \brief Visit bits from each bitmap as bitset<N>
+  ///
+  /// All bitmaps must have identical length.
+  template <size_t N, typename Visitor>
+  static void VisitBits(const std::array<Bitmap, N>& bitmaps, Visitor&& visitor) {
+    int64_t bit_length = BitLength(bitmaps);
+    std::bitset<N> bits;
+    for (int64_t bit_i = 0; bit_i < bit_length; ++bit_i) {
+      for (size_t i = 0; i < N; ++i) {
+        bits[i] = bitmaps[i].GetBit(bit_i);
+      }
+      visitor(bits);
+    }
+  }
+
   /// \brief Visit words of bits from each bitmap as array<Word, N>
   ///
   /// All bitmaps must have identical length. The first bit in a visited bitmap
@@ -252,13 +267,11 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
     constexpr int64_t kBitWidth = sizeof(Word) * 8;
 
     // local, mutable variables which will be sliced/decremented to represent consumption:
-    // todo use std::array here
-    Bitmap bitmaps[N];
-    int64_t word_offsets[N];
+    Bitmap bitmaps[N];  // todo use std::array here
     int64_t bit_length = BitLength(bitmaps_arg);
-    View<Word> words[N];
 
     struct BitmapHolder {
+      BitmapHolder() = default;
       explicit BitmapHolder(Bitmap* bitmap_)
           : bitmap(bitmap_),
             word_offset(bitmap_->template word_offset<Word>()),
@@ -266,48 +279,31 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
         assert(BitmapHolder::word_offset >= 0 && BitmapHolder::word_offset < kBitWidth);
       }
 
-      //      void SliceAndUpdate(int64_t _offset, int64_t _length) {
-      //        BitmapHolder::bitmap = bitmap.Slice(_offset, _length);
-      //        BitmapHolder::word_offset = bitmap.template word_offset<Word>();
-      //        assert(BitmapHolder::word_offset >= 0 && BitmapHolder::word_offset <
-      //        kBitWidth); BitmapHolder::words = bitmap.template words<Word>();
-      //      }
-
-      void StrideAndUpdate(int64_t _stride) {
+      inline void StrideAndUpdate(int64_t _stride) {
         BitmapHolder::bitmap->Stride(_stride);
         BitmapHolder::word_offset = bitmap->template word_offset<Word>();
         assert(BitmapHolder::word_offset >= 0 && BitmapHolder::word_offset < kBitWidth);
         BitmapHolder::words = bitmap->template words<Word>();
       }
 
-      inline int64_t offset() const { return bitmap->offset_; }
-
-      inline const uint8_t* data() const { return bitmap->buffer_->data(); }
-
-      inline uint8_t* mutable_data() { return bitmap->buffer_->mutable_data(); }
-
-      Bitmap* bitmap;
-      int64_t word_offset;
+      Bitmap* bitmap{};
+      int64_t word_offset = 0;
       View<Word> words;
     };
 
+    std::array<BitmapHolder, N> in_bitmaps;
+    Bitmap out_bitmap = *out_bitmap_arg;  // make a copy
+
     for (size_t i = 0; i < N; ++i) {
-      bitmaps[i] = bitmaps_arg[i];
-      word_offsets[i] = bitmaps[i].template word_offset<Word>();
-      assert(word_offsets[i] >= 0 && word_offsets[i] < kBitWidth);
-      words[i] = bitmaps[i].template words<Word>();
+      bitmaps[i] = bitmaps_arg[i];  // make a copy
+      in_bitmaps[i] = BitmapHolder(&bitmaps[i]);
     }
 
-    BitmapHolder out_bitmap(out_bitmap_arg);
-
     auto consume = [&](int64_t consumed_bits) {
       for (size_t i = 0; i < N; ++i) {
-        bitmaps[i] = bitmaps[i].Slice(consumed_bits, bit_length - consumed_bits);
-        word_offsets[i] = bitmaps[i].template word_offset<Word>();
-        assert(word_offsets[i] >= 0 && word_offsets[i] < kBitWidth);
-        words[i] = bitmaps[i].template words<Word>();
+        in_bitmaps[i].StrideAndUpdate(consumed_bits);
       }
-      out_bitmap.StrideAndUpdate(consumed_bits);
+      out_bitmap.Stride(consumed_bits);
 
       bit_length -= consumed_bits;
     };
@@ -322,25 +318,33 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
         SafeLoadWords(bitmaps, 0, leading_bits, false, &visited_words);
         Word visit_out = visitor(visited_words);  // outputs a word/ partial word
         CopyBitmap(reinterpret_cast<uint8_t*>(&visit_out), 0, leading_bits,
-                   out_bitmap.mutable_data(), out_bitmap.offset());
+                   out_bitmap.buffer_->mutable_data(), out_bitmap.offset());
         consume(leading_bits);
       }
       return 0;
     }
 
-    int64_t max_word_offset = *std::max_element(word_offsets, word_offsets + N);
-    int64_t min_word_offset = *std::min_element(word_offsets, word_offsets + N);
+    auto word_offset_comp = [](const BitmapHolder& l, const BitmapHolder& r) {
+      return l.word_offset < r.word_offset;
+    };
+
+    int64_t max_word_offset =
+        (*std::max_element(in_bitmaps.begin(), in_bitmaps.end(), word_offset_comp))
+            .word_offset;
+    int64_t min_word_offset =
+        (*std::min_element(in_bitmaps.begin(), in_bitmaps.end(), word_offset_comp))
+            .word_offset;
     if (max_word_offset > 0) {
       // consume leading bits
       auto leading_bits = kBitWidth - min_word_offset;
       SafeLoadWords(bitmaps, 0, leading_bits, true, &visited_words);
       Word visit_out = visitor(visited_words);
       CopyBitmap(reinterpret_cast<uint8_t*>(&visit_out), sizeof(Word) * 8 - leading_bits,
-                 leading_bits, out_bitmap.mutable_data(), out_bitmap.offset());
+                 leading_bits, out_bitmap.buffer_->mutable_data(), out_bitmap.offset());
       consume(leading_bits);
     }
-    assert(*std::min_element(word_offsets, word_offsets + N) == 0);
-    assert(out_bitmap.word_offset == 0);
+    assert((*std::min_element(in_bitmaps.begin(), in_bitmaps.end(), word_offset_comp))
+               .word_offset == 0);
 
     int64_t whole_word_count = bit_length / kBitWidth;
     assert(whole_word_count >= 1);
@@ -350,18 +354,18 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
 
     if (min_word_offset == max_word_offset) {
       // all offsets were identical, all leading bits have been consumed
-      assert(std::all_of(word_offsets, word_offsets + N,
-                         [](int64_t offset) { return offset == 0; }));
-      assert(out_bitmap.word_offset == 0);
+      assert(std::all_of(
+          in_bitmaps.begin(), in_bitmaps.end(),
+          [](const BitmapHolder& holder) { return holder.word_offset == 0; }));
 
       for (int64_t word_i = 0; word_i < whole_word_count; ++word_i) {
         for (size_t i = 0; i < N; ++i) {
-          visited_words[i] = words[i][word_i];
+          visited_words[i] = in_bitmaps[i].words[word_i];
         }
         visit_outs.template emplace_back(visitor(visited_words));
       }
       CopyBitmap(reinterpret_cast<const uint8_t*>(visit_outs.data()), 0,
-                 whole_word_count * kBitWidth, out_bitmap.mutable_data(),
+                 whole_word_count * kBitWidth, out_bitmap.buffer_->mutable_data(),
                  out_bitmap.offset());
       consume(whole_word_count * kBitWidth);
     } else {
@@ -371,19 +375,21 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
       // within the bitmap for all i
       for (int64_t word_i = 0; word_i < whole_word_count - 1; ++word_i) {
         for (size_t i = 0; i < N; ++i) {
-          if (word_offsets[i] == 0) {
-            visited_words[i] = words[i][word_i];
+          const auto ith_words = in_bitmaps[i].words;
+          const auto ith_word_offset = in_bitmaps[i].word_offset;
+          if (ith_word_offset == 0) {
+            visited_words[i] = ith_words[word_i];
           } else {
-            auto words0 = BitUtil::ToLittleEndian(words[i][word_i]);
-            auto words1 = BitUtil::ToLittleEndian(words[i][word_i + 1]);
+            auto words0 = BitUtil::ToLittleEndian(ith_words[word_i]);
+            auto words1 = BitUtil::ToLittleEndian(ith_words[word_i + 1]);
             visited_words[i] = BitUtil::FromLittleEndian(
-                (words0 >> word_offsets[i]) | (words1 << (kBitWidth - word_offsets[i])));
+                (words0 >> ith_word_offset) | (words1 << (kBitWidth - ith_word_offset)));
           }
         }
         visit_outs.template emplace_back(visitor(visited_words));
       }
       CopyBitmap(reinterpret_cast<const uint8_t*>(visit_outs.data()), 0,
-                 (whole_word_count - 1) * kBitWidth, out_bitmap.mutable_data(),
+                 (whole_word_count - 1) * kBitWidth, out_bitmap.buffer_->mutable_data(),
                  out_bitmap.offset());
       consume((whole_word_count - 1) * kBitWidth);
 
@@ -391,7 +397,7 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
 
       Word visit_out = visitor(visited_words);  // outputs a word/ partial word
       CopyBitmap(reinterpret_cast<uint8_t*>(&visit_out), 0, kBitWidth,
-                 out_bitmap.mutable_data(), out_bitmap.offset());
+                 out_bitmap.buffer_->mutable_data(), out_bitmap.offset());
       consume(kBitWidth);
     }
 
@@ -400,7 +406,7 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
       SafeLoadWords(bitmaps, 0, bit_length, false, &visited_words);
       Word visit_out = visitor(visited_words);
       CopyBitmap(reinterpret_cast<uint8_t*>(&visit_out), 0, bit_length,
-                 out_bitmap.mutable_data(), out_bitmap.offset());
+                 out_bitmap.buffer_->mutable_data(), out_bitmap.offset());
     }
 
     return min_word_offset;
@@ -484,8 +490,8 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
 
   template <size_t N>
   static int64_t BitLength(const std::array<Bitmap, N>& bitmaps) {
-    for (size_t i = 1; i < bitmaps.size(); ++i) {
-      DCHECK_EQ(bitmaps[i].length(), bitmaps[0].length());
+    for (size_t i = 1; i < N; ++i) {
+      assert(bitmaps[i].length() == bitmaps[0].length());
     }
     return bitmaps[0].length();
   }
diff --git a/cpp/src/arrow/util/bitmap_test.cc b/cpp/src/arrow/util/bitmap_test.cc
new file mode 100644
index 00000000000..4dc6d5c0cee
--- /dev/null
+++ b/cpp/src/arrow/util/bitmap_test.cc
@@ -0,0 +1,132 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/bitmap.h"
+
+#include <arrow/array/builder_primitive.h>
+#include <arrow/testing/gtest_util.h>
+#include <gtest/gtest.h>
+
+#include <numeric>
+#include <random>
+
+#include "arrow/buffer.h"
+
+namespace arrow {
+namespace internal {
+
+void random_bool_vector(std::vector<bool>& vec, int64_t size, double p = 0.5) {
+  vec.reserve(size);
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::bernoulli_distribution d(p);
+
+  for (int n = 0; n < size; ++n) {
+    vec.push_back(d(gen));
+  }
+}
+
+void VerifyBoolOutput(const Bitmap& bitmap, const std::vector<bool>& expected) {
+  arrow::BooleanBuilder boolean_builder;
+  ASSERT_OK(boolean_builder.AppendValues(expected));
+  ASSERT_OK_AND_ASSIGN(auto arr, boolean_builder.Finish());
+
+  ASSERT_TRUE(BitmapEquals(bitmap.buffer()->data(), bitmap.offset(),
+                           arr->data()->buffers[1]->data(), 0, expected.size()));
+}
+
+class TestBitmapVisit : public ::testing::Test {};
+
+TEST_F(TestBitmapVisit, OutputZeroOffset) {
+  int64_t bits = 1000, part = bits / 4;
+
+  std::vector<bool> data;
+  random_bool_vector(data, bits);
+
+  arrow::BooleanBuilder boolean_builder;
+  ASSERT_OK(boolean_builder.AppendValues(data));
+  ASSERT_OK_AND_ASSIGN(auto arrow_data, boolean_builder.Finish());
+
+  std::shared_ptr<Buffer>& arrow_buffer = arrow_data->data()->buffers[1];
+
+  Bitmap bm0(arrow_buffer, 0, part);
+  Bitmap bm1 = bm0.Slice(part * 1, part);  // this goes beyond bm0's len
+  Bitmap bm2 = bm0.Slice(part * 2, part);  // this goes beyond bm0's len
+
+  ASSERT_OK_AND_ASSIGN(auto out, AllocateBitmap(part));
+  Bitmap out_bm(out, 0, part);
+
+  // (bm0 & bm1) | bm2
+  std::array<Bitmap, 3> bms{bm0, bm1, bm2};
+  Bitmap::VisitWordsAndWrite(
+      bms,
+      [](std::array<uint64_t, 3>& words) { return (words[0] & words[1]) | words[2]; },
+      &out_bm);
+
+  std::vector<bool> v0(data.begin(), data.begin() + part);
+  std::vector<bool> v1(data.begin() + part * 1, data.begin() + part * 2);
+  std::vector<bool> v2(data.begin() + part * 2, data.begin() + part * 3);
+  std::vector<bool> v3(part);
+  // v3 = v0 & v1
+  std::transform(v0.begin(), v0.end(), v1.begin(), v3.begin(), std::logical_and<bool>());
+  // v3 |= v2
+  std::transform(v3.begin(), v3.end(), v2.begin(), v3.begin(), std::logical_or<bool>());
+
+  VerifyBoolOutput(out_bm, v3);
+}
+
+TEST_F(TestBitmapVisit, OutputNonZeroOffset) {
+  int64_t bits = 1000, part = bits / 4;
+
+  std::vector<bool> data;
+  random_bool_vector(data, bits);
+
+  arrow::BooleanBuilder boolean_builder;
+  ASSERT_OK(boolean_builder.AppendValues(data));
+  ASSERT_OK_AND_ASSIGN(auto arrow_data, boolean_builder.Finish());
+
+  std::shared_ptr<Buffer>& arrow_buffer = arrow_data->data()->buffers[1];
+
+  Bitmap bm0(arrow_buffer, 0, part);
+  Bitmap bm1 = bm0.Slice(part * 1, part);  // this goes beyond bm0's len
+  Bitmap bm2 = bm0.Slice(part * 2, part);  // this goes beyond bm0's len
+
+  // allocate lager buffer but only use the last `part`
+  ASSERT_OK_AND_ASSIGN(auto out, AllocateBitmap(part * 2));
+  Bitmap out_bm(out, part, part);
+
+  // (bm0 & bm1) | bm2
+  std::array<Bitmap, 3> bms{bm0, bm1, bm2};
+  Bitmap::VisitWordsAndWrite(
+      bms,
+      [](std::array<uint64_t, 3>& words) { return (words[0] & words[1]) | words[2]; },
+      &out_bm);
+
+  std::vector<bool> v0(data.begin(), data.begin() + part);
+  std::vector<bool> v1(data.begin() + part * 1, data.begin() + part * 2);
+  std::vector<bool> v2(data.begin() + part * 2, data.begin() + part * 3);
+  std::vector<bool> v3(part);
+  // v3 = v0 & v1
+  std::transform(v0.begin(), v0.end(), v1.begin(), v3.begin(), std::logical_and<bool>());
+  // v3 |= v2
+  std::transform(v3.begin(), v3.end(), v2.begin(), v3.begin(), std::logical_or<bool>());
+
+  VerifyBoolOutput(out_bm, v3);
+}
+
+}  // namespace internal
+}  // namespace arrow
\ No newline at end of file

From f0f3c83a47fc373a4fac0ac5e50e76bbe62b3f65 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Wed, 9 Jun 2021 17:10:10 -0400
Subject: [PATCH 05/46] moving BitmapWordReader and BitmapWordWriter to header
 files

---
 cpp/src/arrow/util/bitmap_ops.cc   | 218 -----------------------------
 cpp/src/arrow/util/bitmap_reader.h | 113 ++++++++++++++-
 cpp/src/arrow/util/bitmap_writer.h | 101 +++++++++++++
 3 files changed, 213 insertions(+), 219 deletions(-)

diff --git a/cpp/src/arrow/util/bitmap_ops.cc b/cpp/src/arrow/util/bitmap_ops.cc
index a27a61cadf3..63c8b008f4a 100644
--- a/cpp/src/arrow/util/bitmap_ops.cc
+++ b/cpp/src/arrow/util/bitmap_ops.cc
@@ -28,9 +28,7 @@
 #include "arrow/util/bit_util.h"
 #include "arrow/util/bitmap_reader.h"
 #include "arrow/util/bitmap_writer.h"
-#include "arrow/util/endian.h"
 #include "arrow/util/logging.h"
-#include "arrow/util/ubsan.h"
 
 namespace arrow {
 namespace internal {
@@ -85,222 +83,6 @@ int64_t CountSetBits(const uint8_t* data, int64_t bit_offset, int64_t length) {
   return count;
 }
 
-namespace {
-
-// BitmapWordReader here is faster than BitmapUInt64Reader (in bitmap_reader.h)
-// on sufficiently large inputs.  However, it has a larger prolog / epilog overhead
-// and should probably not be used for small bitmaps.
-
-template <typename Word>
-class BitmapWordReader {
- public:
-  BitmapWordReader(const uint8_t* bitmap, int64_t offset, int64_t length) {
-    bitmap_ = bitmap + offset / 8;
-    offset_ = offset % 8;
-    bitmap_end_ = bitmap_ + BitUtil::BytesForBits(offset_ + length);
-
-    // decrement word count by one as we may touch two adjacent words in one iteration
-    nwords_ = length / (sizeof(Word) * 8) - 1;
-    if (nwords_ < 0) {
-      nwords_ = 0;
-    }
-    trailing_bits_ = static_cast<int>(length - nwords_ * sizeof(Word) * 8);
-    trailing_bytes_ = static_cast<int>(BitUtil::BytesForBits(trailing_bits_));
-
-    if (nwords_ > 0) {
-      current_word_ = load<Word>(bitmap_);
-    } else if (length > 0) {
-      current_byte_ = load<uint8_t>(bitmap_);
-    }
-  }
-
-  Word NextWord() {
-    bitmap_ += sizeof(Word);
-    const Word next_word = load<Word>(bitmap_);
-    Word word = current_word_;
-    if (offset_) {
-      // combine two adjacent words into one word
-      // |<------ next ----->|<---- current ---->|
-      // +-------------+-----+-------------+-----+
-      // |     ---     |  A  |      B      | --- |
-      // +-------------+-----+-------------+-----+
-      //                  |         |       offset
-      //                  v         v
-      //               +-----+-------------+
-      //               |  A  |      B      |
-      //               +-----+-------------+
-      //               |<------ word ----->|
-      word >>= offset_;
-      word |= next_word << (sizeof(Word) * 8 - offset_);
-    }
-    current_word_ = next_word;
-    return word;
-  }
-
-  uint8_t NextTrailingByte(int& valid_bits) {
-    uint8_t byte;
-    DCHECK_GT(trailing_bits_, 0);
-
-    if (trailing_bits_ <= 8) {
-      // last byte
-      valid_bits = trailing_bits_;
-      trailing_bits_ = 0;
-      byte = 0;
-      internal::BitmapReader reader(bitmap_, offset_, valid_bits);
-      for (int i = 0; i < valid_bits; ++i) {
-        byte >>= 1;
-        if (reader.IsSet()) {
-          byte |= 0x80;
-        }
-        reader.Next();
-      }
-      byte >>= (8 - valid_bits);
-    } else {
-      ++bitmap_;
-      const uint8_t next_byte = load<uint8_t>(bitmap_);
-      byte = current_byte_;
-      if (offset_) {
-        byte >>= offset_;
-        byte |= next_byte << (8 - offset_);
-      }
-      current_byte_ = next_byte;
-      trailing_bits_ -= 8;
-      valid_bits = 8;
-    }
-    return byte;
-  }
-
-  int64_t words() const { return nwords_; }
-  int trailing_bytes() const { return trailing_bytes_; }
-
- private:
-  int64_t offset_;
-  const uint8_t* bitmap_;
-
-  const uint8_t* bitmap_end_;
-  int64_t nwords_;
-  int trailing_bits_;
-  int trailing_bytes_;
-  union {
-    Word current_word_;
-    struct {
-#if ARROW_LITTLE_ENDIAN == 0
-      uint8_t padding_bytes_[sizeof(Word) - 1];
-#endif
-      uint8_t current_byte_;
-    };
-  };
-
-  template <typename DType>
-  DType load(const uint8_t* bitmap) {
-    DCHECK_LE(bitmap + sizeof(DType), bitmap_end_);
-    return BitUtil::ToLittleEndian(util::SafeLoadAs<DType>(bitmap));
-  }
-};
-
-template <typename Word>
-class BitmapWordWriter {
- public:
-  BitmapWordWriter(uint8_t* bitmap, int64_t offset, int64_t length) {
-    bitmap_ = bitmap + offset / 8;
-    offset_ = offset % 8;
-    bitmap_end_ = bitmap_ + BitUtil::BytesForBits(offset_ + length);
-    mask_ = (1U << offset_) - 1;
-
-    if (offset_) {
-      if (length >= static_cast<int>(sizeof(Word) * 8)) {
-        current_word_ = load<Word>(bitmap_);
-      } else if (length > 0) {
-        current_byte_ = load<uint8_t>(bitmap_);
-      }
-    }
-  }
-
-  void PutNextWord(Word word) {
-    if (offset_) {
-      // split one word into two adjacent words, don't touch unused bits
-      //               |<------ word ----->|
-      //               +-----+-------------+
-      //               |  A  |      B      |
-      //               +-----+-------------+
-      //                  |         |
-      //                  v         v       offset
-      // +-------------+-----+-------------+-----+
-      // |     ---     |  A  |      B      | --- |
-      // +-------------+-----+-------------+-----+
-      // |<------ next ----->|<---- current ---->|
-      word = (word << offset_) | (word >> (sizeof(Word) * 8 - offset_));
-      Word next_word = load<Word>(bitmap_ + sizeof(Word));
-      current_word_ = (current_word_ & mask_) | (word & ~mask_);
-      next_word = (next_word & ~mask_) | (word & mask_);
-      store<Word>(bitmap_, current_word_);
-      store<Word>(bitmap_ + sizeof(Word), next_word);
-      current_word_ = next_word;
-    } else {
-      store<Word>(bitmap_, word);
-    }
-    bitmap_ += sizeof(Word);
-  }
-
-  void PutNextTrailingByte(uint8_t byte, int valid_bits) {
-    if (valid_bits == 8) {
-      if (offset_) {
-        byte = (byte << offset_) | (byte >> (8 - offset_));
-        uint8_t next_byte = load<uint8_t>(bitmap_ + 1);
-        current_byte_ = (current_byte_ & mask_) | (byte & ~mask_);
-        next_byte = (next_byte & ~mask_) | (byte & mask_);
-        store<uint8_t>(bitmap_, current_byte_);
-        store<uint8_t>(bitmap_ + 1, next_byte);
-        current_byte_ = next_byte;
-      } else {
-        store<uint8_t>(bitmap_, byte);
-      }
-      ++bitmap_;
-    } else {
-      DCHECK_GT(valid_bits, 0);
-      DCHECK_LT(valid_bits, 8);
-      DCHECK_LE(bitmap_ + BitUtil::BytesForBits(offset_ + valid_bits), bitmap_end_);
-      internal::BitmapWriter writer(bitmap_, offset_, valid_bits);
-      for (int i = 0; i < valid_bits; ++i) {
-        (byte & 0x01) ? writer.Set() : writer.Clear();
-        writer.Next();
-        byte >>= 1;
-      }
-      writer.Finish();
-    }
-  }
-
- private:
-  int64_t offset_;
-  uint8_t* bitmap_;
-
-  const uint8_t* bitmap_end_;
-  uint64_t mask_;
-  union {
-    Word current_word_;
-    struct {
-#if ARROW_LITTLE_ENDIAN == 0
-      uint8_t padding_bytes_[sizeof(Word) - 1];
-#endif
-      uint8_t current_byte_;
-    };
-  };
-
-  template <typename DType>
-  DType load(const uint8_t* bitmap) {
-    DCHECK_LE(bitmap + sizeof(DType), bitmap_end_);
-    return BitUtil::ToLittleEndian(util::SafeLoadAs<DType>(bitmap));
-  }
-
-  template <typename DType>
-  void store(uint8_t* bitmap, DType data) {
-    DCHECK_LE(bitmap + sizeof(DType), bitmap_end_);
-    util::SafeStore(bitmap, BitUtil::FromLittleEndian(data));
-  }
-};
-
-}  // namespace
-
 enum class TransferMode : bool { Copy, Invert };
 
 template <TransferMode mode>
diff --git a/cpp/src/arrow/util/bitmap_reader.h b/cpp/src/arrow/util/bitmap_reader.h
index cf4f5e7db8b..66c0df35cff 100644
--- a/cpp/src/arrow/util/bitmap_reader.h
+++ b/cpp/src/arrow/util/bitmap_reader.h
@@ -142,6 +142,117 @@ class BitmapUInt64Reader {
   uint64_t carry_bits_;
 };
 
+// BitmapWordReader here is faster than BitmapUInt64Reader (in bitmap_reader.h)
+// on sufficiently large inputs.  However, it has a larger prolog / epilog overhead
+// and should probably not be used for small bitmaps.
+
+template <typename Word>
+class BitmapWordReader {
+ public:
+  BitmapWordReader(const uint8_t* bitmap, int64_t offset, int64_t length) {
+    bitmap_ = bitmap + offset / 8;
+    offset_ = offset % 8;
+    bitmap_end_ = bitmap_ + BitUtil::BytesForBits(offset_ + length);
+
+    // decrement word count by one as we may touch two adjacent words in one iteration
+    nwords_ = length / (sizeof(Word) * 8) - 1;
+    if (nwords_ < 0) {
+      nwords_ = 0;
+    }
+    trailing_bits_ = static_cast<int>(length - nwords_ * sizeof(Word) * 8);
+    trailing_bytes_ = static_cast<int>(BitUtil::BytesForBits(trailing_bits_));
+
+    if (nwords_ > 0) {
+      current_word_ = load<Word>(bitmap_);
+    } else if (length > 0) {
+      current_byte_ = load<uint8_t>(bitmap_);
+    }
+  }
+
+  Word NextWord() {
+    bitmap_ += sizeof(Word);
+    const Word next_word = load<Word>(bitmap_);
+    Word word = current_word_;
+    if (offset_) {
+      // combine two adjacent words into one word
+      // |<------ next ----->|<---- current ---->|
+      // +-------------+-----+-------------+-----+
+      // |     ---     |  A  |      B      | --- |
+      // +-------------+-----+-------------+-----+
+      //                  |         |       offset
+      //                  v         v
+      //               +-----+-------------+
+      //               |  A  |      B      |
+      //               +-----+-------------+
+      //               |<------ word ----->|
+      word >>= offset_;
+      word |= next_word << (sizeof(Word) * 8 - offset_);
+    }
+    current_word_ = next_word;
+    return word;
+  }
+
+  uint8_t NextTrailingByte(int& valid_bits) {
+    uint8_t byte;
+    assert(trailing_bits_ > 0);
+
+    if (trailing_bits_ <= 8) {
+      // last byte
+      valid_bits = trailing_bits_;
+      trailing_bits_ = 0;
+      byte = 0;
+      internal::BitmapReader reader(bitmap_, offset_, valid_bits);
+      for (int i = 0; i < valid_bits; ++i) {
+        byte >>= 1;
+        if (reader.IsSet()) {
+          byte |= 0x80;
+        }
+        reader.Next();
+      }
+      byte >>= (8 - valid_bits);
+    } else {
+      ++bitmap_;
+      const uint8_t next_byte = load<uint8_t>(bitmap_);
+      byte = current_byte_;
+      if (offset_) {
+        byte >>= offset_;
+        byte |= next_byte << (8 - offset_);
+      }
+      current_byte_ = next_byte;
+      trailing_bits_ -= 8;
+      valid_bits = 8;
+    }
+    return byte;
+  }
+
+  int64_t words() const { return nwords_; }
+  int trailing_bytes() const { return trailing_bytes_; }
+
+ private:
+  int64_t offset_;
+  const uint8_t* bitmap_;
+
+  const uint8_t* bitmap_end_;
+  int64_t nwords_;
+  int trailing_bits_;
+  int trailing_bytes_;
+  union {
+    Word current_word_;
+    struct {
+#if ARROW_LITTLE_ENDIAN == 0
+      uint8_t padding_bytes_[sizeof(Word) - 1];
+#endif
+      uint8_t current_byte_;
+    };
+  };
+
+  template <typename DType>
+  DType load(const uint8_t* bitmap) {
+    assert(bitmap + sizeof(DType) <= bitmap_end_);
+    return BitUtil::ToLittleEndian(util::SafeLoadAs<DType>(bitmap));
+  }
+};
+
 /// \brief Index into a possibly non-existent bitmap
 struct OptionalBitIndexer {
   const uint8_t* bitmap;
@@ -151,7 +262,7 @@ struct OptionalBitIndexer {
       : bitmap(buffer == NULLPTR ? NULLPTR : buffer->data()), offset(offset) {}
 
   bool operator[](int64_t i) const {
-    return bitmap == NULLPTR ? true : BitUtil::GetBit(bitmap, offset + i);
+    return bitmap == NULLPTR || BitUtil::GetBit(bitmap, offset + i);
   }
 };
 
diff --git a/cpp/src/arrow/util/bitmap_writer.h b/cpp/src/arrow/util/bitmap_writer.h
index d4f02f37a41..afe0dcea35f 100644
--- a/cpp/src/arrow/util/bitmap_writer.h
+++ b/cpp/src/arrow/util/bitmap_writer.h
@@ -180,5 +180,106 @@ class FirstTimeBitmapWriter {
   int64_t byte_offset_;
 };
 
+template <typename Word>
+class BitmapWordWriter {
+ public:
+  BitmapWordWriter(uint8_t* bitmap, int64_t offset, int64_t length) {
+    bitmap_ = bitmap + offset / 8;
+    offset_ = offset % 8;
+    bitmap_end_ = bitmap_ + BitUtil::BytesForBits(offset_ + length);
+    mask_ = (1U << offset_) - 1;
+
+    if (offset_) {
+      if (length >= static_cast<int>(sizeof(Word) * 8)) {
+        current_word_ = load<Word>(bitmap_);
+      } else if (length > 0) {
+        current_byte_ = load<uint8_t>(bitmap_);
+      }
+    }
+  }
+
+  void PutNextWord(Word word) {
+    if (offset_) {
+      // split one word into two adjacent words, don't touch unused bits
+      //               |<------ word ----->|
+      //               +-----+-------------+
+      //               |  A  |      B      |
+      //               +-----+-------------+
+      //                  |         |
+      //                  v         v       offset
+      // +-------------+-----+-------------+-----+
+      // |     ---     |  A  |      B      | --- |
+      // +-------------+-----+-------------+-----+
+      // |<------ next ----->|<---- current ---->|
+      word = (word << offset_) | (word >> (sizeof(Word) * 8 - offset_));
+      Word next_word = load<Word>(bitmap_ + sizeof(Word));
+      current_word_ = (current_word_ & mask_) | (word & ~mask_);
+      next_word = (next_word & ~mask_) | (word & mask_);
+      store<Word>(bitmap_, current_word_);
+      store<Word>(bitmap_ + sizeof(Word), next_word);
+      current_word_ = next_word;
+    } else {
+      store<Word>(bitmap_, word);
+    }
+    bitmap_ += sizeof(Word);
+  }
+
+  void PutNextTrailingByte(uint8_t byte, int valid_bits) {
+    if (valid_bits == 8) {
+      if (offset_) {
+        byte = (byte << offset_) | (byte >> (8 - offset_));
+        uint8_t next_byte = load<uint8_t>(bitmap_ + 1);
+        current_byte_ = (current_byte_ & mask_) | (byte & ~mask_);
+        next_byte = (next_byte & ~mask_) | (byte & mask_);
+        store<uint8_t>(bitmap_, current_byte_);
+        store<uint8_t>(bitmap_ + 1, next_byte);
+        current_byte_ = next_byte;
+      } else {
+        store<uint8_t>(bitmap_, byte);
+      }
+      ++bitmap_;
+    } else {
+      assert(valid_bits > 0);
+      assert(valid_bits < 8);
+      assert(bitmap_ + BitUtil::BytesForBits(offset_ + valid_bits) <= bitmap_end_);
+      internal::BitmapWriter writer(bitmap_, offset_, valid_bits);
+      for (int i = 0; i < valid_bits; ++i) {
+        (byte & 0x01) ? writer.Set() : writer.Clear();
+        writer.Next();
+        byte >>= 1;
+      }
+      writer.Finish();
+    }
+  }
+
+ private:
+  int64_t offset_;
+  uint8_t* bitmap_;
+
+  const uint8_t* bitmap_end_;
+  uint64_t mask_;
+  union {
+    Word current_word_;
+    struct {
+#if ARROW_LITTLE_ENDIAN == 0
+      uint8_t padding_bytes_[sizeof(Word) - 1];
+#endif
+      uint8_t current_byte_;
+    };
+  };
+
+  template <typename DType>
+  DType load(const uint8_t* bitmap) {
+    assert(bitmap + sizeof(DType) <= bitmap_end_);
+    return BitUtil::ToLittleEndian(util::SafeLoadAs<DType>(bitmap));
+  }
+
+  template <typename DType>
+  void store(uint8_t* bitmap, DType data) {
+    assert(bitmap + sizeof(DType) <= bitmap_end_);
+    util::SafeStore(bitmap, BitUtil::FromLittleEndian(data));
+  }
+};
+
 }  // namespace internal
 }  // namespace arrow

From 7c6a4ef29067e0bde006e59f768bb3de0c2ffe6b Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Thu, 10 Jun 2021 20:49:08 -0400
Subject: [PATCH 06/46] adding multiple writers and testing w/ offsets

---
 cpp/src/arrow/util/bitmap.h        | 115 +++++++++++++++++++++
 cpp/src/arrow/util/bitmap_reader.h |   4 +-
 cpp/src/arrow/util/bitmap_test.cc  | 156 ++++++++++++++++++++++++++---
 cpp/src/arrow/util/bitmap_writer.h |  22 +++-
 4 files changed, 282 insertions(+), 15 deletions(-)

diff --git a/cpp/src/arrow/util/bitmap.h b/cpp/src/arrow/util/bitmap.h
index 877811afd31..7174923fca4 100644
--- a/cpp/src/arrow/util/bitmap.h
+++ b/cpp/src/arrow/util/bitmap.h
@@ -30,6 +30,8 @@
 #include "arrow/buffer.h"
 #include "arrow/util/bit_util.h"
 #include "arrow/util/bitmap_ops.h"
+#include "arrow/util/bitmap_reader.h"
+#include "arrow/util/bitmap_writer.h"
 #include "arrow/util/compare.h"
 #include "arrow/util/endian.h"
 #include "arrow/util/functional.h"
@@ -412,6 +414,119 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
     return min_word_offset;
   }
 
+  template <size_t N, size_t M, typename Word>
+  using MultiOutputVisitor = std::function<void(const std::array<Word, N>& in_words,
+                                                std::array<Word, M>& out_words)>;
+
+  template <size_t N, size_t M, typename Word>
+  static void VisitWordsAndWrite(const std::array<Bitmap, N>& bitmaps_arg,
+                                 MultiOutputVisitor<N, M, Word>&& visitor,
+                                 std::array<Bitmap, M>& out_bitmaps_arg) {
+    constexpr int64_t kBitWidth = sizeof(Word) * 8;
+
+    int64_t bit_length = BitLength(bitmaps_arg);
+    assert(bit_length == BitLength(out_bitmaps_arg));
+
+    std::array<BitmapWordReader<Word>, N> readers;
+    for (size_t i = 0; i < N; ++i) {
+      readers[i] = BitmapWordReader<Word>(bitmaps_arg[i].buffer_->data(),
+                                          bitmaps_arg[i].offset_, bitmaps_arg[i].length_);
+    }
+
+    std::array<BitmapWordWriter<Word>, M> writers;
+    for (size_t i = 0; i < M; ++i) {
+      writers[i] =
+          BitmapWordWriter<Word>(out_bitmaps_arg[i].buffer_->mutable_data(),
+                                 out_bitmaps_arg[i].offset_, out_bitmaps_arg[i].length_);
+    }
+
+    std::array<Word, N> visited_words;
+    visited_words.fill(0);
+    std::array<Word, M> output_words;
+    output_words.fill(0);
+
+    // every reader will have same number of words, since they are same length'ed
+    // todo this will be inefficient in some cases. When there are offsets beyond Word
+    //  boundary, every Word would have to be created from 2 adjoining Words
+    auto n_words = readers[0].words();
+    while (n_words--) {
+      // first collect all words to visited_words array
+      for (size_t i = 0; i < N; i++) {
+        visited_words[i] = readers[i].NextWord();
+      }
+
+      visitor(visited_words, output_words);
+
+      for (size_t i = 0; i < M; i++) {
+        writers[i].PutNextWord(output_words[i]);
+      }
+
+      bit_length -= kBitWidth;
+    }
+
+    // every reader will have same number of trailing bytes, because of the above reason
+    // todo when the above issue is resolved, following logic also needs to be fixed!
+    // tailing portion could be more than one word! (ref: BitmapWordReader constructor)
+    assert(static_cast<size_t>(bit_length) < kBitWidth * 2);
+    if (bit_length / kBitWidth) {
+      // there's one full word in trailing portion. Cant use NextWord() here because it
+      // doesn't stride the trailing metadata
+      for (size_t i = 0; i < N; i++) {
+        visited_words[i] = 0;
+        for (size_t b = 0; b < sizeof(Word); b++) {
+          int dummy;
+          auto byte = static_cast<Word>(readers[i].NextTrailingByte(dummy));
+          visited_words[i] |= byte << (b * 8);
+        }
+      }
+
+      visitor(visited_words, output_words);
+
+      for (size_t i = 0; i < M; i++) {
+        writers[i].PutNextWord(output_words[i]);
+      }
+
+      bit_length -= kBitWidth;
+    }
+
+    // clean-up last partial word
+    if (bit_length) {
+      output_words.fill(0);
+      for (size_t i = 0; i < N; i++) {
+        visited_words[i] = 0;
+        int n_byte = readers[i].trailing_bytes();
+        for (int b = 0; b < n_byte; b++) {
+          int valid_bits;
+          auto byte = static_cast<Word>(readers[i].NextTrailingByte(valid_bits));
+          visited_words[i] |= (byte << b * 8);
+        }
+      }
+
+      visitor(visited_words, output_words);
+
+      for (size_t i = 0; i < M; i++) {
+        writers[i].PutNextWord(output_words[i], bit_length);
+      }
+    }
+  }
+
+  template <size_t N, typename Word>
+  using SingleOutputVisitor =
+      std::function<void(const std::array<Word, N>& in_words, Word& out_words)>;
+
+  template <size_t N, typename Word>
+  static void VisitWordsAndWrite(const std::array<Bitmap, N>& bitmaps_arg,
+                                 SingleOutputVisitor<N, Word>&& visitor,
+                                 Bitmap& out_bitmap_arg) {
+    std::array<Bitmap, 1> out_bitmaps{out_bitmap_arg};
+    VisitWordsAndWrite<N, 1, Word>(
+        bitmaps_arg,
+        [&](const std::array<Word, N>& in_words, std::array<Word, 1>& out_words) {
+          visitor(in_words, out_words[0]);
+        },
+        out_bitmaps);
+  }
+
   const std::shared_ptr<Buffer>& buffer() const { return buffer_; }
 
   /// offset of first bit relative to buffer().data()
diff --git a/cpp/src/arrow/util/bitmap_reader.h b/cpp/src/arrow/util/bitmap_reader.h
index 66c0df35cff..a562e9a1294 100644
--- a/cpp/src/arrow/util/bitmap_reader.h
+++ b/cpp/src/arrow/util/bitmap_reader.h
@@ -149,10 +149,11 @@ class BitmapUInt64Reader {
 template <typename Word>
 class BitmapWordReader {
  public:
+  BitmapWordReader() = default;
   BitmapWordReader(const uint8_t* bitmap, int64_t offset, int64_t length) {
     bitmap_ = bitmap + offset / 8;
     offset_ = offset % 8;
-    bitmap_end_ = bitmap_ + BitUtil::BytesForBits(offset_ + length);
+    bitmap_end_ = bitmap_ + BitUtil::BytesForBits(offset + length);
 
     // decrement word count by one as we may touch two adjacent words in one iteration
     nwords_ = length / (sizeof(Word) * 8) - 1;
@@ -220,6 +221,7 @@ class BitmapWordReader {
       }
       current_byte_ = next_byte;
       trailing_bits_ -= 8;
+      trailing_bytes_--;
       valid_bits = 8;
     }
     return byte;
diff --git a/cpp/src/arrow/util/bitmap_test.cc b/cpp/src/arrow/util/bitmap_test.cc
index 4dc6d5c0cee..0db0fa5854c 100644
--- a/cpp/src/arrow/util/bitmap_test.cc
+++ b/cpp/src/arrow/util/bitmap_test.cc
@@ -40,19 +40,29 @@ void random_bool_vector(std::vector<bool>& vec, int64_t size, double p = 0.5) {
   }
 }
 
+std::string VectorToString(const std::vector<bool>& v) {
+  std::string out(v.size() + +((v.size() - 1) / 8), ' ');
+  for (size_t i = 0; i < v.size(); ++i) {
+    out[i + (i / 8)] = v[i] ? '1' : '0';
+  }
+  return out;
+}
+
 void VerifyBoolOutput(const Bitmap& bitmap, const std::vector<bool>& expected) {
   arrow::BooleanBuilder boolean_builder;
   ASSERT_OK(boolean_builder.AppendValues(expected));
   ASSERT_OK_AND_ASSIGN(auto arr, boolean_builder.Finish());
 
   ASSERT_TRUE(BitmapEquals(bitmap.buffer()->data(), bitmap.offset(),
-                           arr->data()->buffers[1]->data(), 0, expected.size()));
+                           arr->data()->buffers[1]->data(), 0, expected.size()))
+      << "exp: " << VectorToString(expected) << "\ngot: " << bitmap.ToString();
 }
 
 class TestBitmapVisit : public ::testing::Test {};
 
-TEST_F(TestBitmapVisit, OutputZeroOffset) {
-  int64_t bits = 1000, part = bits / 4;
+TEST_F(TestBitmapVisit, SingleWriterOutputZeroOffset) {
+  // choosing part = 199, a prime, so that shifts are falling in-between bytes
+  int64_t part = 199, bits = part * 4;
 
   std::vector<bool> data;
   random_bool_vector(data, bits);
@@ -70,12 +80,14 @@ TEST_F(TestBitmapVisit, OutputZeroOffset) {
   ASSERT_OK_AND_ASSIGN(auto out, AllocateBitmap(part));
   Bitmap out_bm(out, 0, part);
 
+  auto visitor = [](const std::array<uint64_t, 3>& in_words, uint64_t& out_words) {
+    out_words = (in_words[0] & in_words[1]) | in_words[2];
+  };
+
   // (bm0 & bm1) | bm2
-  std::array<Bitmap, 3> bms{bm0, bm1, bm2};
   Bitmap::VisitWordsAndWrite(
-      bms,
-      [](std::array<uint64_t, 3>& words) { return (words[0] & words[1]) | words[2]; },
-      &out_bm);
+      {bm0, bm1, bm2}, std::forward<Bitmap::SingleOutputVisitor<3, uint64_t>>(visitor),
+      out_bm);
 
   std::vector<bool> v0(data.begin(), data.begin() + part);
   std::vector<bool> v1(data.begin() + part * 1, data.begin() + part * 2);
@@ -89,8 +101,9 @@ TEST_F(TestBitmapVisit, OutputZeroOffset) {
   VerifyBoolOutput(out_bm, v3);
 }
 
-TEST_F(TestBitmapVisit, OutputNonZeroOffset) {
-  int64_t bits = 1000, part = bits / 4;
+TEST_F(TestBitmapVisit, SingleWriterOutputNonZeroOffset) {
+  // choosing part = 199, a prime
+  int64_t part = 199, bits = part * 4;
 
   std::vector<bool> data;
   random_bool_vector(data, bits);
@@ -109,12 +122,14 @@ TEST_F(TestBitmapVisit, OutputNonZeroOffset) {
   ASSERT_OK_AND_ASSIGN(auto out, AllocateBitmap(part * 2));
   Bitmap out_bm(out, part, part);
 
+  auto visitor = [](const std::array<uint64_t, 3>& in_words, uint64_t& out_words) {
+    out_words = (in_words[0] & in_words[1]) | in_words[2];
+  };
+
   // (bm0 & bm1) | bm2
-  std::array<Bitmap, 3> bms{bm0, bm1, bm2};
   Bitmap::VisitWordsAndWrite(
-      bms,
-      [](std::array<uint64_t, 3>& words) { return (words[0] & words[1]) | words[2]; },
-      &out_bm);
+      {bm0, bm1, bm2}, std::forward<Bitmap::SingleOutputVisitor<3, uint64_t>>(visitor),
+      out_bm);
 
   std::vector<bool> v0(data.begin(), data.begin() + part);
   std::vector<bool> v1(data.begin() + part * 1, data.begin() + part * 2);
@@ -128,5 +143,120 @@ TEST_F(TestBitmapVisit, OutputNonZeroOffset) {
   VerifyBoolOutput(out_bm, v3);
 }
 
+TEST_F(TestBitmapVisit, MultiWriterOutputZeroOffset) {
+  // choosing part = 199, a prime
+  int64_t part = 199, bits = part * 4;
+
+  std::vector<bool> data;
+  random_bool_vector(data, bits);
+
+  arrow::BooleanBuilder boolean_builder;
+  ASSERT_OK(boolean_builder.AppendValues(data));
+  ASSERT_OK_AND_ASSIGN(auto arrow_data, boolean_builder.Finish());
+
+  std::shared_ptr<Buffer>& arrow_buffer = arrow_data->data()->buffers[1];
+
+  Bitmap bm0(arrow_buffer, 0, part);
+  Bitmap bm1 = bm0.Slice(part * 1, part);  // this goes beyond bm0's len
+  Bitmap bm2 = bm0.Slice(part * 2, part);  // this goes beyond bm0's len
+
+  std::array<Bitmap, 2> out_bms;
+  ASSERT_OK_AND_ASSIGN(auto out0, AllocateBitmap(part));
+  ASSERT_OK_AND_ASSIGN(auto out1, AllocateBitmap(part));
+  out_bms[0] = Bitmap(out0, 0, part);
+  out_bms[1] = Bitmap(out1, 0, part);
+
+  std::vector<bool> v0(data.begin(), data.begin() + part);
+  std::vector<bool> v1(data.begin() + part * 1, data.begin() + part * 2);
+  std::vector<bool> v2(data.begin() + part * 2, data.begin() + part * 3);
+
+  // out0 = bm0 & bm1, out1= bm0 | bm2
+  auto visitor_func = [](const std::array<uint64_t, 3>& in,
+                         std::array<uint64_t, 2>& out) {
+    out[0] = in[0] & in[1];
+    out[1] = in[0] | in[2];
+  };
+
+  Bitmap::VisitWordsAndWrite(
+      {bm0, bm1, bm2},
+      std::forward<Bitmap::MultiOutputVisitor<3, 2, uint64_t>>(visitor_func), out_bms);
+
+  std::vector<bool> out_v0(part);
+  std::vector<bool> out_v1(part);
+  // v3 = v0 & v1
+  std::transform(v0.begin(), v0.end(), v1.begin(), out_v0.begin(),
+                 std::logical_and<bool>());
+  // v3 |= v2
+  std::transform(v0.begin(), v0.end(), v2.begin(), out_v1.begin(),
+                 std::logical_or<bool>());
+
+  //  std::cout << "v0: " << VectorToString(v0)<< "\n";
+  //  std::cout << "b0: " << bm0.ToString()<< "\n";
+  //  std::cout << "v1: " << VectorToString(v1)<< "\n";
+  //  std::cout << "b1: " << bm1.ToString()<< "\n";
+  //  std::cout << "v2: " << VectorToString(v2) << "\n";
+  //  std::cout << "b2: " << bm2.ToString() << "\n";
+
+  VerifyBoolOutput(out_bms[0], out_v0);
+  VerifyBoolOutput(out_bms[1], out_v1);
+}
+
+TEST_F(TestBitmapVisit, MultiWriterOutputNonZeroOffset) {
+  // choosing part = 199, a prime
+  int64_t part = 199, bits = part * 4;
+
+  std::vector<bool> data;
+  random_bool_vector(data, bits);
+
+  arrow::BooleanBuilder boolean_builder;
+  ASSERT_OK(boolean_builder.AppendValues(data));
+  ASSERT_OK_AND_ASSIGN(auto arrow_data, boolean_builder.Finish());
+
+  std::shared_ptr<Buffer>& arrow_buffer = arrow_data->data()->buffers[1];
+
+  Bitmap bm0(arrow_buffer, 0, part);
+  Bitmap bm1 = bm0.Slice(part * 1, part);  // this goes beyond bm0's len
+  Bitmap bm2 = bm0.Slice(part * 2, part);  // this goes beyond bm0's len
+
+  std::array<Bitmap, 2> out_bms;
+  ASSERT_OK_AND_ASSIGN(auto out, AllocateBitmap(part * 4));
+  out_bms[0] = Bitmap(out, part, part);
+  out_bms[1] = Bitmap(out, part * 2, part);
+
+  std::vector<bool> v0(data.begin(), data.begin() + part);
+  std::vector<bool> v1(data.begin() + part * 1, data.begin() + part * 2);
+  std::vector<bool> v2(data.begin() + part * 2, data.begin() + part * 3);
+
+  //  std::cout << "v0: " << VectorToString(v0)<< "\n";
+  //  std::cout << "b0: " << bm0.ToString() << "\n";
+  //  std::cout << "v1: " << VectorToString(v1) << "\n";
+  //  std::cout << "b1: " << bm1.ToString() << "\n";
+  //  std::cout << "v2: " << VectorToString(v2) << "\n";
+  //  std::cout << "b2: " << bm2.ToString() << "\n";
+
+  // out0 = bm0 & bm1, out1= bm0 | bm2
+  auto visitor_func = [](const std::array<uint64_t, 3>& in,
+                         std::array<uint64_t, 2>& out) {
+    out[0] = in[0] & in[1];
+    out[1] = in[0] | in[2];
+  };
+
+  Bitmap::VisitWordsAndWrite(
+      {bm0, bm1, bm2},
+      std::forward<Bitmap::MultiOutputVisitor<3, 2, uint64_t>>(visitor_func), out_bms);
+
+  std::vector<bool> out_v0(part);
+  std::vector<bool> out_v1(part);
+  // v3 = v0 & v1
+  std::transform(v0.begin(), v0.end(), v1.begin(), out_v0.begin(),
+                 std::logical_and<bool>());
+  // v3 |= v2
+  std::transform(v0.begin(), v0.end(), v2.begin(), out_v1.begin(),
+                 std::logical_or<bool>());
+
+  VerifyBoolOutput(out_bms[0], out_v0);
+  VerifyBoolOutput(out_bms[1], out_v1);
+}
+
 }  // namespace internal
 }  // namespace arrow
\ No newline at end of file
diff --git a/cpp/src/arrow/util/bitmap_writer.h b/cpp/src/arrow/util/bitmap_writer.h
index afe0dcea35f..ca75abbf15c 100644
--- a/cpp/src/arrow/util/bitmap_writer.h
+++ b/cpp/src/arrow/util/bitmap_writer.h
@@ -183,10 +183,11 @@ class FirstTimeBitmapWriter {
 template <typename Word>
 class BitmapWordWriter {
  public:
+  BitmapWordWriter() = default;
   BitmapWordWriter(uint8_t* bitmap, int64_t offset, int64_t length) {
     bitmap_ = bitmap + offset / 8;
     offset_ = offset % 8;
-    bitmap_end_ = bitmap_ + BitUtil::BytesForBits(offset_ + length);
+    bitmap_end_ = bitmap_ + BitUtil::BytesForBits(offset + length);
     mask_ = (1U << offset_) - 1;
 
     if (offset_) {
@@ -224,6 +225,25 @@ class BitmapWordWriter {
     bitmap_ += sizeof(Word);
   }
 
+  void PutNextWord(Word word, int valid_bits) {
+    assert(static_cast<size_t>(valid_bits) <= sizeof(Word) * 8);
+    if (ARROW_PREDICT_FALSE(valid_bits == 0)) {
+      return;
+    } else if (ARROW_PREDICT_FALSE(valid_bits == sizeof(Word) * 8)) {
+      return PutNextWord(word);
+    }
+    int i = 0;
+    for (; i < valid_bits / 8; i++) {
+      uint8_t byte = *(reinterpret_cast<uint8_t*>(&word) + i);
+      PutNextTrailingByte(byte, 8);
+    }
+    // cleanup
+    if (int remainder = valid_bits - i * 8) {
+      assert(static_cast<size_t>(remainder) < sizeof(Word) * 8);
+      PutNextTrailingByte(*(reinterpret_cast<uint8_t*>(&word) + i), remainder);
+    }
+  }
+
   void PutNextTrailingByte(uint8_t byte, int valid_bits) {
     if (valid_bits == 8) {
       if (offset_) {

From 1e223011f8d5a970be0a2aa14160b12f4e029af2 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Fri, 11 Jun 2021 01:10:15 +0000
Subject: [PATCH 07/46] Autoformat/render all the things [automated commit]

---
 r/man/ChunkedArray.Rd           | 22 --------------------
 r/man/Field.Rd                  |  5 -----
 r/man/FileFormat.Rd             | 15 -------------
 r/man/ParquetFileReader.Rd      | 12 -----------
 r/man/RecordBatch.Rd            | 11 ----------
 r/man/RecordBatchReader.Rd      | 37 ---------------------------------
 r/man/RecordBatchWriter.Rd      | 37 ---------------------------------
 r/man/Scalar.Rd                 | 17 ---------------
 r/man/Schema.Rd                 |  9 --------
 r/man/Table.Rd                  | 11 ----------
 r/man/buffer.Rd                 |  9 --------
 r/man/call_function.Rd          | 10 ---------
 r/man/codec_is_available.Rd     |  5 -----
 r/man/copy_files.Rd             | 10 ---------
 r/man/data-type.Rd              |  8 -------
 r/man/hive_partition.Rd         |  5 -----
 r/man/list_compute_functions.Rd |  7 -------
 r/man/load_flight_server.Rd     |  5 -----
 r/man/match_arrow.Rd            | 25 ----------------------
 r/man/read_delim_arrow.Rd       | 11 ----------
 r/man/read_feather.Rd           | 11 ----------
 r/man/read_json_arrow.Rd        | 12 -----------
 r/man/read_parquet.Rd           |  9 --------
 r/man/s3_bucket.Rd              |  5 -----
 r/man/type.Rd                   | 10 ---------
 r/man/unify_schemas.Rd          |  7 -------
 r/man/value_counts.Rd           |  6 ------
 r/man/write_csv_arrow.Rd        |  7 -------
 r/man/write_feather.Rd          |  7 -------
 r/man/write_ipc_stream.Rd       |  7 -------
 r/man/write_parquet.Rd          | 12 -----------
 r/man/write_to_raw.Rd           |  7 -------
 32 files changed, 371 deletions(-)

diff --git a/r/man/ChunkedArray.Rd b/r/man/ChunkedArray.Rd
index eaae0b3d4b8..486b6222af7 100644
--- a/r/man/ChunkedArray.Rd
+++ b/r/man/ChunkedArray.Rd
@@ -53,28 +53,6 @@ within the array's internal data. This can be an expensive check, potentially \c
 }
 }
 
-\examples{
-\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-# Pass items into chunked_array as separate objects to create chunks
-class_scores <- chunked_array(c(87, 88, 89), c(94, 93, 92), c(71, 72, 73))
-class_scores$num_chunks
-
-# When taking a Slice from a chunked_array, chunks are preserved
-class_scores$Slice(2, length = 5)
-
-# You can combine Take and SortIndices to return a ChunkedArray with 1 chunk 
-# containing all values, ordered.
-class_scores$Take(class_scores$SortIndices(descending = TRUE))
-
-# If you pass a list into chunked_array, you get a list of length 1
-list_scores <- chunked_array(list(c(9.9, 9.6, 9.5), c(8.2, 8.3, 8.4), c(10.0, 9.9, 9.8)))
-list_scores$num_chunks
-
-# When constructing a ChunkedArray, the first chunk is used to infer type.
-doubles <- chunked_array(c(1, 2, 3), c(5L, 6L, 7L))
-doubles$type
-\dontshow{\}) # examplesIf}
-}
 \seealso{
 \link{Array}
 }
diff --git a/r/man/Field.Rd b/r/man/Field.Rd
index 77d31fa637a..03dffd11ca9 100644
--- a/r/man/Field.Rd
+++ b/r/man/Field.Rd
@@ -28,8 +28,3 @@ field(name, type, metadata)
 }
 }
 
-\examples{
-\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-field("x", int32())
-\dontshow{\}) # examplesIf}
-}
diff --git a/r/man/FileFormat.Rd b/r/man/FileFormat.Rd
index 5bc9475b408..b8d4dc01bad 100644
--- a/r/man/FileFormat.Rd
+++ b/r/man/FileFormat.Rd
@@ -51,18 +51,3 @@ From \link{CsvFragmentScanOptions} (these values can be overridden at scan time)
 It returns the appropriate subclass of \code{FileFormat} (e.g. \code{ParquetFileFormat})
 }
 
-\examples{
-\dontshow{if (arrow_with_dataset()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-## Semi-colon delimited files
-# Set up directory for examples
-tf <- tempfile()
-dir.create(tf)
-on.exit(unlink(tf))
-write.table(mtcars, file.path(tf, "file1.txt"), sep = ";", row.names = FALSE)
-
-# Create FileFormat object
-format <- FileFormat$create(format = "text", delimiter = ";")
-
-open_dataset(tf, format = format)
-\dontshow{\}) # examplesIf}
-}
diff --git a/r/man/ParquetFileReader.Rd b/r/man/ParquetFileReader.Rd
index 39146919768..e97cf80ee7a 100644
--- a/r/man/ParquetFileReader.Rd
+++ b/r/man/ParquetFileReader.Rd
@@ -44,15 +44,3 @@ The optional \verb{column_indices=} argument is a 0-based integer vector indicat
 }
 }
 
-\examples{
-\dontshow{if (arrow_with_parquet()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-f <- system.file("v0.7.1.parquet", package="arrow")
-pq <- ParquetFileReader$create(f)
-pq$GetSchema()
-if (codec_is_available("snappy")) {
-  # This file has compressed data columns
-  tab <- pq$ReadTable()
-  tab$schema
-}
-\dontshow{\}) # examplesIf}
-}
diff --git a/r/man/RecordBatch.Rd b/r/man/RecordBatch.Rd
index ff08c215853..e3024b91b7a 100644
--- a/r/man/RecordBatch.Rd
+++ b/r/man/RecordBatch.Rd
@@ -79,14 +79,3 @@ All list elements are coerced to string. See \code{schema()} for more informatio
 }
 }
 
-\examples{
-\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-batch <- record_batch(name = rownames(mtcars), mtcars)
-dim(batch)
-dim(head(batch))
-names(batch)
-batch$mpg
-batch[["cyl"]]
-as.data.frame(batch[4:8, c("gear", "hp", "wt")])
-\dontshow{\}) # examplesIf}
-}
diff --git a/r/man/RecordBatchReader.Rd b/r/man/RecordBatchReader.Rd
index 90c796a6693..a206c30c8fb 100644
--- a/r/man/RecordBatchReader.Rd
+++ b/r/man/RecordBatchReader.Rd
@@ -43,43 +43,6 @@ are in the file.
 }
 }
 
-\examples{
-\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-tf <- tempfile()
-on.exit(unlink(tf))
-
-batch <- record_batch(chickwts)
-
-# This opens a connection to the file in Arrow
-file_obj <- FileOutputStream$create(tf)
-# Pass that to a RecordBatchWriter to write data conforming to a schema
-writer <- RecordBatchFileWriter$create(file_obj, batch$schema)
-writer$write(batch)
-# You may write additional batches to the stream, provided that they have
-# the same schema.
-# Call "close" on the writer to indicate end-of-file/stream
-writer$close()
-# Then, close the connection--closing the IPC message does not close the file
-file_obj$close()
-
-# Now, we have a file we can read from. Same pattern: open file connection,
-# then pass it to a RecordBatchReader
-read_file_obj <- ReadableFile$create(tf)
-reader <- RecordBatchFileReader$create(read_file_obj)
-# RecordBatchFileReader knows how many batches it has (StreamReader does not)
-reader$num_record_batches
-# We could consume the Reader by calling $read_next_batch() until all are,
-# consumed, or we can call $read_table() to pull them all into a Table
-tab <- reader$read_table()
-# Call as.data.frame to turn that Table into an R data.frame
-df <- as.data.frame(tab)
-# This should be the same data we sent
-all.equal(df, chickwts, check.attributes = FALSE)
-# Unlike the Writers, we don't have to close RecordBatchReaders,
-# but we do still need to close the file connection
-read_file_obj$close()
-\dontshow{\}) # examplesIf}
-}
 \seealso{
 \code{\link[=read_ipc_stream]{read_ipc_stream()}} and \code{\link[=read_feather]{read_feather()}} provide a much simpler interface
 for reading data from these formats and are sufficient for many use cases.
diff --git a/r/man/RecordBatchWriter.Rd b/r/man/RecordBatchWriter.Rd
index 219c150e6a4..cc6d2feb3ac 100644
--- a/r/man/RecordBatchWriter.Rd
+++ b/r/man/RecordBatchWriter.Rd
@@ -45,43 +45,6 @@ to be closed separately.
 }
 }
 
-\examples{
-\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-tf <- tempfile()
-on.exit(unlink(tf))
-
-batch <- record_batch(chickwts)
-
-# This opens a connection to the file in Arrow
-file_obj <- FileOutputStream$create(tf)
-# Pass that to a RecordBatchWriter to write data conforming to a schema
-writer <- RecordBatchFileWriter$create(file_obj, batch$schema)
-writer$write(batch)
-# You may write additional batches to the stream, provided that they have
-# the same schema.
-# Call "close" on the writer to indicate end-of-file/stream
-writer$close()
-# Then, close the connection--closing the IPC message does not close the file
-file_obj$close()
-
-# Now, we have a file we can read from. Same pattern: open file connection,
-# then pass it to a RecordBatchReader
-read_file_obj <- ReadableFile$create(tf)
-reader <- RecordBatchFileReader$create(read_file_obj)
-# RecordBatchFileReader knows how many batches it has (StreamReader does not)
-reader$num_record_batches
-# We could consume the Reader by calling $read_next_batch() until all are,
-# consumed, or we can call $read_table() to pull them all into a Table
-tab <- reader$read_table()
-# Call as.data.frame to turn that Table into an R data.frame
-df <- as.data.frame(tab)
-# This should be the same data we sent
-all.equal(df, chickwts, check.attributes = FALSE)
-# Unlike the Writers, we don't have to close RecordBatchReaders,
-# but we do still need to close the file connection
-read_file_obj$close()
-\dontshow{\}) # examplesIf}
-}
 \seealso{
 \code{\link[=write_ipc_stream]{write_ipc_stream()}} and \code{\link[=write_feather]{write_feather()}} provide a much simpler
 interface for writing data to these formats and are sufficient for many use
diff --git a/r/man/Scalar.Rd b/r/man/Scalar.Rd
index 21e04c12e08..9128988d11c 100644
--- a/r/man/Scalar.Rd
+++ b/r/man/Scalar.Rd
@@ -19,20 +19,3 @@ A \code{Scalar} holds a single value of an Arrow type.
 \verb{$type}: Scalar type
 }
 
-\examples{
-\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-Scalar$create(pi)
-Scalar$create(404)
-# If you pass a vector into Scalar$create, you get a list containing your items
-Scalar$create(c(1, 2, 3))
-
-# Comparisons
-my_scalar <- Scalar$create(99)
-my_scalar$ApproxEquals(Scalar$create(99.00001)) # FALSE
-my_scalar$ApproxEquals(Scalar$create(99.000009)) # TRUE
-my_scalar$Equals(Scalar$create(99.000009)) # FALSE
-my_scalar$Equals(Scalar$create(99L)) # FALSE (types don't match)
-
-my_scalar$ToString()
-\dontshow{\}) # examplesIf}
-}
diff --git a/r/man/Schema.Rd b/r/man/Schema.Rd
index 6e385bb804e..0c66e5c2a42 100644
--- a/r/man/Schema.Rd
+++ b/r/man/Schema.Rd
@@ -74,12 +74,3 @@ Files with compressed metadata are readable by older versions of arrow, but
 the metadata is dropped.
 }
 
-\examples{
-\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-df <- data.frame(col1 = 2:4, col2 = c(0.1, 0.3, 0.5))
-tab1 <- Table$create(df)
-tab1$schema
-tab2 <- Table$create(df, schema = schema(col1 = int8(), col2 = float32()))
-tab2$schema
-\dontshow{\}) # examplesIf}
-}
diff --git a/r/man/Table.Rd b/r/man/Table.Rd
index 2675943e572..d955b0f5a29 100644
--- a/r/man/Table.Rd
+++ b/r/man/Table.Rd
@@ -79,14 +79,3 @@ All list elements are coerced to string. See \code{schema()} for more informatio
 }
 }
 
-\examples{
-\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-tab <- Table$create(name = rownames(mtcars), mtcars)
-dim(tab)
-dim(head(tab))
-names(tab)
-tab$mpg
-tab[["cyl"]]
-as.data.frame(tab[4:8, c("gear", "hp", "wt")])
-\dontshow{\}) # examplesIf}
-}
diff --git a/r/man/buffer.Rd b/r/man/buffer.Rd
index a3ca1fc2fcb..99b636da3c7 100644
--- a/r/man/buffer.Rd
+++ b/r/man/buffer.Rd
@@ -33,12 +33,3 @@ contiguous memory with a particular size.
 }
 }
 
-\examples{
-\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-my_buffer <- buffer(c(1, 2, 3, 4))
-my_buffer$is_mutable
-my_buffer$ZeroPadding()
-my_buffer$size
-my_buffer$capacity
-\dontshow{\}) # examplesIf}
-}
diff --git a/r/man/call_function.Rd b/r/man/call_function.Rd
index f63038442dc..790c4237518 100644
--- a/r/man/call_function.Rd
+++ b/r/man/call_function.Rd
@@ -35,16 +35,6 @@ are callable with an \code{arrow_} prefix.
 When passing indices in \code{...}, \code{args}, or \code{options}, express them as
 0-based integers (consistent with C++).
 }
-\examples{
-\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-a <- Array$create(c(1L, 2L, 3L, NA, 5L))
-s <- Scalar$create(4L)
-call_function("fill_null", a, s)
-
-a <- Array$create(rnorm(10000))
-call_function("quantile", a, options = list(q = seq(0, 1, 0.25)))
-\dontshow{\}) # examplesIf}
-}
 \seealso{
 \href{https://arrow.apache.org/docs/cpp/compute.html}{Arrow C++ documentation} for the functions and their respective options.
 }
diff --git a/r/man/codec_is_available.Rd b/r/man/codec_is_available.Rd
index b3238ff1dca..1b5e8278fa9 100644
--- a/r/man/codec_is_available.Rd
+++ b/r/man/codec_is_available.Rd
@@ -18,8 +18,3 @@ Support for compression libraries depends on the build-time settings of
 the Arrow C++ library. This function lets you know which are available for
 use.
 }
-\examples{
-\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-codec_is_available("gzip")
-\dontshow{\}) # examplesIf}
-}
diff --git a/r/man/copy_files.Rd b/r/man/copy_files.Rd
index 1b83703f19f..75cc4405d8a 100644
--- a/r/man/copy_files.Rd
+++ b/r/man/copy_files.Rd
@@ -23,13 +23,3 @@ Nothing: called for side effects in the file system
 \description{
 Copy files between FileSystems
 }
-\examples{
-\dontshow{if (FALSE) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-# Copy an S3 bucket's files to a local directory:
-copy_files("s3://your-bucket-name", "local-directory")
-# Using a FileSystem object
-copy_files(s3_bucket("your-bucket-name"), "local-directory")
-# Or go the other way, from local to S3
-copy_files("local-directory", s3_bucket("your-bucket-name"))
-\dontshow{\}) # examplesIf}
-}
diff --git a/r/man/data-type.Rd b/r/man/data-type.Rd
index a0631897573..101702a2fb2 100644
--- a/r/man/data-type.Rd
+++ b/r/man/data-type.Rd
@@ -150,14 +150,6 @@ are translated to R objects, \code{uint32} and \code{uint64} are converted to \c
 types, this conversion can be disabled (so that \code{int64} always yields a
 \code{bit64::integer64} object) by setting \code{options(arrow.int64_downcast = FALSE)}.
 }
-\examples{
-\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-bool()
-struct(a = int32(), b = double())
-timestamp("ms", timezone = "CEST")
-time64("ns")
-\dontshow{\}) # examplesIf}
-}
 \seealso{
 \code{\link[=dictionary]{dictionary()}} for creating a dictionary (factor-like) type.
 }
diff --git a/r/man/hive_partition.Rd b/r/man/hive_partition.Rd
index eef9f9157ea..39d5d8d0ae2 100644
--- a/r/man/hive_partition.Rd
+++ b/r/man/hive_partition.Rd
@@ -28,8 +28,3 @@ Hive partitioning embeds field names and values in path segments, such as
 Because fields are named in the path segments, order of fields passed to
 \code{hive_partition()} does not matter.
 }
-\examples{
-\dontshow{if (arrow_with_dataset()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-hive_partition(year = int16(), month = int8())
-\dontshow{\}) # examplesIf}
-}
diff --git a/r/man/list_compute_functions.Rd b/r/man/list_compute_functions.Rd
index 668e090c0ca..ba17688d833 100644
--- a/r/man/list_compute_functions.Rd
+++ b/r/man/list_compute_functions.Rd
@@ -37,10 +37,3 @@ The package includes Arrow methods for many base R functions that can
 be called directly on Arrow objects, as well as some tidyverse-flavored versions
 available inside \code{dplyr} verbs.
 }
-\examples{
-\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-list_compute_functions() 
-list_compute_functions(pattern = "^UTF8", ignore.case = TRUE)
-list_compute_functions(pattern = "^is", invert = TRUE)
-\dontshow{\}) # examplesIf}
-}
diff --git a/r/man/load_flight_server.Rd b/r/man/load_flight_server.Rd
index 66d30f39147..7e2000a9ca2 100644
--- a/r/man/load_flight_server.Rd
+++ b/r/man/load_flight_server.Rd
@@ -15,8 +15,3 @@ to look in the \verb{inst/} directory for included modules.}
 \description{
 Load a Python Flight server
 }
-\examples{
-\dontshow{if (FALSE) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-load_flight_server("demo_flight_server")
-\dontshow{\}) # examplesIf}
-}
diff --git a/r/man/match_arrow.Rd b/r/man/match_arrow.Rd
index d63ef3eed87..21481af4c6b 100644
--- a/r/man/match_arrow.Rd
+++ b/r/man/match_arrow.Rd
@@ -26,28 +26,3 @@ per element of \code{x} it it is present in \code{table}.
 \code{base::match()} is not a generic, so we can't just define Arrow methods for
 it. This function exposes the analogous functions in the Arrow C++ library.
 }
-\examples{
-\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-# note that the returned value is 0-indexed
-cars_tbl <- Table$create(name = rownames(mtcars), mtcars)
-match_arrow(Scalar$create("Mazda RX4 Wag"), cars_tbl$name)
-
-is_in(Array$create("Mazda RX4 Wag"), cars_tbl$name)
-
-# Although there are multiple matches, you are returned the index of the first 
-# match, as with the base R equivalent
-match(4, mtcars$cyl) # 1-indexed
-match_arrow(Scalar$create(4), cars_tbl$cyl) # 0-indexed
-
-# If `x` contains multiple values, you are returned the indices of the first 
-# match for each value.
-match(c(4, 6, 8), mtcars$cyl)
-match_arrow(Array$create(c(4, 6, 8)), cars_tbl$cyl)
-
-# Return type matches type of `x`
-is_in(c(4, 6, 8), mtcars$cyl) # returns vector
-is_in(Scalar$create(4), mtcars$cyl) # returns Scalar
-is_in(Array$create(c(4, 6, 8)), cars_tbl$cyl) # returns Array
-is_in(ChunkedArray$create(c(4, 6), 8), cars_tbl$cyl) # returns ChunkedArray
-\dontshow{\}) # examplesIf}
-}
diff --git a/r/man/read_delim_arrow.Rd b/r/man/read_delim_arrow.Rd
index 71394e547c9..d9c80306931 100644
--- a/r/man/read_delim_arrow.Rd
+++ b/r/man/read_delim_arrow.Rd
@@ -205,14 +205,3 @@ Note that if you are specifying column names, whether by \code{schema} or
 to idenfity column names, you'll need to add \code{skip = 1} to skip that row.
 }
 
-\examples{
-\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-  tf <- tempfile()
-  on.exit(unlink(tf))
-  write.csv(mtcars, file = tf)
-  df <- read_csv_arrow(tf)
-  dim(df)
-  # Can select columns
-  df <- read_csv_arrow(tf, col_select = starts_with("d"))
-\dontshow{\}) # examplesIf}
-}
diff --git a/r/man/read_feather.Rd b/r/man/read_feather.Rd
index 95f4d1d12c6..fa18e3f7844 100644
--- a/r/man/read_feather.Rd
+++ b/r/man/read_feather.Rd
@@ -34,17 +34,6 @@ and to make sharing data across data analysis languages easy.
 This function reads both the original, limited specification of the format
 and the version 2 specification, which is the Apache Arrow IPC file format.
 }
-\examples{
-\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-tf <- tempfile()
-on.exit(unlink(tf))
-write_feather(mtcars, tf)
-df <- read_feather(tf)
-dim(df)
-# Can select columns
-df <- read_feather(tf, col_select = starts_with("d"))
-\dontshow{\}) # examplesIf}
-}
 \seealso{
 \link{FeatherReader} and \link{RecordBatchReader} for lower-level access to reading Arrow IPC data.
 }
diff --git a/r/man/read_json_arrow.Rd b/r/man/read_json_arrow.Rd
index 4806b4ad1f0..476c99fe4de 100644
--- a/r/man/read_json_arrow.Rd
+++ b/r/man/read_json_arrow.Rd
@@ -38,15 +38,3 @@ A \code{data.frame}, or a Table if \code{as_data_frame = FALSE}.
 \description{
 Using \link{JsonTableReader}
 }
-\examples{
-\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-  tf <- tempfile()
-  on.exit(unlink(tf))
-  writeLines('
-    { "hello": 3.5, "world": false, "yo": "thing" }
-    { "hello": 3.25, "world": null }
-    { "hello": 0.0, "world": true, "yo": null }
-  ', tf, useBytes=TRUE)
-  df <- read_json_arrow(tf)
-\dontshow{\}) # examplesIf}
-}
diff --git a/r/man/read_parquet.Rd b/r/man/read_parquet.Rd
index 056e8644747..ffb2cf7109f 100644
--- a/r/man/read_parquet.Rd
+++ b/r/man/read_parquet.Rd
@@ -39,12 +39,3 @@ A \link[=Table]{arrow::Table}, or a \code{data.frame} if \code{as_data_frame} is
 '\href{https://parquet.apache.org/}{Parquet}' is a columnar storage file format.
 This function enables you to read Parquet files into R.
 }
-\examples{
-\dontshow{if (arrow_with_parquet()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-tf <- tempfile()
-on.exit(unlink(tf))
-write_parquet(mtcars, tf)
-df <- read_parquet(tf, col_select = starts_with("d"))
-head(df)
-\dontshow{\}) # examplesIf}
-}
diff --git a/r/man/s3_bucket.Rd b/r/man/s3_bucket.Rd
index 95a086deae5..78d527a56c4 100644
--- a/r/man/s3_bucket.Rd
+++ b/r/man/s3_bucket.Rd
@@ -21,8 +21,3 @@ are authorized to access the bucket's contents.
 that automatically detects the bucket's AWS region and holding onto the its
 relative path.
 }
-\examples{
-\dontshow{if (arrow_with_s3()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-bucket <- s3_bucket("ursa-labs-taxi-data")
-\dontshow{\}) # examplesIf}
-}
diff --git a/r/man/type.Rd b/r/man/type.Rd
index d55bbe24bd5..2f85e4a6ac6 100644
--- a/r/man/type.Rd
+++ b/r/man/type.Rd
@@ -15,13 +15,3 @@ an arrow logical type
 \description{
 infer the arrow Array type from an R vector
 }
-\examples{
-\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-type(1:10)
-type(1L:10L)
-type(c(1, 1.5, 2))
-type(c("A", "B", "C"))
-type(mtcars)
-type(Sys.Date())
-\dontshow{\}) # examplesIf}
-}
diff --git a/r/man/unify_schemas.Rd b/r/man/unify_schemas.Rd
index 50c80c2dda9..709e33a5e74 100644
--- a/r/man/unify_schemas.Rd
+++ b/r/man/unify_schemas.Rd
@@ -18,10 +18,3 @@ A \code{Schema} with the union of fields contained in the inputs, or
 \description{
 Combine and harmonize schemas
 }
-\examples{
-\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-a <- schema(b = double(), c = bool())
-z <- schema(b = double(), k = utf8())
-unify_schemas(a, z)
-\dontshow{\}) # examplesIf}
-}
diff --git a/r/man/value_counts.Rd b/r/man/value_counts.Rd
index 6ef77cd4727..139af8edc63 100644
--- a/r/man/value_counts.Rd
+++ b/r/man/value_counts.Rd
@@ -16,9 +16,3 @@ A \code{StructArray} containing "values" (same type as \code{x}) and "counts"
 \description{
 This function tabulates the values in the array and returns a table of counts.
 }
-\examples{
-\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-cyl_vals <- Array$create(mtcars$cyl)
-value_counts(cyl_vals)
-\dontshow{\}) # examplesIf}
-}
diff --git a/r/man/write_csv_arrow.Rd b/r/man/write_csv_arrow.Rd
index 55a239ca998..d6df2bcd08e 100644
--- a/r/man/write_csv_arrow.Rd
+++ b/r/man/write_csv_arrow.Rd
@@ -23,10 +23,3 @@ the stream will be left open.
 \description{
 Write CSV file to disk
 }
-\examples{
-\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-tf <- tempfile()
-on.exit(unlink(tf))
-write_csv_arrow(mtcars, tf)
-\dontshow{\}) # examplesIf}
-}
diff --git a/r/man/write_feather.Rd b/r/man/write_feather.Rd
index c6273b61be8..0cc8c591369 100644
--- a/r/man/write_feather.Rd
+++ b/r/man/write_feather.Rd
@@ -47,13 +47,6 @@ and to make sharing data across data analysis languages easy.
 This function writes both the original, limited specification of the format
 and the version 2 specification, which is the Apache Arrow IPC file format.
 }
-\examples{
-\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-tf <- tempfile()
-on.exit(unlink(tf))
-write_feather(mtcars, tf)
-\dontshow{\}) # examplesIf}
-}
 \seealso{
 \link{RecordBatchWriter} for lower-level access to writing Arrow IPC data.
 
diff --git a/r/man/write_ipc_stream.Rd b/r/man/write_ipc_stream.Rd
index 888d947eb99..4f742ce9178 100644
--- a/r/man/write_ipc_stream.Rd
+++ b/r/man/write_ipc_stream.Rd
@@ -31,13 +31,6 @@ with some nonstandard behavior, is deprecated. You should explicitly choose
 the function that will write the desired IPC format (stream or file) since
 either can be written to a file or \code{OutputStream}.
 }
-\examples{
-\dontshow{if (arrow_available() ) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-tf <- tempfile()
-on.exit(unlink(tf))
-write_ipc_stream(mtcars, tf)
-\dontshow{\}) # examplesIf}
-}
 \seealso{
 \code{\link[=write_feather]{write_feather()}} for writing IPC files. \code{\link[=write_to_raw]{write_to_raw()}} to
 serialize data to a buffer.
diff --git a/r/man/write_parquet.Rd b/r/man/write_parquet.Rd
index d7147f7e8e6..823a6038e84 100644
--- a/r/man/write_parquet.Rd
+++ b/r/man/write_parquet.Rd
@@ -94,15 +94,3 @@ The default "snappy" is used if available, otherwise "uncompressed". To
 disable compression, set \code{compression = "uncompressed"}.
 Note that "uncompressed" columns may still have dictionary encoding.
 }
-\examples{
-\dontshow{if (arrow_with_parquet()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-tf1 <- tempfile(fileext = ".parquet")
-write_parquet(data.frame(x = 1:5), tf1)
-
-# using compression
-if (codec_is_available("gzip")) {
-  tf2 <- tempfile(fileext = ".gz.parquet")
-  write_parquet(data.frame(x = 1:5), tf2, compression = "gzip", compression_level = 5)
-}
-\dontshow{\}) # examplesIf}
-}
diff --git a/r/man/write_to_raw.Rd b/r/man/write_to_raw.Rd
index 1f507e384c3..46af09a96e8 100644
--- a/r/man/write_to_raw.Rd
+++ b/r/man/write_to_raw.Rd
@@ -20,10 +20,3 @@ the data (\code{data.frame}, \code{RecordBatch}, or \code{Table}) they were give
 This function wraps those so that you can serialize data to a buffer and
 access that buffer as a \code{raw} vector in R.
 }
-\examples{
-\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-# The default format is "stream"
-write_to_raw(mtcars)
-write_to_raw(mtcars, format = "file")
-\dontshow{\}) # examplesIf}
-}

From 15bdd4b95c299c6f8639f3df6d292bc6aaada1be Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Thu, 10 Jun 2021 22:17:19 -0400
Subject: [PATCH 08/46] Revert "Autoformat/render all the things [automated
 commit]"

This reverts commit 1c13c4ca
---
 r/man/ChunkedArray.Rd           | 22 ++++++++++++++++++++
 r/man/Field.Rd                  |  5 +++++
 r/man/FileFormat.Rd             | 15 +++++++++++++
 r/man/ParquetFileReader.Rd      | 12 +++++++++++
 r/man/RecordBatch.Rd            | 11 ++++++++++
 r/man/RecordBatchReader.Rd      | 37 +++++++++++++++++++++++++++++++++
 r/man/RecordBatchWriter.Rd      | 37 +++++++++++++++++++++++++++++++++
 r/man/Scalar.Rd                 | 17 +++++++++++++++
 r/man/Schema.Rd                 |  9 ++++++++
 r/man/Table.Rd                  | 11 ++++++++++
 r/man/buffer.Rd                 |  9 ++++++++
 r/man/call_function.Rd          | 10 +++++++++
 r/man/codec_is_available.Rd     |  5 +++++
 r/man/copy_files.Rd             | 10 +++++++++
 r/man/data-type.Rd              |  8 +++++++
 r/man/hive_partition.Rd         |  5 +++++
 r/man/list_compute_functions.Rd |  7 +++++++
 r/man/load_flight_server.Rd     |  5 +++++
 r/man/match_arrow.Rd            | 25 ++++++++++++++++++++++
 r/man/read_delim_arrow.Rd       | 11 ++++++++++
 r/man/read_feather.Rd           | 11 ++++++++++
 r/man/read_json_arrow.Rd        | 12 +++++++++++
 r/man/read_parquet.Rd           |  9 ++++++++
 r/man/s3_bucket.Rd              |  5 +++++
 r/man/type.Rd                   | 10 +++++++++
 r/man/unify_schemas.Rd          |  7 +++++++
 r/man/value_counts.Rd           |  6 ++++++
 r/man/write_csv_arrow.Rd        |  7 +++++++
 r/man/write_feather.Rd          |  7 +++++++
 r/man/write_ipc_stream.Rd       |  7 +++++++
 r/man/write_parquet.Rd          | 12 +++++++++++
 r/man/write_to_raw.Rd           |  7 +++++++
 32 files changed, 371 insertions(+)

diff --git a/r/man/ChunkedArray.Rd b/r/man/ChunkedArray.Rd
index 486b6222af7..eaae0b3d4b8 100644
--- a/r/man/ChunkedArray.Rd
+++ b/r/man/ChunkedArray.Rd
@@ -53,6 +53,28 @@ within the array's internal data. This can be an expensive check, potentially \c
 }
 }
 
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+# Pass items into chunked_array as separate objects to create chunks
+class_scores <- chunked_array(c(87, 88, 89), c(94, 93, 92), c(71, 72, 73))
+class_scores$num_chunks
+
+# When taking a Slice from a chunked_array, chunks are preserved
+class_scores$Slice(2, length = 5)
+
+# You can combine Take and SortIndices to return a ChunkedArray with 1 chunk 
+# containing all values, ordered.
+class_scores$Take(class_scores$SortIndices(descending = TRUE))
+
+# If you pass a list into chunked_array, you get a list of length 1
+list_scores <- chunked_array(list(c(9.9, 9.6, 9.5), c(8.2, 8.3, 8.4), c(10.0, 9.9, 9.8)))
+list_scores$num_chunks
+
+# When constructing a ChunkedArray, the first chunk is used to infer type.
+doubles <- chunked_array(c(1, 2, 3), c(5L, 6L, 7L))
+doubles$type
+\dontshow{\}) # examplesIf}
+}
 \seealso{
 \link{Array}
 }
diff --git a/r/man/Field.Rd b/r/man/Field.Rd
index 03dffd11ca9..77d31fa637a 100644
--- a/r/man/Field.Rd
+++ b/r/man/Field.Rd
@@ -28,3 +28,8 @@ field(name, type, metadata)
 }
 }
 
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+field("x", int32())
+\dontshow{\}) # examplesIf}
+}
diff --git a/r/man/FileFormat.Rd b/r/man/FileFormat.Rd
index b8d4dc01bad..5bc9475b408 100644
--- a/r/man/FileFormat.Rd
+++ b/r/man/FileFormat.Rd
@@ -51,3 +51,18 @@ From \link{CsvFragmentScanOptions} (these values can be overridden at scan time)
 It returns the appropriate subclass of \code{FileFormat} (e.g. \code{ParquetFileFormat})
 }
 
+\examples{
+\dontshow{if (arrow_with_dataset()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+## Semi-colon delimited files
+# Set up directory for examples
+tf <- tempfile()
+dir.create(tf)
+on.exit(unlink(tf))
+write.table(mtcars, file.path(tf, "file1.txt"), sep = ";", row.names = FALSE)
+
+# Create FileFormat object
+format <- FileFormat$create(format = "text", delimiter = ";")
+
+open_dataset(tf, format = format)
+\dontshow{\}) # examplesIf}
+}
diff --git a/r/man/ParquetFileReader.Rd b/r/man/ParquetFileReader.Rd
index e97cf80ee7a..39146919768 100644
--- a/r/man/ParquetFileReader.Rd
+++ b/r/man/ParquetFileReader.Rd
@@ -44,3 +44,15 @@ The optional \verb{column_indices=} argument is a 0-based integer vector indicat
 }
 }
 
+\examples{
+\dontshow{if (arrow_with_parquet()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+f <- system.file("v0.7.1.parquet", package="arrow")
+pq <- ParquetFileReader$create(f)
+pq$GetSchema()
+if (codec_is_available("snappy")) {
+  # This file has compressed data columns
+  tab <- pq$ReadTable()
+  tab$schema
+}
+\dontshow{\}) # examplesIf}
+}
diff --git a/r/man/RecordBatch.Rd b/r/man/RecordBatch.Rd
index e3024b91b7a..ff08c215853 100644
--- a/r/man/RecordBatch.Rd
+++ b/r/man/RecordBatch.Rd
@@ -79,3 +79,14 @@ All list elements are coerced to string. See \code{schema()} for more informatio
 }
 }
 
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+batch <- record_batch(name = rownames(mtcars), mtcars)
+dim(batch)
+dim(head(batch))
+names(batch)
+batch$mpg
+batch[["cyl"]]
+as.data.frame(batch[4:8, c("gear", "hp", "wt")])
+\dontshow{\}) # examplesIf}
+}
diff --git a/r/man/RecordBatchReader.Rd b/r/man/RecordBatchReader.Rd
index a206c30c8fb..90c796a6693 100644
--- a/r/man/RecordBatchReader.Rd
+++ b/r/man/RecordBatchReader.Rd
@@ -43,6 +43,43 @@ are in the file.
 }
 }
 
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+tf <- tempfile()
+on.exit(unlink(tf))
+
+batch <- record_batch(chickwts)
+
+# This opens a connection to the file in Arrow
+file_obj <- FileOutputStream$create(tf)
+# Pass that to a RecordBatchWriter to write data conforming to a schema
+writer <- RecordBatchFileWriter$create(file_obj, batch$schema)
+writer$write(batch)
+# You may write additional batches to the stream, provided that they have
+# the same schema.
+# Call "close" on the writer to indicate end-of-file/stream
+writer$close()
+# Then, close the connection--closing the IPC message does not close the file
+file_obj$close()
+
+# Now, we have a file we can read from. Same pattern: open file connection,
+# then pass it to a RecordBatchReader
+read_file_obj <- ReadableFile$create(tf)
+reader <- RecordBatchFileReader$create(read_file_obj)
+# RecordBatchFileReader knows how many batches it has (StreamReader does not)
+reader$num_record_batches
+# We could consume the Reader by calling $read_next_batch() until all are,
+# consumed, or we can call $read_table() to pull them all into a Table
+tab <- reader$read_table()
+# Call as.data.frame to turn that Table into an R data.frame
+df <- as.data.frame(tab)
+# This should be the same data we sent
+all.equal(df, chickwts, check.attributes = FALSE)
+# Unlike the Writers, we don't have to close RecordBatchReaders,
+# but we do still need to close the file connection
+read_file_obj$close()
+\dontshow{\}) # examplesIf}
+}
 \seealso{
 \code{\link[=read_ipc_stream]{read_ipc_stream()}} and \code{\link[=read_feather]{read_feather()}} provide a much simpler interface
 for reading data from these formats and are sufficient for many use cases.
diff --git a/r/man/RecordBatchWriter.Rd b/r/man/RecordBatchWriter.Rd
index cc6d2feb3ac..219c150e6a4 100644
--- a/r/man/RecordBatchWriter.Rd
+++ b/r/man/RecordBatchWriter.Rd
@@ -45,6 +45,43 @@ to be closed separately.
 }
 }
 
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+tf <- tempfile()
+on.exit(unlink(tf))
+
+batch <- record_batch(chickwts)
+
+# This opens a connection to the file in Arrow
+file_obj <- FileOutputStream$create(tf)
+# Pass that to a RecordBatchWriter to write data conforming to a schema
+writer <- RecordBatchFileWriter$create(file_obj, batch$schema)
+writer$write(batch)
+# You may write additional batches to the stream, provided that they have
+# the same schema.
+# Call "close" on the writer to indicate end-of-file/stream
+writer$close()
+# Then, close the connection--closing the IPC message does not close the file
+file_obj$close()
+
+# Now, we have a file we can read from. Same pattern: open file connection,
+# then pass it to a RecordBatchReader
+read_file_obj <- ReadableFile$create(tf)
+reader <- RecordBatchFileReader$create(read_file_obj)
+# RecordBatchFileReader knows how many batches it has (StreamReader does not)
+reader$num_record_batches
+# We could consume the Reader by calling $read_next_batch() until all are,
+# consumed, or we can call $read_table() to pull them all into a Table
+tab <- reader$read_table()
+# Call as.data.frame to turn that Table into an R data.frame
+df <- as.data.frame(tab)
+# This should be the same data we sent
+all.equal(df, chickwts, check.attributes = FALSE)
+# Unlike the Writers, we don't have to close RecordBatchReaders,
+# but we do still need to close the file connection
+read_file_obj$close()
+\dontshow{\}) # examplesIf}
+}
 \seealso{
 \code{\link[=write_ipc_stream]{write_ipc_stream()}} and \code{\link[=write_feather]{write_feather()}} provide a much simpler
 interface for writing data to these formats and are sufficient for many use
diff --git a/r/man/Scalar.Rd b/r/man/Scalar.Rd
index 9128988d11c..21e04c12e08 100644
--- a/r/man/Scalar.Rd
+++ b/r/man/Scalar.Rd
@@ -19,3 +19,20 @@ A \code{Scalar} holds a single value of an Arrow type.
 \verb{$type}: Scalar type
 }
 
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+Scalar$create(pi)
+Scalar$create(404)
+# If you pass a vector into Scalar$create, you get a list containing your items
+Scalar$create(c(1, 2, 3))
+
+# Comparisons
+my_scalar <- Scalar$create(99)
+my_scalar$ApproxEquals(Scalar$create(99.00001)) # FALSE
+my_scalar$ApproxEquals(Scalar$create(99.000009)) # TRUE
+my_scalar$Equals(Scalar$create(99.000009)) # FALSE
+my_scalar$Equals(Scalar$create(99L)) # FALSE (types don't match)
+
+my_scalar$ToString()
+\dontshow{\}) # examplesIf}
+}
diff --git a/r/man/Schema.Rd b/r/man/Schema.Rd
index 0c66e5c2a42..6e385bb804e 100644
--- a/r/man/Schema.Rd
+++ b/r/man/Schema.Rd
@@ -74,3 +74,12 @@ Files with compressed metadata are readable by older versions of arrow, but
 the metadata is dropped.
 }
 
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+df <- data.frame(col1 = 2:4, col2 = c(0.1, 0.3, 0.5))
+tab1 <- Table$create(df)
+tab1$schema
+tab2 <- Table$create(df, schema = schema(col1 = int8(), col2 = float32()))
+tab2$schema
+\dontshow{\}) # examplesIf}
+}
diff --git a/r/man/Table.Rd b/r/man/Table.Rd
index d955b0f5a29..2675943e572 100644
--- a/r/man/Table.Rd
+++ b/r/man/Table.Rd
@@ -79,3 +79,14 @@ All list elements are coerced to string. See \code{schema()} for more informatio
 }
 }
 
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+tab <- Table$create(name = rownames(mtcars), mtcars)
+dim(tab)
+dim(head(tab))
+names(tab)
+tab$mpg
+tab[["cyl"]]
+as.data.frame(tab[4:8, c("gear", "hp", "wt")])
+\dontshow{\}) # examplesIf}
+}
diff --git a/r/man/buffer.Rd b/r/man/buffer.Rd
index 99b636da3c7..a3ca1fc2fcb 100644
--- a/r/man/buffer.Rd
+++ b/r/man/buffer.Rd
@@ -33,3 +33,12 @@ contiguous memory with a particular size.
 }
 }
 
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+my_buffer <- buffer(c(1, 2, 3, 4))
+my_buffer$is_mutable
+my_buffer$ZeroPadding()
+my_buffer$size
+my_buffer$capacity
+\dontshow{\}) # examplesIf}
+}
diff --git a/r/man/call_function.Rd b/r/man/call_function.Rd
index 790c4237518..f63038442dc 100644
--- a/r/man/call_function.Rd
+++ b/r/man/call_function.Rd
@@ -35,6 +35,16 @@ are callable with an \code{arrow_} prefix.
 When passing indices in \code{...}, \code{args}, or \code{options}, express them as
 0-based integers (consistent with C++).
 }
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+a <- Array$create(c(1L, 2L, 3L, NA, 5L))
+s <- Scalar$create(4L)
+call_function("fill_null", a, s)
+
+a <- Array$create(rnorm(10000))
+call_function("quantile", a, options = list(q = seq(0, 1, 0.25)))
+\dontshow{\}) # examplesIf}
+}
 \seealso{
 \href{https://arrow.apache.org/docs/cpp/compute.html}{Arrow C++ documentation} for the functions and their respective options.
 }
diff --git a/r/man/codec_is_available.Rd b/r/man/codec_is_available.Rd
index 1b5e8278fa9..b3238ff1dca 100644
--- a/r/man/codec_is_available.Rd
+++ b/r/man/codec_is_available.Rd
@@ -18,3 +18,8 @@ Support for compression libraries depends on the build-time settings of
 the Arrow C++ library. This function lets you know which are available for
 use.
 }
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+codec_is_available("gzip")
+\dontshow{\}) # examplesIf}
+}
diff --git a/r/man/copy_files.Rd b/r/man/copy_files.Rd
index 75cc4405d8a..1b83703f19f 100644
--- a/r/man/copy_files.Rd
+++ b/r/man/copy_files.Rd
@@ -23,3 +23,13 @@ Nothing: called for side effects in the file system
 \description{
 Copy files between FileSystems
 }
+\examples{
+\dontshow{if (FALSE) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+# Copy an S3 bucket's files to a local directory:
+copy_files("s3://your-bucket-name", "local-directory")
+# Using a FileSystem object
+copy_files(s3_bucket("your-bucket-name"), "local-directory")
+# Or go the other way, from local to S3
+copy_files("local-directory", s3_bucket("your-bucket-name"))
+\dontshow{\}) # examplesIf}
+}
diff --git a/r/man/data-type.Rd b/r/man/data-type.Rd
index 101702a2fb2..a0631897573 100644
--- a/r/man/data-type.Rd
+++ b/r/man/data-type.Rd
@@ -150,6 +150,14 @@ are translated to R objects, \code{uint32} and \code{uint64} are converted to \c
 types, this conversion can be disabled (so that \code{int64} always yields a
 \code{bit64::integer64} object) by setting \code{options(arrow.int64_downcast = FALSE)}.
 }
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+bool()
+struct(a = int32(), b = double())
+timestamp("ms", timezone = "CEST")
+time64("ns")
+\dontshow{\}) # examplesIf}
+}
 \seealso{
 \code{\link[=dictionary]{dictionary()}} for creating a dictionary (factor-like) type.
 }
diff --git a/r/man/hive_partition.Rd b/r/man/hive_partition.Rd
index 39d5d8d0ae2..eef9f9157ea 100644
--- a/r/man/hive_partition.Rd
+++ b/r/man/hive_partition.Rd
@@ -28,3 +28,8 @@ Hive partitioning embeds field names and values in path segments, such as
 Because fields are named in the path segments, order of fields passed to
 \code{hive_partition()} does not matter.
 }
+\examples{
+\dontshow{if (arrow_with_dataset()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+hive_partition(year = int16(), month = int8())
+\dontshow{\}) # examplesIf}
+}
diff --git a/r/man/list_compute_functions.Rd b/r/man/list_compute_functions.Rd
index ba17688d833..668e090c0ca 100644
--- a/r/man/list_compute_functions.Rd
+++ b/r/man/list_compute_functions.Rd
@@ -37,3 +37,10 @@ The package includes Arrow methods for many base R functions that can
 be called directly on Arrow objects, as well as some tidyverse-flavored versions
 available inside \code{dplyr} verbs.
 }
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+list_compute_functions() 
+list_compute_functions(pattern = "^UTF8", ignore.case = TRUE)
+list_compute_functions(pattern = "^is", invert = TRUE)
+\dontshow{\}) # examplesIf}
+}
diff --git a/r/man/load_flight_server.Rd b/r/man/load_flight_server.Rd
index 7e2000a9ca2..66d30f39147 100644
--- a/r/man/load_flight_server.Rd
+++ b/r/man/load_flight_server.Rd
@@ -15,3 +15,8 @@ to look in the \verb{inst/} directory for included modules.}
 \description{
 Load a Python Flight server
 }
+\examples{
+\dontshow{if (FALSE) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+load_flight_server("demo_flight_server")
+\dontshow{\}) # examplesIf}
+}
diff --git a/r/man/match_arrow.Rd b/r/man/match_arrow.Rd
index 21481af4c6b..d63ef3eed87 100644
--- a/r/man/match_arrow.Rd
+++ b/r/man/match_arrow.Rd
@@ -26,3 +26,28 @@ per element of \code{x} it it is present in \code{table}.
 \code{base::match()} is not a generic, so we can't just define Arrow methods for
 it. This function exposes the analogous functions in the Arrow C++ library.
 }
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+# note that the returned value is 0-indexed
+cars_tbl <- Table$create(name = rownames(mtcars), mtcars)
+match_arrow(Scalar$create("Mazda RX4 Wag"), cars_tbl$name)
+
+is_in(Array$create("Mazda RX4 Wag"), cars_tbl$name)
+
+# Although there are multiple matches, you are returned the index of the first 
+# match, as with the base R equivalent
+match(4, mtcars$cyl) # 1-indexed
+match_arrow(Scalar$create(4), cars_tbl$cyl) # 0-indexed
+
+# If `x` contains multiple values, you are returned the indices of the first 
+# match for each value.
+match(c(4, 6, 8), mtcars$cyl)
+match_arrow(Array$create(c(4, 6, 8)), cars_tbl$cyl)
+
+# Return type matches type of `x`
+is_in(c(4, 6, 8), mtcars$cyl) # returns vector
+is_in(Scalar$create(4), mtcars$cyl) # returns Scalar
+is_in(Array$create(c(4, 6, 8)), cars_tbl$cyl) # returns Array
+is_in(ChunkedArray$create(c(4, 6), 8), cars_tbl$cyl) # returns ChunkedArray
+\dontshow{\}) # examplesIf}
+}
diff --git a/r/man/read_delim_arrow.Rd b/r/man/read_delim_arrow.Rd
index d9c80306931..71394e547c9 100644
--- a/r/man/read_delim_arrow.Rd
+++ b/r/man/read_delim_arrow.Rd
@@ -205,3 +205,14 @@ Note that if you are specifying column names, whether by \code{schema} or
 to idenfity column names, you'll need to add \code{skip = 1} to skip that row.
 }
 
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+  tf <- tempfile()
+  on.exit(unlink(tf))
+  write.csv(mtcars, file = tf)
+  df <- read_csv_arrow(tf)
+  dim(df)
+  # Can select columns
+  df <- read_csv_arrow(tf, col_select = starts_with("d"))
+\dontshow{\}) # examplesIf}
+}
diff --git a/r/man/read_feather.Rd b/r/man/read_feather.Rd
index fa18e3f7844..95f4d1d12c6 100644
--- a/r/man/read_feather.Rd
+++ b/r/man/read_feather.Rd
@@ -34,6 +34,17 @@ and to make sharing data across data analysis languages easy.
 This function reads both the original, limited specification of the format
 and the version 2 specification, which is the Apache Arrow IPC file format.
 }
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+tf <- tempfile()
+on.exit(unlink(tf))
+write_feather(mtcars, tf)
+df <- read_feather(tf)
+dim(df)
+# Can select columns
+df <- read_feather(tf, col_select = starts_with("d"))
+\dontshow{\}) # examplesIf}
+}
 \seealso{
 \link{FeatherReader} and \link{RecordBatchReader} for lower-level access to reading Arrow IPC data.
 }
diff --git a/r/man/read_json_arrow.Rd b/r/man/read_json_arrow.Rd
index 476c99fe4de..4806b4ad1f0 100644
--- a/r/man/read_json_arrow.Rd
+++ b/r/man/read_json_arrow.Rd
@@ -38,3 +38,15 @@ A \code{data.frame}, or a Table if \code{as_data_frame = FALSE}.
 \description{
 Using \link{JsonTableReader}
 }
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+  tf <- tempfile()
+  on.exit(unlink(tf))
+  writeLines('
+    { "hello": 3.5, "world": false, "yo": "thing" }
+    { "hello": 3.25, "world": null }
+    { "hello": 0.0, "world": true, "yo": null }
+  ', tf, useBytes=TRUE)
+  df <- read_json_arrow(tf)
+\dontshow{\}) # examplesIf}
+}
diff --git a/r/man/read_parquet.Rd b/r/man/read_parquet.Rd
index ffb2cf7109f..056e8644747 100644
--- a/r/man/read_parquet.Rd
+++ b/r/man/read_parquet.Rd
@@ -39,3 +39,12 @@ A \link[=Table]{arrow::Table}, or a \code{data.frame} if \code{as_data_frame} is
 '\href{https://parquet.apache.org/}{Parquet}' is a columnar storage file format.
 This function enables you to read Parquet files into R.
 }
+\examples{
+\dontshow{if (arrow_with_parquet()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+tf <- tempfile()
+on.exit(unlink(tf))
+write_parquet(mtcars, tf)
+df <- read_parquet(tf, col_select = starts_with("d"))
+head(df)
+\dontshow{\}) # examplesIf}
+}
diff --git a/r/man/s3_bucket.Rd b/r/man/s3_bucket.Rd
index 78d527a56c4..95a086deae5 100644
--- a/r/man/s3_bucket.Rd
+++ b/r/man/s3_bucket.Rd
@@ -21,3 +21,8 @@ are authorized to access the bucket's contents.
 that automatically detects the bucket's AWS region and holding onto the its
 relative path.
 }
+\examples{
+\dontshow{if (arrow_with_s3()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+bucket <- s3_bucket("ursa-labs-taxi-data")
+\dontshow{\}) # examplesIf}
+}
diff --git a/r/man/type.Rd b/r/man/type.Rd
index 2f85e4a6ac6..d55bbe24bd5 100644
--- a/r/man/type.Rd
+++ b/r/man/type.Rd
@@ -15,3 +15,13 @@ an arrow logical type
 \description{
 infer the arrow Array type from an R vector
 }
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+type(1:10)
+type(1L:10L)
+type(c(1, 1.5, 2))
+type(c("A", "B", "C"))
+type(mtcars)
+type(Sys.Date())
+\dontshow{\}) # examplesIf}
+}
diff --git a/r/man/unify_schemas.Rd b/r/man/unify_schemas.Rd
index 709e33a5e74..50c80c2dda9 100644
--- a/r/man/unify_schemas.Rd
+++ b/r/man/unify_schemas.Rd
@@ -18,3 +18,10 @@ A \code{Schema} with the union of fields contained in the inputs, or
 \description{
 Combine and harmonize schemas
 }
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+a <- schema(b = double(), c = bool())
+z <- schema(b = double(), k = utf8())
+unify_schemas(a, z)
+\dontshow{\}) # examplesIf}
+}
diff --git a/r/man/value_counts.Rd b/r/man/value_counts.Rd
index 139af8edc63..6ef77cd4727 100644
--- a/r/man/value_counts.Rd
+++ b/r/man/value_counts.Rd
@@ -16,3 +16,9 @@ A \code{StructArray} containing "values" (same type as \code{x}) and "counts"
 \description{
 This function tabulates the values in the array and returns a table of counts.
 }
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+cyl_vals <- Array$create(mtcars$cyl)
+value_counts(cyl_vals)
+\dontshow{\}) # examplesIf}
+}
diff --git a/r/man/write_csv_arrow.Rd b/r/man/write_csv_arrow.Rd
index d6df2bcd08e..55a239ca998 100644
--- a/r/man/write_csv_arrow.Rd
+++ b/r/man/write_csv_arrow.Rd
@@ -23,3 +23,10 @@ the stream will be left open.
 \description{
 Write CSV file to disk
 }
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+tf <- tempfile()
+on.exit(unlink(tf))
+write_csv_arrow(mtcars, tf)
+\dontshow{\}) # examplesIf}
+}
diff --git a/r/man/write_feather.Rd b/r/man/write_feather.Rd
index 0cc8c591369..c6273b61be8 100644
--- a/r/man/write_feather.Rd
+++ b/r/man/write_feather.Rd
@@ -47,6 +47,13 @@ and to make sharing data across data analysis languages easy.
 This function writes both the original, limited specification of the format
 and the version 2 specification, which is the Apache Arrow IPC file format.
 }
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+tf <- tempfile()
+on.exit(unlink(tf))
+write_feather(mtcars, tf)
+\dontshow{\}) # examplesIf}
+}
 \seealso{
 \link{RecordBatchWriter} for lower-level access to writing Arrow IPC data.
 
diff --git a/r/man/write_ipc_stream.Rd b/r/man/write_ipc_stream.Rd
index 4f742ce9178..888d947eb99 100644
--- a/r/man/write_ipc_stream.Rd
+++ b/r/man/write_ipc_stream.Rd
@@ -31,6 +31,13 @@ with some nonstandard behavior, is deprecated. You should explicitly choose
 the function that will write the desired IPC format (stream or file) since
 either can be written to a file or \code{OutputStream}.
 }
+\examples{
+\dontshow{if (arrow_available() ) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+tf <- tempfile()
+on.exit(unlink(tf))
+write_ipc_stream(mtcars, tf)
+\dontshow{\}) # examplesIf}
+}
 \seealso{
 \code{\link[=write_feather]{write_feather()}} for writing IPC files. \code{\link[=write_to_raw]{write_to_raw()}} to
 serialize data to a buffer.
diff --git a/r/man/write_parquet.Rd b/r/man/write_parquet.Rd
index 823a6038e84..d7147f7e8e6 100644
--- a/r/man/write_parquet.Rd
+++ b/r/man/write_parquet.Rd
@@ -94,3 +94,15 @@ The default "snappy" is used if available, otherwise "uncompressed". To
 disable compression, set \code{compression = "uncompressed"}.
 Note that "uncompressed" columns may still have dictionary encoding.
 }
+\examples{
+\dontshow{if (arrow_with_parquet()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+tf1 <- tempfile(fileext = ".parquet")
+write_parquet(data.frame(x = 1:5), tf1)
+
+# using compression
+if (codec_is_available("gzip")) {
+  tf2 <- tempfile(fileext = ".gz.parquet")
+  write_parquet(data.frame(x = 1:5), tf2, compression = "gzip", compression_level = 5)
+}
+\dontshow{\}) # examplesIf}
+}
diff --git a/r/man/write_to_raw.Rd b/r/man/write_to_raw.Rd
index 46af09a96e8..1f507e384c3 100644
--- a/r/man/write_to_raw.Rd
+++ b/r/man/write_to_raw.Rd
@@ -20,3 +20,10 @@ the data (\code{data.frame}, \code{RecordBatch}, or \code{Table}) they were give
 This function wraps those so that you can serialize data to a buffer and
 access that buffer as a \code{raw} vector in R.
 }
+\examples{
+\dontshow{if (arrow_available()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+# The default format is "stream"
+write_to_raw(mtcars)
+write_to_raw(mtcars, format = "file")
+\dontshow{\}) # examplesIf}
+}

From b00a8a04cbcb99ba6db6e3b911e1be6b4872bec0 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Fri, 11 Jun 2021 07:44:31 -0400
Subject: [PATCH 09/46] removing std::function visitor and adding direct ref
 template

---
 cpp/src/arrow/util/bitmap.h       | 25 +++++++--------
 cpp/src/arrow/util/bitmap_test.cc | 52 +++++++++++++++----------------
 2 files changed, 37 insertions(+), 40 deletions(-)

diff --git a/cpp/src/arrow/util/bitmap.h b/cpp/src/arrow/util/bitmap.h
index 7174923fca4..ec88e7574f1 100644
--- a/cpp/src/arrow/util/bitmap.h
+++ b/cpp/src/arrow/util/bitmap.h
@@ -414,13 +414,15 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
     return min_word_offset;
   }
 
-  template <size_t N, size_t M, typename Word>
-  using MultiOutputVisitor = std::function<void(const std::array<Word, N>& in_words,
-                                                std::array<Word, M>& out_words)>;
+  //  template <size_t N, size_t M, typename Word>
+  //  using MultiOutputVisitor = std::function<void(const std::array<Word, N>& in_words,
+  //                                                std::array<Word, M>& out_words)>;
 
-  template <size_t N, size_t M, typename Word>
+  template <size_t N, size_t M, typename Visitor,
+            typename Word = typename std::decay<
+                internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
   static void VisitWordsAndWrite(const std::array<Bitmap, N>& bitmaps_arg,
-                                 MultiOutputVisitor<N, M, Word>&& visitor,
+                                 Visitor&& visitor,
                                  std::array<Bitmap, M>& out_bitmaps_arg) {
     constexpr int64_t kBitWidth = sizeof(Word) * 8;
 
@@ -510,16 +512,13 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
     }
   }
 
-  template <size_t N, typename Word>
-  using SingleOutputVisitor =
-      std::function<void(const std::array<Word, N>& in_words, Word& out_words)>;
-
-  template <size_t N, typename Word>
+  template <size_t N, typename Visitor,
+            typename Word = typename std::decay<
+                internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
   static void VisitWordsAndWrite(const std::array<Bitmap, N>& bitmaps_arg,
-                                 SingleOutputVisitor<N, Word>&& visitor,
-                                 Bitmap& out_bitmap_arg) {
+                                 Visitor&& visitor, Bitmap& out_bitmap_arg) {
     std::array<Bitmap, 1> out_bitmaps{out_bitmap_arg};
-    VisitWordsAndWrite<N, 1, Word>(
+    VisitWordsAndWrite(
         bitmaps_arg,
         [&](const std::array<Word, N>& in_words, std::array<Word, 1>& out_words) {
           visitor(in_words, out_words[0]);
diff --git a/cpp/src/arrow/util/bitmap_test.cc b/cpp/src/arrow/util/bitmap_test.cc
index 0db0fa5854c..d981cb7611d 100644
--- a/cpp/src/arrow/util/bitmap_test.cc
+++ b/cpp/src/arrow/util/bitmap_test.cc
@@ -80,13 +80,13 @@ TEST_F(TestBitmapVisit, SingleWriterOutputZeroOffset) {
   ASSERT_OK_AND_ASSIGN(auto out, AllocateBitmap(part));
   Bitmap out_bm(out, 0, part);
 
-  auto visitor = [](const std::array<uint64_t, 3>& in_words, uint64_t& out_words) {
-    out_words = (in_words[0] & in_words[1]) | in_words[2];
-  };
-
   // (bm0 & bm1) | bm2
+  std::array<Bitmap, 3> in_bms{bm0, bm1, bm2};
   Bitmap::VisitWordsAndWrite(
-      {bm0, bm1, bm2}, std::forward<Bitmap::SingleOutputVisitor<3, uint64_t>>(visitor),
+      in_bms,
+      [](const std::array<uint64_t, 3>& in_words, uint64_t& out_words) {
+        out_words = (in_words[0] & in_words[1]) | in_words[2];
+      },
       out_bm);
 
   std::vector<bool> v0(data.begin(), data.begin() + part);
@@ -122,13 +122,13 @@ TEST_F(TestBitmapVisit, SingleWriterOutputNonZeroOffset) {
   ASSERT_OK_AND_ASSIGN(auto out, AllocateBitmap(part * 2));
   Bitmap out_bm(out, part, part);
 
-  auto visitor = [](const std::array<uint64_t, 3>& in_words, uint64_t& out_words) {
-    out_words = (in_words[0] & in_words[1]) | in_words[2];
-  };
-
   // (bm0 & bm1) | bm2
+  std::array<Bitmap, 3> in_bms{bm0, bm1, bm2};
   Bitmap::VisitWordsAndWrite(
-      {bm0, bm1, bm2}, std::forward<Bitmap::SingleOutputVisitor<3, uint64_t>>(visitor),
+      in_bms,
+      [](const std::array<uint64_t, 3>& in_words, uint64_t& out_words) {
+        out_words = (in_words[0] & in_words[1]) | in_words[2];
+      },
       out_bm);
 
   std::vector<bool> v0(data.begin(), data.begin() + part);
@@ -171,15 +171,14 @@ TEST_F(TestBitmapVisit, MultiWriterOutputZeroOffset) {
   std::vector<bool> v2(data.begin() + part * 2, data.begin() + part * 3);
 
   // out0 = bm0 & bm1, out1= bm0 | bm2
-  auto visitor_func = [](const std::array<uint64_t, 3>& in,
-                         std::array<uint64_t, 2>& out) {
-    out[0] = in[0] & in[1];
-    out[1] = in[0] | in[2];
-  };
-
+  std::array<Bitmap, 3> in_bms{bm0, bm1, bm2};
   Bitmap::VisitWordsAndWrite(
-      {bm0, bm1, bm2},
-      std::forward<Bitmap::MultiOutputVisitor<3, 2, uint64_t>>(visitor_func), out_bms);
+      in_bms,
+      [](const std::array<uint64_t, 3>& in, std::array<uint64_t, 2>& out) {
+        out[0] = in[0] & in[1];
+        out[1] = in[0] | in[2];
+      },
+      out_bms);
 
   std::vector<bool> out_v0(part);
   std::vector<bool> out_v1(part);
@@ -235,15 +234,14 @@ TEST_F(TestBitmapVisit, MultiWriterOutputNonZeroOffset) {
   //  std::cout << "b2: " << bm2.ToString() << "\n";
 
   // out0 = bm0 & bm1, out1= bm0 | bm2
-  auto visitor_func = [](const std::array<uint64_t, 3>& in,
-                         std::array<uint64_t, 2>& out) {
-    out[0] = in[0] & in[1];
-    out[1] = in[0] | in[2];
-  };
-
+  std::array<Bitmap, 3> in_bms{bm0, bm1, bm2};
   Bitmap::VisitWordsAndWrite(
-      {bm0, bm1, bm2},
-      std::forward<Bitmap::MultiOutputVisitor<3, 2, uint64_t>>(visitor_func), out_bms);
+      in_bms,
+      [](const std::array<uint64_t, 3>& in, std::array<uint64_t, 2>& out) {
+        out[0] = in[0] & in[1];
+        out[1] = in[0] | in[2];
+      },
+      out_bms);
 
   std::vector<bool> out_v0(part);
   std::vector<bool> out_v1(part);
@@ -259,4 +257,4 @@ TEST_F(TestBitmapVisit, MultiWriterOutputNonZeroOffset) {
 }
 
 }  // namespace internal
-}  // namespace arrow
\ No newline at end of file
+}  // namespace arrow

From 2743309595ce62a35aa2f450bd098c224f82c097 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Fri, 11 Jun 2021 14:24:19 -0400
Subject: [PATCH 10/46] simplifying impl

---
 .../arrow/compute/kernels/scalar_if_else.cc   | 103 ++++----
 cpp/src/arrow/util/bitmap.h                   | 235 ++----------------
 cpp/src/arrow/util/bitmap_reader.h            |  17 +-
 cpp/src/arrow/util/bitmap_test.cc             | 168 +++++--------
 cpp/src/arrow/util/bitmap_writer.h            |  19 +-
 5 files changed, 151 insertions(+), 391 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
index 83e5501a0f1..8a85f61b9a7 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
@@ -108,96 +108,101 @@ Status PromoteNullsVisitor(KernelContext* ctx, const Datum& cond_d, const Datum&
     return c_valid & ((c_data & l_valid) | (~c_data & r_valid));
   };
 
-  Bitmap out_bitmap(output->buffers[0], 0, cond.length);
+  std::array<Bitmap, 1> out_bitmaps{Bitmap{output->buffers[0], 0, cond.length}};
 
   enum { C_VALID, C_DATA, L_VALID, R_VALID };
 
   switch (flag) {
     case COND_CONST | LEFT_CONST | RIGHT_CONST: {
       std::array<Bitmap, 4> bitmaps{_, cond_data, _, _};
-      Bitmap::VisitWordsAndWrite(
-          bitmaps,
-          [&](std::array<uint64_t, 4> words) {
-            return apply(*cond_const, words[C_DATA], *left_const, *right_const);
-          },
-          &out_bitmap);
+      Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+                                 [&](const std::array<uint64_t, 4>& words_in,
+                                     std::array<uint64_t, 1>& word_out) {
+                                   word_out[0] = apply(*cond_const, words_in[C_DATA],
+                                                       *left_const, *right_const);
+                                 });
       break;
     }
     case LEFT_CONST | RIGHT_CONST: {
       std::array<Bitmap, 4> bitmaps{cond_valid, cond_data, _, _};
-      Bitmap::VisitWordsAndWrite(
-          bitmaps,
-          [&](std::array<uint64_t, 4> words) {
-            return apply(words[C_VALID], words[C_DATA], *left_const, *right_const);
-          },
-          &out_bitmap);
+      Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+                                 [&](const std::array<uint64_t, 4>& words_in,
+                                     std::array<uint64_t, 1>& word_out) {
+                                   word_out[0] =
+                                       apply(words_in[C_VALID], words_in[C_DATA],
+                                             *left_const, *right_const);
+                                 });
       break;
     }
     case COND_CONST | RIGHT_CONST: {
       // bitmaps[C_VALID], bitmaps[R_VALID] might be null; override to make it safe for
       // Visit()
       std::array<Bitmap, 4> bitmaps{_, cond_data, left_valid, _};
-      Bitmap::VisitWordsAndWrite(
-          bitmaps,
-          [&](std::array<uint64_t, 4> words) {
-            return apply(*cond_const, words[C_DATA], words[L_VALID], *right_const);
-          },
-          &out_bitmap);
+      Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+                                 [&](const std::array<uint64_t, 4>& words_in,
+                                     std::array<uint64_t, 1>& word_out) {
+                                   word_out[0] = apply(*cond_const, words_in[C_DATA],
+                                                       words_in[L_VALID], *right_const);
+                                 });
       break;
     }
     case RIGHT_CONST: {
       // bitmaps[R_VALID] might be null; override to make it safe for Visit()
       std::array<Bitmap, 4> bitmaps{cond_valid, cond_data, left_valid, _};
-      Bitmap::VisitWordsAndWrite(
-          bitmaps,
-          [&](std::array<uint64_t, 4> words) {
-            return apply(words[C_VALID], words[C_DATA], words[L_VALID], *right_const);
-          },
-          &out_bitmap);
+      Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+                                 [&](const std::array<uint64_t, 4>& words_in,
+                                     std::array<uint64_t, 1>& word_out) {
+                                   word_out[0] =
+                                       apply(words_in[C_VALID], words_in[C_DATA],
+                                             words_in[L_VALID], *right_const);
+                                 });
       break;
     }
     case COND_CONST | LEFT_CONST: {
       // bitmaps[C_VALID], bitmaps[L_VALID] might be null; override to make it safe for
       // Visit()
       std::array<Bitmap, 4> bitmaps{_, cond_data, _, right_valid};
-      Bitmap::VisitWordsAndWrite(
-          bitmaps,
-          [&](std::array<uint64_t, 4> words) {
-            return apply(*cond_const, words[C_DATA], *left_const, words[R_VALID]);
-          },
-          &out_bitmap);
+      Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+                                 [&](const std::array<uint64_t, 4>& words_in,
+                                     std::array<uint64_t, 1>& word_out) {
+                                   word_out[0] = apply(*cond_const, words_in[C_DATA],
+                                                       *left_const, words_in[R_VALID]);
+                                 });
       break;
     }
     case LEFT_CONST: {
       // bitmaps[L_VALID] might be null; override to make it safe for Visit()
       std::array<Bitmap, 4> bitmaps{cond_valid, cond_data, _, right_valid};
-      Bitmap::VisitWordsAndWrite(
-          bitmaps,
-          [&](std::array<uint64_t, 4> words) {
-            return apply(words[C_VALID], words[C_DATA], *left_const, words[R_VALID]);
-          },
-          &out_bitmap);
+      Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+                                 [&](const std::array<uint64_t, 4>& words_in,
+                                     std::array<uint64_t, 1>& word_out) {
+                                   word_out[0] =
+                                       apply(words_in[C_VALID], words_in[C_DATA],
+                                             *left_const, words_in[R_VALID]);
+                                 });
       break;
     }
     case COND_CONST: {
       // bitmaps[C_VALID] might be null; override to make it safe for Visit()
       std::array<Bitmap, 4> bitmaps{_, cond_data, left_valid, right_valid};
-      Bitmap::VisitWordsAndWrite(
-          bitmaps,
-          [&](std::array<uint64_t, 4> words) {
-            return apply(*cond_const, words[C_DATA], words[L_VALID], words[R_VALID]);
-          },
-          &out_bitmap);
+      Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+                                 [&](const std::array<uint64_t, 4>& words_in,
+                                     std::array<uint64_t, 1>& word_out) {
+                                   word_out[0] =
+                                       apply(*cond_const, words_in[C_DATA],
+                                             words_in[L_VALID], words_in[R_VALID]);
+                                 });
       break;
     }
     case 0: {
       std::array<Bitmap, 4> bitmaps{cond_valid, cond_data, left_valid, right_valid};
-      Bitmap::VisitWordsAndWrite(
-          bitmaps,
-          [&](std::array<uint64_t, 4> words) {
-            return apply(words[C_VALID], words[C_DATA], words[L_VALID], words[R_VALID]);
-          },
-          &out_bitmap);
+      Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+                                 [&](const std::array<uint64_t, 4>& words_in,
+                                     std::array<uint64_t, 1>& word_out) {
+                                   word_out[0] =
+                                       apply(words_in[C_VALID], words_in[C_DATA],
+                                             words_in[L_VALID], words_in[R_VALID]);
+                                 });
       break;
     }
   }
diff --git a/cpp/src/arrow/util/bitmap.h b/cpp/src/arrow/util/bitmap.h
index ec88e7574f1..613366eb8af 100644
--- a/cpp/src/arrow/util/bitmap.h
+++ b/cpp/src/arrow/util/bitmap.h
@@ -248,186 +248,31 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
     return min_offset;
   }
 
-  /// \brief Visit words of bits from each bitmap as array<Word, N>
+  /// \brief Visit words of bits from each input bitmap as array<Word, N> and collects
+  /// outputs to an array<Word, M>, to be written into the output bitmaps accordingly.
   ///
   /// All bitmaps must have identical length. The first bit in a visited bitmap
   /// may be offset within the first visited word, but words will otherwise contain
   /// densely packed bits loaded from the bitmap. That offset within the first word is
   /// returned.
+  /// Visitor is expected to have the following signature
+  ///     [](const std::array<Word, N>& in_words, std::array<Word, M>& out_words){...}
   ///
-  /// TODO(bkietz) allow for early termination
   // NOTE: this function is efficient on 3+ sufficiently large bitmaps.
   // It also has a large prolog / epilog overhead and should be used
   // carefully in other cases.
   // For 2 bitmaps or less, and/or smaller bitmaps, see also VisitTwoBitBlocksVoid
   // and BitmapUInt64Reader.
-  template <size_t N, typename Visitor,
-            typename Word = typename std::decay<
-                internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
-  static int64_t VisitWordsAndWrite(const std::array<Bitmap, N>& bitmaps_arg,
-                                    Visitor&& visitor, Bitmap* out_bitmap_arg) {
-    constexpr int64_t kBitWidth = sizeof(Word) * 8;
-
-    // local, mutable variables which will be sliced/decremented to represent consumption:
-    Bitmap bitmaps[N];  // todo use std::array here
-    int64_t bit_length = BitLength(bitmaps_arg);
-
-    struct BitmapHolder {
-      BitmapHolder() = default;
-      explicit BitmapHolder(Bitmap* bitmap_)
-          : bitmap(bitmap_),
-            word_offset(bitmap_->template word_offset<Word>()),
-            words(bitmap_->template words<Word>()) {
-        assert(BitmapHolder::word_offset >= 0 && BitmapHolder::word_offset < kBitWidth);
-      }
-
-      inline void StrideAndUpdate(int64_t _stride) {
-        BitmapHolder::bitmap->Stride(_stride);
-        BitmapHolder::word_offset = bitmap->template word_offset<Word>();
-        assert(BitmapHolder::word_offset >= 0 && BitmapHolder::word_offset < kBitWidth);
-        BitmapHolder::words = bitmap->template words<Word>();
-      }
-
-      Bitmap* bitmap{};
-      int64_t word_offset = 0;
-      View<Word> words;
-    };
-
-    std::array<BitmapHolder, N> in_bitmaps;
-    Bitmap out_bitmap = *out_bitmap_arg;  // make a copy
-
-    for (size_t i = 0; i < N; ++i) {
-      bitmaps[i] = bitmaps_arg[i];  // make a copy
-      in_bitmaps[i] = BitmapHolder(&bitmaps[i]);
-    }
-
-    auto consume = [&](int64_t consumed_bits) {
-      for (size_t i = 0; i < N; ++i) {
-        in_bitmaps[i].StrideAndUpdate(consumed_bits);
-      }
-      out_bitmap.Stride(consumed_bits);
-
-      bit_length -= consumed_bits;
-    };
-
-    std::array<Word, N> visited_words;
-    visited_words.fill(0);
-
-    if (bit_length <= kBitWidth * 2) {
-      // bitmaps fit into one or two words so don't bother with optimization
-      while (bit_length > 0) {
-        auto leading_bits = std::min(bit_length, kBitWidth);
-        SafeLoadWords(bitmaps, 0, leading_bits, false, &visited_words);
-        Word visit_out = visitor(visited_words);  // outputs a word/ partial word
-        CopyBitmap(reinterpret_cast<uint8_t*>(&visit_out), 0, leading_bits,
-                   out_bitmap.buffer_->mutable_data(), out_bitmap.offset());
-        consume(leading_bits);
-      }
-      return 0;
-    }
-
-    auto word_offset_comp = [](const BitmapHolder& l, const BitmapHolder& r) {
-      return l.word_offset < r.word_offset;
-    };
-
-    int64_t max_word_offset =
-        (*std::max_element(in_bitmaps.begin(), in_bitmaps.end(), word_offset_comp))
-            .word_offset;
-    int64_t min_word_offset =
-        (*std::min_element(in_bitmaps.begin(), in_bitmaps.end(), word_offset_comp))
-            .word_offset;
-    if (max_word_offset > 0) {
-      // consume leading bits
-      auto leading_bits = kBitWidth - min_word_offset;
-      SafeLoadWords(bitmaps, 0, leading_bits, true, &visited_words);
-      Word visit_out = visitor(visited_words);
-      CopyBitmap(reinterpret_cast<uint8_t*>(&visit_out), sizeof(Word) * 8 - leading_bits,
-                 leading_bits, out_bitmap.buffer_->mutable_data(), out_bitmap.offset());
-      consume(leading_bits);
-    }
-    assert((*std::min_element(in_bitmaps.begin(), in_bitmaps.end(), word_offset_comp))
-               .word_offset == 0);
-
-    int64_t whole_word_count = bit_length / kBitWidth;
-    assert(whole_word_count >= 1);
-
-    std::vector<Word> visit_outs;
-    visit_outs.reserve(whole_word_count);
-
-    if (min_word_offset == max_word_offset) {
-      // all offsets were identical, all leading bits have been consumed
-      assert(std::all_of(
-          in_bitmaps.begin(), in_bitmaps.end(),
-          [](const BitmapHolder& holder) { return holder.word_offset == 0; }));
-
-      for (int64_t word_i = 0; word_i < whole_word_count; ++word_i) {
-        for (size_t i = 0; i < N; ++i) {
-          visited_words[i] = in_bitmaps[i].words[word_i];
-        }
-        visit_outs.template emplace_back(visitor(visited_words));
-      }
-      CopyBitmap(reinterpret_cast<const uint8_t*>(visit_outs.data()), 0,
-                 whole_word_count * kBitWidth, out_bitmap.buffer_->mutable_data(),
-                 out_bitmap.offset());
-      consume(whole_word_count * kBitWidth);
-    } else {
-      // leading bits from potentially incomplete words have been consumed
-
-      // word_i such that words[i][word_i] and words[i][word_i + 1] are lie entirely
-      // within the bitmap for all i
-      for (int64_t word_i = 0; word_i < whole_word_count - 1; ++word_i) {
-        for (size_t i = 0; i < N; ++i) {
-          const auto ith_words = in_bitmaps[i].words;
-          const auto ith_word_offset = in_bitmaps[i].word_offset;
-          if (ith_word_offset == 0) {
-            visited_words[i] = ith_words[word_i];
-          } else {
-            auto words0 = BitUtil::ToLittleEndian(ith_words[word_i]);
-            auto words1 = BitUtil::ToLittleEndian(ith_words[word_i + 1]);
-            visited_words[i] = BitUtil::FromLittleEndian(
-                (words0 >> ith_word_offset) | (words1 << (kBitWidth - ith_word_offset)));
-          }
-        }
-        visit_outs.template emplace_back(visitor(visited_words));
-      }
-      CopyBitmap(reinterpret_cast<const uint8_t*>(visit_outs.data()), 0,
-                 (whole_word_count - 1) * kBitWidth, out_bitmap.buffer_->mutable_data(),
-                 out_bitmap.offset());
-      consume((whole_word_count - 1) * kBitWidth);
-
-      SafeLoadWords(bitmaps, 0, kBitWidth, false, &visited_words);
-
-      Word visit_out = visitor(visited_words);  // outputs a word/ partial word
-      CopyBitmap(reinterpret_cast<uint8_t*>(&visit_out), 0, kBitWidth,
-                 out_bitmap.buffer_->mutable_data(), out_bitmap.offset());
-      consume(kBitWidth);
-    }
-
-    // load remaining bits
-    if (bit_length > 0) {
-      SafeLoadWords(bitmaps, 0, bit_length, false, &visited_words);
-      Word visit_out = visitor(visited_words);
-      CopyBitmap(reinterpret_cast<uint8_t*>(&visit_out), 0, bit_length,
-                 out_bitmap.buffer_->mutable_data(), out_bitmap.offset());
-    }
-
-    return min_word_offset;
-  }
-
-  //  template <size_t N, size_t M, typename Word>
-  //  using MultiOutputVisitor = std::function<void(const std::array<Word, N>& in_words,
-  //                                                std::array<Word, M>& out_words)>;
-
   template <size_t N, size_t M, typename Visitor,
             typename Word = typename std::decay<
                 internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
   static void VisitWordsAndWrite(const std::array<Bitmap, N>& bitmaps_arg,
-                                 Visitor&& visitor,
-                                 std::array<Bitmap, M>& out_bitmaps_arg) {
+                                 std::array<Bitmap, M>* out_bitmaps_arg,
+                                 Visitor&& visitor) {
     constexpr int64_t kBitWidth = sizeof(Word) * 8;
 
     int64_t bit_length = BitLength(bitmaps_arg);
-    assert(bit_length == BitLength(out_bitmaps_arg));
+    assert(bit_length == BitLength(*out_bitmaps_arg));
 
     std::array<BitmapWordReader<Word>, N> readers;
     for (size_t i = 0; i < N; ++i) {
@@ -437,9 +282,9 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
 
     std::array<BitmapWordWriter<Word>, M> writers;
     for (size_t i = 0; i < M; ++i) {
-      writers[i] =
-          BitmapWordWriter<Word>(out_bitmaps_arg[i].buffer_->mutable_data(),
-                                 out_bitmaps_arg[i].offset_, out_bitmaps_arg[i].length_);
+      const Bitmap& out_bitmap = out_bitmaps_arg->at(i);
+      writers[i] = BitmapWordWriter<Word>(out_bitmap.buffer_->mutable_data(),
+                                          out_bitmap.offset_, out_bitmap.length_);
     }
 
     std::array<Word, N> visited_words;
@@ -456,76 +301,32 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
       for (size_t i = 0; i < N; i++) {
         visited_words[i] = readers[i].NextWord();
       }
-
       visitor(visited_words, output_words);
-
       for (size_t i = 0; i < M; i++) {
         writers[i].PutNextWord(output_words[i]);
       }
-
       bit_length -= kBitWidth;
     }
 
     // every reader will have same number of trailing bytes, because of the above reason
-    // todo when the above issue is resolved, following logic also needs to be fixed!
     // tailing portion could be more than one word! (ref: BitmapWordReader constructor)
-    assert(static_cast<size_t>(bit_length) < kBitWidth * 2);
-    if (bit_length / kBitWidth) {
-      // there's one full word in trailing portion. Cant use NextWord() here because it
-      // doesn't stride the trailing metadata
-      for (size_t i = 0; i < N; i++) {
-        visited_words[i] = 0;
-        for (size_t b = 0; b < sizeof(Word); b++) {
-          int dummy;
-          auto byte = static_cast<Word>(readers[i].NextTrailingByte(dummy));
-          visited_words[i] |= byte << (b * 8);
-        }
-      }
-
-      visitor(visited_words, output_words);
-
-      for (size_t i = 0; i < M; i++) {
-        writers[i].PutNextWord(output_words[i]);
-      }
-
-      bit_length -= kBitWidth;
-    }
-
-    // clean-up last partial word
-    if (bit_length) {
+    // remaining full/ partial words to write
+    n_words = (bit_length + kBitWidth - 1) / kBitWidth;
+    assert(n_words <= 2);
+    while (n_words--) {
+      visited_words.fill(0);
       output_words.fill(0);
+      int valid_bits;
       for (size_t i = 0; i < N; i++) {
-        visited_words[i] = 0;
-        int n_byte = readers[i].trailing_bytes();
-        for (int b = 0; b < n_byte; b++) {
-          int valid_bits;
-          auto byte = static_cast<Word>(readers[i].NextTrailingByte(valid_bits));
-          visited_words[i] |= (byte << b * 8);
-        }
+        visited_words[i] = readers[i].NextTrailingWord(valid_bits);
       }
-
       visitor(visited_words, output_words);
-
       for (size_t i = 0; i < M; i++) {
-        writers[i].PutNextWord(output_words[i], bit_length);
+        writers[i].PutTrailingWord(output_words[i], valid_bits);
       }
     }
   }
 
-  template <size_t N, typename Visitor,
-            typename Word = typename std::decay<
-                internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
-  static void VisitWordsAndWrite(const std::array<Bitmap, N>& bitmaps_arg,
-                                 Visitor&& visitor, Bitmap& out_bitmap_arg) {
-    std::array<Bitmap, 1> out_bitmaps{out_bitmap_arg};
-    VisitWordsAndWrite(
-        bitmaps_arg,
-        [&](const std::array<Word, N>& in_words, std::array<Word, 1>& out_words) {
-          visitor(in_words, out_words[0]);
-        },
-        out_bitmaps);
-  }
-
   const std::shared_ptr<Buffer>& buffer() const { return buffer_; }
 
   /// offset of first bit relative to buffer().data()
diff --git a/cpp/src/arrow/util/bitmap_reader.h b/cpp/src/arrow/util/bitmap_reader.h
index a562e9a1294..70584d5dfe9 100644
--- a/cpp/src/arrow/util/bitmap_reader.h
+++ b/cpp/src/arrow/util/bitmap_reader.h
@@ -153,7 +153,7 @@ class BitmapWordReader {
   BitmapWordReader(const uint8_t* bitmap, int64_t offset, int64_t length) {
     bitmap_ = bitmap + offset / 8;
     offset_ = offset % 8;
-    bitmap_end_ = bitmap_ + BitUtil::BytesForBits(offset + length);
+    bitmap_end_ = bitmap_ + BitUtil::BytesForBits(offset_ + length);
 
     // decrement word count by one as we may touch two adjacent words in one iteration
     nwords_ = length / (sizeof(Word) * 8) - 1;
@@ -193,6 +193,21 @@ class BitmapWordReader {
     return word;
   }
 
+  Word NextTrailingWord(int& valid_bits) {
+    // safest way to create a word from the trailing bits, is to concatenate bytes
+    // returned by NextTrailingByte
+    Word word = 0;  // only a partial word may be returned.
+    valid_bits = 0;
+    int n_byte = std::min(trailing_bytes_, static_cast<int>(sizeof(Word)));
+    for (int b = 0; b < n_byte; b++) {
+      int valid;
+      auto byte = static_cast<Word>(NextTrailingByte(valid));
+      word |= byte << (b * 8);
+      valid_bits += valid;
+    }
+    return word;
+  }
+
   uint8_t NextTrailingByte(int& valid_bits) {
     uint8_t byte;
     assert(trailing_bits_ > 0);
diff --git a/cpp/src/arrow/util/bitmap_test.cc b/cpp/src/arrow/util/bitmap_test.cc
index d981cb7611d..601cf6f65ad 100644
--- a/cpp/src/arrow/util/bitmap_test.cc
+++ b/cpp/src/arrow/util/bitmap_test.cc
@@ -58,95 +58,8 @@ void VerifyBoolOutput(const Bitmap& bitmap, const std::vector<bool>& expected) {
       << "exp: " << VectorToString(expected) << "\ngot: " << bitmap.ToString();
 }
 
-class TestBitmapVisit : public ::testing::Test {};
-
-TEST_F(TestBitmapVisit, SingleWriterOutputZeroOffset) {
-  // choosing part = 199, a prime, so that shifts are falling in-between bytes
-  int64_t part = 199, bits = part * 4;
-
-  std::vector<bool> data;
-  random_bool_vector(data, bits);
-
-  arrow::BooleanBuilder boolean_builder;
-  ASSERT_OK(boolean_builder.AppendValues(data));
-  ASSERT_OK_AND_ASSIGN(auto arrow_data, boolean_builder.Finish());
-
-  std::shared_ptr<Buffer>& arrow_buffer = arrow_data->data()->buffers[1];
-
-  Bitmap bm0(arrow_buffer, 0, part);
-  Bitmap bm1 = bm0.Slice(part * 1, part);  // this goes beyond bm0's len
-  Bitmap bm2 = bm0.Slice(part * 2, part);  // this goes beyond bm0's len
-
-  ASSERT_OK_AND_ASSIGN(auto out, AllocateBitmap(part));
-  Bitmap out_bm(out, 0, part);
-
-  // (bm0 & bm1) | bm2
-  std::array<Bitmap, 3> in_bms{bm0, bm1, bm2};
-  Bitmap::VisitWordsAndWrite(
-      in_bms,
-      [](const std::array<uint64_t, 3>& in_words, uint64_t& out_words) {
-        out_words = (in_words[0] & in_words[1]) | in_words[2];
-      },
-      out_bm);
-
-  std::vector<bool> v0(data.begin(), data.begin() + part);
-  std::vector<bool> v1(data.begin() + part * 1, data.begin() + part * 2);
-  std::vector<bool> v2(data.begin() + part * 2, data.begin() + part * 3);
-  std::vector<bool> v3(part);
-  // v3 = v0 & v1
-  std::transform(v0.begin(), v0.end(), v1.begin(), v3.begin(), std::logical_and<bool>());
-  // v3 |= v2
-  std::transform(v3.begin(), v3.end(), v2.begin(), v3.begin(), std::logical_or<bool>());
-
-  VerifyBoolOutput(out_bm, v3);
-}
-
-TEST_F(TestBitmapVisit, SingleWriterOutputNonZeroOffset) {
-  // choosing part = 199, a prime
-  int64_t part = 199, bits = part * 4;
-
-  std::vector<bool> data;
-  random_bool_vector(data, bits);
-
-  arrow::BooleanBuilder boolean_builder;
-  ASSERT_OK(boolean_builder.AppendValues(data));
-  ASSERT_OK_AND_ASSIGN(auto arrow_data, boolean_builder.Finish());
-
-  std::shared_ptr<Buffer>& arrow_buffer = arrow_data->data()->buffers[1];
-
-  Bitmap bm0(arrow_buffer, 0, part);
-  Bitmap bm1 = bm0.Slice(part * 1, part);  // this goes beyond bm0's len
-  Bitmap bm2 = bm0.Slice(part * 2, part);  // this goes beyond bm0's len
-
-  // allocate lager buffer but only use the last `part`
-  ASSERT_OK_AND_ASSIGN(auto out, AllocateBitmap(part * 2));
-  Bitmap out_bm(out, part, part);
-
-  // (bm0 & bm1) | bm2
-  std::array<Bitmap, 3> in_bms{bm0, bm1, bm2};
-  Bitmap::VisitWordsAndWrite(
-      in_bms,
-      [](const std::array<uint64_t, 3>& in_words, uint64_t& out_words) {
-        out_words = (in_words[0] & in_words[1]) | in_words[2];
-      },
-      out_bm);
-
-  std::vector<bool> v0(data.begin(), data.begin() + part);
-  std::vector<bool> v1(data.begin() + part * 1, data.begin() + part * 2);
-  std::vector<bool> v2(data.begin() + part * 2, data.begin() + part * 3);
-  std::vector<bool> v3(part);
-  // v3 = v0 & v1
-  std::transform(v0.begin(), v0.end(), v1.begin(), v3.begin(), std::logical_and<bool>());
-  // v3 |= v2
-  std::transform(v3.begin(), v3.end(), v2.begin(), v3.begin(), std::logical_or<bool>());
-
-  VerifyBoolOutput(out_bm, v3);
-}
-
-TEST_F(TestBitmapVisit, MultiWriterOutputZeroOffset) {
-  // choosing part = 199, a prime
-  int64_t part = 199, bits = part * 4;
-
+void RunOutputNoOffset(int part) {
+  int64_t bits = 4 * part;
   std::vector<bool> data;
   random_bool_vector(data, bits);
 
@@ -173,12 +86,11 @@ TEST_F(TestBitmapVisit, MultiWriterOutputZeroOffset) {
   // out0 = bm0 & bm1, out1= bm0 | bm2
   std::array<Bitmap, 3> in_bms{bm0, bm1, bm2};
   Bitmap::VisitWordsAndWrite(
-      in_bms,
+      in_bms, &out_bms,
       [](const std::array<uint64_t, 3>& in, std::array<uint64_t, 2>& out) {
         out[0] = in[0] & in[1];
         out[1] = in[0] | in[2];
-      },
-      out_bms);
+      });
 
   std::vector<bool> out_v0(part);
   std::vector<bool> out_v1(part);
@@ -200,10 +112,8 @@ TEST_F(TestBitmapVisit, MultiWriterOutputZeroOffset) {
   VerifyBoolOutput(out_bms[1], out_v1);
 }
 
-TEST_F(TestBitmapVisit, MultiWriterOutputNonZeroOffset) {
-  // choosing part = 199, a prime
-  int64_t part = 199, bits = part * 4;
-
+void RunOutputWithOffset(int64_t part) {
+  int64_t bits = part * 4;
   std::vector<bool> data;
   random_bool_vector(data, bits);
 
@@ -226,22 +136,12 @@ TEST_F(TestBitmapVisit, MultiWriterOutputNonZeroOffset) {
   std::vector<bool> v1(data.begin() + part * 1, data.begin() + part * 2);
   std::vector<bool> v2(data.begin() + part * 2, data.begin() + part * 3);
 
-  //  std::cout << "v0: " << VectorToString(v0)<< "\n";
-  //  std::cout << "b0: " << bm0.ToString() << "\n";
-  //  std::cout << "v1: " << VectorToString(v1) << "\n";
-  //  std::cout << "b1: " << bm1.ToString() << "\n";
-  //  std::cout << "v2: " << VectorToString(v2) << "\n";
-  //  std::cout << "b2: " << bm2.ToString() << "\n";
-
-  // out0 = bm0 & bm1, out1= bm0 | bm2
-  std::array<Bitmap, 3> in_bms{bm0, bm1, bm2};
-  Bitmap::VisitWordsAndWrite(
-      in_bms,
-      [](const std::array<uint64_t, 3>& in, std::array<uint64_t, 2>& out) {
-        out[0] = in[0] & in[1];
-        out[1] = in[0] | in[2];
-      },
-      out_bms);
+  std::cout << "v0: " << VectorToString(v0) << "\n";
+  std::cout << "b0: " << bm0.ToString() << "\n";
+  std::cout << "v1: " << VectorToString(v1) << "\n";
+  std::cout << "b1: " << bm1.ToString() << "\n";
+  std::cout << "v2: " << VectorToString(v2) << "\n";
+  std::cout << "b2: " << bm2.ToString() << "\n";
 
   std::vector<bool> out_v0(part);
   std::vector<bool> out_v1(part);
@@ -252,9 +152,53 @@ TEST_F(TestBitmapVisit, MultiWriterOutputNonZeroOffset) {
   std::transform(v0.begin(), v0.end(), v2.begin(), out_v1.begin(),
                  std::logical_or<bool>());
 
+  std::cout << "out0: " << VectorToString(out_v0) << "\n";
+  std::cout << "out1: " << VectorToString(out_v1) << "\n";
+
+  // out0 = bm0 & bm1, out1= bm0 | bm2
+  std::array<Bitmap, 3> in_bms{bm0, bm1, bm2};
+  Bitmap::VisitWordsAndWrite(
+      in_bms, &out_bms,
+      [](const std::array<uint64_t, 3>& in, std::array<uint64_t, 2>& out) {
+        out[0] = in[0] & in[1];
+        out[1] = in[0] | in[2];
+      });
+
   VerifyBoolOutput(out_bms[0], out_v0);
   VerifyBoolOutput(out_bms[1], out_v1);
 }
 
+class TestBitmapVisitOutputNoOffset : public ::testing::TestWithParam<int32_t> {};
+
+TEST_P(TestBitmapVisitOutputNoOffset, Test1) {
+  auto part = GetParam();
+  RunOutputNoOffset(part);
+}
+
+INSTANTIATE_TEST_SUITE_P(General, TestBitmapVisitOutputNoOffset,
+                         testing::Values(199, 256, 1000));
+
+INSTANTIATE_TEST_SUITE_P(EdgeCases, TestBitmapVisitOutputNoOffset,
+                         testing::Values(5, 13, 21, 29, 37, 41, 51, 59, 64, 97));
+
+INSTANTIATE_TEST_SUITE_P(EdgeCases2, TestBitmapVisitOutputNoOffset,
+                         testing::Values(8, 16, 24, 32, 40, 48, 56, 64));
+
+class TestBitmapVisitOutputWithOffset : public ::testing::TestWithParam<int32_t> {};
+
+TEST_P(TestBitmapVisitOutputWithOffset, Test2) {
+  auto part = GetParam();
+  RunOutputWithOffset(part);
+}
+
+INSTANTIATE_TEST_SUITE_P(General, TestBitmapVisitOutputWithOffset,
+                         testing::Values(199, 256, 1000));
+
+INSTANTIATE_TEST_SUITE_P(EdgeCases, TestBitmapVisitOutputWithOffset,
+                         testing::Values(7, 15, 23, 31, 39, 47, 55, 63, 73, 97));
+
+INSTANTIATE_TEST_SUITE_P(EdgeCases2, TestBitmapVisitOutputWithOffset,
+                         testing::Values(8, 16, 24, 32, 40, 48, 56, 64));
+
 }  // namespace internal
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/bitmap_writer.h b/cpp/src/arrow/util/bitmap_writer.h
index ca75abbf15c..e4f86a269fc 100644
--- a/cpp/src/arrow/util/bitmap_writer.h
+++ b/cpp/src/arrow/util/bitmap_writer.h
@@ -187,7 +187,7 @@ class BitmapWordWriter {
   BitmapWordWriter(uint8_t* bitmap, int64_t offset, int64_t length) {
     bitmap_ = bitmap + offset / 8;
     offset_ = offset % 8;
-    bitmap_end_ = bitmap_ + BitUtil::BytesForBits(offset + length);
+    bitmap_end_ = bitmap_ + BitUtil::BytesForBits(offset_ + length);
     mask_ = (1U << offset_) - 1;
 
     if (offset_) {
@@ -225,22 +225,17 @@ class BitmapWordWriter {
     bitmap_ += sizeof(Word);
   }
 
-  void PutNextWord(Word word, int valid_bits) {
+  void PutTrailingWord(Word word, int valid_bits) {
     assert(static_cast<size_t>(valid_bits) <= sizeof(Word) * 8);
     if (ARROW_PREDICT_FALSE(valid_bits == 0)) {
       return;
-    } else if (ARROW_PREDICT_FALSE(valid_bits == sizeof(Word) * 8)) {
-      return PutNextWord(word);
     }
-    int i = 0;
-    for (; i < valid_bits / 8; i++) {
+
+    int n_bytes = (valid_bits + 7) / 8;
+    for (int i = 0; i < n_bytes; i++) {
       uint8_t byte = *(reinterpret_cast<uint8_t*>(&word) + i);
-      PutNextTrailingByte(byte, 8);
-    }
-    // cleanup
-    if (int remainder = valid_bits - i * 8) {
-      assert(static_cast<size_t>(remainder) < sizeof(Word) * 8);
-      PutNextTrailingByte(*(reinterpret_cast<uint8_t*>(&word) + i), remainder);
+      PutNextTrailingByte(byte, std::min(8, valid_bits));
+      valid_bits -= 8;
     }
   }
 

From 4907fa096e36fda4b0ae590ea2865df0a6c139e6 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Fri, 11 Jun 2021 16:20:23 -0400
Subject: [PATCH 11/46] adding byte visitor to clean up the code

---
 .../arrow/compute/kernels/scalar_if_else.cc   | 84 +++++++++----------
 cpp/src/arrow/util/bitmap.h                   | 44 ++++++----
 cpp/src/arrow/util/bitmap_reader.h            | 15 ----
 cpp/src/arrow/util/bitmap_test.cc             | 32 +++----
 cpp/src/arrow/util/bitmap_writer.h            | 14 ----
 5 files changed, 83 insertions(+), 106 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
index 8a85f61b9a7..37db3391996 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
@@ -72,9 +72,6 @@ Status PromoteNullsVisitor(KernelContext* ctx, const Datum& cond_d, const Datum&
   Bitmap cond_valid{cond.buffers[0], cond.offset, cond.length};
   Bitmap left_valid = GetBitmap(left_d, 0);
   Bitmap right_valid = GetBitmap(right_d, 0);
-  // sometimes Bitmaps will be ignored, in which case we replace access to them with
-  // duplicated (probably elided) access to cond_data
-  const Bitmap& _ = cond_data;
 
   // cond.valid & (cond.data & left.valid | ~cond.data & right.valid)
   // In the following cases, we dont need to allocate out_valid bitmap
@@ -114,83 +111,79 @@ Status PromoteNullsVisitor(KernelContext* ctx, const Datum& cond_d, const Datum&
 
   switch (flag) {
     case COND_CONST | LEFT_CONST | RIGHT_CONST: {
-      std::array<Bitmap, 4> bitmaps{_, cond_data, _, _};
+      std::array<Bitmap, 1> bitmaps{cond_data};
       Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
-                                 [&](const std::array<uint64_t, 4>& words_in,
-                                     std::array<uint64_t, 1>& word_out) {
-                                   word_out[0] = apply(*cond_const, words_in[C_DATA],
-                                                       *left_const, *right_const);
+                                 [&](const std::array<uint64_t, 1>& words_in,
+                                     std::array<uint64_t, 1>* word_out) {
+                                   word_out->at(0) = apply(*cond_const, words_in[0],
+                                                           *left_const, *right_const);
                                  });
       break;
     }
     case LEFT_CONST | RIGHT_CONST: {
-      std::array<Bitmap, 4> bitmaps{cond_valid, cond_data, _, _};
+      std::array<Bitmap, 2> bitmaps{cond_valid, cond_data};
       Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
-                                 [&](const std::array<uint64_t, 4>& words_in,
-                                     std::array<uint64_t, 1>& word_out) {
-                                   word_out[0] =
-                                       apply(words_in[C_VALID], words_in[C_DATA],
-                                             *left_const, *right_const);
+                                 [&](const std::array<uint64_t, 2>& words_in,
+                                     std::array<uint64_t, 1>* word_out) {
+                                   word_out->at(0) = apply(words_in[0], words_in[1],
+                                                           *left_const, *right_const);
                                  });
       break;
     }
     case COND_CONST | RIGHT_CONST: {
       // bitmaps[C_VALID], bitmaps[R_VALID] might be null; override to make it safe for
       // Visit()
-      std::array<Bitmap, 4> bitmaps{_, cond_data, left_valid, _};
+      std::array<Bitmap, 2> bitmaps{cond_data, left_valid};
       Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
-                                 [&](const std::array<uint64_t, 4>& words_in,
-                                     std::array<uint64_t, 1>& word_out) {
-                                   word_out[0] = apply(*cond_const, words_in[C_DATA],
-                                                       words_in[L_VALID], *right_const);
+                                 [&](const std::array<uint64_t, 2>& words_in,
+                                     std::array<uint64_t, 1>* word_out) {
+                                   word_out->at(0) = apply(*cond_const, words_in[0],
+                                                           words_in[1], *right_const);
                                  });
       break;
     }
     case RIGHT_CONST: {
       // bitmaps[R_VALID] might be null; override to make it safe for Visit()
-      std::array<Bitmap, 4> bitmaps{cond_valid, cond_data, left_valid, _};
+      std::array<Bitmap, 3> bitmaps{cond_valid, cond_data, left_valid};
       Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
-                                 [&](const std::array<uint64_t, 4>& words_in,
-                                     std::array<uint64_t, 1>& word_out) {
-                                   word_out[0] =
-                                       apply(words_in[C_VALID], words_in[C_DATA],
-                                             words_in[L_VALID], *right_const);
+                                 [&](const std::array<uint64_t, 3>& words_in,
+                                     std::array<uint64_t, 1>* word_out) {
+                                   word_out->at(0) = apply(words_in[0], words_in[1],
+                                                           words_in[2], *right_const);
                                  });
       break;
     }
     case COND_CONST | LEFT_CONST: {
       // bitmaps[C_VALID], bitmaps[L_VALID] might be null; override to make it safe for
       // Visit()
-      std::array<Bitmap, 4> bitmaps{_, cond_data, _, right_valid};
+      std::array<Bitmap, 2> bitmaps{cond_data, right_valid};
       Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
-                                 [&](const std::array<uint64_t, 4>& words_in,
-                                     std::array<uint64_t, 1>& word_out) {
-                                   word_out[0] = apply(*cond_const, words_in[C_DATA],
-                                                       *left_const, words_in[R_VALID]);
+                                 [&](const std::array<uint64_t, 2>& words_in,
+                                     std::array<uint64_t, 1>* word_out) {
+                                   word_out->at(0) = apply(*cond_const, words_in[0],
+                                                           *left_const, words_in[1]);
                                  });
       break;
     }
     case LEFT_CONST: {
       // bitmaps[L_VALID] might be null; override to make it safe for Visit()
-      std::array<Bitmap, 4> bitmaps{cond_valid, cond_data, _, right_valid};
+      std::array<Bitmap, 3> bitmaps{cond_valid, cond_data, right_valid};
       Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
-                                 [&](const std::array<uint64_t, 4>& words_in,
-                                     std::array<uint64_t, 1>& word_out) {
-                                   word_out[0] =
-                                       apply(words_in[C_VALID], words_in[C_DATA],
-                                             *left_const, words_in[R_VALID]);
+                                 [&](const std::array<uint64_t, 3>& words_in,
+                                     std::array<uint64_t, 1>* word_out) {
+                                   word_out->at(0) = apply(words_in[0], words_in[1],
+                                                           *left_const, words_in[2]);
                                  });
       break;
     }
     case COND_CONST: {
       // bitmaps[C_VALID] might be null; override to make it safe for Visit()
-      std::array<Bitmap, 4> bitmaps{_, cond_data, left_valid, right_valid};
+      std::array<Bitmap, 3> bitmaps{cond_data, left_valid, right_valid};
       Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
-                                 [&](const std::array<uint64_t, 4>& words_in,
-                                     std::array<uint64_t, 1>& word_out) {
-                                   word_out[0] =
-                                       apply(*cond_const, words_in[C_DATA],
-                                             words_in[L_VALID], words_in[R_VALID]);
+                                 [&](const std::array<uint64_t, 3>& words_in,
+                                     std::array<uint64_t, 1>* word_out) {
+                                   word_out->at(0) = apply(*cond_const, words_in[0],
+                                                           words_in[1], words_in[2]);
                                  });
       break;
     }
@@ -198,10 +191,9 @@ Status PromoteNullsVisitor(KernelContext* ctx, const Datum& cond_d, const Datum&
       std::array<Bitmap, 4> bitmaps{cond_valid, cond_data, left_valid, right_valid};
       Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
                                  [&](const std::array<uint64_t, 4>& words_in,
-                                     std::array<uint64_t, 1>& word_out) {
-                                   word_out[0] =
-                                       apply(words_in[C_VALID], words_in[C_DATA],
-                                             words_in[L_VALID], words_in[R_VALID]);
+                                     std::array<uint64_t, 1>* word_out) {
+                                   word_out->at(0) = apply(words_in[0], words_in[1],
+                                                           words_in[2], words_in[3]);
                                  });
       break;
     }
diff --git a/cpp/src/arrow/util/bitmap.h b/cpp/src/arrow/util/bitmap.h
index 613366eb8af..05cc7a309f8 100644
--- a/cpp/src/arrow/util/bitmap.h
+++ b/cpp/src/arrow/util/bitmap.h
@@ -256,7 +256,7 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
   /// densely packed bits loaded from the bitmap. That offset within the first word is
   /// returned.
   /// Visitor is expected to have the following signature
-  ///     [](const std::array<Word, N>& in_words, std::array<Word, M>& out_words){...}
+  ///     [](const std::array<Word, N>& in_words, std::array<Word, M>* out_words){...}
   ///
   // NOTE: this function is efficient on 3+ sufficiently large bitmaps.
   // It also has a large prolog / epilog overhead and should be used
@@ -296,33 +296,47 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
     // todo this will be inefficient in some cases. When there are offsets beyond Word
     //  boundary, every Word would have to be created from 2 adjoining Words
     auto n_words = readers[0].words();
+    bit_length -= n_words * kBitWidth;
     while (n_words--) {
       // first collect all words to visited_words array
       for (size_t i = 0; i < N; i++) {
         visited_words[i] = readers[i].NextWord();
       }
-      visitor(visited_words, output_words);
+      visitor(visited_words, &output_words);
       for (size_t i = 0; i < M; i++) {
         writers[i].PutNextWord(output_words[i]);
       }
-      bit_length -= kBitWidth;
     }
 
     // every reader will have same number of trailing bytes, because of the above reason
     // tailing portion could be more than one word! (ref: BitmapWordReader constructor)
     // remaining full/ partial words to write
-    n_words = (bit_length + kBitWidth - 1) / kBitWidth;
-    assert(n_words <= 2);
-    while (n_words--) {
-      visited_words.fill(0);
-      output_words.fill(0);
-      int valid_bits;
-      for (size_t i = 0; i < N; i++) {
-        visited_words[i] = readers[i].NextTrailingWord(valid_bits);
-      }
-      visitor(visited_words, output_words);
-      for (size_t i = 0; i < M; i++) {
-        writers[i].PutTrailingWord(output_words[i], valid_bits);
+
+    if (bit_length) {
+      // convert the word visitor lambda to a byte_visitor
+      auto byte_visitor = [&](const std::array<uint8_t, N>& in,
+                              std::array<uint8_t, M>* out) {
+        std::array<Word, N> in_words;
+        std::array<Word, M> out_words;
+        std::copy(in.begin(), in.end(), in_words.begin());
+        visitor(in_words, &out_words);
+        std::move(out_words.begin(), out_words.end(), out->begin());
+      };
+
+      std::array<uint8_t, N> visited_bytes;
+      std::array<uint8_t, M> output_bytes;
+      int n_bytes = readers[0].trailing_bytes();
+      while (n_bytes--) {
+        visited_bytes.fill(0);
+        output_bytes.fill(0);
+        int valid_bits;
+        for (size_t i = 0; i < N; i++) {
+          visited_bytes[i] = readers[i].NextTrailingByte(valid_bits);
+        }
+        byte_visitor(visited_bytes, &output_bytes);
+        for (size_t i = 0; i < M; i++) {
+          writers[i].PutNextTrailingByte(output_bytes[i], valid_bits);
+        }
       }
     }
   }
diff --git a/cpp/src/arrow/util/bitmap_reader.h b/cpp/src/arrow/util/bitmap_reader.h
index 70584d5dfe9..ce1d5f376bd 100644
--- a/cpp/src/arrow/util/bitmap_reader.h
+++ b/cpp/src/arrow/util/bitmap_reader.h
@@ -193,21 +193,6 @@ class BitmapWordReader {
     return word;
   }
 
-  Word NextTrailingWord(int& valid_bits) {
-    // safest way to create a word from the trailing bits, is to concatenate bytes
-    // returned by NextTrailingByte
-    Word word = 0;  // only a partial word may be returned.
-    valid_bits = 0;
-    int n_byte = std::min(trailing_bytes_, static_cast<int>(sizeof(Word)));
-    for (int b = 0; b < n_byte; b++) {
-      int valid;
-      auto byte = static_cast<Word>(NextTrailingByte(valid));
-      word |= byte << (b * 8);
-      valid_bits += valid;
-    }
-    return word;
-  }
-
   uint8_t NextTrailingByte(int& valid_bits) {
     uint8_t byte;
     assert(trailing_bits_ > 0);
diff --git a/cpp/src/arrow/util/bitmap_test.cc b/cpp/src/arrow/util/bitmap_test.cc
index 601cf6f65ad..4a782e3ce12 100644
--- a/cpp/src/arrow/util/bitmap_test.cc
+++ b/cpp/src/arrow/util/bitmap_test.cc
@@ -87,9 +87,9 @@ void RunOutputNoOffset(int part) {
   std::array<Bitmap, 3> in_bms{bm0, bm1, bm2};
   Bitmap::VisitWordsAndWrite(
       in_bms, &out_bms,
-      [](const std::array<uint64_t, 3>& in, std::array<uint64_t, 2>& out) {
-        out[0] = in[0] & in[1];
-        out[1] = in[0] | in[2];
+      [](const std::array<uint64_t, 3>& in, std::array<uint64_t, 2>* out) {
+        out->at(0) = in[0] & in[1];
+        out->at(1) = in[0] | in[2];
       });
 
   std::vector<bool> out_v0(part);
@@ -124,8 +124,8 @@ void RunOutputWithOffset(int64_t part) {
   std::shared_ptr<Buffer>& arrow_buffer = arrow_data->data()->buffers[1];
 
   Bitmap bm0(arrow_buffer, 0, part);
-  Bitmap bm1 = bm0.Slice(part * 1, part);  // this goes beyond bm0's len
-  Bitmap bm2 = bm0.Slice(part * 2, part);  // this goes beyond bm0's len
+  Bitmap bm1(arrow_buffer, part * 1, part);
+  Bitmap bm2(arrow_buffer, part * 2, part);
 
   std::array<Bitmap, 2> out_bms;
   ASSERT_OK_AND_ASSIGN(auto out, AllocateBitmap(part * 4));
@@ -136,12 +136,12 @@ void RunOutputWithOffset(int64_t part) {
   std::vector<bool> v1(data.begin() + part * 1, data.begin() + part * 2);
   std::vector<bool> v2(data.begin() + part * 2, data.begin() + part * 3);
 
-  std::cout << "v0: " << VectorToString(v0) << "\n";
-  std::cout << "b0: " << bm0.ToString() << "\n";
-  std::cout << "v1: " << VectorToString(v1) << "\n";
-  std::cout << "b1: " << bm1.ToString() << "\n";
-  std::cout << "v2: " << VectorToString(v2) << "\n";
-  std::cout << "b2: " << bm2.ToString() << "\n";
+  //  std::cout << "v0: " << VectorToString(v0) << "\n";
+  //  std::cout << "b0: " << bm0.ToString() << "\n";
+  //  std::cout << "v1: " << VectorToString(v1) << "\n";
+  //  std::cout << "b1: " << bm1.ToString() << "\n";
+  //  std::cout << "v2: " << VectorToString(v2) << "\n";
+  //  std::cout << "b2: " << bm2.ToString() << "\n";
 
   std::vector<bool> out_v0(part);
   std::vector<bool> out_v1(part);
@@ -152,16 +152,16 @@ void RunOutputWithOffset(int64_t part) {
   std::transform(v0.begin(), v0.end(), v2.begin(), out_v1.begin(),
                  std::logical_or<bool>());
 
-  std::cout << "out0: " << VectorToString(out_v0) << "\n";
-  std::cout << "out1: " << VectorToString(out_v1) << "\n";
+  //  std::cout << "out0: " << VectorToString(out_v0) << "\n";
+  //  std::cout << "out1: " << VectorToString(out_v1) << "\n";
 
   // out0 = bm0 & bm1, out1= bm0 | bm2
   std::array<Bitmap, 3> in_bms{bm0, bm1, bm2};
   Bitmap::VisitWordsAndWrite(
       in_bms, &out_bms,
-      [](const std::array<uint64_t, 3>& in, std::array<uint64_t, 2>& out) {
-        out[0] = in[0] & in[1];
-        out[1] = in[0] | in[2];
+      [](const std::array<uint64_t, 3>& in, std::array<uint64_t, 2>* out) {
+        out->at(0) = in[0] & in[1];
+        out->at(1) = in[0] | in[2];
       });
 
   VerifyBoolOutput(out_bms[0], out_v0);
diff --git a/cpp/src/arrow/util/bitmap_writer.h b/cpp/src/arrow/util/bitmap_writer.h
index e4f86a269fc..b15b036c248 100644
--- a/cpp/src/arrow/util/bitmap_writer.h
+++ b/cpp/src/arrow/util/bitmap_writer.h
@@ -225,20 +225,6 @@ class BitmapWordWriter {
     bitmap_ += sizeof(Word);
   }
 
-  void PutTrailingWord(Word word, int valid_bits) {
-    assert(static_cast<size_t>(valid_bits) <= sizeof(Word) * 8);
-    if (ARROW_PREDICT_FALSE(valid_bits == 0)) {
-      return;
-    }
-
-    int n_bytes = (valid_bits + 7) / 8;
-    for (int i = 0; i < n_bytes; i++) {
-      uint8_t byte = *(reinterpret_cast<uint8_t*>(&word) + i);
-      PutNextTrailingByte(byte, std::min(8, valid_bits));
-      valid_bits -= 8;
-    }
-  }
-
   void PutNextTrailingByte(uint8_t byte, int valid_bits) {
     if (valid_bits == 8) {
       if (offset_) {

From 3af7137f0db7b1a45fe332a03ba00078835b5279 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Fri, 11 Jun 2021 17:22:45 -0400
Subject: [PATCH 12/46] adding changes to kleene kernels

---
 .../arrow/compute/kernels/scalar_boolean.cc   | 54 +++++++++----------
 .../arrow/compute/kernels/scalar_if_else.cc   |  2 -
 2 files changed, 27 insertions(+), 29 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_boolean.cc
index 89107120fa3..065e01b2780 100644
--- a/cpp/src/arrow/compute/kernels/scalar_boolean.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_boolean.cc
@@ -30,60 +30,60 @@ namespace compute {
 
 namespace {
 
-enum BitmapIndex { LEFT_VALID, LEFT_DATA, RIGHT_VALID, RIGHT_DATA };
-
 template <typename ComputeWord>
 void ComputeKleene(ComputeWord&& compute_word, KernelContext* ctx, const ArrayData& left,
                    const ArrayData& right, ArrayData* out) {
   DCHECK(left.null_count != 0 || right.null_count != 0)
       << "ComputeKleene is unnecessarily expensive for the non-null case";
 
-  Bitmap bitmaps[4];
-  bitmaps[LEFT_VALID] = {left.buffers[0], left.offset, left.length};
-  bitmaps[LEFT_DATA] = {left.buffers[1], left.offset, left.length};
+  Bitmap left_valid_bm{left.buffers[0], left.offset, left.length};
+  Bitmap left_data_bm{left.buffers[1], left.offset, left.length};
 
-  bitmaps[RIGHT_VALID] = {right.buffers[0], right.offset, right.length};
-  bitmaps[RIGHT_DATA] = {right.buffers[1], right.offset, right.length};
+  Bitmap right_valid_bm{right.buffers[0], right.offset, right.length};
+  Bitmap right_data_bm{right.buffers[1], right.offset, right.length};
 
-  auto out_validity = out->GetMutableValues<uint64_t>(0);
-  auto out_data = out->GetMutableValues<uint64_t>(1);
+  std::array<Bitmap, 2> out_bms{Bitmap(out->buffers[0], out->offset, out->length),
+                                Bitmap(out->buffers[1], out->offset, out->length)};
 
-  int64_t i = 0;
   auto apply = [&](uint64_t left_valid, uint64_t left_data, uint64_t right_valid,
-                   uint64_t right_data) {
+                   uint64_t right_data, uint64_t* out_validity, uint64_t* out_data) {
     auto left_true = left_valid & left_data;
     auto left_false = left_valid & ~left_data;
 
     auto right_true = right_valid & right_data;
     auto right_false = right_valid & ~right_data;
 
-    compute_word(left_true, left_false, right_true, right_false, &out_validity[i],
-                 &out_data[i]);
-    ++i;
+    compute_word(left_true, left_false, right_true, right_false, out_validity, out_data);
   };
 
   if (right.null_count == 0) {
-    // bitmaps[RIGHT_VALID] might be null; override to make it safe for Visit()
-    bitmaps[RIGHT_VALID] = bitmaps[RIGHT_DATA];
-    Bitmap::VisitWords(bitmaps, [&](std::array<uint64_t, 4> words) {
-      apply(words[LEFT_VALID], words[LEFT_DATA], ~uint64_t(0), words[RIGHT_DATA]);
-    });
+    std::array<Bitmap, 3> in_bms{left_valid_bm, left_data_bm, right_data_bm};
+    Bitmap::VisitWordsAndWrite(
+        in_bms, &out_bms,
+        [&](const std::array<uint64_t, 3>& in, std::array<uint64_t, 2>* out) {
+          apply(in[0], in[1], ~uint64_t(0), in[2], &(out->at(0)), &(out->at(1)));
+        });
     return;
   }
 
   if (left.null_count == 0) {
-    // bitmaps[LEFT_VALID] might be null; override to make it safe for Visit()
-    bitmaps[LEFT_VALID] = bitmaps[LEFT_DATA];
-    Bitmap::VisitWords(bitmaps, [&](std::array<uint64_t, 4> words) {
-      apply(~uint64_t(0), words[LEFT_DATA], words[RIGHT_VALID], words[RIGHT_DATA]);
-    });
+    std::array<Bitmap, 3> in_bms{left_data_bm, right_valid_bm, right_data_bm};
+    Bitmap::VisitWordsAndWrite(
+        in_bms, &out_bms,
+        [&](const std::array<uint64_t, 3>& in, std::array<uint64_t, 2>* out) {
+          apply(~uint64_t(0), in[0], in[1], in[2], &(out->at(0)), &(out->at(1)));
+        });
     return;
   }
 
   DCHECK(left.null_count != 0 && right.null_count != 0);
-  Bitmap::VisitWords(bitmaps, [&](std::array<uint64_t, 4> words) {
-    apply(words[LEFT_VALID], words[LEFT_DATA], words[RIGHT_VALID], words[RIGHT_DATA]);
-  });
+  std::array<Bitmap, 4> in_bms{left_valid_bm, left_data_bm, right_valid_bm,
+                               right_data_bm};
+  Bitmap::VisitWordsAndWrite(
+      in_bms, &out_bms,
+      [&](const std::array<uint64_t, 4>& in, std::array<uint64_t, 2>* out) {
+        apply(in[0], in[1], in[2], in[3], &(out->at(0)), &(out->at(1)));
+      });
 }
 
 inline BooleanScalar InvertScalar(const Scalar& in) {
diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
index 37db3391996..147b68f4baa 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
@@ -107,8 +107,6 @@ Status PromoteNullsVisitor(KernelContext* ctx, const Datum& cond_d, const Datum&
 
   std::array<Bitmap, 1> out_bitmaps{Bitmap{output->buffers[0], 0, cond.length}};
 
-  enum { C_VALID, C_DATA, L_VALID, R_VALID };
-
   switch (flag) {
     case COND_CONST | LEFT_CONST | RIGHT_CONST: {
       std::array<Bitmap, 1> bitmaps{cond_data};

From cc659e91cf53f1f268acac4498c7155c6e6e69da Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Sat, 12 Jun 2021 00:46:56 -0400
Subject: [PATCH 13/46] fix for kleene test failures with
 NullHandling::COMPUTED_PREALLOCATE and can_write_into_slices=true

---
 .../arrow/compute/kernels/scalar_boolean.cc   | 25 ++++++++++---------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_boolean.cc
index 065e01b2780..cfcad558aed 100644
--- a/cpp/src/arrow/compute/kernels/scalar_boolean.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_boolean.cc
@@ -204,7 +204,9 @@ struct KleeneAndOp : Commutative<KleeneAndOp> {
                      ArrayData* out) {
     if (left.GetNullCount() == 0 && right.GetNullCount() == 0) {
       out->null_count = 0;
-      out->buffers[0] = nullptr;
+      //      out->buffers[0] = nullptr;
+      // Kleene kernels have validity bitmap pre-allocated. Therefore, set it to 1
+      std::memset(out->buffers[0]->mutable_data(), UINT8_MAX, out->buffers[0]->size());
       return AndOp::Call(ctx, left, right, out);
     }
     auto compute_word = [](uint64_t left_true, uint64_t left_false, uint64_t right_true,
@@ -307,7 +309,9 @@ struct KleeneOrOp : Commutative<KleeneOrOp> {
                      ArrayData* out) {
     if (left.GetNullCount() == 0 && right.GetNullCount() == 0) {
       out->null_count = 0;
-      out->buffers[0] = nullptr;
+      //      out->buffers[0] = nullptr;
+      // Kleene kernels have validity bitmap pre-allocated. Therefore, set it to 1
+      std::memset(out->buffers[0]->mutable_data(), UINT8_MAX, out->buffers[0]->size());
       return OrOp::Call(ctx, left, right, out);
     }
 
@@ -437,7 +441,9 @@ struct KleeneAndNotOp {
                      ArrayData* out) {
     if (left.GetNullCount() == 0 && right.GetNullCount() == 0) {
       out->null_count = 0;
-      out->buffers[0] = nullptr;
+      //      out->buffers[0] = nullptr;
+      // Kleene kernels have validity bitmap pre-allocated. Therefore, set it to 1
+      std::memset(out->buffers[0]->mutable_data(), UINT8_MAX, out->buffers[0]->size());
       return AndNotOp::Call(ctx, left, right, out);
     }
 
@@ -453,9 +459,8 @@ struct KleeneAndNotOp {
   }
 };
 
-void MakeFunction(std::string name, int arity, ArrayKernelExec exec,
+void MakeFunction(const std::string& name, int arity, ArrayKernelExec exec,
                   const FunctionDoc* doc, FunctionRegistry* registry,
-                  bool can_write_into_slices = true,
                   NullHandling::type null_handling = NullHandling::INTERSECTION) {
   auto func = std::make_shared<ScalarFunction>(name, Arity(arity), doc);
 
@@ -463,7 +468,6 @@ void MakeFunction(std::string name, int arity, ArrayKernelExec exec,
   std::vector<InputType> in_types(arity, InputType(boolean()));
   ScalarKernel kernel(std::move(in_types), boolean(), exec);
   kernel.null_handling = null_handling;
-  kernel.can_write_into_slices = can_write_into_slices;
 
   DCHECK_OK(func->AddKernel(kernel));
   DCHECK_OK(registry->AddFunction(std::move(func)));
@@ -551,14 +555,11 @@ void RegisterScalarBoolean(FunctionRegistry* registry) {
 
   // The Kleene logic kernels cannot write into sliced output bitmaps
   MakeFunction("and_kleene", 2, applicator::SimpleBinary<KleeneAndOp>, &and_kleene_doc,
-               registry,
-               /*can_write_into_slices=*/false, NullHandling::COMPUTED_PREALLOCATE);
+               registry, NullHandling::COMPUTED_PREALLOCATE);
   MakeFunction("and_not_kleene", 2, applicator::SimpleBinary<KleeneAndNotOp>,
-               &and_not_kleene_doc, registry,
-               /*can_write_into_slices=*/false, NullHandling::COMPUTED_PREALLOCATE);
+               &and_not_kleene_doc, registry, NullHandling::COMPUTED_PREALLOCATE);
   MakeFunction("or_kleene", 2, applicator::SimpleBinary<KleeneOrOp>, &or_kleene_doc,
-               registry,
-               /*can_write_into_slices=*/false, NullHandling::COMPUTED_PREALLOCATE);
+               registry, NullHandling::COMPUTED_PREALLOCATE);
 }
 
 }  // namespace internal

From a0b4b42df009fe8f4c37bff360335552c682719b Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Sun, 13 Jun 2021 20:30:26 -0400
Subject: [PATCH 14/46] adding set/clear bitmap methods

---
 .../arrow/compute/kernels/scalar_boolean.cc   | 12 +++---
 cpp/src/arrow/util/bitmap_ops.cc              | 40 +++++++++++++++++++
 cpp/src/arrow/util/bitmap_ops.h               |  8 ++++
 3 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_boolean.cc
index cfcad558aed..cba07eb057f 100644
--- a/cpp/src/arrow/compute/kernels/scalar_boolean.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_boolean.cc
@@ -204,9 +204,9 @@ struct KleeneAndOp : Commutative<KleeneAndOp> {
                      ArrayData* out) {
     if (left.GetNullCount() == 0 && right.GetNullCount() == 0) {
       out->null_count = 0;
-      //      out->buffers[0] = nullptr;
       // Kleene kernels have validity bitmap pre-allocated. Therefore, set it to 1
-      std::memset(out->buffers[0]->mutable_data(), UINT8_MAX, out->buffers[0]->size());
+      arrow::internal::SetBitmap(out->buffers[0]->mutable_data(), out->offset,
+                                 out->length);
       return AndOp::Call(ctx, left, right, out);
     }
     auto compute_word = [](uint64_t left_true, uint64_t left_false, uint64_t right_true,
@@ -309,9 +309,9 @@ struct KleeneOrOp : Commutative<KleeneOrOp> {
                      ArrayData* out) {
     if (left.GetNullCount() == 0 && right.GetNullCount() == 0) {
       out->null_count = 0;
-      //      out->buffers[0] = nullptr;
       // Kleene kernels have validity bitmap pre-allocated. Therefore, set it to 1
-      std::memset(out->buffers[0]->mutable_data(), UINT8_MAX, out->buffers[0]->size());
+      arrow::internal::SetBitmap(out->buffers[0]->mutable_data(), out->offset,
+                                 out->length);
       return OrOp::Call(ctx, left, right, out);
     }
 
@@ -441,9 +441,9 @@ struct KleeneAndNotOp {
                      ArrayData* out) {
     if (left.GetNullCount() == 0 && right.GetNullCount() == 0) {
       out->null_count = 0;
-      //      out->buffers[0] = nullptr;
       // Kleene kernels have validity bitmap pre-allocated. Therefore, set it to 1
-      std::memset(out->buffers[0]->mutable_data(), UINT8_MAX, out->buffers[0]->size());
+      arrow::internal::SetBitmap(out->buffers[0]->mutable_data(), out->offset,
+                                 out->length);
       return AndNotOp::Call(ctx, left, right, out);
     }
 
diff --git a/cpp/src/arrow/util/bitmap_ops.cc b/cpp/src/arrow/util/bitmap_ops.cc
index 63c8b008f4a..f657bc8db95 100644
--- a/cpp/src/arrow/util/bitmap_ops.cc
+++ b/cpp/src/arrow/util/bitmap_ops.cc
@@ -383,5 +383,45 @@ void BitmapOrNot(const uint8_t* left, int64_t left_offset, const uint8_t* right,
   BitmapOp<OrNotOp>(left, left_offset, right, right_offset, length, out_offset, out);
 }
 
+template <bool value>
+void SetBitmapImpl(uint8_t* data, int64_t offset, int64_t length) {
+  int64_t prologue = std::min(((offset + 7) / 8) * 8 - offset, length);
+
+  if (prologue) { // align to a byte boundary
+    DCHECK_LT(prologue, 8);
+    BitmapWriter writer(data, offset, prologue);
+    for (auto i = 0; i < prologue; i++) {
+      value ? writer.Set() : writer.Clear();
+      writer.Next();
+    }
+    writer.Finish();
+    offset += prologue;
+    length -= prologue;
+  }
+
+  if (length) { // set values per byte
+    DCHECK_EQ(offset % 8, 0);
+    std::memset(data + offset / 8, value ? UINT8_MAX : 0, length / 8);
+    offset += ((length / 8) * 8);
+    length -= ((length / 8) * 8);
+  }
+
+  if (length) { // clean up
+    BitmapWriter writer(data, offset, length);
+    for (auto i = 0; i < length; i++) {
+      value ? writer.Set() : writer.Clear();
+      writer.Next();
+    }
+    writer.Finish();
+  }
+}
+
+void SetBitmap(uint8_t* data, int64_t offset, int64_t length) {
+  SetBitmapImpl<true>(data, offset, length);
+}
+
+void ClearBitmap(uint8_t* data, int64_t offset, int64_t length) {
+  SetBitmapImpl<false>(data, offset, length);
+}
 }  // namespace internal
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/bitmap_ops.h b/cpp/src/arrow/util/bitmap_ops.h
index 40a7797a239..ecc8a77f024 100644
--- a/cpp/src/arrow/util/bitmap_ops.h
+++ b/cpp/src/arrow/util/bitmap_ops.h
@@ -202,5 +202,13 @@ ARROW_EXPORT
 void BitmapOrNot(const uint8_t* left, int64_t left_offset, const uint8_t* right,
                  int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
 
+/// \brief Sets all bits in the bitmap to true
+ARROW_EXPORT
+void SetBitmap(uint8_t* data, int64_t offset, int64_t length);
+
+/// \brief Clears all bits in the bitmap (set to false)
+ARROW_EXPORT
+void ClearBitmap(uint8_t* data, int64_t offset, int64_t length);
+
 }  // namespace internal
 }  // namespace arrow

From aea1b0f5873bdd91f8e736cf91a5560b47b6b533 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Mon, 14 Jun 2021 00:28:58 -0400
Subject: [PATCH 15/46] lint fixes

---
 cpp/src/arrow/util/bitmap_ops.cc  | 7 ++++---
 cpp/src/arrow/util/bitmap_test.cc | 3 +--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/src/arrow/util/bitmap_ops.cc b/cpp/src/arrow/util/bitmap_ops.cc
index f657bc8db95..c1d94b6b588 100644
--- a/cpp/src/arrow/util/bitmap_ops.cc
+++ b/cpp/src/arrow/util/bitmap_ops.cc
@@ -387,7 +387,7 @@ template <bool value>
 void SetBitmapImpl(uint8_t* data, int64_t offset, int64_t length) {
   int64_t prologue = std::min(((offset + 7) / 8) * 8 - offset, length);
 
-  if (prologue) { // align to a byte boundary
+  if (prologue) {  // align to a byte boundary
     DCHECK_LT(prologue, 8);
     BitmapWriter writer(data, offset, prologue);
     for (auto i = 0; i < prologue; i++) {
@@ -399,14 +399,15 @@ void SetBitmapImpl(uint8_t* data, int64_t offset, int64_t length) {
     length -= prologue;
   }
 
-  if (length) { // set values per byte
+  if (length) {  // set values per byte
     DCHECK_EQ(offset % 8, 0);
     std::memset(data + offset / 8, value ? UINT8_MAX : 0, length / 8);
     offset += ((length / 8) * 8);
     length -= ((length / 8) * 8);
   }
 
-  if (length) { // clean up
+  if (length) {  // clean up
+    DCHECK_LT(prologue, 8);
     BitmapWriter writer(data, offset, length);
     for (auto i = 0; i < length; i++) {
       value ? writer.Set() : writer.Clear();
diff --git a/cpp/src/arrow/util/bitmap_test.cc b/cpp/src/arrow/util/bitmap_test.cc
index 4a782e3ce12..4c2958f6432 100644
--- a/cpp/src/arrow/util/bitmap_test.cc
+++ b/cpp/src/arrow/util/bitmap_test.cc
@@ -18,14 +18,13 @@
 #include "arrow/util/bitmap.h"
 
 #include <arrow/array/builder_primitive.h>
+#include <arrow/buffer.h>
 #include <arrow/testing/gtest_util.h>
 #include <gtest/gtest.h>
 
 #include <numeric>
 #include <random>
 
-#include "arrow/buffer.h"
-
 namespace arrow {
 namespace internal {
 

From 6f30a986d6ebb8c5950b04a066e657076b5671f6 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Mon, 14 Jun 2021 13:57:46 -0400
Subject: [PATCH 16/46] adding SpliceWord and refactoring code

---
 cpp/src/arrow/util/CMakeLists.txt   |   1 -
 cpp/src/arrow/util/bit_util.h       |  16 +++
 cpp/src/arrow/util/bit_util_test.cc | 195 ++++++++++++++++++++++++++
 cpp/src/arrow/util/bitmap_ops.cc    |  48 ++++---
 cpp/src/arrow/util/bitmap_test.cc   | 203 ----------------------------
 5 files changed, 242 insertions(+), 221 deletions(-)
 delete mode 100644 cpp/src/arrow/util/bitmap_test.cc

diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt
index 571834dfca6..e26a17120cd 100644
--- a/cpp/src/arrow/util/CMakeLists.txt
+++ b/cpp/src/arrow/util/CMakeLists.txt
@@ -44,7 +44,6 @@ add_arrow_test(utility-test
                async_generator_test.cc
                bit_block_counter_test.cc
                bit_util_test.cc
-               bitmap_test.cc
                cache_test.cc
                checked_cast_test.cc
                compression_test.cc
diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h
index 01845791faa..60e5ef6543f 100644
--- a/cpp/src/arrow/util/bit_util.h
+++ b/cpp/src/arrow/util/bit_util.h
@@ -316,5 +316,21 @@ static inline void SetBitTo(uint8_t* bits, int64_t i, bool bit_is_set) {
 ARROW_EXPORT
 void SetBitsTo(uint8_t* bits, int64_t start_offset, int64_t length, bool bits_are_set);
 
+template <typename Word>
+constexpr Word WordBitMask(int i) {
+  return (static_cast<Word>(1) << i) - 1;
+}
+
+/// \brief Create a word with low `n` bits from `low` and high `sizeof(Word)-n` bits
+/// from `high`.
+/// Word ret
+/// for (i = 0; i < sizeof(Word); i++){
+///     ret[i]= i < n ? low[i]: high[i];
+/// }
+template <typename Word>
+constexpr Word SpliceWord(int n, Word low, Word high) {
+  return (high & ~WordBitMask<Word>(n)) | (low & WordBitMask<Word>(n));
+}
+
 }  // namespace BitUtil
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc
index e5a5e4c39be..0fe39fa804b 100644
--- a/cpp/src/arrow/util/bit_util_test.cc
+++ b/cpp/src/arrow/util/bit_util_test.cc
@@ -1975,6 +1975,37 @@ TEST(BitUtil, BitsetStack) {
   ASSERT_EQ(stack.TopSize(), 0);
 }
 
+template <typename Word>
+void CheckSplice(int n, Word low, Word high) {
+  std::bitset<sizeof(Word) * 8> ret;
+  for (size_t i = 0; i < ret.size(); i++) {
+    ret[i] = i < static_cast<size_t>(n)
+                 ? BitUtil::GetBit(reinterpret_cast<uint8_t*>(&low), i)
+                 : BitUtil::GetBit(reinterpret_cast<uint8_t*>(&high), i);
+  }
+
+  ASSERT_EQ(static_cast<Word>(ret.to_ulong()), BitUtil::SpliceWord(n, low, high));
+}
+
+TEST(SpliceWord, SpliceWord) {
+  uint64_t low = 123456789, high = 987654321;
+
+  CheckSplice<uint8_t>(0, static_cast<uint8_t>(low), static_cast<uint8_t>(high));
+  CheckSplice<uint8_t>(UINT8_MAX, static_cast<uint8_t>(low), static_cast<uint8_t>(high));
+  CheckSplice<uint8_t>(sizeof(uint8_t) / 3, static_cast<uint8_t>(low),
+                       static_cast<uint8_t>(high));
+
+  CheckSplice<uint32_t>(0, static_cast<uint32_t>(low), static_cast<uint32_t>(high));
+  CheckSplice<uint32_t>(UINT32_MAX, static_cast<uint32_t>(low),
+                        static_cast<uint32_t>(high));
+  CheckSplice<uint32_t>(sizeof(uint32_t) / 3, static_cast<uint32_t>(low),
+                        static_cast<uint32_t>(high));
+
+  CheckSplice(0, low, high);
+  CheckSplice(UINT32_MAX, low, high);
+  CheckSplice(sizeof(uint32_t) / 3, low, high);
+}
+
 // test the basic assumption of word level Bitmap::Visit
 TEST(Bitmap, ShiftingWordsOptimization) {
   // single word
@@ -2156,5 +2187,169 @@ TEST(Bitmap, VisitWordsAnd) {
   }
 }
 
+void random_bool_vector(std::vector<bool>& vec, int64_t size, double p = 0.5) {
+  vec.reserve(size);
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::bernoulli_distribution d(p);
+
+  for (int n = 0; n < size; ++n) {
+    vec.push_back(d(gen));
+  }
+}
+
+std::string VectorToString(const std::vector<bool>& v) {
+  std::string out(v.size() + +((v.size() - 1) / 8), ' ');
+  for (size_t i = 0; i < v.size(); ++i) {
+    out[i + (i / 8)] = v[i] ? '1' : '0';
+  }
+  return out;
+}
+
+void VerifyBoolVectorAndBitmap(const Bitmap& bitmap, const std::vector<bool>& expected) {
+  arrow::BooleanBuilder boolean_builder;
+  ASSERT_OK(boolean_builder.AppendValues(expected));
+  ASSERT_OK_AND_ASSIGN(auto arr, boolean_builder.Finish());
+
+  ASSERT_TRUE(BitmapEquals(bitmap.buffer()->data(), bitmap.offset(),
+                           arr->data()->buffers[1]->data(), 0, expected.size()))
+      << "exp: " << VectorToString(expected) << "\ngot: " << bitmap.ToString();
+}
+
+class TestBitmapVisitAndWriteOutputNoOffset : public ::testing::TestWithParam<int32_t> {};
+
+TEST_P(TestBitmapVisitAndWriteOutputNoOffset, Test1) {
+  auto part = GetParam();
+  int64_t bits = 4 * part;
+  std::vector<bool> data;
+  random_bool_vector(data, bits);
+
+  arrow::BooleanBuilder boolean_builder;
+  ASSERT_OK(boolean_builder.AppendValues(data));
+  ASSERT_OK_AND_ASSIGN(auto arrow_data, boolean_builder.Finish());
+
+  std::shared_ptr<Buffer>& arrow_buffer = arrow_data->data()->buffers[1];
+
+  Bitmap bm0(arrow_buffer, 0, part);
+  Bitmap bm1 = bm0.Slice(part * 1, part);  // this goes beyond bm0's len
+  Bitmap bm2 = bm0.Slice(part * 2, part);  // this goes beyond bm0's len
+
+  std::array<Bitmap, 2> out_bms;
+  ASSERT_OK_AND_ASSIGN(auto out0, AllocateBitmap(part));
+  ASSERT_OK_AND_ASSIGN(auto out1, AllocateBitmap(part));
+  out_bms[0] = Bitmap(out0, 0, part);
+  out_bms[1] = Bitmap(out1, 0, part);
+
+  std::vector<bool> v0(data.begin(), data.begin() + part);
+  std::vector<bool> v1(data.begin() + part * 1, data.begin() + part * 2);
+  std::vector<bool> v2(data.begin() + part * 2, data.begin() + part * 3);
+
+  // out0 = bm0 & bm1, out1= bm0 | bm2
+  std::array<Bitmap, 3> in_bms{bm0, bm1, bm2};
+  Bitmap::VisitWordsAndWrite(
+      in_bms, &out_bms,
+      [](const std::array<uint64_t, 3>& in, std::array<uint64_t, 2>* out) {
+        out->at(0) = in[0] & in[1];
+        out->at(1) = in[0] | in[2];
+      });
+
+  std::vector<bool> out_v0(part);
+  std::vector<bool> out_v1(part);
+  // v3 = v0 & v1
+  std::transform(v0.begin(), v0.end(), v1.begin(), out_v0.begin(),
+                 std::logical_and<bool>());
+  // v3 |= v2
+  std::transform(v0.begin(), v0.end(), v2.begin(), out_v1.begin(),
+                 std::logical_or<bool>());
+
+  //  std::cout << "v0: " << VectorToString(v0) << "\n"
+  //            << "b0: " << bm0.ToString() << "\n"
+  //            << "v1: " << VectorToString(v1) << "\n"
+  //            << "b1: " << bm1.ToString() << "\n"
+  //            << "v2: " << VectorToString(v2) << "\n"
+  //            << "b2: " << bm2.ToString() << "\n";
+
+  VerifyBoolVectorAndBitmap(out_bms[0], out_v0);
+  VerifyBoolVectorAndBitmap(out_bms[1], out_v1);
+}
+
+INSTANTIATE_TEST_SUITE_P(VisitWriteGeneral, TestBitmapVisitAndWriteOutputNoOffset,
+                         testing::Values(199, 256, 1000));
+
+INSTANTIATE_TEST_SUITE_P(VisitWriteEdgeCases, TestBitmapVisitAndWriteOutputNoOffset,
+                         testing::Values(5, 13, 21, 29, 37, 41, 51, 59, 64, 97));
+
+INSTANTIATE_TEST_SUITE_P(VisitWriteEdgeCases2, TestBitmapVisitAndWriteOutputNoOffset,
+                         testing::Values(8, 16, 24, 32, 40, 48, 56, 64));
+
+class TestBitmapVisitAndWriteOutputWithOffset : public ::testing::TestWithParam<int32_t> {
+};
+
+TEST_P(TestBitmapVisitAndWriteOutputWithOffset, Test2) {
+  auto part = GetParam();
+  int64_t bits = part * 4;
+  std::vector<bool> data;
+  random_bool_vector(data, bits);
+
+  arrow::BooleanBuilder boolean_builder;
+  ASSERT_OK(boolean_builder.AppendValues(data));
+  ASSERT_OK_AND_ASSIGN(auto arrow_data, boolean_builder.Finish());
+
+  std::shared_ptr<Buffer>& arrow_buffer = arrow_data->data()->buffers[1];
+
+  Bitmap bm0(arrow_buffer, 0, part);
+  Bitmap bm1(arrow_buffer, part * 1, part);
+  Bitmap bm2(arrow_buffer, part * 2, part);
+
+  std::array<Bitmap, 2> out_bms;
+  ASSERT_OK_AND_ASSIGN(auto out, AllocateBitmap(part * 4));
+  out_bms[0] = Bitmap(out, part, part);
+  out_bms[1] = Bitmap(out, part * 2, part);
+
+  std::vector<bool> v0(data.begin(), data.begin() + part);
+  std::vector<bool> v1(data.begin() + part * 1, data.begin() + part * 2);
+  std::vector<bool> v2(data.begin() + part * 2, data.begin() + part * 3);
+
+  //  std::cout << "v0: " << VectorToString(v0) << "\n"
+  //            << "b0: " << bm0.ToString() << "\n"
+  //            << "v1: " << VectorToString(v1) << "\n"
+  //            << "b1: " << bm1.ToString() << "\n"
+  //            << "v2: " << VectorToString(v2) << "\n"
+  //            << "b2: " << bm2.ToString() << "\n";
+
+  std::vector<bool> out_v0(part);
+  std::vector<bool> out_v1(part);
+  // v3 = v0 & v1
+  std::transform(v0.begin(), v0.end(), v1.begin(), out_v0.begin(),
+                 std::logical_and<bool>());
+  // v3 |= v2
+  std::transform(v0.begin(), v0.end(), v2.begin(), out_v1.begin(),
+                 std::logical_or<bool>());
+
+  //  std::cout << "out0: " << VectorToString(out_v0) << "\n"
+  //            << "out1: " << VectorToString(out_v1) << "\n";
+
+  // out0 = bm0 & bm1, out1= bm0 | bm2
+  std::array<Bitmap, 3> in_bms{bm0, bm1, bm2};
+  Bitmap::VisitWordsAndWrite(
+      in_bms, &out_bms,
+      [](const std::array<uint64_t, 3>& in, std::array<uint64_t, 2>* out) {
+        out->at(0) = in[0] & in[1];
+        out->at(1) = in[0] | in[2];
+      });
+
+  VerifyBoolVectorAndBitmap(out_bms[0], out_v0);
+  VerifyBoolVectorAndBitmap(out_bms[1], out_v1);
+}
+
+INSTANTIATE_TEST_SUITE_P(VisitWriteGeneral, TestBitmapVisitAndWriteOutputWithOffset,
+                         testing::Values(199, 256, 1000));
+
+INSTANTIATE_TEST_SUITE_P(VisitWriteEdgeCases, TestBitmapVisitAndWriteOutputWithOffset,
+                         testing::Values(7, 15, 23, 31, 39, 47, 55, 63, 73, 97));
+
+INSTANTIATE_TEST_SUITE_P(VisitWriteEdgeCases2, TestBitmapVisitAndWriteOutputWithOffset,
+                         testing::Values(8, 16, 24, 32, 40, 48, 56, 64));
+
 }  // namespace internal
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/bitmap_ops.cc b/cpp/src/arrow/util/bitmap_ops.cc
index c1d94b6b588..fce23de35d8 100644
--- a/cpp/src/arrow/util/bitmap_ops.cc
+++ b/cpp/src/arrow/util/bitmap_ops.cc
@@ -385,35 +385,49 @@ void BitmapOrNot(const uint8_t* left, int64_t left_offset, const uint8_t* right,
 
 template <bool value>
 void SetBitmapImpl(uint8_t* data, int64_t offset, int64_t length) {
-  int64_t prologue = std::min(((offset + 7) / 8) * 8 - offset, length);
+  //                 offset  length
+  // data              |<------------->|
+  //   |--------|...|--------|...|--------|
+  //                   |<--->|   |<--->|
+  //                     pro       epi
+  if (ARROW_PREDICT_FALSE(length == 0)) {
+    return;
+  }
+
+  constexpr uint8_t set_byte = value ? UINT8_MAX : 0;
+
+  int prologue = static_cast<int>(((offset + 7) / 8) * 8 - offset);
+  DCHECK_LT(prologue, 8);
+
+  if (length < prologue) {  // special case where a mask is required
+    //             offset length
+    // data             |<->|
+    //   |--------|...|--------|...
+    //             mask |111|
+    //                  |<---->|
+    //                     pro
+    uint8_t mask = BitUtil::kPrecedingBitmask[8 - prologue] ^
+                   BitUtil::kPrecedingBitmask[8 - prologue + length];
+    data[offset / 8] |= mask;
+    return;
+  }
 
   if (prologue) {  // align to a byte boundary
-    DCHECK_LT(prologue, 8);
-    BitmapWriter writer(data, offset, prologue);
-    for (auto i = 0; i < prologue; i++) {
-      value ? writer.Set() : writer.Clear();
-      writer.Next();
-    }
-    writer.Finish();
+    data[offset / 8] = BitUtil::SpliceWord(offset, data[offset / 8], set_byte);
     offset += prologue;
     length -= prologue;
   }
 
-  if (length) {  // set values per byte
+  if (length / 8) {  // set values per byte
     DCHECK_EQ(offset % 8, 0);
-    std::memset(data + offset / 8, value ? UINT8_MAX : 0, length / 8);
+    std::memset(data + offset / 8, set_byte, length / 8);
     offset += ((length / 8) * 8);
     length -= ((length / 8) * 8);
   }
 
   if (length) {  // clean up
-    DCHECK_LT(prologue, 8);
-    BitmapWriter writer(data, offset, length);
-    for (auto i = 0; i < length; i++) {
-      value ? writer.Set() : writer.Clear();
-      writer.Next();
-    }
-    writer.Finish();
+    DCHECK_LT(length, 8);
+    data[offset / 8] = BitUtil::SpliceWord(length, set_byte, data[offset / 8]);
   }
 }
 
diff --git a/cpp/src/arrow/util/bitmap_test.cc b/cpp/src/arrow/util/bitmap_test.cc
deleted file mode 100644
index 4c2958f6432..00000000000
--- a/cpp/src/arrow/util/bitmap_test.cc
+++ /dev/null
@@ -1,203 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/util/bitmap.h"
-
-#include <arrow/array/builder_primitive.h>
-#include <arrow/buffer.h>
-#include <arrow/testing/gtest_util.h>
-#include <gtest/gtest.h>
-
-#include <numeric>
-#include <random>
-
-namespace arrow {
-namespace internal {
-
-void random_bool_vector(std::vector<bool>& vec, int64_t size, double p = 0.5) {
-  vec.reserve(size);
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::bernoulli_distribution d(p);
-
-  for (int n = 0; n < size; ++n) {
-    vec.push_back(d(gen));
-  }
-}
-
-std::string VectorToString(const std::vector<bool>& v) {
-  std::string out(v.size() + +((v.size() - 1) / 8), ' ');
-  for (size_t i = 0; i < v.size(); ++i) {
-    out[i + (i / 8)] = v[i] ? '1' : '0';
-  }
-  return out;
-}
-
-void VerifyBoolOutput(const Bitmap& bitmap, const std::vector<bool>& expected) {
-  arrow::BooleanBuilder boolean_builder;
-  ASSERT_OK(boolean_builder.AppendValues(expected));
-  ASSERT_OK_AND_ASSIGN(auto arr, boolean_builder.Finish());
-
-  ASSERT_TRUE(BitmapEquals(bitmap.buffer()->data(), bitmap.offset(),
-                           arr->data()->buffers[1]->data(), 0, expected.size()))
-      << "exp: " << VectorToString(expected) << "\ngot: " << bitmap.ToString();
-}
-
-void RunOutputNoOffset(int part) {
-  int64_t bits = 4 * part;
-  std::vector<bool> data;
-  random_bool_vector(data, bits);
-
-  arrow::BooleanBuilder boolean_builder;
-  ASSERT_OK(boolean_builder.AppendValues(data));
-  ASSERT_OK_AND_ASSIGN(auto arrow_data, boolean_builder.Finish());
-
-  std::shared_ptr<Buffer>& arrow_buffer = arrow_data->data()->buffers[1];
-
-  Bitmap bm0(arrow_buffer, 0, part);
-  Bitmap bm1 = bm0.Slice(part * 1, part);  // this goes beyond bm0's len
-  Bitmap bm2 = bm0.Slice(part * 2, part);  // this goes beyond bm0's len
-
-  std::array<Bitmap, 2> out_bms;
-  ASSERT_OK_AND_ASSIGN(auto out0, AllocateBitmap(part));
-  ASSERT_OK_AND_ASSIGN(auto out1, AllocateBitmap(part));
-  out_bms[0] = Bitmap(out0, 0, part);
-  out_bms[1] = Bitmap(out1, 0, part);
-
-  std::vector<bool> v0(data.begin(), data.begin() + part);
-  std::vector<bool> v1(data.begin() + part * 1, data.begin() + part * 2);
-  std::vector<bool> v2(data.begin() + part * 2, data.begin() + part * 3);
-
-  // out0 = bm0 & bm1, out1= bm0 | bm2
-  std::array<Bitmap, 3> in_bms{bm0, bm1, bm2};
-  Bitmap::VisitWordsAndWrite(
-      in_bms, &out_bms,
-      [](const std::array<uint64_t, 3>& in, std::array<uint64_t, 2>* out) {
-        out->at(0) = in[0] & in[1];
-        out->at(1) = in[0] | in[2];
-      });
-
-  std::vector<bool> out_v0(part);
-  std::vector<bool> out_v1(part);
-  // v3 = v0 & v1
-  std::transform(v0.begin(), v0.end(), v1.begin(), out_v0.begin(),
-                 std::logical_and<bool>());
-  // v3 |= v2
-  std::transform(v0.begin(), v0.end(), v2.begin(), out_v1.begin(),
-                 std::logical_or<bool>());
-
-  //  std::cout << "v0: " << VectorToString(v0)<< "\n";
-  //  std::cout << "b0: " << bm0.ToString()<< "\n";
-  //  std::cout << "v1: " << VectorToString(v1)<< "\n";
-  //  std::cout << "b1: " << bm1.ToString()<< "\n";
-  //  std::cout << "v2: " << VectorToString(v2) << "\n";
-  //  std::cout << "b2: " << bm2.ToString() << "\n";
-
-  VerifyBoolOutput(out_bms[0], out_v0);
-  VerifyBoolOutput(out_bms[1], out_v1);
-}
-
-void RunOutputWithOffset(int64_t part) {
-  int64_t bits = part * 4;
-  std::vector<bool> data;
-  random_bool_vector(data, bits);
-
-  arrow::BooleanBuilder boolean_builder;
-  ASSERT_OK(boolean_builder.AppendValues(data));
-  ASSERT_OK_AND_ASSIGN(auto arrow_data, boolean_builder.Finish());
-
-  std::shared_ptr<Buffer>& arrow_buffer = arrow_data->data()->buffers[1];
-
-  Bitmap bm0(arrow_buffer, 0, part);
-  Bitmap bm1(arrow_buffer, part * 1, part);
-  Bitmap bm2(arrow_buffer, part * 2, part);
-
-  std::array<Bitmap, 2> out_bms;
-  ASSERT_OK_AND_ASSIGN(auto out, AllocateBitmap(part * 4));
-  out_bms[0] = Bitmap(out, part, part);
-  out_bms[1] = Bitmap(out, part * 2, part);
-
-  std::vector<bool> v0(data.begin(), data.begin() + part);
-  std::vector<bool> v1(data.begin() + part * 1, data.begin() + part * 2);
-  std::vector<bool> v2(data.begin() + part * 2, data.begin() + part * 3);
-
-  //  std::cout << "v0: " << VectorToString(v0) << "\n";
-  //  std::cout << "b0: " << bm0.ToString() << "\n";
-  //  std::cout << "v1: " << VectorToString(v1) << "\n";
-  //  std::cout << "b1: " << bm1.ToString() << "\n";
-  //  std::cout << "v2: " << VectorToString(v2) << "\n";
-  //  std::cout << "b2: " << bm2.ToString() << "\n";
-
-  std::vector<bool> out_v0(part);
-  std::vector<bool> out_v1(part);
-  // v3 = v0 & v1
-  std::transform(v0.begin(), v0.end(), v1.begin(), out_v0.begin(),
-                 std::logical_and<bool>());
-  // v3 |= v2
-  std::transform(v0.begin(), v0.end(), v2.begin(), out_v1.begin(),
-                 std::logical_or<bool>());
-
-  //  std::cout << "out0: " << VectorToString(out_v0) << "\n";
-  //  std::cout << "out1: " << VectorToString(out_v1) << "\n";
-
-  // out0 = bm0 & bm1, out1= bm0 | bm2
-  std::array<Bitmap, 3> in_bms{bm0, bm1, bm2};
-  Bitmap::VisitWordsAndWrite(
-      in_bms, &out_bms,
-      [](const std::array<uint64_t, 3>& in, std::array<uint64_t, 2>* out) {
-        out->at(0) = in[0] & in[1];
-        out->at(1) = in[0] | in[2];
-      });
-
-  VerifyBoolOutput(out_bms[0], out_v0);
-  VerifyBoolOutput(out_bms[1], out_v1);
-}
-
-class TestBitmapVisitOutputNoOffset : public ::testing::TestWithParam<int32_t> {};
-
-TEST_P(TestBitmapVisitOutputNoOffset, Test1) {
-  auto part = GetParam();
-  RunOutputNoOffset(part);
-}
-
-INSTANTIATE_TEST_SUITE_P(General, TestBitmapVisitOutputNoOffset,
-                         testing::Values(199, 256, 1000));
-
-INSTANTIATE_TEST_SUITE_P(EdgeCases, TestBitmapVisitOutputNoOffset,
-                         testing::Values(5, 13, 21, 29, 37, 41, 51, 59, 64, 97));
-
-INSTANTIATE_TEST_SUITE_P(EdgeCases2, TestBitmapVisitOutputNoOffset,
-                         testing::Values(8, 16, 24, 32, 40, 48, 56, 64));
-
-class TestBitmapVisitOutputWithOffset : public ::testing::TestWithParam<int32_t> {};
-
-TEST_P(TestBitmapVisitOutputWithOffset, Test2) {
-  auto part = GetParam();
-  RunOutputWithOffset(part);
-}
-
-INSTANTIATE_TEST_SUITE_P(General, TestBitmapVisitOutputWithOffset,
-                         testing::Values(199, 256, 1000));
-
-INSTANTIATE_TEST_SUITE_P(EdgeCases, TestBitmapVisitOutputWithOffset,
-                         testing::Values(7, 15, 23, 31, 39, 47, 55, 63, 73, 97));
-
-INSTANTIATE_TEST_SUITE_P(EdgeCases2, TestBitmapVisitOutputWithOffset,
-                         testing::Values(8, 16, 24, 32, 40, 48, 56, 64));
-
-}  // namespace internal
-}  // namespace arrow

From 40ba1c7628a41f5e6381278c6bf42c23276f0b76 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Mon, 14 Jun 2021 14:25:08 -0400
Subject: [PATCH 17/46] refactor

---
 .../arrow/compute/kernels/scalar_boolean.cc   |  9 +--
 cpp/src/arrow/util/bit_util.cc                | 55 +++++++++++++++++++
 cpp/src/arrow/util/bit_util.h                 |  8 +++
 cpp/src/arrow/util/bitmap_ops.cc              | 55 -------------------
 cpp/src/arrow/util/bitmap_ops.h               |  8 ---
 5 files changed, 66 insertions(+), 69 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_boolean.cc
index cba07eb057f..6de4ef16031 100644
--- a/cpp/src/arrow/compute/kernels/scalar_boolean.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_boolean.cc
@@ -205,8 +205,7 @@ struct KleeneAndOp : Commutative<KleeneAndOp> {
     if (left.GetNullCount() == 0 && right.GetNullCount() == 0) {
       out->null_count = 0;
       // Kleene kernels have validity bitmap pre-allocated. Therefore, set it to 1
-      arrow::internal::SetBitmap(out->buffers[0]->mutable_data(), out->offset,
-                                 out->length);
+      BitUtil::SetBitmap(out->buffers[0]->mutable_data(), out->offset, out->length);
       return AndOp::Call(ctx, left, right, out);
     }
     auto compute_word = [](uint64_t left_true, uint64_t left_false, uint64_t right_true,
@@ -310,8 +309,7 @@ struct KleeneOrOp : Commutative<KleeneOrOp> {
     if (left.GetNullCount() == 0 && right.GetNullCount() == 0) {
       out->null_count = 0;
       // Kleene kernels have validity bitmap pre-allocated. Therefore, set it to 1
-      arrow::internal::SetBitmap(out->buffers[0]->mutable_data(), out->offset,
-                                 out->length);
+      BitUtil::SetBitmap(out->buffers[0]->mutable_data(), out->offset, out->length);
       return OrOp::Call(ctx, left, right, out);
     }
 
@@ -442,8 +440,7 @@ struct KleeneAndNotOp {
     if (left.GetNullCount() == 0 && right.GetNullCount() == 0) {
       out->null_count = 0;
       // Kleene kernels have validity bitmap pre-allocated. Therefore, set it to 1
-      arrow::internal::SetBitmap(out->buffers[0]->mutable_data(), out->offset,
-                                 out->length);
+      BitUtil::SetBitmap(out->buffers[0]->mutable_data(), out->offset, out->length);
       return AndNotOp::Call(ctx, left, right, out);
     }
 
diff --git a/cpp/src/arrow/util/bit_util.cc b/cpp/src/arrow/util/bit_util.cc
index 6e23678ddf9..9c0ef6bc9bf 100644
--- a/cpp/src/arrow/util/bit_util.cc
+++ b/cpp/src/arrow/util/bit_util.cc
@@ -20,6 +20,8 @@
 #include <cstdint>
 #include <cstring>
 
+#include "arrow/util/logging.h"
+
 namespace arrow {
 namespace BitUtil {
 
@@ -67,5 +69,58 @@ void SetBitsTo(uint8_t* bits, int64_t start_offset, int64_t length, bool bits_ar
   bits[bytes_end - 1] |= static_cast<uint8_t>(fill_byte & ~last_byte_mask);
 }
 
+template <bool value>
+void SetBitmapImpl(uint8_t* data, int64_t offset, int64_t length) {
+  //                 offset  length
+  // data              |<------------->|
+  //   |--------|...|--------|...|--------|
+  //                   |<--->|   |<--->|
+  //                     pro       epi
+  if (ARROW_PREDICT_FALSE(length == 0)) {
+    return;
+  }
+
+  constexpr uint8_t set_byte = value ? UINT8_MAX : 0;
+
+  int prologue = static_cast<int>(((offset + 7) / 8) * 8 - offset);
+  DCHECK_LT(prologue, 8);
+
+  if (length < prologue) {  // special case where a mask is required
+    //             offset length
+    // data             |<->|
+    //   |--------|...|--------|...
+    //         mask --> |111|
+    //                  |<---->|
+    //                     pro
+    uint8_t mask = BitUtil::kPrecedingBitmask[8 - prologue] ^
+                   BitUtil::kPrecedingBitmask[8 - prologue + length];
+    data[offset / 8] |= mask;
+    return;
+  }
+
+  // align to a byte boundary
+  data[offset / 8] = BitUtil::SpliceWord(offset, data[offset / 8], set_byte);
+  offset += prologue;
+  length -= prologue;
+
+  // set values per byte
+  DCHECK_EQ(offset % 8, 0);
+  std::memset(data + offset / 8, set_byte, length / 8);
+  offset += ((length / 8) * 8);
+  length -= ((length / 8) * 8);
+
+  // clean up
+  DCHECK_LT(length, 8);
+  data[offset / 8] = BitUtil::SpliceWord(length, set_byte, data[offset / 8]);
+}
+
+void SetBitmap(uint8_t* data, int64_t offset, int64_t length) {
+  SetBitmapImpl<true>(data, offset, length);
+}
+
+void ClearBitmap(uint8_t* data, int64_t offset, int64_t length) {
+  SetBitmapImpl<false>(data, offset, length);
+}
+
 }  // namespace BitUtil
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h
index 60e5ef6543f..f0a556e21b1 100644
--- a/cpp/src/arrow/util/bit_util.h
+++ b/cpp/src/arrow/util/bit_util.h
@@ -316,6 +316,14 @@ static inline void SetBitTo(uint8_t* bits, int64_t i, bool bit_is_set) {
 ARROW_EXPORT
 void SetBitsTo(uint8_t* bits, int64_t start_offset, int64_t length, bool bits_are_set);
 
+/// \brief Sets all bits in the bitmap to true
+ARROW_EXPORT
+void SetBitmap(uint8_t* data, int64_t offset, int64_t length);
+
+/// \brief Clears all bits in the bitmap (set to false)
+ARROW_EXPORT
+void ClearBitmap(uint8_t* data, int64_t offset, int64_t length);
+
 template <typename Word>
 constexpr Word WordBitMask(int i) {
   return (static_cast<Word>(1) << i) - 1;
diff --git a/cpp/src/arrow/util/bitmap_ops.cc b/cpp/src/arrow/util/bitmap_ops.cc
index fce23de35d8..63c8b008f4a 100644
--- a/cpp/src/arrow/util/bitmap_ops.cc
+++ b/cpp/src/arrow/util/bitmap_ops.cc
@@ -383,60 +383,5 @@ void BitmapOrNot(const uint8_t* left, int64_t left_offset, const uint8_t* right,
   BitmapOp<OrNotOp>(left, left_offset, right, right_offset, length, out_offset, out);
 }
 
-template <bool value>
-void SetBitmapImpl(uint8_t* data, int64_t offset, int64_t length) {
-  //                 offset  length
-  // data              |<------------->|
-  //   |--------|...|--------|...|--------|
-  //                   |<--->|   |<--->|
-  //                     pro       epi
-  if (ARROW_PREDICT_FALSE(length == 0)) {
-    return;
-  }
-
-  constexpr uint8_t set_byte = value ? UINT8_MAX : 0;
-
-  int prologue = static_cast<int>(((offset + 7) / 8) * 8 - offset);
-  DCHECK_LT(prologue, 8);
-
-  if (length < prologue) {  // special case where a mask is required
-    //             offset length
-    // data             |<->|
-    //   |--------|...|--------|...
-    //             mask |111|
-    //                  |<---->|
-    //                     pro
-    uint8_t mask = BitUtil::kPrecedingBitmask[8 - prologue] ^
-                   BitUtil::kPrecedingBitmask[8 - prologue + length];
-    data[offset / 8] |= mask;
-    return;
-  }
-
-  if (prologue) {  // align to a byte boundary
-    data[offset / 8] = BitUtil::SpliceWord(offset, data[offset / 8], set_byte);
-    offset += prologue;
-    length -= prologue;
-  }
-
-  if (length / 8) {  // set values per byte
-    DCHECK_EQ(offset % 8, 0);
-    std::memset(data + offset / 8, set_byte, length / 8);
-    offset += ((length / 8) * 8);
-    length -= ((length / 8) * 8);
-  }
-
-  if (length) {  // clean up
-    DCHECK_LT(length, 8);
-    data[offset / 8] = BitUtil::SpliceWord(length, set_byte, data[offset / 8]);
-  }
-}
-
-void SetBitmap(uint8_t* data, int64_t offset, int64_t length) {
-  SetBitmapImpl<true>(data, offset, length);
-}
-
-void ClearBitmap(uint8_t* data, int64_t offset, int64_t length) {
-  SetBitmapImpl<false>(data, offset, length);
-}
 }  // namespace internal
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/bitmap_ops.h b/cpp/src/arrow/util/bitmap_ops.h
index ecc8a77f024..40a7797a239 100644
--- a/cpp/src/arrow/util/bitmap_ops.h
+++ b/cpp/src/arrow/util/bitmap_ops.h
@@ -202,13 +202,5 @@ ARROW_EXPORT
 void BitmapOrNot(const uint8_t* left, int64_t left_offset, const uint8_t* right,
                  int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
 
-/// \brief Sets all bits in the bitmap to true
-ARROW_EXPORT
-void SetBitmap(uint8_t* data, int64_t offset, int64_t length);
-
-/// \brief Clears all bits in the bitmap (set to false)
-ARROW_EXPORT
-void ClearBitmap(uint8_t* data, int64_t offset, int64_t length);
-
 }  // namespace internal
 }  // namespace arrow

From ae38c47d5e67c08c3ac99ec6c9e16a3908afa1ee Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Wed, 16 Jun 2021 18:06:18 -0400
Subject: [PATCH 18/46] adding benchmark

---
 cpp/src/arrow/util/CMakeLists.txt             |   1 +
 cpp/src/arrow/util/bit_util_benchmark_temp.cc | 110 ++++++++++++++++++
 2 files changed, 111 insertions(+)
 create mode 100644 cpp/src/arrow/util/bit_util_benchmark_temp.cc

diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt
index e26a17120cd..1851a9afa5e 100644
--- a/cpp/src/arrow/util/CMakeLists.txt
+++ b/cpp/src/arrow/util/CMakeLists.txt
@@ -93,3 +93,4 @@ add_arrow_benchmark(trie_benchmark)
 add_arrow_benchmark(utf8_util_benchmark)
 add_arrow_benchmark(value_parsing_benchmark)
 add_arrow_benchmark(variant_benchmark)
+add_arrow_benchmark(bit_util_benchmark_temp)
diff --git a/cpp/src/arrow/util/bit_util_benchmark_temp.cc b/cpp/src/arrow/util/bit_util_benchmark_temp.cc
new file mode 100644
index 00000000000..d0b67dea701
--- /dev/null
+++ b/cpp/src/arrow/util/bit_util_benchmark_temp.cc
@@ -0,0 +1,110 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <bitset>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <utility>
+
+#include "arrow/buffer.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/util.h"
+#include "arrow/util/bit_block_counter.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_reader.h"
+#include "benchmark/benchmark.h"
+
+namespace arrow {
+namespace BitUtil {
+
+using internal::BitBlockCount;
+using internal::BitBlockCounter;
+using internal::BitmapWordReader;
+
+const int64_t kBufferSize = 1024 * (std::rand() % 25 + 1000);
+
+// const int seed = std::rand();
+
+static std::shared_ptr<Buffer> CreateRandomBuffer(int64_t nbytes) {
+  auto buffer = *AllocateBuffer(nbytes);
+  memset(buffer->mutable_data(), 0, nbytes);
+  random_bytes(nbytes, /*seed=*/0, buffer->mutable_data());
+  return std::move(buffer);
+}
+
+static void BitBlockCounterBench(benchmark::State& state) {
+  int64_t nbytes = state.range(0);
+  std::shared_ptr<Buffer> cond_buf = CreateRandomBuffer(nbytes);
+  for (auto _ : state) {
+    BitBlockCounter counter(cond_buf->data(), 0, nbytes * 8);
+
+    int64_t offset = 0;
+    int64_t set_bits = 0;
+
+    while (offset < nbytes * 8) {
+      const BitBlockCount& word = counter.NextWord();
+      //      if (word.AllSet()) {
+      //        set_bits += word.length;
+      //      } else if (word.popcount) {
+      //        set_bits += word.popcount;
+      //      }
+      set_bits += word.popcount;
+      offset += word.length;
+    }
+    benchmark::ClobberMemory();
+  }
+
+  state.SetBytesProcessed(state.iterations() * nbytes);
+}
+
+static void BitmapWordReaderBench(benchmark::State& state) {
+  int64_t nbytes = state.range(0);
+  std::shared_ptr<Buffer> cond_buf = CreateRandomBuffer(nbytes);
+  for (auto _ : state) {
+    BitmapWordReader<uint64_t> counter(cond_buf->data(), 0, nbytes * 8);
+
+    int64_t set_bits = 0;
+
+    int64_t cnt = counter.words();
+    while (cnt--) {
+      const auto& word = counter.NextWord();
+      //      if (word == UINT64_MAX) {
+      //        set_bits += sizeof(uint64_t) * 8;
+      //      } else if (word) {
+      //        set_bits += PopCount(word);
+      //      }
+      set_bits += PopCount(word);
+    }
+
+    cnt = counter.trailing_bytes();
+    while (cnt--) {
+      int valid_bits;
+      const auto& byte = static_cast<uint32_t>(counter.NextTrailingByte(valid_bits));
+      set_bits += PopCount(kPrecedingBitmask[valid_bits] & byte);
+    }
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * nbytes);
+}
+
+BENCHMARK(BitBlockCounterBench)->Arg(kBufferSize);
+BENCHMARK(BitmapWordReaderBench)->Arg(kBufferSize);
+
+}  // namespace BitUtil
+}  // namespace arrow

From e25a0d2883cae4d3ac5bd505327700a0d2bac87f Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Wed, 16 Jun 2021 19:09:37 -0400
Subject: [PATCH 19/46] adding benchmark1

---
 cpp/src/arrow/util/bit_util_benchmark_temp.cc | 65 +++++++++++++------
 1 file changed, 45 insertions(+), 20 deletions(-)

diff --git a/cpp/src/arrow/util/bit_util_benchmark_temp.cc b/cpp/src/arrow/util/bit_util_benchmark_temp.cc
index d0b67dea701..2230f2c6dfb 100644
--- a/cpp/src/arrow/util/bit_util_benchmark_temp.cc
+++ b/cpp/src/arrow/util/bit_util_benchmark_temp.cc
@@ -37,9 +37,7 @@ using internal::BitBlockCount;
 using internal::BitBlockCounter;
 using internal::BitmapWordReader;
 
-const int64_t kBufferSize = 1024 * (std::rand() % 25 + 1000);
-
-// const int seed = std::rand();
+const int64_t kBufferSize = 1024 * 1024;
 
 static std::shared_ptr<Buffer> CreateRandomBuffer(int64_t nbytes) {
   auto buffer = *AllocateBuffer(nbytes);
@@ -51,20 +49,27 @@ static std::shared_ptr<Buffer> CreateRandomBuffer(int64_t nbytes) {
 static void BitBlockCounterBench(benchmark::State& state) {
   int64_t nbytes = state.range(0);
   std::shared_ptr<Buffer> cond_buf = CreateRandomBuffer(nbytes);
+  std::shared_ptr<Buffer> data_buf = CreateRandomBuffer(nbytes * 8 * 8);
+  std::shared_ptr<Buffer> dest_buf = CreateRandomBuffer(nbytes * 8 * 8);
   for (auto _ : state) {
     BitBlockCounter counter(cond_buf->data(), 0, nbytes * 8);
 
-    int64_t offset = 0;
-    int64_t set_bits = 0;
+    const uint8_t* cond_ptr = cond_buf->data();
+    const uint64_t* data_ptr = reinterpret_cast<const uint64_t*>(data_buf->data());
+    uint64_t* dest_ptr = reinterpret_cast<uint64_t*>(dest_buf->mutable_data());
 
+    int64_t offset = 0;
     while (offset < nbytes * 8) {
       const BitBlockCount& word = counter.NextWord();
-      //      if (word.AllSet()) {
-      //        set_bits += word.length;
-      //      } else if (word.popcount) {
-      //        set_bits += word.popcount;
-      //      }
-      set_bits += word.popcount;
+      if (word.AllSet()) {
+        std::memcpy(dest_ptr + offset, data_ptr + offset, word.length * 8);
+      } else if (word.popcount) {
+        for (int64_t i = 0; i < word.length; i++) {
+          if (GetBit(cond_ptr, offset + i)) {
+            dest_ptr[offset + i] = data_ptr[offset + i];
+          }
+        }
+      }
       offset += word.length;
     }
     benchmark::ClobberMemory();
@@ -76,27 +81,47 @@ static void BitBlockCounterBench(benchmark::State& state) {
 static void BitmapWordReaderBench(benchmark::State& state) {
   int64_t nbytes = state.range(0);
   std::shared_ptr<Buffer> cond_buf = CreateRandomBuffer(nbytes);
+  std::shared_ptr<Buffer> data_buf = CreateRandomBuffer(nbytes * 8 * 8);
+  std::shared_ptr<Buffer> dest_buf = CreateRandomBuffer(nbytes * 8 * 8);
+
   for (auto _ : state) {
     BitmapWordReader<uint64_t> counter(cond_buf->data(), 0, nbytes * 8);
 
-    int64_t set_bits = 0;
+    const uint8_t* cond_ptr = cond_buf->data();
+    const auto* data_ptr = reinterpret_cast<const uint64_t*>(data_buf->data());
+    auto* dest_ptr = reinterpret_cast<uint64_t*>(dest_buf->mutable_data());
 
+    int64_t offset = 0;
     int64_t cnt = counter.words();
     while (cnt--) {
       const auto& word = counter.NextWord();
-      //      if (word == UINT64_MAX) {
-      //        set_bits += sizeof(uint64_t) * 8;
-      //      } else if (word) {
-      //        set_bits += PopCount(word);
-      //      }
-      set_bits += PopCount(word);
+      if (word == UINT64_MAX) {
+        std::memcpy(dest_ptr + offset, data_ptr + offset, 64 * 8);
+      } else if (word) {
+        for (int64_t i = 0; i < 8; i++) {
+          if (GetBit(cond_ptr, offset + i)) {
+            dest_ptr[offset + i] = data_ptr[offset + i];
+          }
+        }
+      }
+      offset += 8;
     }
 
     cnt = counter.trailing_bytes();
     while (cnt--) {
       int valid_bits;
-      const auto& byte = static_cast<uint32_t>(counter.NextTrailingByte(valid_bits));
-      set_bits += PopCount(kPrecedingBitmask[valid_bits] & byte);
+      const auto& byte = counter.NextTrailingByte(valid_bits);
+      if (byte == UINT8_MAX && valid_bits == 8) {
+        std::memcpy(dest_ptr, data_ptr, 8 * 8);
+      } else {
+        for (int64_t i = 0; i < valid_bits; i++) {
+          if (GetBit(cond_ptr, offset + i)) {
+            dest_ptr[offset + i] = data_ptr[offset + i];
+          }
+        }
+      }
+
+      offset += valid_bits;
     }
     benchmark::ClobberMemory();
   }

From bd7463a3e50db47b906ff4f15297554a0445da72 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Thu, 17 Jun 2021 09:19:54 -0400
Subject: [PATCH 20/46] Revert "adding benchmark1"

This reverts commit 67d60872
---
 cpp/src/arrow/util/bit_util_benchmark_temp.cc | 65 ++++++-------------
 1 file changed, 20 insertions(+), 45 deletions(-)

diff --git a/cpp/src/arrow/util/bit_util_benchmark_temp.cc b/cpp/src/arrow/util/bit_util_benchmark_temp.cc
index 2230f2c6dfb..d0b67dea701 100644
--- a/cpp/src/arrow/util/bit_util_benchmark_temp.cc
+++ b/cpp/src/arrow/util/bit_util_benchmark_temp.cc
@@ -37,7 +37,9 @@ using internal::BitBlockCount;
 using internal::BitBlockCounter;
 using internal::BitmapWordReader;
 
-const int64_t kBufferSize = 1024 * 1024;
+const int64_t kBufferSize = 1024 * (std::rand() % 25 + 1000);
+
+// const int seed = std::rand();
 
 static std::shared_ptr<Buffer> CreateRandomBuffer(int64_t nbytes) {
   auto buffer = *AllocateBuffer(nbytes);
@@ -49,27 +51,20 @@ static std::shared_ptr<Buffer> CreateRandomBuffer(int64_t nbytes) {
 static void BitBlockCounterBench(benchmark::State& state) {
   int64_t nbytes = state.range(0);
   std::shared_ptr<Buffer> cond_buf = CreateRandomBuffer(nbytes);
-  std::shared_ptr<Buffer> data_buf = CreateRandomBuffer(nbytes * 8 * 8);
-  std::shared_ptr<Buffer> dest_buf = CreateRandomBuffer(nbytes * 8 * 8);
   for (auto _ : state) {
     BitBlockCounter counter(cond_buf->data(), 0, nbytes * 8);
 
-    const uint8_t* cond_ptr = cond_buf->data();
-    const uint64_t* data_ptr = reinterpret_cast<const uint64_t*>(data_buf->data());
-    uint64_t* dest_ptr = reinterpret_cast<uint64_t*>(dest_buf->mutable_data());
-
     int64_t offset = 0;
+    int64_t set_bits = 0;
+
     while (offset < nbytes * 8) {
       const BitBlockCount& word = counter.NextWord();
-      if (word.AllSet()) {
-        std::memcpy(dest_ptr + offset, data_ptr + offset, word.length * 8);
-      } else if (word.popcount) {
-        for (int64_t i = 0; i < word.length; i++) {
-          if (GetBit(cond_ptr, offset + i)) {
-            dest_ptr[offset + i] = data_ptr[offset + i];
-          }
-        }
-      }
+      //      if (word.AllSet()) {
+      //        set_bits += word.length;
+      //      } else if (word.popcount) {
+      //        set_bits += word.popcount;
+      //      }
+      set_bits += word.popcount;
       offset += word.length;
     }
     benchmark::ClobberMemory();
@@ -81,47 +76,27 @@ static void BitBlockCounterBench(benchmark::State& state) {
 static void BitmapWordReaderBench(benchmark::State& state) {
   int64_t nbytes = state.range(0);
   std::shared_ptr<Buffer> cond_buf = CreateRandomBuffer(nbytes);
-  std::shared_ptr<Buffer> data_buf = CreateRandomBuffer(nbytes * 8 * 8);
-  std::shared_ptr<Buffer> dest_buf = CreateRandomBuffer(nbytes * 8 * 8);
-
   for (auto _ : state) {
     BitmapWordReader<uint64_t> counter(cond_buf->data(), 0, nbytes * 8);
 
-    const uint8_t* cond_ptr = cond_buf->data();
-    const auto* data_ptr = reinterpret_cast<const uint64_t*>(data_buf->data());
-    auto* dest_ptr = reinterpret_cast<uint64_t*>(dest_buf->mutable_data());
+    int64_t set_bits = 0;
 
-    int64_t offset = 0;
     int64_t cnt = counter.words();
     while (cnt--) {
       const auto& word = counter.NextWord();
-      if (word == UINT64_MAX) {
-        std::memcpy(dest_ptr + offset, data_ptr + offset, 64 * 8);
-      } else if (word) {
-        for (int64_t i = 0; i < 8; i++) {
-          if (GetBit(cond_ptr, offset + i)) {
-            dest_ptr[offset + i] = data_ptr[offset + i];
-          }
-        }
-      }
-      offset += 8;
+      //      if (word == UINT64_MAX) {
+      //        set_bits += sizeof(uint64_t) * 8;
+      //      } else if (word) {
+      //        set_bits += PopCount(word);
+      //      }
+      set_bits += PopCount(word);
     }
 
     cnt = counter.trailing_bytes();
     while (cnt--) {
       int valid_bits;
-      const auto& byte = counter.NextTrailingByte(valid_bits);
-      if (byte == UINT8_MAX && valid_bits == 8) {
-        std::memcpy(dest_ptr, data_ptr, 8 * 8);
-      } else {
-        for (int64_t i = 0; i < valid_bits; i++) {
-          if (GetBit(cond_ptr, offset + i)) {
-            dest_ptr[offset + i] = data_ptr[offset + i];
-          }
-        }
-      }
-
-      offset += valid_bits;
+      const auto& byte = static_cast<uint32_t>(counter.NextTrailingByte(valid_bits));
+      set_bits += PopCount(kPrecedingBitmask[valid_bits] & byte);
     }
     benchmark::ClobberMemory();
   }

From 83904223d81580b89fd2588df38332f72cbf6a55 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Thu, 17 Jun 2021 09:20:18 -0400
Subject: [PATCH 21/46] adding do not optimize

---
 cpp/src/arrow/util/bit_util_benchmark_temp.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cpp/src/arrow/util/bit_util_benchmark_temp.cc b/cpp/src/arrow/util/bit_util_benchmark_temp.cc
index d0b67dea701..359653c9644 100644
--- a/cpp/src/arrow/util/bit_util_benchmark_temp.cc
+++ b/cpp/src/arrow/util/bit_util_benchmark_temp.cc
@@ -55,7 +55,7 @@ static void BitBlockCounterBench(benchmark::State& state) {
     BitBlockCounter counter(cond_buf->data(), 0, nbytes * 8);
 
     int64_t offset = 0;
-    int64_t set_bits = 0;
+    uint64_t set_bits = 0;
 
     while (offset < nbytes * 8) {
       const BitBlockCount& word = counter.NextWord();
@@ -65,6 +65,7 @@ static void BitBlockCounterBench(benchmark::State& state) {
       //        set_bits += word.popcount;
       //      }
       set_bits += word.popcount;
+      benchmark::DoNotOptimize(set_bits);
       offset += word.length;
     }
     benchmark::ClobberMemory();
@@ -90,6 +91,7 @@ static void BitmapWordReaderBench(benchmark::State& state) {
       //        set_bits += PopCount(word);
       //      }
       set_bits += PopCount(word);
+      benchmark::DoNotOptimize(set_bits);
     }
 
     cnt = counter.trailing_bytes();
@@ -97,6 +99,7 @@ static void BitmapWordReaderBench(benchmark::State& state) {
       int valid_bits;
       const auto& byte = static_cast<uint32_t>(counter.NextTrailingByte(valid_bits));
       set_bits += PopCount(kPrecedingBitmask[valid_bits] & byte);
+      benchmark::DoNotOptimize(set_bits);
     }
     benchmark::ClobberMemory();
   }

From 952015a4fdc5a4708123393dd09f16e8f935da96 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Thu, 17 Jun 2021 11:11:19 -0400
Subject: [PATCH 22/46] adding ifelse bench

---
 cpp/src/arrow/compute/kernels/CMakeLists.txt  |  1 +
 .../kernels/scalar_if_else_benchmark.cc       | 64 +++++++++++++++++++
 2 files changed, 65 insertions(+)
 create mode 100644 cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc

diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt
index 326578588a7..3362d91cbe8 100644
--- a/cpp/src/arrow/compute/kernels/CMakeLists.txt
+++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt
@@ -37,6 +37,7 @@ add_arrow_benchmark(scalar_arithmetic_benchmark PREFIX "arrow-compute")
 add_arrow_benchmark(scalar_boolean_benchmark PREFIX "arrow-compute")
 add_arrow_benchmark(scalar_cast_benchmark PREFIX "arrow-compute")
 add_arrow_benchmark(scalar_compare_benchmark PREFIX "arrow-compute")
+add_arrow_benchmark(scalar_if_else_benchmark PREFIX "arrow-compute")
 add_arrow_benchmark(scalar_set_lookup_benchmark PREFIX "arrow-compute")
 add_arrow_benchmark(scalar_string_benchmark PREFIX "arrow-compute")
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc
new file mode 100644
index 00000000000..09336d93091
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc
@@ -0,0 +1,64 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <arrow/compute/api_scalar.h>
+#include <arrow/testing/gtest_util.h>
+#include <arrow/testing/random.h>
+#include <benchmark/benchmark.h>
+
+namespace arrow {
+namespace compute {
+
+const int64_t elems = 1024 * 1024;
+
+template <typename Type>
+static void IfElseBench(benchmark::State& state) {
+  using CType = typename Type::c_type;
+  auto type = TypeTraits<Type>::type_singleton();
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+  int64_t len = state.range(0);
+
+  random::RandomArrayGenerator rand(/*seed=*/0);
+
+  auto cond = std::static_pointer_cast<BooleanArray>(
+      rand.ArrayOf(boolean(), len, /*null_probability=*/0.01));
+  auto left = std::static_pointer_cast<ArrayType>(
+      rand.ArrayOf(type, len, /*null_probability=*/0.01));
+  auto right = std::static_pointer_cast<ArrayType>(
+      rand.ArrayOf(type, len, /*null_probability=*/0.01));
+
+  for (auto _ : state) {
+    ABORT_NOT_OK(IfElse(cond, left, right));
+  }
+
+  state.SetBytesProcessed(state.iterations() * (len / 8 + 2 * len * sizeof(CType)));
+}
+
+static void IfElseBench64Wide(benchmark::State& state) {
+  return IfElseBench<UInt64Type>(state);
+}
+
+static void IfElseBench32Wide(benchmark::State& state) {
+  return IfElseBench<UInt32Type>(state);
+}
+
+BENCHMARK(IfElseBench32Wide)->Arg(elems);
+BENCHMARK(IfElseBench64Wide)->Arg(elems);
+
+}  // namespace compute
+}  // namespace arrow
\ No newline at end of file

From 588373bf4fb98ccbdf0a7a204a45124635098e65 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Thu, 17 Jun 2021 11:29:52 -0400
Subject: [PATCH 23/46] adding offset bench

---
 .../compute/kernels/scalar_if_else_benchmark.cc     | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc
index 09336d93091..c3afa94da5f 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc
@@ -32,6 +32,7 @@ static void IfElseBench(benchmark::State& state) {
   using ArrayType = typename TypeTraits<Type>::ArrayType;
 
   int64_t len = state.range(0);
+  int64_t offset = state.range(1);
 
   random::RandomArrayGenerator rand(/*seed=*/0);
 
@@ -43,10 +44,11 @@ static void IfElseBench(benchmark::State& state) {
       rand.ArrayOf(type, len, /*null_probability=*/0.01));
 
   for (auto _ : state) {
-    ABORT_NOT_OK(IfElse(cond, left, right));
+    ABORT_NOT_OK(IfElse(cond->Slice(offset), left->Slice(offset), right->Slice(offset)));
   }
 
-  state.SetBytesProcessed(state.iterations() * (len / 8 + 2 * len * sizeof(CType)));
+  state.SetBytesProcessed(state.iterations() *
+                          ((len - offset) / 8 + 2 * (len - offset) * sizeof(CType)));
 }
 
 static void IfElseBench64Wide(benchmark::State& state) {
@@ -57,8 +59,11 @@ static void IfElseBench32Wide(benchmark::State& state) {
   return IfElseBench<UInt32Type>(state);
 }
 
-BENCHMARK(IfElseBench32Wide)->Arg(elems);
-BENCHMARK(IfElseBench64Wide)->Arg(elems);
+BENCHMARK(IfElseBench32Wide)->Args({elems, 0});
+BENCHMARK(IfElseBench64Wide)->Args({elems, 0});
+
+BENCHMARK(IfElseBench32Wide)->Args({elems, 99});
+BENCHMARK(IfElseBench64Wide)->Args({elems, 99});
 
 }  // namespace compute
 }  // namespace arrow
\ No newline at end of file

From ec127b7ee1563d104b51c45ab4783575f39b7ed6 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Thu, 17 Jun 2021 12:54:24 -0400
Subject: [PATCH 24/46] replacing bitblockcounter in ifelse

---
 .../arrow/compute/kernels/scalar_if_else.cc   | 66 +++++++++++++++----
 1 file changed, 53 insertions(+), 13 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
index 147b68f4baa..e9da2b2942f 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
@@ -21,11 +21,13 @@
 #include <arrow/util/bit_block_counter.h>
 #include <arrow/util/bitmap.h>
 #include <arrow/util/bitmap_ops.h>
+#include <arrow/util/bitmap_reader.h>
 
 namespace arrow {
 using internal::BitBlockCount;
 using internal::BitBlockCounter;
 using internal::Bitmap;
+using internal::BitmapWordReader;
 
 namespace compute {
 
@@ -223,28 +225,66 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
     std::memcpy(out_values, right_data, right.length * sizeof(T));
 
     const auto* cond_data = cond.buffers[1]->data();  // this is a BoolArray
-    BitBlockCounter bit_counter(cond_data, cond.offset, cond.length);
+    //    BitBlockCounter bit_counter(cond_data, cond.offset, cond.length);
+    BitmapWordReader<uint64_t> cond_reader(cond_data, cond.offset, cond.length);
 
     // selectively copy values from left data
     const T* left_data = left.GetValues<T>(1);
-    int64_t offset = cond.offset;
+    int64_t offset = 0;
+    int64_t bit_offset = cond.offset;
+
+    int64_t cnt = cond_reader.words();
 
     // todo this can be improved by intrinsics. ex: _mm*_mask_store_e* (vmovdqa*)
-    while (offset < cond.offset + cond.length) {
-      const BitBlockCount& block = bit_counter.NextWord();
-      if (block.AllSet()) {  // all from left
-        std::memcpy(out_values, left_data, block.length * sizeof(T));
-      } else if (block.popcount) {  // selectively copy from left
-        for (int64_t i = 0; i < block.length; ++i) {
-          if (BitUtil::GetBit(cond_data, offset + i)) {
-            out_values[i] = left_data[i];
+
+    //    while (offset < cond.offset + cond.length) {
+    //      const BitBlockCount& block = bit_counter.NextWord();
+    //      if (block.AllSet()) {  // all from left
+    //        std::memcpy(out_values, left_data, block.length * sizeof(T));
+    //      } else if (block.popcount) {  // selectively copy from left
+    //        for (int64_t i = 0; i < block.length; ++i) {
+    //          if (BitUtil::GetBit(cond_data, offset + i)) {
+    //            out_values[i] = left_data[i];
+    //          }
+    //        }
+    //      }
+    //
+    //      offset += block.length;
+    //      out_values += block.length;
+    //      left_data += block.length;
+    //    }
+
+    constexpr int64_t WordBitsSize = sizeof(uint64_t) * 8;
+    while (cnt--) {
+      uint64_t word = cond_reader.NextWord();
+      if (word == UINT64_MAX) {
+        std::memcpy(out_values + offset, left_data + offset, WordBitsSize * sizeof(T));
+      } else if (word) {
+        for (int64_t i = 0; i < WordBitsSize; ++i) {
+          if (BitUtil::GetBit(cond_data, bit_offset + i)) {
+            out_values[i + offset] = left_data[i + offset];
           }
         }
       }
+      offset += WordBitsSize;
+      bit_offset += WordBitsSize;
+    }
 
-      offset += block.length;
-      out_values += block.length;
-      left_data += block.length;
+    cnt = cond_reader.trailing_bytes();
+    while (cnt--) {
+      int valid_bits;
+      uint8_t byte = cond_reader.NextTrailingByte(valid_bits);
+      if (byte == UINT8_MAX && valid_bits == 8) {
+        std::memcpy(out_values + offset, left_data + offset, 8 * sizeof(T));
+      } else if (byte) {
+        for (int i = 0; i < valid_bits; ++i) {
+          if (BitUtil::GetBit(cond_data, bit_offset + i)) {
+            out_values[i + offset] = left_data[i + offset];
+          }
+        }
+      }
+      offset += 8;
+      bit_offset += 8;
     }
 
     out->buffers[1] = std::move(out_buf);

From 8d9023c43ed36bffe2825bdde0d03c86f4fd79b5 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Thu, 17 Jun 2021 14:43:45 -0400
Subject: [PATCH 25/46] replacing bitblockcounter in ifelse

---
 .../arrow/compute/kernels/scalar_if_else.cc   | 177 +++++++++++-------
 1 file changed, 108 insertions(+), 69 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
index e9da2b2942f..4d31cdbcf5d 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
@@ -212,6 +212,8 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
   using T = typename TypeTraits<Type>::CType;
   // A - Array
   // S - Scalar
+  using Word = uint64_t ;
+  static constexpr int64_t word_len = sizeof(Word) * 8;
 
   //  AAA
   static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
@@ -225,49 +227,29 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
     std::memcpy(out_values, right_data, right.length * sizeof(T));
 
     const auto* cond_data = cond.buffers[1]->data();  // this is a BoolArray
-    //    BitBlockCounter bit_counter(cond_data, cond.offset, cond.length);
-    BitmapWordReader<uint64_t> cond_reader(cond_data, cond.offset, cond.length);
+    BitmapWordReader<Word> cond_reader(cond_data, cond.offset, cond.length);
 
     // selectively copy values from left data
     const T* left_data = left.GetValues<T>(1);
-    int64_t offset = 0;
+    int64_t data_offset = 0;
     int64_t bit_offset = cond.offset;
 
-    int64_t cnt = cond_reader.words();
-
     // todo this can be improved by intrinsics. ex: _mm*_mask_store_e* (vmovdqa*)
-
-    //    while (offset < cond.offset + cond.length) {
-    //      const BitBlockCount& block = bit_counter.NextWord();
-    //      if (block.AllSet()) {  // all from left
-    //        std::memcpy(out_values, left_data, block.length * sizeof(T));
-    //      } else if (block.popcount) {  // selectively copy from left
-    //        for (int64_t i = 0; i < block.length; ++i) {
-    //          if (BitUtil::GetBit(cond_data, offset + i)) {
-    //            out_values[i] = left_data[i];
-    //          }
-    //        }
-    //      }
-    //
-    //      offset += block.length;
-    //      out_values += block.length;
-    //      left_data += block.length;
-    //    }
-
-    constexpr int64_t WordBitsSize = sizeof(uint64_t) * 8;
+    int64_t cnt = cond_reader.words();
     while (cnt--) {
-      uint64_t word = cond_reader.NextWord();
+      Word word = cond_reader.NextWord();
       if (word == UINT64_MAX) {
-        std::memcpy(out_values + offset, left_data + offset, WordBitsSize * sizeof(T));
+        std::memcpy(out_values + data_offset, left_data + data_offset,
+                    word_len * sizeof(T));
       } else if (word) {
-        for (int64_t i = 0; i < WordBitsSize; ++i) {
+        for (int64_t i = 0; i < word_len; ++i) {
           if (BitUtil::GetBit(cond_data, bit_offset + i)) {
-            out_values[i + offset] = left_data[i + offset];
+            out_values[data_offset + i] = left_data[data_offset + i];
           }
         }
       }
-      offset += WordBitsSize;
-      bit_offset += WordBitsSize;
+      data_offset += word_len;
+      bit_offset += word_len;
     }
 
     cnt = cond_reader.trailing_bytes();
@@ -275,15 +257,15 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
       int valid_bits;
       uint8_t byte = cond_reader.NextTrailingByte(valid_bits);
       if (byte == UINT8_MAX && valid_bits == 8) {
-        std::memcpy(out_values + offset, left_data + offset, 8 * sizeof(T));
+        std::memcpy(out_values + data_offset, left_data + data_offset, 8 * sizeof(T));
       } else if (byte) {
         for (int i = 0; i < valid_bits; ++i) {
           if (BitUtil::GetBit(cond_data, bit_offset + i)) {
-            out_values[i + offset] = left_data[i + offset];
+            out_values[data_offset + i] = left_data[data_offset + i];
           }
         }
       }
-      offset += 8;
+      data_offset += 8;
       bit_offset += 8;
     }
 
@@ -303,27 +285,46 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
     std::memcpy(out_values, right_data, right.length * sizeof(T));
 
     const auto* cond_data = cond.buffers[1]->data();  // this is a BoolArray
-    BitBlockCounter bit_counter(cond_data, cond.offset, cond.length);
+    BitmapWordReader<Word> cond_reader(cond_data, cond.offset, cond.length);
 
     // selectively copy values from left data
     T left_data = internal::UnboxScalar<Type>::Unbox(left);
-    int64_t offset = cond.offset;
+    int64_t data_offset = 0;
+    int64_t bit_offset = cond.offset;
 
     // todo this can be improved by intrinsics. ex: _mm*_mask_store_e* (vmovdqa*)
-    while (offset < cond.offset + cond.length) {
-      const BitBlockCount& block = bit_counter.NextWord();
-      if (block.AllSet()) {  // all from left
-        std::fill(out_values, out_values + block.length, left_data);
-      } else if (block.popcount) {  // selectively copy from left
-        for (int64_t i = 0; i < block.length; ++i) {
-          if (BitUtil::GetBit(cond_data, offset + i)) {
-            out_values[i] = left_data;
+    int64_t cnt = cond_reader.words();
+    while (cnt--) {
+      Word word = cond_reader.NextWord();
+      if (word == UINT64_MAX) {
+        std::fill(out_values + data_offset, out_values + data_offset + word_len,
+                  left_data);
+      } else if (word) {
+        for (int64_t i = 0; i < word_len; ++i) {
+          if (BitUtil::GetBit(cond_data, bit_offset + i)) {
+            out_values[data_offset + i] = left_data;
           }
         }
       }
+      data_offset += word_len;
+      bit_offset += word_len;
+    }
 
-      offset += block.length;
-      out_values += block.length;
+    cnt = cond_reader.trailing_bytes();
+    while (cnt--) {
+      int valid_bits;
+      uint8_t byte = cond_reader.NextTrailingByte(valid_bits);
+      if (byte == UINT8_MAX && valid_bits == 8) {
+        std::fill(out_values + data_offset, out_values + data_offset + 8, left_data);
+      } else if (byte) {
+        for (int i = 0; i < valid_bits; ++i) {
+          if (BitUtil::GetBit(cond_data, bit_offset + i)) {
+            out_values[data_offset + i] = left_data;
+          }
+        }
+      }
+      data_offset += 8;
+      bit_offset += 8;
     }
 
     out->buffers[1] = std::move(out_buf);
@@ -342,28 +343,47 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
     std::memcpy(out_values, left_data, left.length * sizeof(T));
 
     const auto* cond_data = cond.buffers[1]->data();  // this is a BoolArray
-    BitBlockCounter bit_counter(cond_data, cond.offset, cond.length);
+    BitmapWordReader<Word> cond_reader(cond_data, cond.offset, cond.length);
 
     // selectively copy values from left data
     T right_data = internal::UnboxScalar<Type>::Unbox(right);
-    int64_t offset = cond.offset;
+    int64_t data_offset = 0;
+    int64_t bit_offset = cond.offset;
 
     // todo this can be improved by intrinsics. ex: _mm*_mask_store_e* (vmovdqa*)
     // left data is already in the output buffer. Therefore, mask needs to be inverted
-    while (offset < cond.offset + cond.length) {
-      const BitBlockCount& block = bit_counter.NextWord();
-      if (block.NoneSet()) {  // all from right
-        std::fill(out_values, out_values + block.length, right_data);
-      } else if (block.popcount) {  // selectively copy from right
-        for (int64_t i = 0; i < block.length; ++i) {
-          if (!BitUtil::GetBit(cond_data, offset + i)) {
-            out_values[i] = right_data;
+    int64_t cnt = cond_reader.words();
+    while (cnt--) {
+      Word word = cond_reader.NextWord();
+      if (word == 0) {  // all from right
+        std::fill(out_values + data_offset, out_values + data_offset + word_len,
+                  right_data);
+      } else if (word != UINT64_MAX) {  // selectively copy from right
+        for (int64_t i = 0; i < word_len; ++i) {
+          if (!BitUtil::GetBit(cond_data, bit_offset + i)) {
+            out_values[data_offset + i] = right_data;
           }
         }
       }
+      data_offset += word_len;
+      bit_offset += word_len;
+    }
 
-      offset += block.length;
-      out_values += block.length;
+    cnt = cond_reader.trailing_bytes();
+    while (cnt--) {
+      int valid_bits;
+      uint8_t byte = cond_reader.NextTrailingByte(valid_bits);
+      if (byte == 0 && valid_bits == 8) {
+        std::fill(out_values + data_offset, out_values + data_offset + 8, right_data);
+      } else if (byte != UINT8_MAX) {
+        for (int i = 0; i < valid_bits; ++i) {
+          if (!BitUtil::GetBit(cond_data, bit_offset + i)) {
+            out_values[data_offset + i] = right_data;
+          }
+        }
+      }
+      data_offset += 8;
+      bit_offset += 8;
     }
 
     out->buffers[1] = std::move(out_buf);
@@ -382,27 +402,46 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
     std::fill(out_values, out_values + cond.length, right_data);
 
     const auto* cond_data = cond.buffers[1]->data();  // this is a BoolArray
-    BitBlockCounter bit_counter(cond_data, cond.offset, cond.length);
+    BitmapWordReader<Word> cond_reader(cond_data, cond.offset, cond.length);
 
     // selectively copy values from left data
     T left_data = internal::UnboxScalar<Type>::Unbox(left);
-    int64_t offset = cond.offset;
+    int64_t data_offset = 0;
+    int64_t bit_offset = cond.offset;
 
     // todo this can be improved by intrinsics. ex: _mm*_mask_store_e* (vmovdqa*)
-    while (offset < cond.offset + cond.length) {
-      const BitBlockCount& block = bit_counter.NextWord();
-      if (block.AllSet()) {  // all from left
-        std::fill(out_values, out_values + block.length, left_data);
-      } else if (block.popcount) {  // selectively copy from left
-        for (int64_t i = 0; i < block.length; ++i) {
-          if (BitUtil::GetBit(cond_data, offset + i)) {
-            out_values[i] = left_data;
+    int64_t cnt = cond_reader.words();
+    while (cnt--) {
+      Word word = cond_reader.NextWord();
+      if (word == UINT64_MAX) { // all from left
+        std::fill(out_values + data_offset, out_values + data_offset + word_len,
+                  left_data);
+      } else if (word) {  // selectively copy from left
+        for (int64_t i = 0; i < word_len; ++i) {
+          if (BitUtil::GetBit(cond_data, bit_offset + i)) {
+            out_values[data_offset + i] = left_data;
           }
         }
       }
+      data_offset += word_len;
+      bit_offset += word_len;
+    }
 
-      offset += block.length;
-      out_values += block.length;
+    cnt = cond_reader.trailing_bytes();
+    while (cnt--) {
+      int valid_bits;
+      uint8_t byte = cond_reader.NextTrailingByte(valid_bits);
+      if (byte == UINT8_MAX && valid_bits == 8) {
+        std::fill(out_values + data_offset, out_values + data_offset + 8, left_data);
+      } else if (byte) {
+        for (int i = 0; i < valid_bits; ++i) {
+          if (BitUtil::GetBit(cond_data, bit_offset + i)) {
+            out_values[data_offset + i] = left_data;
+          }
+        }
+      }
+      data_offset += 8;
+      bit_offset += 8;
     }
 
     out->buffers[1] = std::move(out_buf);

From 4e640b1c225368d4df332a4f82d747c568f2781c Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Thu, 17 Jun 2021 15:14:18 -0400
Subject: [PATCH 26/46] extending bench suite

---
 .../kernels/scalar_if_else_benchmark.cc       | 56 +++++++++++++++++--
 .../compute/kernels/scalar_if_else_test.cc    |  9 ++-
 2 files changed, 57 insertions(+), 8 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc
index c3afa94da5f..937921a05b2 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include <arrow/array/concatenate.h>
 #include <arrow/compute/api_scalar.h>
 #include <arrow/testing/gtest_util.h>
 #include <arrow/testing/random.h>
@@ -51,19 +52,62 @@ static void IfElseBench(benchmark::State& state) {
                           ((len - offset) / 8 + 2 * (len - offset) * sizeof(CType)));
 }
 
-static void IfElseBench64Wide(benchmark::State& state) {
+template <typename Type>
+static void IfElseBenchContiguous(benchmark::State& state) {
+  using CType = typename Type::c_type;
+  auto type = TypeTraits<Type>::type_singleton();
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+  int64_t len = state.range(0);
+  int64_t offset = state.range(1);
+
+  ASSERT_OK_AND_ASSIGN(auto temp1, MakeArrayFromScalar(BooleanScalar(true), len / 2));
+  ASSERT_OK_AND_ASSIGN(auto temp2,
+                       MakeArrayFromScalar(BooleanScalar(false), len - len / 2));
+  ASSERT_OK_AND_ASSIGN(auto concat, Concatenate({temp1, temp2}));
+  auto cond = std::static_pointer_cast<BooleanArray>(concat);
+
+  random::RandomArrayGenerator rand(/*seed=*/0);
+  auto left = std::static_pointer_cast<ArrayType>(
+      rand.ArrayOf(type, len, /*null_probability=*/0.01));
+  auto right = std::static_pointer_cast<ArrayType>(
+      rand.ArrayOf(type, len, /*null_probability=*/0.01));
+
+  for (auto _ : state) {
+    ABORT_NOT_OK(IfElse(cond->Slice(offset), left->Slice(offset), right->Slice(offset)));
+  }
+
+  state.SetBytesProcessed(state.iterations() *
+                          ((len - offset) / 8 + 2 * (len - offset) * sizeof(CType)));
+}
+
+static void IfElseBench64(benchmark::State& state) {
   return IfElseBench<UInt64Type>(state);
 }
 
-static void IfElseBench32Wide(benchmark::State& state) {
+static void IfElseBench32(benchmark::State& state) {
   return IfElseBench<UInt32Type>(state);
 }
 
-BENCHMARK(IfElseBench32Wide)->Args({elems, 0});
-BENCHMARK(IfElseBench64Wide)->Args({elems, 0});
+static void IfElseBench64Contiguous(benchmark::State& state) {
+  return IfElseBenchContiguous<UInt64Type>(state);
+}
+
+static void IfElseBench32Contiguous(benchmark::State& state) {
+  return IfElseBenchContiguous<UInt32Type>(state);
+}
+
+BENCHMARK(IfElseBench32)->Args({elems, 0});
+BENCHMARK(IfElseBench64)->Args({elems, 0});
+
+BENCHMARK(IfElseBench32)->Args({elems, 99});
+BENCHMARK(IfElseBench64)->Args({elems, 99});
+
+BENCHMARK(IfElseBench32Contiguous)->Args({elems, 0});
+BENCHMARK(IfElseBench64Contiguous)->Args({elems, 0});
 
-BENCHMARK(IfElseBench32Wide)->Args({elems, 99});
-BENCHMARK(IfElseBench64Wide)->Args({elems, 99});
+BENCHMARK(IfElseBench32Contiguous)->Args({elems, 99});
+BENCHMARK(IfElseBench64Contiguous)->Args({elems, 99});
 
 }  // namespace compute
 }  // namespace arrow
\ No newline at end of file
diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
index 2b63af2f26f..c9347bc6a4b 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
@@ -16,6 +16,7 @@
 // under the License.
 
 #include <arrow/array.h>
+#include <arrow/array/concatenate.h>
 #include <arrow/compute/api_scalar.h>
 #include <arrow/compute/kernels/test_util.h>
 #include <arrow/testing/gtest_util.h>
@@ -56,8 +57,12 @@ TYPED_TEST(TestIfElsePrimitive, IfElseFixedSizeRand) {
 
   random::RandomArrayGenerator rand(/*seed=*/0);
   int64_t len = 1000;
-  auto cond = std::static_pointer_cast<BooleanArray>(
-      rand.ArrayOf(boolean(), len, /*null_probability=*/0.01));
+  ASSERT_OK_AND_ASSIGN(auto temp1, MakeArrayFromScalar(BooleanScalar(true), 64));
+  ASSERT_OK_AND_ASSIGN(auto temp2, MakeArrayFromScalar(BooleanScalar(false), 64));
+  auto temp3 = rand.ArrayOf(boolean(), len - 64 * 2, /*null_probability=*/0.01);
+  ASSERT_OK_AND_ASSIGN(auto concat, Concatenate({temp1, temp2, temp3}));
+  auto cond = std::static_pointer_cast<BooleanArray>(concat);
+
   auto left = std::static_pointer_cast<ArrayType>(
       rand.ArrayOf(type, len, /*null_probability=*/0.01));
   auto right = std::static_pointer_cast<ArrayType>(

From d9ee399b4185c15bb2fb52be36823a6a003bd9e8 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Thu, 17 Jun 2021 15:34:13 -0400
Subject: [PATCH 27/46] Update cpp/src/arrow/compute/kernels/scalar_boolean.cc

Co-authored-by: Benjamin Kietzman <bengilgit@gmail.com>
---
 cpp/src/arrow/compute/kernels/scalar_boolean.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_boolean.cc
index 6de4ef16031..7a0e3654edb 100644
--- a/cpp/src/arrow/compute/kernels/scalar_boolean.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_boolean.cc
@@ -550,7 +550,6 @@ void RegisterScalarBoolean(FunctionRegistry* registry) {
   MakeFunction("or", 2, applicator::SimpleBinary<OrOp>, &or_doc, registry);
   MakeFunction("xor", 2, applicator::SimpleBinary<XorOp>, &xor_doc, registry);
 
-  // The Kleene logic kernels cannot write into sliced output bitmaps
   MakeFunction("and_kleene", 2, applicator::SimpleBinary<KleeneAndOp>, &and_kleene_doc,
                registry, NullHandling::COMPUTED_PREALLOCATE);
   MakeFunction("and_not_kleene", 2, applicator::SimpleBinary<KleeneAndNotOp>,

From 197e1c4663f8e9dc058d8a7b096761df7d137ecc Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Thu, 17 Jun 2021 15:51:34 -0400
Subject: [PATCH 28/46] Apply suggestions from code review

Co-authored-by: Benjamin Kietzman <bengilgit@gmail.com>
---
 cpp/src/arrow/util/bit_util.cc      | 6 +++---
 cpp/src/arrow/util/bit_util.h       | 6 +++---
 cpp/src/arrow/util/bit_util_test.cc | 9 +--------
 cpp/src/arrow/util/bitmap.h         | 9 ++-------
 4 files changed, 9 insertions(+), 21 deletions(-)

diff --git a/cpp/src/arrow/util/bit_util.cc b/cpp/src/arrow/util/bit_util.cc
index 9c0ef6bc9bf..47bf1563150 100644
--- a/cpp/src/arrow/util/bit_util.cc
+++ b/cpp/src/arrow/util/bit_util.cc
@@ -82,7 +82,7 @@ void SetBitmapImpl(uint8_t* data, int64_t offset, int64_t length) {
 
   constexpr uint8_t set_byte = value ? UINT8_MAX : 0;
 
-  int prologue = static_cast<int>(((offset + 7) / 8) * 8 - offset);
+  auto prologue = BitUtil::RoundUp(offset, 8) - offset;
   DCHECK_LT(prologue, 8);
 
   if (length < prologue) {  // special case where a mask is required
@@ -106,8 +106,8 @@ void SetBitmapImpl(uint8_t* data, int64_t offset, int64_t length) {
   // set values per byte
   DCHECK_EQ(offset % 8, 0);
   std::memset(data + offset / 8, set_byte, length / 8);
-  offset += ((length / 8) * 8);
-  length -= ((length / 8) * 8);
+  offset += BitUtil::RoundDown(length, 8);
+  length -= BitUtil::RoundDown(length, 8);
 
   // clean up
   DCHECK_LT(length, 8);
diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h
index f0a556e21b1..adce96308b1 100644
--- a/cpp/src/arrow/util/bit_util.h
+++ b/cpp/src/arrow/util/bit_util.h
@@ -324,9 +324,9 @@ void SetBitmap(uint8_t* data, int64_t offset, int64_t length);
 ARROW_EXPORT
 void ClearBitmap(uint8_t* data, int64_t offset, int64_t length);
 
-template <typename Word>
-constexpr Word WordBitMask(int i) {
-  return (static_cast<Word>(1) << i) - 1;
+template <typename Word, Word all = static_cast<Word>(~static_cast<Word>(0))>
+constexpr Word TrailingWordBitmask(int i) {
+  return ARROW_PREDICT_FALSE(i >= sizeof(Word) * 8) ? 0 : all << i;
 }
 
 /// \brief Create a word with low `n` bits from `low` and high `sizeof(Word)-n` bits
diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc
index 0fe39fa804b..bbd06d3cbbb 100644
--- a/cpp/src/arrow/util/bit_util_test.cc
+++ b/cpp/src/arrow/util/bit_util_test.cc
@@ -1984,7 +1984,7 @@ void CheckSplice(int n, Word low, Word high) {
                  : BitUtil::GetBit(reinterpret_cast<uint8_t*>(&high), i);
   }
 
-  ASSERT_EQ(static_cast<Word>(ret.to_ulong()), BitUtil::SpliceWord(n, low, high));
+  ASSERT_EQ(BitUtil::SpliceWord(n, low, high), static_cast<Word>(ret.to_ulong());
 }
 
 TEST(SpliceWord, SpliceWord) {
@@ -2262,13 +2262,6 @@ TEST_P(TestBitmapVisitAndWriteOutputNoOffset, Test1) {
   std::transform(v0.begin(), v0.end(), v2.begin(), out_v1.begin(),
                  std::logical_or<bool>());
 
-  //  std::cout << "v0: " << VectorToString(v0) << "\n"
-  //            << "b0: " << bm0.ToString() << "\n"
-  //            << "v1: " << VectorToString(v1) << "\n"
-  //            << "b1: " << bm1.ToString() << "\n"
-  //            << "v2: " << VectorToString(v2) << "\n"
-  //            << "b2: " << bm2.ToString() << "\n";
-
   VerifyBoolVectorAndBitmap(out_bms[0], out_v0);
   VerifyBoolVectorAndBitmap(out_bms[1], out_v1);
 }
diff --git a/cpp/src/arrow/util/bitmap.h b/cpp/src/arrow/util/bitmap.h
index 05cc7a309f8..78bfca8d408 100644
--- a/cpp/src/arrow/util/bitmap.h
+++ b/cpp/src/arrow/util/bitmap.h
@@ -76,11 +76,6 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
     return Bitmap(buffer_, offset_ + offset, length);
   }
 
-  void Stride(int64_t stride) {
-    this->offset_ += stride;
-    this->length_ -= stride;
-  }
-
   std::string ToString() const;
 
   bool Equals(const Bitmap& other) const;
@@ -293,8 +288,8 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
     output_words.fill(0);
 
     // every reader will have same number of words, since they are same length'ed
-    // todo this will be inefficient in some cases. When there are offsets beyond Word
-    //  boundary, every Word would have to be created from 2 adjoining Words
+    // TODO($JIRA) this will be inefficient in some cases. When there are offsets beyond Word
+    // boundary, every Word would have to be created from 2 adjoining Words
     auto n_words = readers[0].words();
     bit_length -= n_words * kBitWidth;
     while (n_words--) {

From 4c7f445aab1fbe14777776cb1bf5dcac60c9c18d Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Thu, 17 Jun 2021 19:42:13 -0400
Subject: [PATCH 29/46] adding PR comments

---
 cpp/src/arrow/util/bit_util.cc      |   4 +-
 cpp/src/arrow/util/bit_util.h       |  16 ++-
 cpp/src/arrow/util/bit_util_test.cc | 198 +++++++++-------------------
 cpp/src/arrow/util/bitmap.h         |   4 +-
 4 files changed, 76 insertions(+), 146 deletions(-)

diff --git a/cpp/src/arrow/util/bit_util.cc b/cpp/src/arrow/util/bit_util.cc
index 47bf1563150..b1ac21e8e41 100644
--- a/cpp/src/arrow/util/bit_util.cc
+++ b/cpp/src/arrow/util/bit_util.cc
@@ -94,12 +94,12 @@ void SetBitmapImpl(uint8_t* data, int64_t offset, int64_t length) {
     //                     pro
     uint8_t mask = BitUtil::kPrecedingBitmask[8 - prologue] ^
                    BitUtil::kPrecedingBitmask[8 - prologue + length];
-    data[offset / 8] |= mask;
+    data[offset / 8] = value ? data[offset / 8] | mask : data[offset / 8] & ~mask;
     return;
   }
 
   // align to a byte boundary
-  data[offset / 8] = BitUtil::SpliceWord(offset, data[offset / 8], set_byte);
+  data[offset / 8] = BitUtil::SpliceWord(prologue, data[offset / 8], set_byte);
   offset += prologue;
   length -= prologue;
 
diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h
index adce96308b1..a9775552c7b 100644
--- a/cpp/src/arrow/util/bit_util.h
+++ b/cpp/src/arrow/util/bit_util.h
@@ -324,9 +324,17 @@ void SetBitmap(uint8_t* data, int64_t offset, int64_t length);
 ARROW_EXPORT
 void ClearBitmap(uint8_t* data, int64_t offset, int64_t length);
 
-template <typename Word, Word all = static_cast<Word>(~static_cast<Word>(0))>
-constexpr Word TrailingWordBitmask(int i) {
-  return ARROW_PREDICT_FALSE(i >= sizeof(Word) * 8) ? 0 : all << i;
+/// Returns a mask with lower i bits set to 1. If i >= sizeof(Word)*8, all-ones will be
+/// returned
+/// ex:
+/// PrecedingWordBitmask<uint_8>(0)= 0x00
+/// PrecedingWordBitmask<uint_8>(4)= 0x0f
+/// PrecedingWordBitmask<uint_8>(8)= 0xff
+/// PrecedingWordBitmask<uint_32>(8)= 0x00ff
+/// ref: https://stackoverflow.com/a/59523400
+template <typename Word>
+constexpr Word PrecedingWordBitmask(unsigned int const i) {
+  return (static_cast<Word>(i < sizeof(Word) * 8) << (i & (sizeof(Word) * 8 - 1))) - 1;
 }
 
 /// \brief Create a word with low `n` bits from `low` and high `sizeof(Word)-n` bits
@@ -337,7 +345,7 @@ constexpr Word TrailingWordBitmask(int i) {
 /// }
 template <typename Word>
 constexpr Word SpliceWord(int n, Word low, Word high) {
-  return (high & ~WordBitMask<Word>(n)) | (low & WordBitMask<Word>(n));
+  return (high & ~PrecedingWordBitmask<Word>(n)) | (low & PrecedingWordBitmask<Word>(n));
 }
 
 }  // namespace BitUtil
diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc
index bbd06d3cbbb..316a52de087 100644
--- a/cpp/src/arrow/util/bit_util_test.cc
+++ b/cpp/src/arrow/util/bit_util_test.cc
@@ -1984,26 +1984,36 @@ void CheckSplice(int n, Word low, Word high) {
                  : BitUtil::GetBit(reinterpret_cast<uint8_t*>(&high), i);
   }
 
-  ASSERT_EQ(BitUtil::SpliceWord(n, low, high), static_cast<Word>(ret.to_ulong());
+  Word res = BitUtil::SpliceWord<Word>(n, low, high);
+  Word exp = static_cast<Word>(ret.to_ulong());
+  assert(res == exp);
 }
 
 TEST(SpliceWord, SpliceWord) {
   uint64_t low = 123456789, high = 987654321;
 
+  static_assert(
+      BitUtil::PrecedingWordBitmask<uint8_t>(0) == BitUtil::kPrecedingBitmask[0], "");
+  static_assert(
+      BitUtil::PrecedingWordBitmask<uint8_t>(5) == BitUtil::kPrecedingBitmask[5], "");
+  static_assert(BitUtil::PrecedingWordBitmask<uint8_t>(8) == UINT8_MAX, "");
+
+  static_assert(BitUtil::PrecedingWordBitmask<uint64_t>(0) == uint64_t(0), "");
+  static_assert(BitUtil::PrecedingWordBitmask<uint64_t>(33) == 8589934591, "");
+  static_assert(BitUtil::PrecedingWordBitmask<uint64_t>(64) == UINT64_MAX, "");
+  static_assert(BitUtil::PrecedingWordBitmask<uint64_t>(65) == UINT64_MAX, "");
+
   CheckSplice<uint8_t>(0, static_cast<uint8_t>(low), static_cast<uint8_t>(high));
-  CheckSplice<uint8_t>(UINT8_MAX, static_cast<uint8_t>(low), static_cast<uint8_t>(high));
-  CheckSplice<uint8_t>(sizeof(uint8_t) / 3, static_cast<uint8_t>(low),
-                       static_cast<uint8_t>(high));
+  CheckSplice<uint8_t>(8, static_cast<uint8_t>(low), static_cast<uint8_t>(high));
+  CheckSplice<uint8_t>(8 / 3, static_cast<uint8_t>(low), static_cast<uint8_t>(high));
 
   CheckSplice<uint32_t>(0, static_cast<uint32_t>(low), static_cast<uint32_t>(high));
-  CheckSplice<uint32_t>(UINT32_MAX, static_cast<uint32_t>(low),
-                        static_cast<uint32_t>(high));
-  CheckSplice<uint32_t>(sizeof(uint32_t) / 3, static_cast<uint32_t>(low),
-                        static_cast<uint32_t>(high));
+  CheckSplice<uint32_t>(32, static_cast<uint32_t>(low), static_cast<uint32_t>(high));
+  CheckSplice<uint32_t>(32 / 3, static_cast<uint32_t>(low), static_cast<uint32_t>(high));
 
   CheckSplice(0, low, high);
-  CheckSplice(UINT32_MAX, low, high);
-  CheckSplice(sizeof(uint32_t) / 3, low, high);
+  CheckSplice(64, low, high);
+  CheckSplice(64 / 3, low, high);
 }
 
 // test the basic assumption of word level Bitmap::Visit
@@ -2187,62 +2197,29 @@ TEST(Bitmap, VisitWordsAnd) {
   }
 }
 
-void random_bool_vector(std::vector<bool>& vec, int64_t size, double p = 0.5) {
-  vec.reserve(size);
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::bernoulli_distribution d(p);
-
-  for (int n = 0; n < size; ++n) {
-    vec.push_back(d(gen));
-  }
-}
-
-std::string VectorToString(const std::vector<bool>& v) {
-  std::string out(v.size() + +((v.size() - 1) / 8), ' ');
-  for (size_t i = 0; i < v.size(); ++i) {
-    out[i + (i / 8)] = v[i] ? '1' : '0';
-  }
-  return out;
-}
-
-void VerifyBoolVectorAndBitmap(const Bitmap& bitmap, const std::vector<bool>& expected) {
-  arrow::BooleanBuilder boolean_builder;
-  ASSERT_OK(boolean_builder.AppendValues(expected));
-  ASSERT_OK_AND_ASSIGN(auto arr, boolean_builder.Finish());
-
-  ASSERT_TRUE(BitmapEquals(bitmap.buffer()->data(), bitmap.offset(),
-                           arr->data()->buffers[1]->data(), 0, expected.size()))
-      << "exp: " << VectorToString(expected) << "\ngot: " << bitmap.ToString();
-}
-
-class TestBitmapVisitAndWriteOutputNoOffset : public ::testing::TestWithParam<int32_t> {};
-
-TEST_P(TestBitmapVisitAndWriteOutputNoOffset, Test1) {
-  auto part = GetParam();
-  int64_t bits = 4 * part;
-  std::vector<bool> data;
-  random_bool_vector(data, bits);
+void DoBitmapVisitAndWrite(int64_t part, bool with_offset) {
+  int64_t bits = part * 4;
 
-  arrow::BooleanBuilder boolean_builder;
-  ASSERT_OK(boolean_builder.AppendValues(data));
-  ASSERT_OK_AND_ASSIGN(auto arrow_data, boolean_builder.Finish());
+  random::RandomArrayGenerator rand(/*seed=*/0);
+  auto arrow_data = rand.ArrayOf(boolean(), bits, 0);
 
   std::shared_ptr<Buffer>& arrow_buffer = arrow_data->data()->buffers[1];
 
   Bitmap bm0(arrow_buffer, 0, part);
-  Bitmap bm1 = bm0.Slice(part * 1, part);  // this goes beyond bm0's len
-  Bitmap bm2 = bm0.Slice(part * 2, part);  // this goes beyond bm0's len
+  Bitmap bm1(arrow_buffer, part * 1, part);
+  Bitmap bm2(arrow_buffer, part * 2, part);
 
   std::array<Bitmap, 2> out_bms;
-  ASSERT_OK_AND_ASSIGN(auto out0, AllocateBitmap(part));
-  ASSERT_OK_AND_ASSIGN(auto out1, AllocateBitmap(part));
-  out_bms[0] = Bitmap(out0, 0, part);
-  out_bms[1] = Bitmap(out1, 0, part);
-
-  std::vector<bool> v0(data.begin(), data.begin() + part);
-  std::vector<bool> v1(data.begin() + part * 1, data.begin() + part * 2);
-  std::vector<bool> v2(data.begin() + part * 2, data.begin() + part * 3);
+  if (with_offset) {
+    ASSERT_OK_AND_ASSIGN(auto out, AllocateBitmap(part * 4));
+    out_bms[0] = Bitmap(out, part, part);
+    out_bms[1] = Bitmap(out, part * 2, part);
+  } else {
+    ASSERT_OK_AND_ASSIGN(auto out0, AllocateBitmap(part));
+    ASSERT_OK_AND_ASSIGN(auto out1, AllocateBitmap(part));
+    out_bms[0] = Bitmap(out0, 0, part);
+    out_bms[1] = Bitmap(out1, 0, part);
+  }
 
   // out0 = bm0 & bm1, out1= bm0 | bm2
   std::array<Bitmap, 3> in_bms{bm0, bm1, bm2};
@@ -2253,96 +2230,39 @@ TEST_P(TestBitmapVisitAndWriteOutputNoOffset, Test1) {
         out->at(1) = in[0] | in[2];
       });
 
-  std::vector<bool> out_v0(part);
-  std::vector<bool> out_v1(part);
-  // v3 = v0 & v1
-  std::transform(v0.begin(), v0.end(), v1.begin(), out_v0.begin(),
-                 std::logical_and<bool>());
-  // v3 |= v2
-  std::transform(v0.begin(), v0.end(), v2.begin(), out_v1.begin(),
-                 std::logical_or<bool>());
+  auto pool = MemoryPool::CreateDefault();
+  ASSERT_OK_AND_ASSIGN(auto exp_0,
+                       BitmapAnd(pool.get(), bm0.buffer()->data(), bm0.offset(),
+                                 bm1.buffer()->data(), bm1.offset(), part, 0));
+  ASSERT_OK_AND_ASSIGN(auto exp_1,
+                       BitmapOr(pool.get(), bm0.buffer()->data(), bm0.offset(),
+                                bm2.buffer()->data(), bm2.offset(), part, 0));
 
-  VerifyBoolVectorAndBitmap(out_bms[0], out_v0);
-  VerifyBoolVectorAndBitmap(out_bms[1], out_v1);
+  ASSERT_TRUE(BitmapEquals(exp_0->data(), 0, out_bms[0].buffer()->data(),
+                           out_bms[0].offset(), part))
+      << "exp: " << Bitmap(exp_0->data(), 0, part).ToString() << std::endl
+      << "got: " << out_bms[0].ToString();
+
+  ASSERT_TRUE(BitmapEquals(exp_1->data(), 0, out_bms[1].buffer()->data(),
+                           out_bms[1].offset(), part))
+      << "exp: " << Bitmap(exp_1->data(), 0, part).ToString() << std::endl
+      << "got: " << out_bms[1].ToString();
 }
 
-INSTANTIATE_TEST_SUITE_P(VisitWriteGeneral, TestBitmapVisitAndWriteOutputNoOffset,
+class TestBitmapVisitAndWrite : public ::testing::TestWithParam<int32_t> {};
+
+INSTANTIATE_TEST_SUITE_P(VisitWriteGeneral, TestBitmapVisitAndWrite,
                          testing::Values(199, 256, 1000));
 
-INSTANTIATE_TEST_SUITE_P(VisitWriteEdgeCases, TestBitmapVisitAndWriteOutputNoOffset,
+INSTANTIATE_TEST_SUITE_P(VisitWriteEdgeCases, TestBitmapVisitAndWrite,
                          testing::Values(5, 13, 21, 29, 37, 41, 51, 59, 64, 97));
 
-INSTANTIATE_TEST_SUITE_P(VisitWriteEdgeCases2, TestBitmapVisitAndWriteOutputNoOffset,
+INSTANTIATE_TEST_SUITE_P(VisitWriteEdgeCases2, TestBitmapVisitAndWrite,
                          testing::Values(8, 16, 24, 32, 40, 48, 56, 64));
 
-class TestBitmapVisitAndWriteOutputWithOffset : public ::testing::TestWithParam<int32_t> {
-};
-
-TEST_P(TestBitmapVisitAndWriteOutputWithOffset, Test2) {
-  auto part = GetParam();
-  int64_t bits = part * 4;
-  std::vector<bool> data;
-  random_bool_vector(data, bits);
-
-  arrow::BooleanBuilder boolean_builder;
-  ASSERT_OK(boolean_builder.AppendValues(data));
-  ASSERT_OK_AND_ASSIGN(auto arrow_data, boolean_builder.Finish());
+TEST_P(TestBitmapVisitAndWrite, NoOffset) { DoBitmapVisitAndWrite(GetParam(), false); }
 
-  std::shared_ptr<Buffer>& arrow_buffer = arrow_data->data()->buffers[1];
-
-  Bitmap bm0(arrow_buffer, 0, part);
-  Bitmap bm1(arrow_buffer, part * 1, part);
-  Bitmap bm2(arrow_buffer, part * 2, part);
-
-  std::array<Bitmap, 2> out_bms;
-  ASSERT_OK_AND_ASSIGN(auto out, AllocateBitmap(part * 4));
-  out_bms[0] = Bitmap(out, part, part);
-  out_bms[1] = Bitmap(out, part * 2, part);
-
-  std::vector<bool> v0(data.begin(), data.begin() + part);
-  std::vector<bool> v1(data.begin() + part * 1, data.begin() + part * 2);
-  std::vector<bool> v2(data.begin() + part * 2, data.begin() + part * 3);
-
-  //  std::cout << "v0: " << VectorToString(v0) << "\n"
-  //            << "b0: " << bm0.ToString() << "\n"
-  //            << "v1: " << VectorToString(v1) << "\n"
-  //            << "b1: " << bm1.ToString() << "\n"
-  //            << "v2: " << VectorToString(v2) << "\n"
-  //            << "b2: " << bm2.ToString() << "\n";
-
-  std::vector<bool> out_v0(part);
-  std::vector<bool> out_v1(part);
-  // v3 = v0 & v1
-  std::transform(v0.begin(), v0.end(), v1.begin(), out_v0.begin(),
-                 std::logical_and<bool>());
-  // v3 |= v2
-  std::transform(v0.begin(), v0.end(), v2.begin(), out_v1.begin(),
-                 std::logical_or<bool>());
-
-  //  std::cout << "out0: " << VectorToString(out_v0) << "\n"
-  //            << "out1: " << VectorToString(out_v1) << "\n";
-
-  // out0 = bm0 & bm1, out1= bm0 | bm2
-  std::array<Bitmap, 3> in_bms{bm0, bm1, bm2};
-  Bitmap::VisitWordsAndWrite(
-      in_bms, &out_bms,
-      [](const std::array<uint64_t, 3>& in, std::array<uint64_t, 2>* out) {
-        out->at(0) = in[0] & in[1];
-        out->at(1) = in[0] | in[2];
-      });
-
-  VerifyBoolVectorAndBitmap(out_bms[0], out_v0);
-  VerifyBoolVectorAndBitmap(out_bms[1], out_v1);
-}
-
-INSTANTIATE_TEST_SUITE_P(VisitWriteGeneral, TestBitmapVisitAndWriteOutputWithOffset,
-                         testing::Values(199, 256, 1000));
-
-INSTANTIATE_TEST_SUITE_P(VisitWriteEdgeCases, TestBitmapVisitAndWriteOutputWithOffset,
-                         testing::Values(7, 15, 23, 31, 39, 47, 55, 63, 73, 97));
-
-INSTANTIATE_TEST_SUITE_P(VisitWriteEdgeCases2, TestBitmapVisitAndWriteOutputWithOffset,
-                         testing::Values(8, 16, 24, 32, 40, 48, 56, 64));
+TEST_P(TestBitmapVisitAndWrite, WithOffset) { DoBitmapVisitAndWrite(GetParam(), true); }
 
 }  // namespace internal
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/bitmap.h b/cpp/src/arrow/util/bitmap.h
index 78bfca8d408..619135bab2d 100644
--- a/cpp/src/arrow/util/bitmap.h
+++ b/cpp/src/arrow/util/bitmap.h
@@ -315,7 +315,9 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
         std::array<Word, M> out_words;
         std::copy(in.begin(), in.end(), in_words.begin());
         visitor(in_words, &out_words);
-        std::move(out_words.begin(), out_words.end(), out->begin());
+        for (size_t i = 0; i < M; i++) {
+          out->at(i) = static_cast<uint8_t>(out_words[i]);
+        }
       };
 
       std::array<uint8_t, N> visited_bytes;

From b12a20d87605bf84a07759c5c16e034c0caa8f53 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Fri, 18 Jun 2021 08:50:14 -0400
Subject: [PATCH 30/46] fixing errors

---
 cpp/src/arrow/compute/kernels/scalar_if_else.cc | 4 ++--
 cpp/src/arrow/util/bit_util.cc                  | 5 +++--
 cpp/src/arrow/util/bit_util_test.cc             | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
index 4d31cdbcf5d..a71705816b8 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
@@ -212,7 +212,7 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
   using T = typename TypeTraits<Type>::CType;
   // A - Array
   // S - Scalar
-  using Word = uint64_t ;
+  using Word = uint64_t;
   static constexpr int64_t word_len = sizeof(Word) * 8;
 
   //  AAA
@@ -413,7 +413,7 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
     int64_t cnt = cond_reader.words();
     while (cnt--) {
       Word word = cond_reader.NextWord();
-      if (word == UINT64_MAX) { // all from left
+      if (word == UINT64_MAX) {  // all from left
         std::fill(out_values + data_offset, out_values + data_offset + word_len,
                   left_data);
       } else if (word) {  // selectively copy from left
diff --git a/cpp/src/arrow/util/bit_util.cc b/cpp/src/arrow/util/bit_util.cc
index b1ac21e8e41..e33b65b841a 100644
--- a/cpp/src/arrow/util/bit_util.cc
+++ b/cpp/src/arrow/util/bit_util.cc
@@ -82,7 +82,7 @@ void SetBitmapImpl(uint8_t* data, int64_t offset, int64_t length) {
 
   constexpr uint8_t set_byte = value ? UINT8_MAX : 0;
 
-  auto prologue = BitUtil::RoundUp(offset, 8) - offset;
+  auto prologue = static_cast<int32_t>(BitUtil::RoundUp(offset, 8) - offset);
   DCHECK_LT(prologue, 8);
 
   if (length < prologue) {  // special case where a mask is required
@@ -111,7 +111,8 @@ void SetBitmapImpl(uint8_t* data, int64_t offset, int64_t length) {
 
   // clean up
   DCHECK_LT(length, 8);
-  data[offset / 8] = BitUtil::SpliceWord(length, set_byte, data[offset / 8]);
+  data[offset / 8] =
+      BitUtil::SpliceWord(static_cast<int32_t>(length), set_byte, data[offset / 8]);
 }
 
 void SetBitmap(uint8_t* data, int64_t offset, int64_t length) {
diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc
index 316a52de087..4568dc4219d 100644
--- a/cpp/src/arrow/util/bit_util_test.cc
+++ b/cpp/src/arrow/util/bit_util_test.cc
@@ -1986,7 +1986,7 @@ void CheckSplice(int n, Word low, Word high) {
 
   Word res = BitUtil::SpliceWord<Word>(n, low, high);
   Word exp = static_cast<Word>(ret.to_ulong());
-  assert(res == exp);
+  ASSERT_EQ(exp, res) << "exp: " << exp << " got: " << res << std::endl;
 }
 
 TEST(SpliceWord, SpliceWord) {

From a20e2959440ca54c1a8ee61a0a690ed806041ee6 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Fri, 18 Jun 2021 15:24:34 -0400
Subject: [PATCH 31/46] simplifying if-else

---
 .../arrow/compute/kernels/scalar_if_else.cc   | 148 ++++++------------
 1 file changed, 47 insertions(+), 101 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
index a71705816b8..e59b9828ee0 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
@@ -215,36 +215,24 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
   using Word = uint64_t;
   static constexpr int64_t word_len = sizeof(Word) * 8;
 
-  //  AAA
-  static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
-                     const ArrayData& right, ArrayData* out) {
-    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> out_buf,
-                          ctx->Allocate(cond.length * sizeof(T)));
-    T* out_values = reinterpret_cast<T*>(out_buf->mutable_data());
-
-    // copy right data to out_buff
-    const T* right_data = right.GetValues<T>(1);
-    std::memcpy(out_values, right_data, right.length * sizeof(T));
-
-    const auto* cond_data = cond.buffers[1]->data();  // this is a BoolArray
-    BitmapWordReader<Word> cond_reader(cond_data, cond.offset, cond.length);
-
-    // selectively copy values from left data
-    const T* left_data = left.GetValues<T>(1);
+  template <typename HandleBulk, typename HandleEach>
+  static void RunIfElseLoop(const ArrayData& cond, HandleBulk handle_bulk,
+                            HandleEach handle_each) {
     int64_t data_offset = 0;
     int64_t bit_offset = cond.offset;
+    const auto* cond_data = cond.buffers[1]->data();  // this is a BoolArray
+
+    BitmapWordReader<Word> cond_reader(cond_data, cond.offset, cond.length);
 
-    // todo this can be improved by intrinsics. ex: _mm*_mask_store_e* (vmovdqa*)
     int64_t cnt = cond_reader.words();
     while (cnt--) {
       Word word = cond_reader.NextWord();
       if (word == UINT64_MAX) {
-        std::memcpy(out_values + data_offset, left_data + data_offset,
-                    word_len * sizeof(T));
+        handle_bulk(data_offset, word_len);
       } else if (word) {
         for (int64_t i = 0; i < word_len; ++i) {
           if (BitUtil::GetBit(cond_data, bit_offset + i)) {
-            out_values[data_offset + i] = left_data[data_offset + i];
+            handle_each(data_offset + i);
           }
         }
       }
@@ -257,17 +245,40 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
       int valid_bits;
       uint8_t byte = cond_reader.NextTrailingByte(valid_bits);
       if (byte == UINT8_MAX && valid_bits == 8) {
-        std::memcpy(out_values + data_offset, left_data + data_offset, 8 * sizeof(T));
+        handle_bulk(data_offset, 8);
       } else if (byte) {
         for (int i = 0; i < valid_bits; ++i) {
           if (BitUtil::GetBit(cond_data, bit_offset + i)) {
-            out_values[data_offset + i] = left_data[data_offset + i];
+            handle_each(data_offset + i);
           }
         }
       }
       data_offset += 8;
       bit_offset += 8;
     }
+  }
+
+  //  AAA
+  static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+                     const ArrayData& right, ArrayData* out) {
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> out_buf,
+                          ctx->Allocate(cond.length * sizeof(T)));
+    T* out_values = reinterpret_cast<T*>(out_buf->mutable_data());
+
+    // copy right data to out_buff
+    const T* right_data = right.GetValues<T>(1);
+    std::memcpy(out_values, right_data, right.length * sizeof(T));
+
+    // selectively copy values from left data
+    const T* left_data = left.GetValues<T>(1);
+
+    RunIfElseLoop(
+        cond,
+        [&](int64_t data_offset, int64_t num_elems) {
+          std::memcpy(out_values + data_offset, left_data + data_offset,
+                      num_elems * sizeof(T));
+        },
+        [&](int64_t data_offset) { out_values[data_offset] = left_data[data_offset]; });
 
     out->buffers[1] = std::move(out_buf);
     return Status::OK();
@@ -284,48 +295,16 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
     const T* right_data = right.GetValues<T>(1);
     std::memcpy(out_values, right_data, right.length * sizeof(T));
 
-    const auto* cond_data = cond.buffers[1]->data();  // this is a BoolArray
-    BitmapWordReader<Word> cond_reader(cond_data, cond.offset, cond.length);
-
     // selectively copy values from left data
     T left_data = internal::UnboxScalar<Type>::Unbox(left);
-    int64_t data_offset = 0;
-    int64_t bit_offset = cond.offset;
 
-    // todo this can be improved by intrinsics. ex: _mm*_mask_store_e* (vmovdqa*)
-    int64_t cnt = cond_reader.words();
-    while (cnt--) {
-      Word word = cond_reader.NextWord();
-      if (word == UINT64_MAX) {
-        std::fill(out_values + data_offset, out_values + data_offset + word_len,
-                  left_data);
-      } else if (word) {
-        for (int64_t i = 0; i < word_len; ++i) {
-          if (BitUtil::GetBit(cond_data, bit_offset + i)) {
-            out_values[data_offset + i] = left_data;
-          }
-        }
-      }
-      data_offset += word_len;
-      bit_offset += word_len;
-    }
-
-    cnt = cond_reader.trailing_bytes();
-    while (cnt--) {
-      int valid_bits;
-      uint8_t byte = cond_reader.NextTrailingByte(valid_bits);
-      if (byte == UINT8_MAX && valid_bits == 8) {
-        std::fill(out_values + data_offset, out_values + data_offset + 8, left_data);
-      } else if (byte) {
-        for (int i = 0; i < valid_bits; ++i) {
-          if (BitUtil::GetBit(cond_data, bit_offset + i)) {
-            out_values[data_offset + i] = left_data;
-          }
-        }
-      }
-      data_offset += 8;
-      bit_offset += 8;
-    }
+    RunIfElseLoop(
+        cond,
+        [&](int64_t data_offset, int64_t num_elems) {
+          std::fill(out_values + data_offset, out_values + data_offset + num_elems,
+                    left_data);
+        },
+        [&](int64_t data_offset) { out_values[data_offset] = left_data; });
 
     out->buffers[1] = std::move(out_buf);
     return Status::OK();
@@ -401,48 +380,15 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
     T right_data = internal::UnboxScalar<Type>::Unbox(right);
     std::fill(out_values, out_values + cond.length, right_data);
 
-    const auto* cond_data = cond.buffers[1]->data();  // this is a BoolArray
-    BitmapWordReader<Word> cond_reader(cond_data, cond.offset, cond.length);
-
     // selectively copy values from left data
     T left_data = internal::UnboxScalar<Type>::Unbox(left);
-    int64_t data_offset = 0;
-    int64_t bit_offset = cond.offset;
-
-    // todo this can be improved by intrinsics. ex: _mm*_mask_store_e* (vmovdqa*)
-    int64_t cnt = cond_reader.words();
-    while (cnt--) {
-      Word word = cond_reader.NextWord();
-      if (word == UINT64_MAX) {  // all from left
-        std::fill(out_values + data_offset, out_values + data_offset + word_len,
-                  left_data);
-      } else if (word) {  // selectively copy from left
-        for (int64_t i = 0; i < word_len; ++i) {
-          if (BitUtil::GetBit(cond_data, bit_offset + i)) {
-            out_values[data_offset + i] = left_data;
-          }
-        }
-      }
-      data_offset += word_len;
-      bit_offset += word_len;
-    }
-
-    cnt = cond_reader.trailing_bytes();
-    while (cnt--) {
-      int valid_bits;
-      uint8_t byte = cond_reader.NextTrailingByte(valid_bits);
-      if (byte == UINT8_MAX && valid_bits == 8) {
-        std::fill(out_values + data_offset, out_values + data_offset + 8, left_data);
-      } else if (byte) {
-        for (int i = 0; i < valid_bits; ++i) {
-          if (BitUtil::GetBit(cond_data, bit_offset + i)) {
-            out_values[data_offset + i] = left_data;
-          }
-        }
-      }
-      data_offset += 8;
-      bit_offset += 8;
-    }
+    RunIfElseLoop(
+        cond,
+        [&](int64_t data_offset, int64_t num_elems) {
+          std::fill(out_values + data_offset, out_values + data_offset + num_elems,
+                    left_data);
+        },
+        [&](int64_t data_offset) { out_values[data_offset] = left_data; });
 
     out->buffers[1] = std::move(out_buf);
     return Status::OK();

From e37be50942c1e47bd46ec2c71b406f1c3b70de27 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Fri, 18 Jun 2021 16:07:27 -0400
Subject: [PATCH 32/46] simplifying if-else

---
 .../arrow/compute/kernels/scalar_if_else.cc   | 79 ++++++++-----------
 1 file changed, 31 insertions(+), 48 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
index e59b9828ee0..69426bb630a 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
@@ -215,7 +215,15 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
   using Word = uint64_t;
   static constexpr int64_t word_len = sizeof(Word) * 8;
 
-  template <typename HandleBulk, typename HandleEach>
+  /// Runs the main if_else loop. Here, it is expected that the right data has already
+  /// been copied to the output.
+  /// If invert_mask is meant to invert the cond.data. If is set to ~Word(0), then the
+  /// buffer will be inverted before calling the handle_bulk or handle_each functions.
+  /// This is useful, when left is an array and right is scalar. Then rather than
+  /// copying data from the right to output, we can copy left data to the output and
+  /// invert the cond data to fill right values. Filling out with a scalar is presumed to
+  /// be more efficient than filling with an array
+  template <typename HandleBulk, typename HandleEach, Word invert_mask = Word(0)>
   static void RunIfElseLoop(const ArrayData& cond, HandleBulk handle_bulk,
                             HandleEach handle_each) {
     int64_t data_offset = 0;
@@ -227,11 +235,12 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
     int64_t cnt = cond_reader.words();
     while (cnt--) {
       Word word = cond_reader.NextWord();
-      if (word == UINT64_MAX) {
+      if ((word ^ invert_mask) == UINT64_MAX) {
         handle_bulk(data_offset, word_len);
-      } else if (word) {
+      } else if (word ^ invert_mask) {
         for (int64_t i = 0; i < word_len; ++i) {
-          if (BitUtil::GetBit(cond_data, bit_offset + i)) {
+          if (BitUtil::GetBit(cond_data, bit_offset + i) ^
+              static_cast<bool>(invert_mask)) {
             handle_each(data_offset + i);
           }
         }
@@ -244,11 +253,12 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
     while (cnt--) {
       int valid_bits;
       uint8_t byte = cond_reader.NextTrailingByte(valid_bits);
-      if (byte == UINT8_MAX && valid_bits == 8) {
+      if (((byte ^ static_cast<uint8_t>(invert_mask)) == UINT8_MAX) && valid_bits == 8) {
         handle_bulk(data_offset, 8);
-      } else if (byte) {
+      } else if (byte ^ static_cast<uint8_t>(invert_mask)) {
         for (int i = 0; i < valid_bits; ++i) {
-          if (BitUtil::GetBit(cond_data, bit_offset + i)) {
+          if (BitUtil::GetBit(cond_data, bit_offset + i) ^
+              static_cast<bool>(invert_mask)) {
             handle_each(data_offset + i);
           }
         }
@@ -258,6 +268,13 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
     }
   }
 
+  template <typename HandleBulk, typename HandleEach>
+  static void RunIfElseLoopInverted(const ArrayData& cond, HandleBulk handle_bulk,
+                                    HandleEach handle_each) {
+    return RunIfElseLoop<HandleBulk, HandleEach, ~Word(0)>(cond, handle_bulk,
+                                                           handle_each);
+  }
+
   //  AAA
   static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
                      const ArrayData& right, ArrayData* out) {
@@ -321,49 +338,15 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
     const T* left_data = left.GetValues<T>(1);
     std::memcpy(out_values, left_data, left.length * sizeof(T));
 
-    const auto* cond_data = cond.buffers[1]->data();  // this is a BoolArray
-    BitmapWordReader<Word> cond_reader(cond_data, cond.offset, cond.length);
-
-    // selectively copy values from left data
     T right_data = internal::UnboxScalar<Type>::Unbox(right);
-    int64_t data_offset = 0;
-    int64_t bit_offset = cond.offset;
-
-    // todo this can be improved by intrinsics. ex: _mm*_mask_store_e* (vmovdqa*)
-    // left data is already in the output buffer. Therefore, mask needs to be inverted
-    int64_t cnt = cond_reader.words();
-    while (cnt--) {
-      Word word = cond_reader.NextWord();
-      if (word == 0) {  // all from right
-        std::fill(out_values + data_offset, out_values + data_offset + word_len,
-                  right_data);
-      } else if (word != UINT64_MAX) {  // selectively copy from right
-        for (int64_t i = 0; i < word_len; ++i) {
-          if (!BitUtil::GetBit(cond_data, bit_offset + i)) {
-            out_values[data_offset + i] = right_data;
-          }
-        }
-      }
-      data_offset += word_len;
-      bit_offset += word_len;
-    }
 
-    cnt = cond_reader.trailing_bytes();
-    while (cnt--) {
-      int valid_bits;
-      uint8_t byte = cond_reader.NextTrailingByte(valid_bits);
-      if (byte == 0 && valid_bits == 8) {
-        std::fill(out_values + data_offset, out_values + data_offset + 8, right_data);
-      } else if (byte != UINT8_MAX) {
-        for (int i = 0; i < valid_bits; ++i) {
-          if (!BitUtil::GetBit(cond_data, bit_offset + i)) {
-            out_values[data_offset + i] = right_data;
-          }
-        }
-      }
-      data_offset += 8;
-      bit_offset += 8;
-    }
+    RunIfElseLoopInverted(
+        cond,
+        [&](int64_t data_offset, int64_t num_elems) {
+          std::fill(out_values + data_offset, out_values + data_offset + num_elems,
+                    right_data);
+        },
+        [&](int64_t data_offset) { out_values[data_offset] = right_data; });
 
     out->buffers[1] = std::move(out_buf);
     return Status::OK();

From 6c71c36c4f1141c8a728d0d5a0641f3b9740d147 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Fri, 18 Jun 2021 18:09:35 -0400
Subject: [PATCH 33/46] fixing errors

---
 cpp/src/arrow/util/bit_util.h       |  2 +-
 cpp/src/arrow/util/bit_util_test.cc | 37 ++++++++++-------------------
 2 files changed, 13 insertions(+), 26 deletions(-)

diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h
index a9775552c7b..95969dbd2da 100644
--- a/cpp/src/arrow/util/bit_util.h
+++ b/cpp/src/arrow/util/bit_util.h
@@ -340,7 +340,7 @@ constexpr Word PrecedingWordBitmask(unsigned int const i) {
 /// \brief Create a word with low `n` bits from `low` and high `sizeof(Word)-n` bits
 /// from `high`.
 /// Word ret
-/// for (i = 0; i < sizeof(Word); i++){
+/// for (i = 0; i < sizeof(Word)*8; i++){
 ///     ret[i]= i < n ? low[i]: high[i];
 /// }
 template <typename Word>
diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc
index 4568dc4219d..2b42e3b34e4 100644
--- a/cpp/src/arrow/util/bit_util_test.cc
+++ b/cpp/src/arrow/util/bit_util_test.cc
@@ -1975,23 +1975,7 @@ TEST(BitUtil, BitsetStack) {
   ASSERT_EQ(stack.TopSize(), 0);
 }
 
-template <typename Word>
-void CheckSplice(int n, Word low, Word high) {
-  std::bitset<sizeof(Word) * 8> ret;
-  for (size_t i = 0; i < ret.size(); i++) {
-    ret[i] = i < static_cast<size_t>(n)
-                 ? BitUtil::GetBit(reinterpret_cast<uint8_t*>(&low), i)
-                 : BitUtil::GetBit(reinterpret_cast<uint8_t*>(&high), i);
-  }
-
-  Word res = BitUtil::SpliceWord<Word>(n, low, high);
-  Word exp = static_cast<Word>(ret.to_ulong());
-  ASSERT_EQ(exp, res) << "exp: " << exp << " got: " << res << std::endl;
-}
-
 TEST(SpliceWord, SpliceWord) {
-  uint64_t low = 123456789, high = 987654321;
-
   static_assert(
       BitUtil::PrecedingWordBitmask<uint8_t>(0) == BitUtil::kPrecedingBitmask[0], "");
   static_assert(
@@ -2003,17 +1987,20 @@ TEST(SpliceWord, SpliceWord) {
   static_assert(BitUtil::PrecedingWordBitmask<uint64_t>(64) == UINT64_MAX, "");
   static_assert(BitUtil::PrecedingWordBitmask<uint64_t>(65) == UINT64_MAX, "");
 
-  CheckSplice<uint8_t>(0, static_cast<uint8_t>(low), static_cast<uint8_t>(high));
-  CheckSplice<uint8_t>(8, static_cast<uint8_t>(low), static_cast<uint8_t>(high));
-  CheckSplice<uint8_t>(8 / 3, static_cast<uint8_t>(low), static_cast<uint8_t>(high));
+  ASSERT_EQ(BitUtil::SpliceWord<uint8_t>(0, 0x12, 0xef), 0xef);
+  ASSERT_EQ(BitUtil::SpliceWord<uint8_t>(8, 0x12, 0xef), 0x12);
+  ASSERT_EQ(BitUtil::SpliceWord<uint8_t>(3, 0x12, 0xef), 0xea);
 
-  CheckSplice<uint32_t>(0, static_cast<uint32_t>(low), static_cast<uint32_t>(high));
-  CheckSplice<uint32_t>(32, static_cast<uint32_t>(low), static_cast<uint32_t>(high));
-  CheckSplice<uint32_t>(32 / 3, static_cast<uint32_t>(low), static_cast<uint32_t>(high));
+  ASSERT_EQ(BitUtil::SpliceWord<uint32_t>(0, 0x12345678, 0xfedcba98), 0xfedcba98);
+  ASSERT_EQ(BitUtil::SpliceWord<uint32_t>(32, 0x12345678, 0xfedcba98), 0x12345678);
+  ASSERT_EQ(BitUtil::SpliceWord<uint32_t>(24, 0x12345678, 0xfedcba98), 0xfe345678);
 
-  CheckSplice(0, low, high);
-  CheckSplice(64, low, high);
-  CheckSplice(64 / 3, low, high);
+  ASSERT_EQ(BitUtil::SpliceWord<uint64_t>(0, 0x0123456789abcdef, 0xfedcba9876543210),
+            0xfedcba9876543210);
+  ASSERT_EQ(BitUtil::SpliceWord<uint64_t>(64, 0x0123456789abcdef, 0xfedcba9876543210),
+            0x0123456789abcdef);
+  ASSERT_EQ(BitUtil::SpliceWord<uint64_t>(48, 0x0123456789abcdef, 0xfedcba9876543210),
+            0xfedc456789abcdef);
 }
 
 // test the basic assumption of word level Bitmap::Visit

From aeb48ae03562b398c19728babfc95f026eb2bcac Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Sat, 19 Jun 2021 11:10:50 -0400
Subject: [PATCH 34/46] attempting to fix msvc error

---
 cpp/src/arrow/compute/kernels/scalar_if_else.cc | 4 ++--
 cpp/src/arrow/util/bitmap.h                     | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
index 69426bb630a..7b4be7ed46c 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
@@ -239,7 +239,7 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
         handle_bulk(data_offset, word_len);
       } else if (word ^ invert_mask) {
         for (int64_t i = 0; i < word_len; ++i) {
-          if (BitUtil::GetBit(cond_data, bit_offset + i) ^
+          if (BitUtil::GetBit(cond_data, bit_offset + i) !=
               static_cast<bool>(invert_mask)) {
             handle_each(data_offset + i);
           }
@@ -257,7 +257,7 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
         handle_bulk(data_offset, 8);
       } else if (byte ^ static_cast<uint8_t>(invert_mask)) {
         for (int i = 0; i < valid_bits; ++i) {
-          if (BitUtil::GetBit(cond_data, bit_offset + i) ^
+          if (BitUtil::GetBit(cond_data, bit_offset + i) !=
               static_cast<bool>(invert_mask)) {
             handle_each(data_offset + i);
           }
diff --git a/cpp/src/arrow/util/bitmap.h b/cpp/src/arrow/util/bitmap.h
index 619135bab2d..674ff96ca5d 100644
--- a/cpp/src/arrow/util/bitmap.h
+++ b/cpp/src/arrow/util/bitmap.h
@@ -288,8 +288,8 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
     output_words.fill(0);
 
     // every reader will have same number of words, since they are same length'ed
-    // TODO($JIRA) this will be inefficient in some cases. When there are offsets beyond Word
-    // boundary, every Word would have to be created from 2 adjoining Words
+    // TODO($JIRA) this will be inefficient in some cases. When there are offsets beyond
+    //  Word boundary, every Word would have to be created from 2 adjoining Words
     auto n_words = readers[0].words();
     bit_length -= n_words * kBitWidth;
     while (n_words--) {

From 4519dd3585fecef8645267a2ae457554aa9be0a6 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Sat, 19 Jun 2021 14:14:58 -0400
Subject: [PATCH 35/46] lint fix

---
 cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc
index 937921a05b2..98fb675da40 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc
@@ -110,4 +110,4 @@ BENCHMARK(IfElseBench32Contiguous)->Args({elems, 99});
 BENCHMARK(IfElseBench64Contiguous)->Args({elems, 99});
 
 }  // namespace compute
-}  // namespace arrow
\ No newline at end of file
+}  // namespace arrow

From 30ec72ec7aa7e6744521168b84790fbf238501b9 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Sat, 19 Jun 2021 14:51:20 -0400
Subject: [PATCH 36/46] fixing the down casting issue

---
 .../arrow/compute/kernels/scalar_if_else.cc   | 55 +++++++++++++------
 1 file changed, 38 insertions(+), 17 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
index 7b4be7ed46c..f83b01df913 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
@@ -223,7 +223,7 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
   /// copying data from the right to output, we can copy left data to the output and
   /// invert the cond data to fill right values. Filling out with a scalar is presumed to
   /// be more efficient than filling with an array
-  template <typename HandleBulk, typename HandleEach, Word invert_mask = Word(0)>
+  template <typename HandleBulk, typename HandleEach, bool invert = false>
   static void RunIfElseLoop(const ArrayData& cond, HandleBulk handle_bulk,
                             HandleEach handle_each) {
     int64_t data_offset = 0;
@@ -235,13 +235,24 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
     int64_t cnt = cond_reader.words();
     while (cnt--) {
       Word word = cond_reader.NextWord();
-      if ((word ^ invert_mask) == UINT64_MAX) {
-        handle_bulk(data_offset, word_len);
-      } else if (word ^ invert_mask) {
-        for (int64_t i = 0; i < word_len; ++i) {
-          if (BitUtil::GetBit(cond_data, bit_offset + i) !=
-              static_cast<bool>(invert_mask)) {
-            handle_each(data_offset + i);
+      if (invert) {
+        if (word == 0) {
+          handle_bulk(data_offset, word_len);
+        } else if (word != UINT64_MAX) {
+          for (int64_t i = 0; i < word_len; ++i) {
+            if (!BitUtil::GetBit(cond_data, bit_offset + i)) {
+              handle_each(data_offset + i);
+            }
+          }
+        }
+      } else {
+        if (word == UINT64_MAX) {
+          handle_bulk(data_offset, word_len);
+        } else if (word) {
+          for (int64_t i = 0; i < word_len; ++i) {
+            if (BitUtil::GetBit(cond_data, bit_offset + i)) {
+              handle_each(data_offset + i);
+            }
           }
         }
       }
@@ -253,13 +264,24 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
     while (cnt--) {
       int valid_bits;
       uint8_t byte = cond_reader.NextTrailingByte(valid_bits);
-      if (((byte ^ static_cast<uint8_t>(invert_mask)) == UINT8_MAX) && valid_bits == 8) {
-        handle_bulk(data_offset, 8);
-      } else if (byte ^ static_cast<uint8_t>(invert_mask)) {
-        for (int i = 0; i < valid_bits; ++i) {
-          if (BitUtil::GetBit(cond_data, bit_offset + i) !=
-              static_cast<bool>(invert_mask)) {
-            handle_each(data_offset + i);
+      if (invert) {
+        if (byte == 0 && valid_bits == 8) {
+          handle_bulk(data_offset, 8);
+        } else if (byte != UINT8_MAX) {
+          for (int i = 0; i < valid_bits; ++i) {
+            if (!BitUtil::GetBit(cond_data, bit_offset + i)) {
+              handle_each(data_offset + i);
+            }
+          }
+        }
+      } else {
+        if (byte == UINT8_MAX && valid_bits == 8) {
+          handle_bulk(data_offset, 8);
+        } else if (byte) {
+          for (int i = 0; i < valid_bits; ++i) {
+            if (BitUtil::GetBit(cond_data, bit_offset + i)) {
+              handle_each(data_offset + i);
+            }
           }
         }
       }
@@ -271,8 +293,7 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
   template <typename HandleBulk, typename HandleEach>
   static void RunIfElseLoopInverted(const ArrayData& cond, HandleBulk handle_bulk,
                                     HandleEach handle_each) {
-    return RunIfElseLoop<HandleBulk, HandleEach, ~Word(0)>(cond, handle_bulk,
-                                                           handle_each);
+    return RunIfElseLoop<HandleBulk, HandleEach, true>(cond, handle_bulk, handle_each);
   }
 
   //  AAA

From 0e4f1a050a0c2bcb7727bf51104b28b82b3fd4a8 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Sat, 19 Jun 2021 14:52:30 -0400
Subject: [PATCH 37/46] fixing the down casting issue

---
 cpp/src/arrow/compute/kernels/scalar_if_else.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
index f83b01df913..aa1962e7919 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
@@ -217,7 +217,7 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
 
   /// Runs the main if_else loop. Here, it is expected that the right data has already
   /// been copied to the output.
-  /// If invert_mask is meant to invert the cond.data. If is set to ~Word(0), then the
+  /// If `invert` is meant to invert the cond.data. If is set to `true`, then the
   /// buffer will be inverted before calling the handle_bulk or handle_each functions.
   /// This is useful, when left is an array and right is scalar. Then rather than
   /// copying data from the right to output, we can copy left data to the output and

From f5a14c0709f6ac58a8ce1b4a648daa9676160393 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Sat, 19 Jun 2021 17:22:24 -0400
Subject: [PATCH 38/46] refactor

---
 cpp/src/arrow/util/CMakeLists.txt                               | 2 +-
 .../{bit_util_benchmark_temp.cc => bitmap_reader_benchmark.cc}  | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename cpp/src/arrow/util/{bit_util_benchmark_temp.cc => bitmap_reader_benchmark.cc} (100%)

diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt
index 1851a9afa5e..660fb2657b6 100644
--- a/cpp/src/arrow/util/CMakeLists.txt
+++ b/cpp/src/arrow/util/CMakeLists.txt
@@ -79,6 +79,7 @@ add_arrow_test(threading-utility-test
 
 add_arrow_benchmark(bit_block_counter_benchmark)
 add_arrow_benchmark(bit_util_benchmark)
+add_arrow_benchmark(bitmap_reader_benchmark)
 add_arrow_benchmark(cache_benchmark)
 add_arrow_benchmark(compression_benchmark)
 add_arrow_benchmark(decimal_benchmark)
@@ -93,4 +94,3 @@ add_arrow_benchmark(trie_benchmark)
 add_arrow_benchmark(utf8_util_benchmark)
 add_arrow_benchmark(value_parsing_benchmark)
 add_arrow_benchmark(variant_benchmark)
-add_arrow_benchmark(bit_util_benchmark_temp)
diff --git a/cpp/src/arrow/util/bit_util_benchmark_temp.cc b/cpp/src/arrow/util/bitmap_reader_benchmark.cc
similarity index 100%
rename from cpp/src/arrow/util/bit_util_benchmark_temp.cc
rename to cpp/src/arrow/util/bitmap_reader_benchmark.cc

From cfb88f8c7c2f5d91083be6d7d1fee3f4d6f1bc2f Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Mon, 21 Jun 2021 16:42:29 -0400
Subject: [PATCH 39/46] adding set/clearbitmap tests

---
 cpp/src/arrow/util/bit_util.cc      |  2 +-
 cpp/src/arrow/util/bit_util_test.cc | 37 +++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/cpp/src/arrow/util/bit_util.cc b/cpp/src/arrow/util/bit_util.cc
index e33b65b841a..ee4bcde7713 100644
--- a/cpp/src/arrow/util/bit_util.cc
+++ b/cpp/src/arrow/util/bit_util.cc
@@ -99,7 +99,7 @@ void SetBitmapImpl(uint8_t* data, int64_t offset, int64_t length) {
   }
 
   // align to a byte boundary
-  data[offset / 8] = BitUtil::SpliceWord(prologue, data[offset / 8], set_byte);
+  data[offset / 8] = BitUtil::SpliceWord(8 - prologue, data[offset / 8], set_byte);
   offset += prologue;
   length -= prologue;
 
diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc
index 2b42e3b34e4..ded37398f95 100644
--- a/cpp/src/arrow/util/bit_util_test.cc
+++ b/cpp/src/arrow/util/bit_util_test.cc
@@ -1532,6 +1532,43 @@ TEST(BitUtilTests, TestSetBitsTo) {
   }
 }
 
+TEST(BitUtilTests, TestSetBitmap) {
+  using BitUtil::SetBitsTo;
+  for (const auto fill_byte_int : {0xff}) {
+    const uint8_t fill_byte = static_cast<uint8_t>(fill_byte_int);
+    {
+      // test set within a byte
+      uint8_t bitmap[] = {fill_byte, fill_byte, fill_byte, fill_byte};
+      BitUtil::SetBitmap(bitmap, 2, 2);
+      BitUtil::ClearBitmap(bitmap, 4, 2);
+      ASSERT_BYTES_EQ(bitmap, {static_cast<uint8_t>((fill_byte & ~0x3C) | 0xC)});
+    }
+    {
+      // test straddling a single byte boundary
+      uint8_t bitmap[] = {fill_byte, fill_byte, fill_byte, fill_byte};
+      BitUtil::SetBitmap(bitmap, 4, 7);
+      BitUtil::ClearBitmap(bitmap, 11, 7);
+      ASSERT_BYTES_EQ(bitmap, {static_cast<uint8_t>((fill_byte & 0xF) | 0xF0), 0x7,
+                               static_cast<uint8_t>(fill_byte & ~0x3)});
+    }
+    {
+      // test byte aligned end
+      uint8_t bitmap[] = {fill_byte, fill_byte, fill_byte, fill_byte};
+      BitUtil::SetBitmap(bitmap, 4, 4);
+      BitUtil::ClearBitmap(bitmap, 8, 8);
+      ASSERT_BYTES_EQ(bitmap,
+                      {static_cast<uint8_t>((fill_byte & 0xF) | 0xF0), 0x00, fill_byte});
+    }
+    {
+      // test byte aligned end, multiple bytes
+      uint8_t bitmap[] = {fill_byte, fill_byte, fill_byte, fill_byte};
+      BitUtil::ClearBitmap(bitmap, 0, 24);
+      uint8_t false_byte = static_cast<uint8_t>(0);
+      ASSERT_BYTES_EQ(bitmap, {false_byte, false_byte, false_byte, fill_byte});
+    }
+  }
+}
+
 TEST(BitUtilTests, TestCopyBitmap) {
   const int kBufferSize = 1000;
 

From 33444d151108acb8d4afe1f9e358aa33597c9979 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Mon, 21 Jun 2021 16:42:52 -0400
Subject: [PATCH 40/46] making if_else kernels write_to_slices

---
 .../arrow/compute/kernels/scalar_if_else.cc   | 452 ++++++++++--------
 1 file changed, 249 insertions(+), 203 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
index aa1962e7919..c82adcfc316 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
@@ -78,36 +78,30 @@ Status PromoteNullsVisitor(KernelContext* ctx, const Datum& cond_d, const Datum&
   // cond.valid & (cond.data & left.valid | ~cond.data & right.valid)
   // In the following cases, we dont need to allocate out_valid bitmap
 
-  // if cond & left & right all ones, then output is all valid --> out_valid = nullptr
+  // if cond & left & right all ones, then output is all valid. output validity buffer
+  // is already allocated, hence set all bits
   if (cond_const == kAllValid && left_const == kAllValid && right_const == kAllValid) {
+    BitUtil::SetBitmap(output->buffers[0]->mutable_data(), output->offset,
+                       output->length);
     return Status::OK();
   }
 
   if (left_const == kAllValid && right_const == kAllValid) {
-    // if both left and right are valid, no need to calculate out_valid bitmap. Pass
+    // if both left and right are valid, no need to calculate out_valid bitmap. Copy
     // cond validity buffer
-    // if there's an offset, copy bitmap (cannot slice a bitmap)
-    if (cond.offset) {
-      ARROW_ASSIGN_OR_RAISE(
-          output->buffers[0],
-          arrow::internal::CopyBitmap(ctx->memory_pool(), cond.buffers[0]->data(),
-                                      cond.offset, cond.length));
-    } else {  // just copy assign cond validity buffer
-      output->buffers[0] = cond.buffers[0];
-    }
+    arrow::internal::CopyBitmap(cond.buffers[0]->data(), cond.offset, cond.length,
+                                output->buffers[0]->mutable_data(), output->offset);
     return Status::OK();
   }
 
-  // following cases requires a separate out_valid buffer
-  ARROW_ASSIGN_OR_RAISE(output->buffers[0], ctx->AllocateBitmap(cond.length));
-
   // lambda function that will be used inside the visitor
   auto apply = [&](uint64_t c_valid, uint64_t c_data, uint64_t l_valid,
                    uint64_t r_valid) {
     return c_valid & ((c_data & l_valid) | (~c_data & r_valid));
   };
 
-  std::array<Bitmap, 1> out_bitmaps{Bitmap{output->buffers[0], 0, cond.length}};
+  std::array<Bitmap, 1> out_bitmaps{
+      Bitmap{output->buffers[0], output->offset, output->length}};
 
   switch (flag) {
     case COND_CONST | LEFT_CONST | RIGHT_CONST: {
@@ -201,107 +195,179 @@ Status PromoteNullsVisitor(KernelContext* ctx, const Datum& cond_d, const Datum&
   return Status::OK();
 }
 
-template <typename Type, typename Enable = void>
-struct IfElseFunctor {};
-
-// only number types needs to be handled for Fixed sized primitive data types because,
-// internal::GenerateTypeAgnosticPrimitive forwards types to the corresponding unsigned
-// int type
-template <typename Type>
-struct IfElseFunctor<Type, enable_if_number<Type>> {
-  using T = typename TypeTraits<Type>::CType;
-  // A - Array
-  // S - Scalar
-  using Word = uint64_t;
-  static constexpr int64_t word_len = sizeof(Word) * 8;
-
-  /// Runs the main if_else loop. Here, it is expected that the right data has already
-  /// been copied to the output.
-  /// If `invert` is meant to invert the cond.data. If is set to `true`, then the
-  /// buffer will be inverted before calling the handle_bulk or handle_each functions.
-  /// This is useful, when left is an array and right is scalar. Then rather than
-  /// copying data from the right to output, we can copy left data to the output and
-  /// invert the cond data to fill right values. Filling out with a scalar is presumed to
-  /// be more efficient than filling with an array
-  template <typename HandleBulk, typename HandleEach, bool invert = false>
-  static void RunIfElseLoop(const ArrayData& cond, HandleBulk handle_bulk,
-                            HandleEach handle_each) {
-    int64_t data_offset = 0;
-    int64_t bit_offset = cond.offset;
-    const auto* cond_data = cond.buffers[1]->data();  // this is a BoolArray
-
-    BitmapWordReader<Word> cond_reader(cond_data, cond.offset, cond.length);
-
-    int64_t cnt = cond_reader.words();
-    while (cnt--) {
-      Word word = cond_reader.NextWord();
-      if (invert) {
-        if (word == 0) {
-          handle_bulk(data_offset, word_len);
-        } else if (word != UINT64_MAX) {
-          for (int64_t i = 0; i < word_len; ++i) {
-            if (!BitUtil::GetBit(cond_data, bit_offset + i)) {
-              handle_each(data_offset + i);
-            }
+using Word = uint64_t;
+static constexpr int64_t word_len = sizeof(Word) * 8;
+
+/// Runs the main if_else loop. Here, it is expected that the right data has already
+/// been copied to the output.
+/// If `invert` is meant to invert the cond.data. If is set to `true`, then the
+/// buffer will be inverted before calling the handle_bulk or handle_each functions.
+/// This is useful, when left is an array and right is scalar. Then rather than
+/// copying data from the right to output, we can copy left data to the output and
+/// invert the cond data to fill right values. Filling out with a scalar is presumed to
+/// be more efficient than filling with an array
+template <typename HandleBulk, typename HandleEach, bool invert = false>
+static void RunIfElseLoop(const ArrayData& cond, HandleBulk handle_bulk,
+                          HandleEach handle_each) {
+  int64_t data_offset = 0;
+  int64_t bit_offset = cond.offset;
+  const auto* cond_data = cond.buffers[1]->data();  // this is a BoolArray
+
+  BitmapWordReader<Word> cond_reader(cond_data, cond.offset, cond.length);
+
+  int64_t cnt = cond_reader.words();
+  while (cnt--) {
+    Word word = cond_reader.NextWord();
+    if (invert) {
+      if (word == 0) {
+        handle_bulk(data_offset, word_len);
+      } else if (word != UINT64_MAX) {
+        for (int64_t i = 0; i < word_len; ++i) {
+          if (!BitUtil::GetBit(cond_data, bit_offset + i)) {
+            handle_each(data_offset + i);
           }
         }
-      } else {
-        if (word == UINT64_MAX) {
-          handle_bulk(data_offset, word_len);
-        } else if (word) {
-          for (int64_t i = 0; i < word_len; ++i) {
-            if (BitUtil::GetBit(cond_data, bit_offset + i)) {
-              handle_each(data_offset + i);
-            }
+      }
+    } else {
+      if (word == UINT64_MAX) {
+        handle_bulk(data_offset, word_len);
+      } else if (word) {
+        for (int64_t i = 0; i < word_len; ++i) {
+          if (BitUtil::GetBit(cond_data, bit_offset + i)) {
+            handle_each(data_offset + i);
           }
         }
       }
-      data_offset += word_len;
-      bit_offset += word_len;
     }
+    data_offset += word_len;
+    bit_offset += word_len;
+  }
 
-    cnt = cond_reader.trailing_bytes();
-    while (cnt--) {
-      int valid_bits;
-      uint8_t byte = cond_reader.NextTrailingByte(valid_bits);
-      if (invert) {
-        if (byte == 0 && valid_bits == 8) {
-          handle_bulk(data_offset, 8);
-        } else if (byte != UINT8_MAX) {
-          for (int i = 0; i < valid_bits; ++i) {
-            if (!BitUtil::GetBit(cond_data, bit_offset + i)) {
-              handle_each(data_offset + i);
-            }
+  cnt = cond_reader.trailing_bytes();
+  while (cnt--) {
+    int valid_bits;
+    uint8_t byte = cond_reader.NextTrailingByte(valid_bits);
+    if (invert) {
+      if (byte == 0 && valid_bits == 8) {
+        handle_bulk(data_offset, 8);
+      } else if (byte != UINT8_MAX) {
+        for (int i = 0; i < valid_bits; ++i) {
+          if (!BitUtil::GetBit(cond_data, bit_offset + i)) {
+            handle_each(data_offset + i);
           }
         }
-      } else {
-        if (byte == UINT8_MAX && valid_bits == 8) {
-          handle_bulk(data_offset, 8);
-        } else if (byte) {
-          for (int i = 0; i < valid_bits; ++i) {
-            if (BitUtil::GetBit(cond_data, bit_offset + i)) {
-              handle_each(data_offset + i);
-            }
+      }
+    } else {
+      if (byte == UINT8_MAX && valid_bits == 8) {
+        handle_bulk(data_offset, 8);
+      } else if (byte) {
+        for (int i = 0; i < valid_bits; ++i) {
+          if (BitUtil::GetBit(cond_data, bit_offset + i)) {
+            handle_each(data_offset + i);
           }
         }
       }
-      data_offset += 8;
-      bit_offset += 8;
     }
+    data_offset += 8;
+    bit_offset += 8;
   }
+}
 
-  template <typename HandleBulk, typename HandleEach>
-  static void RunIfElseLoopInverted(const ArrayData& cond, HandleBulk handle_bulk,
-                                    HandleEach handle_each) {
-    return RunIfElseLoop<HandleBulk, HandleEach, true>(cond, handle_bulk, handle_each);
+template <typename HandleBulk, typename HandleEach>
+static void RunIfElseLoopInverted(const ArrayData& cond, HandleBulk handle_bulk,
+                                  HandleEach handle_each) {
+  return RunIfElseLoop<HandleBulk, HandleEach, true>(cond, handle_bulk, handle_each);
+}
+
+/// Runs if-else when cond is a scalar. Two special functions are required,
+/// 1.CopyArrayData, 2. BroadcastScalar
+template <typename CopyArrayData, typename BroadcastScalar>
+static Status RunIfElseScalar(const BooleanScalar& cond, const Datum& left,
+                              const Datum& right, Datum* out,
+                              CopyArrayData copy_array_data,
+                              BroadcastScalar broadcast_scalar) {
+  if (left.is_scalar() && right.is_scalar()) {  // output will be a scalar
+    if (cond.is_valid) {
+      *out = cond.value ? left.scalar() : right.scalar();
+    } else {
+      *out = MakeNullScalar(left.type());
+    }
+    return Status::OK();
+  }
+
+  // either left or right is an array. Output is always an array`
+  const std::shared_ptr<ArrayData>& out_array = out->array();
+  if (!cond.is_valid) {
+    // cond is null; output is all null --> clear validity buffer
+    BitUtil::ClearBitmap(out_array->buffers[0]->mutable_data(), out_array->offset,
+                         out_array->length);
+    return Status::OK();
+  }
+
+  // cond is a non-null scalar
+  const auto& valid_data = cond.value ? left : right;
+  if (valid_data.is_array()) {
+    // valid_data is an array. Hence copy data to the output buffers
+    const auto& valid_array = valid_data.array();
+    if (valid_array->MayHaveNulls()) {
+      arrow::internal::CopyBitmap(
+          valid_array->buffers[0]->data(), valid_array->offset, valid_array->length,
+          out_array->buffers[0]->mutable_data(), out_array->offset);
+    } else {  // validity buffer is nullptr --> set all bits
+      BitUtil::SetBitmap(out_array->buffers[0]->mutable_data(), out_array->offset,
+                         out_array->length);
+    }
+    copy_array_data(*valid_array, out_array.get());
+    return Status::OK();
+
+  } else {  // valid data is scalar
+    // valid data is a scalar that needs to be broadcasted
+    const auto& valid_scalar = *valid_data.scalar();
+    if (valid_scalar.is_valid) {  // if the scalar is non-null, broadcast
+      BitUtil::SetBitmap(out_array->buffers[0]->mutable_data(), out_array->offset,
+                         out_array->length);
+      broadcast_scalar(*valid_data.scalar(), out_array.get());
+    } else {  // scalar is null, clear the output validity buffer
+      BitUtil::ClearBitmap(out_array->buffers[0]->mutable_data(), out_array->offset,
+                           out_array->length);
+    }
+    return Status::OK();
+  }
+}
+
+template <typename Type, typename Enable = void>
+struct IfElseFunctor {};
+
+// only number types needs to be handled for Fixed sized primitive data types because,
+// internal::GenerateTypeAgnosticPrimitive forwards types to the corresponding unsigned
+// int type
+template <typename Type>
+struct IfElseFunctor<Type, enable_if_number<Type>> {
+  using T = typename TypeTraits<Type>::CType;
+  // A - Array, S - Scalar, X = Array/Scalar
+
+  // SXX
+  static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
+                     const Datum& right, Datum* out) {
+    return RunIfElseScalar(
+        cond, left, right, out,
+        /*CopyArrayData*/
+        [&](const ArrayData& valid_array, ArrayData* out_array) {
+          std::memcpy(out_array->GetMutableValues<T>(1), valid_array.GetValues<T>(1),
+                      valid_array.length * sizeof(T));
+        },
+        /*BroadcastScalar*/
+        [&](const Scalar& scalar, ArrayData* out_array) {
+          T scalar_data = internal::UnboxScalar<Type>::Unbox(scalar);
+          std::fill(out_array->GetMutableValues<T>(1),
+                    out_array->GetMutableValues<T>(1) + out_array->length, scalar_data);
+        });
   }
 
   //  AAA
   static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
                      const ArrayData& right, ArrayData* out) {
-    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> out_buf,
-                          ctx->Allocate(cond.length * sizeof(T)));
-    T* out_values = reinterpret_cast<T*>(out_buf->mutable_data());
+    T* out_values = out->template GetMutableValues<T>(1);
 
     // copy right data to out_buff
     const T* right_data = right.GetValues<T>(1);
@@ -318,16 +384,13 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
         },
         [&](int64_t data_offset) { out_values[data_offset] = left_data[data_offset]; });
 
-    out->buffers[1] = std::move(out_buf);
     return Status::OK();
   }
 
   // ASA
   static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
                      const ArrayData& right, ArrayData* out) {
-    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> out_buf,
-                          ctx->Allocate(cond.length * sizeof(T)));
-    T* out_values = reinterpret_cast<T*>(out_buf->mutable_data());
+    T* out_values = out->template GetMutableValues<T>(1);
 
     // copy right data to out_buff
     const T* right_data = right.GetValues<T>(1);
@@ -344,16 +407,13 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
         },
         [&](int64_t data_offset) { out_values[data_offset] = left_data; });
 
-    out->buffers[1] = std::move(out_buf);
     return Status::OK();
   }
 
   // AAS
   static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
                      const Scalar& right, ArrayData* out) {
-    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> out_buf,
-                          ctx->Allocate(cond.length * sizeof(T)));
-    T* out_values = reinterpret_cast<T*>(out_buf->mutable_data());
+    T* out_values = out->template GetMutableValues<T>(1);
 
     // copy left data to out_buff
     const T* left_data = left.GetValues<T>(1);
@@ -369,16 +429,13 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
         },
         [&](int64_t data_offset) { out_values[data_offset] = right_data; });
 
-    out->buffers[1] = std::move(out_buf);
     return Status::OK();
   }
 
   // ASS
   static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
                      const Scalar& right, ArrayData* out) {
-    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> out_buf,
-                          ctx->Allocate(cond.length * sizeof(T)));
-    T* out_values = reinterpret_cast<T*>(out_buf->mutable_data());
+    T* out_values = out->template GetMutableValues<T>(1);
 
     // copy right data to out_buff
     T right_data = internal::UnboxScalar<Type>::Unbox(right);
@@ -394,21 +451,41 @@ struct IfElseFunctor<Type, enable_if_number<Type>> {
         },
         [&](int64_t data_offset) { out_values[data_offset] = left_data; });
 
-    out->buffers[1] = std::move(out_buf);
     return Status::OK();
   }
 };
 
 template <typename Type>
 struct IfElseFunctor<Type, enable_if_boolean<Type>> {
+  // A - Array, S - Scalar, X = Array/Scalar
+
+  // SXX
+  static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
+                     const Datum& right, Datum* out) {
+    return RunIfElseScalar(
+        cond, left, right, out,
+        /*CopyArrayData*/
+        [&](const ArrayData& valid_array, ArrayData* out_array) {
+          arrow::internal::CopyBitmap(
+              valid_array.buffers[1]->data(), valid_array.offset, valid_array.length,
+              out_array->buffers[1]->mutable_data(), out_array->offset);
+        },
+        /*BroadcastScalar*/
+        [&](const Scalar& scalar, ArrayData* out_array) {
+          bool scalar_data = internal::UnboxScalar<Type>::Unbox(scalar);
+          BitUtil::SetBitsTo(out_array->buffers[1]->mutable_data(), out_array->offset,
+                             out_array->length, scalar_data);
+        });
+  }
+
   // AAA
   static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
                      const ArrayData& right, ArrayData* out) {
     // out_buff = right & ~cond
-    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> out_buf,
-                          arrow::internal::BitmapAndNot(
-                              ctx->memory_pool(), right.buffers[1]->data(), right.offset,
-                              cond.buffers[1]->data(), cond.offset, cond.length, 0));
+    const auto& out_buf = out->buffers[1];
+    arrow::internal::BitmapAndNot(right.buffers[1]->data(), right.offset,
+                                  cond.buffers[1]->data(), cond.offset, cond.length,
+                                  out->offset, out_buf->mutable_data());
 
     // out_buff = left & cond
     ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> temp_buf,
@@ -416,9 +493,9 @@ struct IfElseFunctor<Type, enable_if_boolean<Type>> {
                               ctx->memory_pool(), left.buffers[1]->data(), left.offset,
                               cond.buffers[1]->data(), cond.offset, cond.length, 0));
 
-    arrow::internal::BitmapOr(out_buf->data(), 0, temp_buf->data(), 0, cond.length, 0,
-                              out_buf->mutable_data());
-    out->buffers[1] = std::move(out_buf);
+    arrow::internal::BitmapOr(out_buf->data(), out->offset, temp_buf->data(), 0,
+                              cond.length, out->offset, out_buf->mutable_data());
+
     return Status::OK();
   }
 
@@ -426,19 +503,19 @@ struct IfElseFunctor<Type, enable_if_boolean<Type>> {
   static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
                      const ArrayData& right, ArrayData* out) {
     // out_buff = right & ~cond
-    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> out_buf,
-                          arrow::internal::BitmapAndNot(
-                              ctx->memory_pool(), right.buffers[1]->data(), right.offset,
-                              cond.buffers[1]->data(), cond.offset, cond.length, 0));
+    const auto& out_buf = out->buffers[1];
+    arrow::internal::BitmapAndNot(right.buffers[1]->data(), right.offset,
+                                  cond.buffers[1]->data(), cond.offset, cond.length,
+                                  out->offset, out_buf->mutable_data());
 
     // out_buff = left & cond
     bool left_data = internal::UnboxScalar<BooleanType>::Unbox(left);
     if (left_data) {
-      arrow::internal::BitmapOr(out_buf->data(), 0, cond.buffers[1]->data(), cond.offset,
-                                cond.length, 0, out_buf->mutable_data());
+      arrow::internal::BitmapOr(out_buf->data(), out->offset, cond.buffers[1]->data(),
+                                cond.offset, cond.length, out->offset,
+                                out_buf->mutable_data());
     }
 
-    out->buffers[1] = std::move(out_buf);
     return Status::OK();
   }
 
@@ -446,20 +523,20 @@ struct IfElseFunctor<Type, enable_if_boolean<Type>> {
   static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
                      const Scalar& right, ArrayData* out) {
     // out_buff = left & cond
-    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> out_buf,
-                          arrow::internal::BitmapAnd(
-                              ctx->memory_pool(), left.buffers[1]->data(), left.offset,
-                              cond.buffers[1]->data(), cond.offset, cond.length, 0));
+    const auto& out_buf = out->buffers[1];
+    arrow::internal::BitmapAnd(left.buffers[1]->data(), left.offset,
+                               cond.buffers[1]->data(), cond.offset, cond.length,
+                               out->offset, out_buf->mutable_data());
 
     bool right_data = internal::UnboxScalar<BooleanType>::Unbox(right);
 
     // out_buff = left & cond | right & ~cond
     if (right_data) {
-      arrow::internal::BitmapOrNot(out_buf->data(), 0, cond.buffers[1]->data(),
-                                   cond.offset, cond.length, 0, out_buf->mutable_data());
+      arrow::internal::BitmapOrNot(out_buf->data(), out->offset, cond.buffers[1]->data(),
+                                   cond.offset, cond.length, out->offset,
+                                   out_buf->mutable_data());
     }
 
-    out->buffers[1] = std::move(out_buf);
     return Status::OK();
   }
 
@@ -469,66 +546,32 @@ struct IfElseFunctor<Type, enable_if_boolean<Type>> {
     bool left_data = internal::UnboxScalar<BooleanType>::Unbox(left);
     bool right_data = internal::UnboxScalar<BooleanType>::Unbox(right);
 
+    const auto& out_buf = out->buffers[1];
+
     // out_buf = left & cond | right & ~cond
-    std::shared_ptr<Buffer> out_buf = nullptr;
+    //    std::shared_ptr<Buffer> out_buf = nullptr;
     if (left_data) {
       if (right_data) {
         // out_buf = ones
-        ARROW_ASSIGN_OR_RAISE(out_buf, ctx->AllocateBitmap(cond.length));
-        // filling with UINT8_MAX upto the buffer's size (in bytes)
-        std::memset(out_buf->mutable_data(), UINT8_MAX, out_buf->size());
+        BitUtil::SetBitmap(out_buf->mutable_data(), out->offset, cond.length);
       } else {
         // out_buf = cond
-        out_buf = SliceBuffer(cond.buffers[1], cond.offset, cond.length);
+        arrow::internal::CopyBitmap(cond.buffers[1]->data(), cond.offset, cond.length,
+                                    out_buf->mutable_data(), out->offset);
       }
     } else {
       if (right_data) {
         // out_buf = ~cond
-        ARROW_ASSIGN_OR_RAISE(out_buf, arrow::internal::InvertBitmap(
-                                           ctx->memory_pool(), cond.buffers[1]->data(),
-                                           cond.offset, cond.length))
+        arrow::internal::InvertBitmap(cond.buffers[1]->data(), cond.offset, cond.length,
+                                      out_buf->mutable_data(), out->offset);
       } else {
         // out_buf = zeros
-        ARROW_ASSIGN_OR_RAISE(out_buf, ctx->AllocateBitmap(cond.length));
+        BitUtil::ClearBitmap(out_buf->mutable_data(), out->offset, cond.length);
       }
     }
-    out->buffers[1] = std::move(out_buf);
-    return Status::OK();
-  }
-};
 
-template <typename Type>
-struct IfElseFunctor<Type, enable_if_null<Type>> {
-  template <typename T>
-  static inline Status ReturnCopy(const T& in, T* out) {
-    // Nothing preallocated, so we assign in into the output
-    *out = in;
     return Status::OK();
   }
-
-  // AAA
-  static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
-                     const ArrayData& right, ArrayData* out) {
-    return ReturnCopy(left, out);
-  }
-
-  // ASA
-  static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
-                     const ArrayData& right, ArrayData* out) {
-    return ReturnCopy(right, out);
-  }
-
-  // AAS
-  static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
-                     const Scalar& right, ArrayData* out) {
-    return ReturnCopy(left, out);
-  }
-
-  // ASS
-  static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
-                     const Scalar& right, ArrayData* out) {
-    return ReturnCopy(cond, out);
-  }
 };
 
 template <typename Type>
@@ -537,32 +580,7 @@ struct ResolveIfElseExec {
     // cond is scalar
     if (batch[0].is_scalar()) {
       const auto& cond = batch[0].scalar_as<BooleanScalar>();
-      if (batch[1].is_scalar() && batch[2].is_scalar()) {
-        if (cond.is_valid) {
-          *out = cond.value ? batch[1].scalar() : batch[2].scalar();
-        } else {
-          *out = MakeNullScalar(batch[1].type());
-        }
-        return Status::OK();
-      }
-      // either left or right is an array. Output is always an array
-      if (!cond.is_valid) {
-        // cond is null; just create a null array
-        ARROW_ASSIGN_OR_RAISE(
-            *out, MakeArrayOfNull(batch[1].type(), batch.length, ctx->memory_pool()))
-        return Status::OK();
-      }
-
-      const auto& valid_data = cond.value ? batch[1] : batch[2];
-      if (valid_data.is_array()) {
-        *out = valid_data;
-      } else {
-        // valid data is a scalar that needs to be broadcasted
-        ARROW_ASSIGN_OR_RAISE(
-            *out,
-            MakeArrayFromScalar(*valid_data.scalar(), batch.length, ctx->memory_pool()));
-      }
-      return Status::OK();
+      return IfElseFunctor<Type>::Call(ctx, cond, batch[1], batch[2], out);
     }
 
     // cond is array. Use functors to sort things out
@@ -589,6 +607,22 @@ struct ResolveIfElseExec {
   }
 };
 
+
+template <>
+struct ResolveIfElseExec<NullType> {
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    if (batch[0].is_scalar()) {
+      *out = MakeNullScalar(null());
+    } else {
+      const std::shared_ptr<ArrayData>& cond_array = batch[0].array();
+      ARROW_ASSIGN_OR_RAISE(
+          *out, MakeArrayOfNull(null(), cond_array->length, ctx->memory_pool()));
+    }
+    return Status::OK();
+  }
+};
+
+
 struct IfElseFunction : ScalarFunction {
   using ScalarFunction::ScalarFunction;
 
@@ -620,14 +654,25 @@ struct IfElseFunction : ScalarFunction {
   }
 };
 
-void AddPrimitiveIfElseKernels(const std::shared_ptr<IfElseFunction>& scalar_function,
+void AddNullIfElseKernel(const std::shared_ptr<IfElseFunction>& scalar_function) {
+  ScalarKernel kernel({boolean(), null(), null()}, null(),
+                      ResolveIfElseExec<NullType>::Exec);
+  kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+  kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+  kernel.can_write_into_slices = false;
+
+  DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
+}
+
+void AddPrimitiveIfElseKernels(const std::shared_ptr<ScalarFunction>& scalar_function,
                                const std::vector<std::shared_ptr<DataType>>& types) {
   for (auto&& type : types) {
     auto exec = internal::GenerateTypeAgnosticPrimitive<ResolveIfElseExec>(*type);
     // cond array needs to be boolean always
     ScalarKernel kernel({boolean(), type, type}, type, exec);
-    kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
-    kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+    kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE;
+    kernel.mem_allocation = MemAllocation::PREALLOCATE;
+    kernel.can_write_into_slices = true;
 
     DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
   }
@@ -653,7 +698,8 @@ void RegisterScalarIfElse(FunctionRegistry* registry) {
 
   AddPrimitiveIfElseKernels(func, NumericTypes());
   AddPrimitiveIfElseKernels(func, TemporalTypes());
-  AddPrimitiveIfElseKernels(func, {boolean(), null()});
+  AddPrimitiveIfElseKernels(func, {boolean()});
+  AddNullIfElseKernel(func);
   // todo add binary kernels
 
   DCHECK_OK(registry->AddFunction(std::move(func)));

From 984b7dba94288bf271c1c6c5dbd85436eacdd85a Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Mon, 21 Jun 2021 17:03:59 -0400
Subject: [PATCH 41/46] fixing lint

---
 cpp/src/arrow/compute/kernels/scalar_if_else.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
index c82adcfc316..54e0725fce7 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
@@ -607,7 +607,6 @@ struct ResolveIfElseExec {
   }
 };
 
-
 template <>
 struct ResolveIfElseExec<NullType> {
   static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
@@ -622,7 +621,6 @@ struct ResolveIfElseExec<NullType> {
   }
 };
 
-
 struct IfElseFunction : ScalarFunction {
   using ScalarFunction::ScalarFunction;
 

From adfb0fd0003b4b754f55e22667c6401d9d6172e1 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Mon, 28 Jun 2021 15:35:46 -0400
Subject: [PATCH 42/46] fixing performance isssue

---
 cpp/src/arrow/util/bitmap.h        | 105 +++++++++++++++++++----------
 cpp/src/arrow/util/bitmap_reader.h |  15 ++---
 cpp/src/arrow/util/bitmap_writer.h |  17 +++--
 3 files changed, 83 insertions(+), 54 deletions(-)

diff --git a/cpp/src/arrow/util/bitmap.h b/cpp/src/arrow/util/bitmap.h
index 674ff96ca5d..4c19da17819 100644
--- a/cpp/src/arrow/util/bitmap.h
+++ b/cpp/src/arrow/util/bitmap.h
@@ -243,49 +243,17 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
     return min_offset;
   }
 
-  /// \brief Visit words of bits from each input bitmap as array<Word, N> and collects
-  /// outputs to an array<Word, M>, to be written into the output bitmaps accordingly.
-  ///
-  /// All bitmaps must have identical length. The first bit in a visited bitmap
-  /// may be offset within the first visited word, but words will otherwise contain
-  /// densely packed bits loaded from the bitmap. That offset within the first word is
-  /// returned.
-  /// Visitor is expected to have the following signature
-  ///     [](const std::array<Word, N>& in_words, std::array<Word, M>* out_words){...}
-  ///
-  // NOTE: this function is efficient on 3+ sufficiently large bitmaps.
-  // It also has a large prolog / epilog overhead and should be used
-  // carefully in other cases.
-  // For 2 bitmaps or less, and/or smaller bitmaps, see also VisitTwoBitBlocksVoid
-  // and BitmapUInt64Reader.
-  template <size_t N, size_t M, typename Visitor,
+  template <size_t N, size_t M, typename ReaderT, typename WriterT, typename Visitor,
             typename Word = typename std::decay<
                 internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
-  static void VisitWordsAndWrite(const std::array<Bitmap, N>& bitmaps_arg,
-                                 std::array<Bitmap, M>* out_bitmaps_arg,
-                                 Visitor&& visitor) {
+  static void RunVisitWordsAndWriteLoop(int64_t bit_length,
+                                        std::array<ReaderT, N>& readers,
+                                        std::array<WriterT, M>& writers,
+                                        Visitor&& visitor) {
     constexpr int64_t kBitWidth = sizeof(Word) * 8;
 
-    int64_t bit_length = BitLength(bitmaps_arg);
-    assert(bit_length == BitLength(*out_bitmaps_arg));
-
-    std::array<BitmapWordReader<Word>, N> readers;
-    for (size_t i = 0; i < N; ++i) {
-      readers[i] = BitmapWordReader<Word>(bitmaps_arg[i].buffer_->data(),
-                                          bitmaps_arg[i].offset_, bitmaps_arg[i].length_);
-    }
-
-    std::array<BitmapWordWriter<Word>, M> writers;
-    for (size_t i = 0; i < M; ++i) {
-      const Bitmap& out_bitmap = out_bitmaps_arg->at(i);
-      writers[i] = BitmapWordWriter<Word>(out_bitmap.buffer_->mutable_data(),
-                                          out_bitmap.offset_, out_bitmap.length_);
-    }
-
     std::array<Word, N> visited_words;
-    visited_words.fill(0);
     std::array<Word, M> output_words;
-    output_words.fill(0);
 
     // every reader will have same number of words, since they are same length'ed
     // TODO($JIRA) this will be inefficient in some cases. When there are offsets beyond
@@ -338,6 +306,69 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
     }
   }
 
+  /// \brief Visit words of bits from each input bitmap as array<Word, N> and collects
+  /// outputs to an array<Word, M>, to be written into the output bitmaps accordingly.
+  ///
+  /// All bitmaps must have identical length. The first bit in a visited bitmap
+  /// may be offset within the first visited word, but words will otherwise contain
+  /// densely packed bits loaded from the bitmap. That offset within the first word is
+  /// returned.
+  /// Visitor is expected to have the following signature
+  ///     [](const std::array<Word, N>& in_words, std::array<Word, M>* out_words){...}
+  ///
+  // NOTE: this function is efficient on 3+ sufficiently large bitmaps.
+  // It also has a large prolog / epilog overhead and should be used
+  // carefully in other cases.
+  // For 2 bitmaps or less, and/or smaller bitmaps, see also VisitTwoBitBlocksVoid
+  // and BitmapUInt64Reader.
+  template <size_t N, size_t M, typename Visitor,
+            typename Word = typename std::decay<
+                internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
+  static void VisitWordsAndWrite(const std::array<Bitmap, N>& bitmaps_arg,
+                                 std::array<Bitmap, M>* out_bitmaps_arg,
+                                 Visitor&& visitor) {
+    int64_t bit_length = BitLength(bitmaps_arg);
+    assert(bit_length == BitLength(*out_bitmaps_arg));
+
+    // if both input and output bitmaps have no byte offset, then use special template
+    if (std::all_of(bitmaps_arg.begin(), bitmaps_arg.end(),
+                    [](const Bitmap& b) { return b.offset_ % 8 == 0; }) &&
+        std::all_of(out_bitmaps_arg->begin(), out_bitmaps_arg->end(),
+                    [](const Bitmap& b) { return b.offset_ % 8 == 0; })) {
+      std::array<BitmapWordReader<Word, /*may_have_byte_offset=*/false>, N> readers;
+      for (size_t i = 0; i < N; ++i) {
+        const Bitmap& in_bitmap = bitmaps_arg[i];
+        readers[i] = BitmapWordReader<Word, /*may_have_byte_offset=*/false>(
+            in_bitmap.buffer_->data(), in_bitmap.offset_, in_bitmap.length_);
+      }
+
+      std::array<BitmapWordWriter<Word, /*may_have_byte_offset=*/false>, M> writers;
+      for (size_t i = 0; i < M; ++i) {
+        const Bitmap& out_bitmap = out_bitmaps_arg->at(i);
+        writers[i] = BitmapWordWriter<Word, /*may_have_byte_offset=*/false>(
+            out_bitmap.buffer_->mutable_data(), out_bitmap.offset_, out_bitmap.length_);
+      }
+
+      RunVisitWordsAndWriteLoop(bit_length, readers, writers, std::move(visitor));
+    } else {
+      std::array<BitmapWordReader<Word>, N> readers;
+      for (size_t i = 0; i < N; ++i) {
+        const Bitmap& in_bitmap = bitmaps_arg[i];
+        readers[i] = BitmapWordReader<Word>(in_bitmap.buffer_->data(), in_bitmap.offset_,
+                                            in_bitmap.length_);
+      }
+
+      std::array<BitmapWordWriter<Word>, M> writers;
+      for (size_t i = 0; i < M; ++i) {
+        const Bitmap& out_bitmap = out_bitmaps_arg->at(i);
+        writers[i] = BitmapWordWriter<Word>(out_bitmap.buffer_->mutable_data(),
+                                            out_bitmap.offset_, out_bitmap.length_);
+      }
+
+      RunVisitWordsAndWriteLoop(bit_length, readers, writers, std::move(visitor));
+    }
+  }
+
   const std::shared_ptr<Buffer>& buffer() const { return buffer_; }
 
   /// offset of first bit relative to buffer().data()
diff --git a/cpp/src/arrow/util/bitmap_reader.h b/cpp/src/arrow/util/bitmap_reader.h
index ce1d5f376bd..7c43747fafb 100644
--- a/cpp/src/arrow/util/bitmap_reader.h
+++ b/cpp/src/arrow/util/bitmap_reader.h
@@ -146,15 +146,14 @@ class BitmapUInt64Reader {
 // on sufficiently large inputs.  However, it has a larger prolog / epilog overhead
 // and should probably not be used for small bitmaps.
 
-template <typename Word>
+template <typename Word, bool may_have_byte_offset = true>
 class BitmapWordReader {
  public:
   BitmapWordReader() = default;
-  BitmapWordReader(const uint8_t* bitmap, int64_t offset, int64_t length) {
-    bitmap_ = bitmap + offset / 8;
-    offset_ = offset % 8;
-    bitmap_end_ = bitmap_ + BitUtil::BytesForBits(offset_ + length);
-
+  BitmapWordReader(const uint8_t* bitmap, int64_t offset, int64_t length)
+      : offset_(static_cast<int64_t>(may_have_byte_offset) * (offset % 8)),
+        bitmap_(bitmap + offset / 8),
+        bitmap_end_(bitmap_ + BitUtil::BytesForBits(offset_ + length)) {
     // decrement word count by one as we may touch two adjacent words in one iteration
     nwords_ = length / (sizeof(Word) * 8) - 1;
     if (nwords_ < 0) {
@@ -174,7 +173,7 @@ class BitmapWordReader {
     bitmap_ += sizeof(Word);
     const Word next_word = load<Word>(bitmap_);
     Word word = current_word_;
-    if (offset_) {
+    if (may_have_byte_offset && offset_) {
       // combine two adjacent words into one word
       // |<------ next ----->|<---- current ---->|
       // +-------------+-----+-------------+-----+
@@ -215,7 +214,7 @@ class BitmapWordReader {
       ++bitmap_;
       const uint8_t next_byte = load<uint8_t>(bitmap_);
       byte = current_byte_;
-      if (offset_) {
+      if (may_have_byte_offset && offset_) {
         byte >>= offset_;
         byte |= next_byte << (8 - offset_);
       }
diff --git a/cpp/src/arrow/util/bitmap_writer.h b/cpp/src/arrow/util/bitmap_writer.h
index b15b036c248..d5c6d909df0 100644
--- a/cpp/src/arrow/util/bitmap_writer.h
+++ b/cpp/src/arrow/util/bitmap_writer.h
@@ -180,16 +180,15 @@ class FirstTimeBitmapWriter {
   int64_t byte_offset_;
 };
 
-template <typename Word>
+template <typename Word, bool may_have_byte_offset = true>
 class BitmapWordWriter {
  public:
   BitmapWordWriter() = default;
-  BitmapWordWriter(uint8_t* bitmap, int64_t offset, int64_t length) {
-    bitmap_ = bitmap + offset / 8;
-    offset_ = offset % 8;
-    bitmap_end_ = bitmap_ + BitUtil::BytesForBits(offset_ + length);
-    mask_ = (1U << offset_) - 1;
-
+  BitmapWordWriter(uint8_t* bitmap, int64_t offset, int64_t length)
+      : offset_(static_cast<int64_t>(may_have_byte_offset) * (offset % 8)),
+        bitmap_(bitmap + offset / 8),
+        bitmap_end_(bitmap_ + BitUtil::BytesForBits(offset_ + length)),
+        mask_((1U << offset_) - 1) {
     if (offset_) {
       if (length >= static_cast<int>(sizeof(Word) * 8)) {
         current_word_ = load<Word>(bitmap_);
@@ -200,7 +199,7 @@ class BitmapWordWriter {
   }
 
   void PutNextWord(Word word) {
-    if (offset_) {
+    if (may_have_byte_offset && offset_) {
       // split one word into two adjacent words, don't touch unused bits
       //               |<------ word ----->|
       //               +-----+-------------+
@@ -227,7 +226,7 @@ class BitmapWordWriter {
 
   void PutNextTrailingByte(uint8_t byte, int valid_bits) {
     if (valid_bits == 8) {
-      if (offset_) {
+      if (may_have_byte_offset && offset_) {
         byte = (byte << offset_) | (byte >> (8 - offset_));
         uint8_t next_byte = load<uint8_t>(bitmap_ + 1);
         current_byte_ = (current_byte_ & mask_) | (byte & ~mask_);

From 6d48f7a9162142ab094be1d8d47680576383e2a6 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Mon, 28 Jun 2021 16:35:07 -0400
Subject: [PATCH 43/46] dummy

---
 cpp/src/arrow/util/bitmap.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/src/arrow/util/bitmap.h b/cpp/src/arrow/util/bitmap.h
index 4c19da17819..c0bd30bdd5c 100644
--- a/cpp/src/arrow/util/bitmap.h
+++ b/cpp/src/arrow/util/bitmap.h
@@ -358,6 +358,7 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
                                             in_bitmap.length_);
       }
 
+
       std::array<BitmapWordWriter<Word>, M> writers;
       for (size_t i = 0; i < M; ++i) {
         const Bitmap& out_bitmap = out_bitmaps_arg->at(i);

From d3688664783bb0f7696918981bfee7619b567737 Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Mon, 28 Jun 2021 16:35:28 -0400
Subject: [PATCH 44/46] Revert "dummy"

This reverts commit 97091f85
---
 cpp/src/arrow/util/bitmap.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/src/arrow/util/bitmap.h b/cpp/src/arrow/util/bitmap.h
index c0bd30bdd5c..4c19da17819 100644
--- a/cpp/src/arrow/util/bitmap.h
+++ b/cpp/src/arrow/util/bitmap.h
@@ -358,7 +358,6 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
                                             in_bitmap.length_);
       }
 
-
       std::array<BitmapWordWriter<Word>, M> writers;
       for (size_t i = 0; i < M; ++i) {
         const Bitmap& out_bitmap = out_bitmaps_arg->at(i);

From 4324a73d7ecf3ac513c48fc66ea0a6f4a8240d0b Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Mon, 28 Jun 2021 19:18:00 -0400
Subject: [PATCH 45/46] Apply suggestions from code review

Co-authored-by: Benjamin Kietzman <bengilgit@gmail.com>
---
 cpp/src/arrow/util/bit_util.h | 8 ++++----
 cpp/src/arrow/util/bitmap.h   | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h
index 95969dbd2da..1e97e467610 100644
--- a/cpp/src/arrow/util/bit_util.h
+++ b/cpp/src/arrow/util/bit_util.h
@@ -327,15 +327,15 @@ void ClearBitmap(uint8_t* data, int64_t offset, int64_t length);
 /// Returns a mask with lower i bits set to 1. If i >= sizeof(Word)*8, all-ones will be
 /// returned
 /// ex:
-/// PrecedingWordBitmask<uint_8>(0)= 0x00
-/// PrecedingWordBitmask<uint_8>(4)= 0x0f
-/// PrecedingWordBitmask<uint_8>(8)= 0xff
-/// PrecedingWordBitmask<uint_32>(8)= 0x00ff
 /// ref: https://stackoverflow.com/a/59523400
 template <typename Word>
 constexpr Word PrecedingWordBitmask(unsigned int const i) {
   return (static_cast<Word>(i < sizeof(Word) * 8) << (i & (sizeof(Word) * 8 - 1))) - 1;
 }
+static_assert(PrecedingWordBitmask<uint8_t>(0) == 0x00, "");
+static_assert(PrecedingWordBitmask<uint8_t>(4) == 0x0f, "");
+static_assert(PrecedingWordBitmask<uint8_t>(8) == 0xff, "");
+static_assert(PrecedingWordBitmask<uint16_t>(8) == 0x00ff, "");
 
 /// \brief Create a word with low `n` bits from `low` and high `sizeof(Word)-n` bits
 /// from `high`.
diff --git a/cpp/src/arrow/util/bitmap.h b/cpp/src/arrow/util/bitmap.h
index 4c19da17819..461647e6b6c 100644
--- a/cpp/src/arrow/util/bitmap.h
+++ b/cpp/src/arrow/util/bitmap.h
@@ -365,7 +365,7 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
                                             out_bitmap.offset_, out_bitmap.length_);
       }
 
-      RunVisitWordsAndWriteLoop(bit_length, readers, writers, std::move(visitor));
+      RunVisitWordsAndWriteLoop(bit_length, readers, writers, visitor);
     }
   }
 

From 1b3144b0c7767c1359e3bbbdcbe000d728ef29fc Mon Sep 17 00:00:00 2001
From: niranda perera <niranda.perera@gmail.com>
Date: Mon, 28 Jun 2021 19:19:39 -0400
Subject: [PATCH 46/46] applying PR comments

---
 cpp/src/arrow/compute/kernels/scalar_if_else_test.cc | 3 +++
 cpp/src/arrow/util/bitmap.h                          | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
index c9347bc6a4b..670a2d42a3a 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
@@ -57,6 +57,9 @@ TYPED_TEST(TestIfElsePrimitive, IfElseFixedSizeRand) {
 
   random::RandomArrayGenerator rand(/*seed=*/0);
   int64_t len = 1000;
+
+  // adding 64 consecutive 1's and 0's in the cond array to test all-true/ all-false
+  // word code paths
   ASSERT_OK_AND_ASSIGN(auto temp1, MakeArrayFromScalar(BooleanScalar(true), 64));
   ASSERT_OK_AND_ASSIGN(auto temp2, MakeArrayFromScalar(BooleanScalar(false), 64));
   auto temp3 = rand.ArrayOf(boolean(), len - 64 * 2, /*null_probability=*/0.01);
diff --git a/cpp/src/arrow/util/bitmap.h b/cpp/src/arrow/util/bitmap.h
index 461647e6b6c..141f863c0b8 100644
--- a/cpp/src/arrow/util/bitmap.h
+++ b/cpp/src/arrow/util/bitmap.h
@@ -349,7 +349,7 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
             out_bitmap.buffer_->mutable_data(), out_bitmap.offset_, out_bitmap.length_);
       }
 
-      RunVisitWordsAndWriteLoop(bit_length, readers, writers, std::move(visitor));
+      RunVisitWordsAndWriteLoop(bit_length, readers, writers, visitor);
     } else {
       std::array<BitmapWordReader<Word>, N> readers;
       for (size_t i = 0; i < N; ++i) {