From 6131c02b90a2549a680c87a71292031bd882349b Mon Sep 17 00:00:00 2001
From: Mahmut Bulut <vertexclique@gmail.com>
Date: Wed, 4 Nov 2020 23:05:46 +0100
Subject: [PATCH 01/22] ARROW-10588 - Implement safe bit operations over all
 crates that uses Arrow

---
 rust/arrow/Cargo.toml                         |   1 +
 rust/arrow/src/array/array_binary.rs          |   6 +-
 rust/arrow/src/array/array_list.rs            |  48 ++-
 rust/arrow/src/array/array_primitive.rs       |  22 +-
 rust/arrow/src/array/array_string.rs          |   5 +-
 rust/arrow/src/array/array_struct.rs          |   7 +-
 rust/arrow/src/array/array_union.rs           |   5 +-
 rust/arrow/src/array/builder.rs               |  39 +-
 rust/arrow/src/array/data.rs                  |  36 +-
 rust/arrow/src/array/equal/utils.rs           |  12 +-
 rust/arrow/src/bitmap.rs                      |   7 +-
 rust/arrow/src/buffer.rs                      | 171 ++++++--
 rust/arrow/src/compute/kernels/aggregate.rs   |  47 +-
 rust/arrow/src/compute/kernels/arithmetic.rs  |   6 +-
 rust/arrow/src/compute/kernels/boolean.rs     |   4 +-
 rust/arrow/src/compute/kernels/comparison.rs  |  19 +-
 rust/arrow/src/compute/kernels/filter.rs      |  12 +-
 rust/arrow/src/compute/kernels/limit.rs       |  13 +-
 rust/arrow/src/compute/kernels/take.rs        |  39 +-
 rust/arrow/src/compute/util.rs                |   8 +-
 rust/arrow/src/datatypes.rs                   |   8 +-
 rust/arrow/src/ipc/writer.rs                  |   4 +-
 rust/arrow/src/util/bit_chunk_iterator.rs     | 257 -----------
 rust/arrow/src/util/bit_ops.rs                | 407 ++++++++++++++++++
 rust/arrow/src/util/bit_util.rs               | 390 -----------------
 rust/arrow/src/util/buffered_iterator.rs      |   2 +-
 rust/arrow/src/util/mod.rs                    |   4 +-
 rust/arrow/src/util/utils.rs                  | 119 +++++
 rust/datafusion/tests/sql.rs                  |  34 +-
 .../src/bin/arrow-json-integration-test.rs    |   7 +-
 rust/parquet/src/arrow/array_reader.rs        |   7 +-
 31 files changed, 876 insertions(+), 870 deletions(-)
 delete mode 100644 rust/arrow/src/util/bit_chunk_iterator.rs
 create mode 100644 rust/arrow/src/util/bit_ops.rs
 delete mode 100644 rust/arrow/src/util/bit_util.rs
 create mode 100644 rust/arrow/src/util/utils.rs

diff --git a/rust/arrow/Cargo.toml b/rust/arrow/Cargo.toml
index 71445768207..86f1fac627d 100644
--- a/rust/arrow/Cargo.toml
+++ b/rust/arrow/Cargo.toml
@@ -48,6 +48,7 @@ lazy_static = "1.4"
 packed_simd = { version = "0.3.4", optional = true, package = "packed_simd_2" }
 chrono = "0.4"
 flatbuffers = "0.6"
+bitvec = "0.19"
 hex = "0.4"
 prettytable-rs = { version = "0.8.0", optional = true }
 
diff --git a/rust/arrow/src/array/array_binary.rs b/rust/arrow/src/array/array_binary.rs
index d7a3eb7217a..c8898001880 100644
--- a/rust/arrow/src/array/array_binary.rs
+++ b/rust/arrow/src/array/array_binary.rs
@@ -25,7 +25,7 @@ use super::{
     Array, ArrayData, ArrayDataRef, FixedSizeListArray, GenericBinaryIter,
     GenericListArray, LargeListArray, ListArray, OffsetSizeTrait,
 };
-use crate::util::bit_util;
+
 use crate::{buffer::Buffer, datatypes::ToByteSlice};
 use crate::{buffer::MutableBuffer, datatypes::DataType};
 
@@ -231,12 +231,10 @@ where
         offsets.push(length_so_far);
 
         {
-            let null_slice = null_buf.data_mut();
-
             for (i, s) in iter.enumerate() {
                 if let Some(s) = s {
                     let s = s.as_ref();
-                    bit_util::set_bit(null_slice, i);
+                    null_buf.set_bit(i);
                     length_so_far =
                         length_so_far + OffsetSize::from_usize(s.len()).unwrap();
                     values.extend_from_slice(s);
diff --git a/rust/arrow/src/array/array_list.rs b/rust/arrow/src/array/array_list.rs
index 4eb8dc56640..72a152f6f59 100644
--- a/rust/arrow/src/array/array_list.rs
+++ b/rust/arrow/src/array/array_list.rs
@@ -302,10 +302,10 @@ mod tests {
         buffer::Buffer,
         datatypes::{Field, ToByteSlice},
         memory,
-        util::bit_util,
     };
 
     use super::*;
+    use crate::util::bit_ops::BufferBitSliceMut;
 
     #[test]
     fn test_list_array() {
@@ -552,11 +552,14 @@ mod tests {
             Buffer::from(&[0, 2, 2, 2, 4, 6, 6, 9, 9, 10].to_byte_slice());
         // 01011001 00000001
         let mut null_bits: [u8; 2] = [0; 2];
-        bit_util::set_bit(&mut null_bits, 0);
-        bit_util::set_bit(&mut null_bits, 3);
-        bit_util::set_bit(&mut null_bits, 4);
-        bit_util::set_bit(&mut null_bits, 6);
-        bit_util::set_bit(&mut null_bits, 8);
+        {
+            let mut null_bit_slice = BufferBitSliceMut::new(&mut null_bits);
+            null_bit_slice.set_bit(0, true);
+            null_bit_slice.set_bit(3, true);
+            null_bit_slice.set_bit(4, true);
+            null_bit_slice.set_bit(6, true);
+            null_bit_slice.set_bit(8, true);
+        }
 
         // Construct a list array from the above two
         let list_data_type =
@@ -582,8 +585,9 @@ mod tests {
         assert_eq!(1, sliced_array.offset());
         assert_eq!(3, sliced_array.null_count());
 
+        let null_bit_slice = BufferBitSliceMut::new(&mut null_bits);
         for i in 0..sliced_array.len() {
-            if bit_util::get_bit(&null_bits, sliced_array.offset() + i) {
+            if null_bit_slice.get_bit(sliced_array.offset() + i) {
                 assert!(sliced_array.is_valid(i));
             } else {
                 assert!(sliced_array.is_null(i));
@@ -617,11 +621,14 @@ mod tests {
             Buffer::from(&[0i64, 2, 2, 2, 4, 6, 6, 9, 9, 10].to_byte_slice());
         // 01011001 00000001
         let mut null_bits: [u8; 2] = [0; 2];
-        bit_util::set_bit(&mut null_bits, 0);
-        bit_util::set_bit(&mut null_bits, 3);
-        bit_util::set_bit(&mut null_bits, 4);
-        bit_util::set_bit(&mut null_bits, 6);
-        bit_util::set_bit(&mut null_bits, 8);
+        {
+            let mut null_bit_slice = BufferBitSliceMut::new(&mut null_bits);
+            null_bit_slice.set_bit(0, true);
+            null_bit_slice.set_bit(3, true);
+            null_bit_slice.set_bit(4, true);
+            null_bit_slice.set_bit(6, true);
+            null_bit_slice.set_bit(8, true);
+        }
 
         // Construct a list array from the above two
         let list_data_type =
@@ -647,8 +654,9 @@ mod tests {
         assert_eq!(1, sliced_array.offset());
         assert_eq!(3, sliced_array.null_count());
 
+        let null_bit_slice = BufferBitSliceMut::new(&mut null_bits);
         for i in 0..sliced_array.len() {
-            if bit_util::get_bit(&null_bits, sliced_array.offset() + i) {
+            if null_bit_slice.get_bit(sliced_array.offset() + i) {
                 assert!(sliced_array.is_valid(i));
             } else {
                 assert!(sliced_array.is_null(i));
@@ -678,13 +686,16 @@ mod tests {
             ))
             .build();
 
-        // Set null buts for the nested array:
+        // Set null bits for the nested array:
         //  [[0, 1], null, null, [6, 7], [8, 9]]
         // 01011001 00000001
         let mut null_bits: [u8; 1] = [0; 1];
-        bit_util::set_bit(&mut null_bits, 0);
-        bit_util::set_bit(&mut null_bits, 3);
-        bit_util::set_bit(&mut null_bits, 4);
+        {
+            let mut null_bit_slice = BufferBitSliceMut::new(&mut null_bits);
+            null_bit_slice.set_bit(0, true);
+            null_bit_slice.set_bit(3, true);
+            null_bit_slice.set_bit(4, true);
+        }
 
         // Construct a fixed size list array from the above two
         let list_data_type = DataType::FixedSizeList(
@@ -711,8 +722,9 @@ mod tests {
         assert_eq!(1, sliced_array.offset());
         assert_eq!(2, sliced_array.null_count());
 
+        let null_bit_slice = BufferBitSliceMut::new(&mut null_bits);
         for i in 0..sliced_array.len() {
-            if bit_util::get_bit(&null_bits, sliced_array.offset() + i) {
+            if null_bit_slice.get_bit(sliced_array.offset() + i) {
                 assert!(sliced_array.is_valid(i));
             } else {
                 assert!(sliced_array.is_null(i));
diff --git a/rust/arrow/src/array/array_primitive.rs b/rust/arrow/src/array/array_primitive.rs
index d1014dbb940..34fbfc856af 100644
--- a/rust/arrow/src/array/array_primitive.rs
+++ b/rust/arrow/src/array/array_primitive.rs
@@ -30,7 +30,7 @@ use super::raw_pointer::RawPtrBox;
 use super::*;
 use crate::buffer::{Buffer, MutableBuffer};
 use crate::memory;
-use crate::util::bit_util;
+use crate::util::utils;
 
 /// Number of seconds in a day
 const SECONDS_IN_DAY: i64 = 86_400;
@@ -296,7 +296,7 @@ impl<T: ArrowPrimitiveType, Ptr: Borrow<Option<<T as ArrowPrimitiveType>::Native
         let (_, data_len) = iter.size_hint();
         let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound.
 
-        let num_bytes = bit_util::ceil(data_len, 8);
+        let num_bytes = utils::ceil(data_len, 8);
         let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, false);
         let mut val_buf = MutableBuffer::new(
             data_len * mem::size_of::<<T as ArrowPrimitiveType>::Native>(),
@@ -304,10 +304,9 @@ impl<T: ArrowPrimitiveType, Ptr: Borrow<Option<<T as ArrowPrimitiveType>::Native
 
         let null = vec![0; mem::size_of::<<T as ArrowPrimitiveType>::Native>()];
 
-        let null_slice = null_buf.data_mut();
         iter.enumerate().for_each(|(i, item)| {
             if let Some(a) = item.borrow() {
-                bit_util::set_bit(null_slice, i);
+                null_buf.set_bit(i);
                 val_buf.extend_from_slice(a.to_byte_slice());
             } else {
                 val_buf.extend_from_slice(&null);
@@ -401,10 +400,9 @@ impl<T: ArrowTimestampType> PrimitiveArray<T> {
 
         {
             let null = vec![0; mem::size_of::<i64>()];
-            let null_slice = null_buf.data_mut();
             for (i, v) in data.iter().enumerate() {
                 if let Some(n) = v {
-                    bit_util::set_bit(null_slice, i);
+                    null_buf.set_bit(i);
                     val_buf.extend_from_slice(&n.to_byte_slice());
                 } else {
                     val_buf.extend_from_slice(&null);
@@ -427,10 +425,9 @@ impl From<Vec<bool>> for BooleanArray {
     fn from(data: Vec<bool>) -> Self {
         let mut mut_buf = MutableBuffer::new_null(data.len());
         {
-            let mut_slice = mut_buf.data_mut();
             for (i, b) in data.iter().enumerate() {
                 if *b {
-                    bit_util::set_bit(mut_slice, i);
+                    mut_buf.set_bit(i);
                 }
             }
         }
@@ -445,19 +442,16 @@ impl From<Vec<bool>> for BooleanArray {
 impl From<Vec<Option<bool>>> for BooleanArray {
     fn from(data: Vec<Option<bool>>) -> Self {
         let data_len = data.len();
-        let num_byte = bit_util::ceil(data_len, 8);
+        let num_byte = utils::ceil(data_len, 8);
         let mut null_buf = MutableBuffer::new_null(data.len());
         let mut val_buf = MutableBuffer::new(num_byte).with_bitset(num_byte, false);
 
         {
-            let null_slice = null_buf.data_mut();
-            let val_slice = val_buf.data_mut();
-
             for (i, v) in data.iter().enumerate() {
                 if let Some(b) = v {
-                    bit_util::set_bit(null_slice, i);
+                    null_buf.set_bit(i);
                     if *b {
-                        bit_util::set_bit(val_slice, i);
+                        val_buf.set_bit(i);
                     }
                 }
             }
diff --git a/rust/arrow/src/array/array_string.rs b/rust/arrow/src/array/array_string.rs
index ec79ac9010d..4d61a203921 100644
--- a/rust/arrow/src/array/array_string.rs
+++ b/rust/arrow/src/array/array_string.rs
@@ -25,7 +25,7 @@ use super::{
     Array, ArrayData, ArrayDataRef, GenericListArray, GenericStringIter, LargeListArray,
     ListArray, OffsetSizeTrait,
 };
-use crate::util::bit_util;
+
 use crate::{buffer::Buffer, datatypes::ToByteSlice};
 use crate::{buffer::MutableBuffer, datatypes::DataType};
 
@@ -168,8 +168,7 @@ where
             if let Some(s) = s {
                 let s = s.as_ref();
                 // set null bit
-                let null_slice = null_buf.data_mut();
-                bit_util::set_bit(null_slice, i);
+                null_buf.set_bit(i);
 
                 length_so_far = length_so_far + OffsetSize::from_usize(s.len()).unwrap();
                 offsets.push(length_so_far);
diff --git a/rust/arrow/src/array/array_struct.rs b/rust/arrow/src/array/array_struct.rs
index 3715a8b1501..4ada0fd0679 100644
--- a/rust/arrow/src/array/array_struct.rs
+++ b/rust/arrow/src/array/array_struct.rs
@@ -23,14 +23,11 @@ use std::{any::Any, sync::Arc};
 
 use super::{make_array, Array, ArrayData, ArrayDataRef, ArrayRef};
 use crate::datatypes::DataType;
+use crate::error::{ArrowError, Result};
 use crate::{
     buffer::{buffer_bin_or, Buffer},
     datatypes::Field,
 };
-use crate::{
-    error::{ArrowError, Result},
-    util::bit_util,
-};
 
 /// A nested array type where each child (called *field*) is represented by a separate
 /// array.
@@ -149,7 +146,7 @@ impl TryFrom<Vec<(&str, ArrayRef)>> for StructArray {
             .len(len)
             .child_data(child_data);
         if let Some(null_buffer) = null {
-            let null_count = len - bit_util::count_set_bits(null_buffer.data());
+            let null_count = len - null_buffer.count_ones();
             builder = builder.null_count(null_count).null_bit_buffer(null_buffer);
         }
 
diff --git a/rust/arrow/src/array/array_union.rs b/rust/arrow/src/array/array_union.rs
index 8509228aa68..bcead845bd4 100644
--- a/rust/arrow/src/array/array_union.rs
+++ b/rust/arrow/src/array/array_union.rs
@@ -78,7 +78,6 @@ use crate::buffer::Buffer;
 use crate::datatypes::*;
 use crate::error::{ArrowError, Result};
 
-use crate::util::bit_util;
 use core::fmt;
 use std::any::Any;
 use std::mem;
@@ -145,7 +144,7 @@ impl UnionArray {
         bitmap: Option<Buffer>,
     ) -> Result<Self> {
         let bitmap_data = bitmap.map(|b| {
-            let null_count = type_ids.len() - bit_util::count_set_bits(b.data());
+            let null_count = type_ids.len() - b.count_ones();
             (b, null_count)
         });
 
@@ -232,7 +231,7 @@ impl UnionArray {
         assert!(index - self.offset() < self.len());
         if self.is_dense() {
             let valid_slots = match self.data.null_buffer() {
-                Some(b) => bit_util::count_set_bits_offset(b.data(), 0, index),
+                Some(b) => b.bit_slice().view(0, index).count_ones(),
                 None => index,
             };
             self.data().buffers()[1].data()[valid_slots * size_of::<i32>()] as i32
diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs
index d3b2f9890a5..607e4a42aca 100644
--- a/rust/arrow/src/array/builder.rs
+++ b/rust/arrow/src/array/builder.rs
@@ -31,7 +31,8 @@ use crate::array::*;
 use crate::buffer::{Buffer, MutableBuffer};
 use crate::datatypes::*;
 use crate::error::{ArrowError, Result};
-use crate::util::bit_util;
+use crate::util::bit_ops::BufferBitSliceMut;
+use crate::util::utils;
 
 ///  Converts a `MutableBuffer` to a `BufferBuilder<T>`.
 ///
@@ -188,7 +189,7 @@ pub trait BufferBuilderTrait<T: ArrowPrimitiveType> {
     ///
     /// assert!(builder.capacity() >= 20);
     /// ```
-    fn reserve(&mut self, n: usize) -> ();
+    fn reserve(&mut self, n: usize);
 
     /// Appends a value of type `T` into the builder,
     /// growing the internal buffer as needed.
@@ -255,8 +256,8 @@ impl<T: ArrowPrimitiveType> BufferBuilderTrait<T> for BufferBuilder<T> {
     #[inline]
     fn new(capacity: usize) -> Self {
         let buffer = if T::DATA_TYPE == DataType::Boolean {
-            let byte_capacity = bit_util::ceil(capacity, 8);
-            let actual_capacity = bit_util::round_upto_multiple_of_64(byte_capacity);
+            let byte_capacity = utils::ceil(capacity, 8);
+            let actual_capacity = utils::round_upto_multiple_of_64(byte_capacity);
             let mut buffer = MutableBuffer::new(actual_capacity);
             buffer.set_null_bits(0, actual_capacity);
             buffer
@@ -287,7 +288,7 @@ impl<T: ArrowPrimitiveType> BufferBuilderTrait<T> for BufferBuilder<T> {
     #[inline]
     fn advance(&mut self, i: usize) -> Result<()> {
         let new_buffer_len = if T::DATA_TYPE == DataType::Boolean {
-            bit_util::ceil(self.len + i, 8)
+            utils::ceil(self.len + i, 8)
         } else {
             (self.len + i) * mem::size_of::<T::Native>()
         };
@@ -301,7 +302,7 @@ impl<T: ArrowPrimitiveType> BufferBuilderTrait<T> for BufferBuilder<T> {
         let new_capacity = self.len + n;
         if T::DATA_TYPE == DataType::Boolean {
             if new_capacity > self.capacity() {
-                let new_byte_capacity = bit_util::ceil(new_capacity, 8);
+                let new_byte_capacity = utils::ceil(new_capacity, 8);
                 let existing_capacity = self.buffer.capacity();
                 let new_capacity = self.buffer.reserve(new_byte_capacity);
                 self.buffer
@@ -318,9 +319,7 @@ impl<T: ArrowPrimitiveType> BufferBuilderTrait<T> for BufferBuilder<T> {
         self.reserve(1);
         if T::DATA_TYPE == DataType::Boolean {
             if v != T::default_value() {
-                unsafe {
-                    bit_util::set_bit_raw(self.buffer.raw_data_mut(), self.len);
-                }
+                self.buffer.set_bit(self.len);
             }
             self.len += 1;
         } else {
@@ -340,7 +339,9 @@ impl<T: ArrowPrimitiveType> BufferBuilderTrait<T> for BufferBuilder<T> {
                         self.buffer.capacity(),
                     )
                 };
-                (self.len..self.len + n).for_each(|i| bit_util::set_bit(data, i))
+                (self.len..self.len + n).for_each(|i| {
+                    BufferBitSliceMut::new(data).set_bit(i, true);
+                });
             }
             self.len += n;
         } else {
@@ -362,9 +363,7 @@ impl<T: ArrowPrimitiveType> BufferBuilderTrait<T> for BufferBuilder<T> {
                     // For performance the `len` of the buffer is not
                     // updated on each append but is updated in the
                     // `freeze` method instead.
-                    unsafe {
-                        bit_util::set_bit_raw(self.buffer.raw_data_mut(), self.len);
-                    }
+                    self.buffer.set_bit(self.len);
                 }
                 self.len += 1;
             }
@@ -378,7 +377,7 @@ impl<T: ArrowPrimitiveType> BufferBuilderTrait<T> for BufferBuilder<T> {
     fn finish(&mut self) -> Buffer {
         if T::DATA_TYPE == DataType::Boolean {
             // `append` does not update the buffer's `len` so do it before `freeze` is called.
-            let new_buffer_len = bit_util::ceil(self.len, 8);
+            let new_buffer_len = utils::ceil(self.len, 8);
             debug_assert!(new_buffer_len >= self.buffer.len());
             let mut buf = std::mem::replace(&mut self.buffer, MutableBuffer::new(0));
             self.len = 0;
@@ -602,7 +601,7 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
     pub fn finish(&mut self) -> PrimitiveArray<T> {
         let len = self.len();
         let null_bit_buffer = self.bitmap_builder.finish();
-        let null_count = len - bit_util::count_set_bits(null_bit_buffer.data());
+        let null_count = len - null_bit_buffer.count_ones();
         let mut builder = ArrayData::builder(T::DATA_TYPE)
             .len(len)
             .add_buffer(self.values_builder.finish());
@@ -619,7 +618,7 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
     pub fn finish_dict(&mut self, values: ArrayRef) -> DictionaryArray<T> {
         let len = self.len();
         let null_bit_buffer = self.bitmap_builder.finish();
-        let null_count = len - bit_util::count_set_bits(null_bit_buffer.data());
+        let null_count = len - null_bit_buffer.count_ones();
         let data_type = DataType::Dictionary(
             Box::new(T::DATA_TYPE),
             Box::new(values.data_type().clone()),
@@ -831,7 +830,7 @@ where
 
         let offset_buffer = self.offsets_builder.finish();
         let null_bit_buffer = self.bitmap_builder.finish();
-        let nulls = bit_util::count_set_bits(null_bit_buffer.data());
+        let nulls = null_bit_buffer.count_ones();
         self.offsets_builder.append(0).unwrap();
         let data = ArrayData::builder(DataType::List(Box::new(Field::new(
             "item",
@@ -1043,7 +1042,7 @@ where
 
         let offset_buffer = self.offsets_builder.finish();
         let null_bit_buffer = self.bitmap_builder.finish();
-        let nulls = bit_util::count_set_bits(null_bit_buffer.data());
+        let nulls = null_bit_buffer.count_ones();
         self.offsets_builder.append(0).unwrap();
         let data = ArrayData::builder(DataType::LargeList(Box::new(Field::new(
             "item",
@@ -1234,7 +1233,7 @@ where
         }
 
         let null_bit_buffer = self.bitmap_builder.finish();
-        let nulls = bit_util::count_set_bits(null_bit_buffer.data());
+        let nulls = null_bit_buffer.count_ones();
         let data = ArrayData::builder(DataType::FixedSizeList(
             Box::new(Field::new("item", values_data.data_type().clone(), true)),
             self.list_len,
@@ -2134,7 +2133,7 @@ impl StructBuilder {
         }
 
         let null_bit_buffer = self.bitmap_builder.finish();
-        let null_count = self.len - bit_util::count_set_bits(null_bit_buffer.data());
+        let null_count = self.len - null_bit_buffer.count_ones();
         let mut builder = ArrayData::builder(DataType::Struct(self.fields.clone()))
             .len(self.len)
             .child_data(child_data);
diff --git a/rust/arrow/src/array/data.rs b/rust/arrow/src/array/data.rs
index 0aec4e614d9..0372d6c889a 100644
--- a/rust/arrow/src/array/data.rs
+++ b/rust/arrow/src/array/data.rs
@@ -23,7 +23,7 @@ use std::sync::Arc;
 
 use crate::buffer::Buffer;
 use crate::datatypes::DataType;
-use crate::util::bit_util;
+
 use crate::{bitmap::Bitmap, datatypes::ArrowNativeType};
 
 use super::equal::equal;
@@ -31,8 +31,8 @@ use super::equal::equal;
 #[inline]
 fn count_nulls(null_bit_buffer: Option<&Buffer>, offset: usize, len: usize) -> usize {
     if let Some(ref buf) = null_bit_buffer {
-        len.checked_sub(bit_util::count_set_bits_offset(buf.data(), offset, len))
-            .unwrap()
+        let ones = buf.bit_slice().view(offset, len).count_ones();
+        len.checked_sub(ones).unwrap()
     } else {
         0
     }
@@ -342,7 +342,7 @@ mod tests {
 
     use crate::buffer::Buffer;
     use crate::datatypes::ToByteSlice;
-    use crate::util::bit_util;
+    use crate::util::bit_ops::BufferBitSliceMut;
 
     #[test]
     fn test_new() {
@@ -388,9 +388,10 @@ mod tests {
     #[test]
     fn test_null_count() {
         let mut bit_v: [u8; 2] = [0; 2];
-        bit_util::set_bit(&mut bit_v, 0);
-        bit_util::set_bit(&mut bit_v, 3);
-        bit_util::set_bit(&mut bit_v, 10);
+        let mut bit_v_slice = BufferBitSliceMut::new(&mut bit_v);
+        bit_v_slice.set_bit(0, true);
+        bit_v_slice.set_bit(3, true);
+        bit_v_slice.set_bit(10, true);
         let arr_data = ArrayData::builder(DataType::Int32)
             .len(16)
             .null_bit_buffer(Buffer::from(bit_v))
@@ -399,9 +400,10 @@ mod tests {
 
         // Test with offset
         let mut bit_v: [u8; 2] = [0; 2];
-        bit_util::set_bit(&mut bit_v, 0);
-        bit_util::set_bit(&mut bit_v, 3);
-        bit_util::set_bit(&mut bit_v, 10);
+        let mut bit_v_slice = BufferBitSliceMut::new(&mut bit_v);
+        bit_v_slice.set_bit(0, true);
+        bit_v_slice.set_bit(3, true);
+        bit_v_slice.set_bit(10, true);
         let arr_data = ArrayData::builder(DataType::Int32)
             .len(12)
             .offset(2)
@@ -413,9 +415,10 @@ mod tests {
     #[test]
     fn test_null_buffer_ref() {
         let mut bit_v: [u8; 2] = [0; 2];
-        bit_util::set_bit(&mut bit_v, 0);
-        bit_util::set_bit(&mut bit_v, 3);
-        bit_util::set_bit(&mut bit_v, 10);
+        let mut bit_v_slice = BufferBitSliceMut::new(&mut bit_v);
+        bit_v_slice.set_bit(0, true);
+        bit_v_slice.set_bit(3, true);
+        bit_v_slice.set_bit(10, true);
         let arr_data = ArrayData::builder(DataType::Int32)
             .len(16)
             .null_bit_buffer(Buffer::from(bit_v))
@@ -427,9 +430,10 @@ mod tests {
     #[test]
     fn test_slice() {
         let mut bit_v: [u8; 2] = [0; 2];
-        bit_util::set_bit(&mut bit_v, 0);
-        bit_util::set_bit(&mut bit_v, 3);
-        bit_util::set_bit(&mut bit_v, 10);
+        let mut bit_v_slice = BufferBitSliceMut::new(&mut bit_v);
+        bit_v_slice.set_bit(0, true);
+        bit_v_slice.set_bit(3, true);
+        bit_v_slice.set_bit(10, true);
         let data = ArrayData::builder(DataType::Int32)
             .len(16)
             .null_bit_buffer(Buffer::from(bit_v))
diff --git a/rust/arrow/src/array/equal/utils.rs b/rust/arrow/src/array/equal/utils.rs
index f9e8860a5bb..343c86c3d88 100644
--- a/rust/arrow/src/array/equal/utils.rs
+++ b/rust/arrow/src/array/equal/utils.rs
@@ -15,7 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::{array::ArrayData, util::bit_util};
+use crate::array::ArrayData;
+use crate::util::bit_ops::BufferBitSlice;
 
 // whether bits along the positions are equal
 // `lhs_start`, `rhs_start` and `len` are _measured in bits_.
@@ -27,10 +28,11 @@ pub(super) fn equal_bits(
     rhs_start: usize,
     len: usize,
 ) -> bool {
-    (0..len).all(|i| {
-        bit_util::get_bit(lhs_values, lhs_start + i)
-            == bit_util::get_bit(rhs_values, rhs_start + i)
-    })
+    let (lhs, rhs) = (
+        BufferBitSlice::new(lhs_values),
+        BufferBitSlice::new(rhs_values),
+    );
+    lhs.view(lhs_start, len) == rhs.view(rhs_start, len)
 }
 
 #[inline]
diff --git a/rust/arrow/src/bitmap.rs b/rust/arrow/src/bitmap.rs
index 7df609fb88b..1bf7ce14361 100644
--- a/rust/arrow/src/bitmap.rs
+++ b/rust/arrow/src/bitmap.rs
@@ -20,7 +20,7 @@
 
 use crate::buffer::Buffer;
 use crate::error::Result;
-use crate::util::bit_util;
+
 use std::mem;
 
 use std::ops::{BitAnd, BitOr};
@@ -44,6 +44,7 @@ impl Bitmap {
         }
     }
 
+    #[inline]
     pub fn len(&self) -> usize {
         self.bits.len()
     }
@@ -52,9 +53,9 @@ impl Bitmap {
         self.bits.is_empty()
     }
 
+    #[inline]
     pub fn is_set(&self, i: usize) -> bool {
-        assert!(i < (self.bits.len() << 3));
-        unsafe { bit_util::get_bit_raw(self.bits.raw_data(), i) }
+        self.bits.get_bit(i)
     }
 
     pub fn buffer_ref(&self) -> &Buffer {
diff --git a/rust/arrow/src/buffer.rs b/rust/arrow/src/buffer.rs
index d5b824e2992..c381ccdb5cf 100644
--- a/rust/arrow/src/buffer.rs
+++ b/rust/arrow/src/buffer.rs
@@ -33,9 +33,10 @@ use std::sync::Arc;
 use crate::datatypes::ArrowNativeType;
 use crate::error::{ArrowError, Result};
 use crate::memory;
-use crate::util::bit_chunk_iterator::BitChunks;
-use crate::util::bit_util;
-use crate::util::bit_util::ceil;
+use crate::util::bit_ops::*;
+use crate::util::utils;
+use crate::util::utils::ceil;
+
 #[cfg(feature = "simd")]
 use std::borrow::BorrowMut;
 
@@ -258,19 +259,52 @@ impl Buffer {
     /// Returns a slice of this buffer starting at a certain bit offset.
     /// If the offset is byte-aligned the returned buffer is a shallow clone,
     /// otherwise a new buffer is allocated and filled with a copy of the bits in the range.
-    pub fn bit_slice(&self, offset: usize, len: usize) -> Self {
-        if offset % 8 == 0 && len % 8 == 0 {
-            return self.slice(offset / 8);
+    #[inline]
+    pub fn bit_view(&self, offset_in_bits: usize, len_in_bits: usize) -> Self {
+        if offset_in_bits % 8 == 0 && len_in_bits % 8 == 0 {
+            self.slice(offset_in_bits / 8)
+        } else {
+            self.bit_slice()
+                .view(offset_in_bits, len_in_bits)
+                .as_buffer()
         }
+    }
+
+    /// Gives bit slice of the underlying buffer
+    /// This method can be used to get bit views for bit operations on the immutable view over the buffer.
+    #[inline]
+    pub fn bit_slice(&self) -> BufferBitSlice {
+        BufferBitSlice::new(self.data.data())
+    }
+
+    /// Count one bits in this buffer
+    #[inline]
+    pub fn count_ones(&self) -> usize {
+        self.bit_slice().count_ones()
+    }
 
-        bitwise_unary_op_helper(&self, offset, len, |a| a)
+    /// Count zero bits in this buffer
+    #[inline]
+    pub fn count_zeros(&self) -> usize {
+        self.bit_slice().count_zeros()
     }
 
-    pub fn bit_chunks(&self, offset: usize, len: usize) -> BitChunks {
-        BitChunks::new(&self, offset, len)
+    ///
+    /// Get bit value at the given index in this bit view
+    #[inline]
+    pub fn get_bit(&self, index: usize) -> bool {
+        self.bit_slice().get_bit(index)
+    }
+
+    ///
+    /// Get bits in this view as vector of booleans
+    #[inline]
+    pub fn typed_bits(&mut self) -> Vec<bool> {
+        self.bit_slice().typed_bits()
     }
 
     /// Returns an empty buffer.
+    #[inline]
     pub fn empty() -> Self {
         unsafe { Self::from_raw_parts(BUFFER_INIT.as_ptr() as _, 0, 0) }
     }
@@ -283,7 +317,7 @@ impl<T: AsRef<[u8]>> From<T> for Buffer {
         // allocate aligned memory buffer
         let slice = p.as_ref();
         let len = slice.len() * mem::size_of::<u8>();
-        let capacity = bit_util::round_upto_multiple_of_64(len);
+        let capacity = utils::round_upto_multiple_of_64(len);
         let buffer = memory::allocate_aligned(capacity);
         unsafe {
             memory::memcpy(buffer, slice.as_ptr(), len);
@@ -322,7 +356,7 @@ where
         .borrow_mut()
         .zip(left_chunks.borrow_mut().zip(right_chunks.borrow_mut()))
         .for_each(|(res, (left, right))| {
-            unsafe { bit_util::bitwise_bin_op_simd(&left, &right, res, &simd_op) };
+            unsafe { utils::bitwise_bin_op_simd(&left, &right, res, &simd_op) };
         });
 
     result_chunks
@@ -401,20 +435,27 @@ where
     let mut result =
         MutableBuffer::new(ceil(len_in_bits, 8)).with_bitset(len_in_bits / 64 * 8, false);
 
-    let left_chunks = left.bit_chunks(left_offset_in_bits, len_in_bits);
-    let right_chunks = right.bit_chunks(right_offset_in_bits, len_in_bits);
+    let left_slice = left.bit_slice().view(left_offset_in_bits, len_in_bits);
+    let left_chunks = left_slice.chunks::<u64>();
+
+    let right_slice = right.bit_slice().view(right_offset_in_bits, len_in_bits);
+    let right_chunks = right_slice.chunks::<u64>();
+
+    let remainder_bytes = ceil(left_chunks.remainder_bit_len(), 8);
+    let rem = op(left_chunks.remainder_bits(), right_chunks.remainder_bits());
+    let rem = &rem.to_ne_bytes()[0..remainder_bytes];
+
+    let left_chunk_iter = left_chunks.interpret();
+    let right_chunk_iter = right_chunks.interpret();
+
     let result_chunks = result.typed_data_mut::<u64>().iter_mut();
 
     result_chunks
-        .zip(left_chunks.iter().zip(right_chunks.iter()))
+        .zip(left_chunk_iter.zip(right_chunk_iter))
         .for_each(|(res, (left, right))| {
             *res = op(left, right);
         });
 
-    let remainder_bytes = ceil(left_chunks.remainder_len(), 8);
-    let rem = op(left_chunks.remainder_bits(), right_chunks.remainder_bits());
-    // we are counting its starting from the least significant bit, to to_le_bytes should be correct
-    let rem = &rem.to_le_bytes()[0..remainder_bytes];
     result.extend_from_slice(rem);
 
     result.freeze()
@@ -435,19 +476,21 @@ where
     let mut result =
         MutableBuffer::new(ceil(len_in_bits, 8)).with_bitset(len_in_bits / 64 * 8, false);
 
-    let left_chunks = left.bit_chunks(offset_in_bits, len_in_bits);
+    let left_slice = left.bit_slice().view(offset_in_bits, len_in_bits);
+    let left_chunks = left_slice.chunks::<u64>();
+
+    let remainder_bytes = ceil(left_chunks.remainder_bit_len(), 8);
+    let rem = op(left_chunks.remainder_bits());
+    let rem = &rem.to_ne_bytes()[0..remainder_bytes];
+
+    let left_chunk_iter = left_chunks.interpret();
+
     let result_chunks = result.typed_data_mut::<u64>().iter_mut();
 
-    result_chunks
-        .zip(left_chunks.iter())
-        .for_each(|(res, left)| {
-            *res = op(left);
-        });
+    result_chunks.zip(left_chunk_iter).for_each(|(res, left)| {
+        *res = op(left);
+    });
 
-    let remainder_bytes = ceil(left_chunks.remainder_len(), 8);
-    let rem = op(left_chunks.remainder_bits());
-    // we are counting its starting from the least significant bit, to to_le_bytes should be correct
-    let rem = &rem.to_le_bytes()[0..remainder_bytes];
     result.extend_from_slice(rem);
 
     result.freeze()
@@ -605,7 +648,7 @@ pub struct MutableBuffer {
 impl MutableBuffer {
     /// Allocate a new mutable buffer with initial capacity to be `capacity`.
     pub fn new(capacity: usize) -> Self {
-        let new_capacity = bit_util::round_upto_multiple_of_64(capacity);
+        let new_capacity = utils::round_upto_multiple_of_64(capacity);
         let ptr = memory::allocate_aligned(new_capacity);
         Self {
             data: ptr,
@@ -616,7 +659,7 @@ impl MutableBuffer {
 
     /// creates a new [MutableBuffer] where every bit is initialized to `0`
     pub fn new_null(len: usize) -> Self {
-        let num_bytes = bit_util::ceil(len, 8);
+        let num_bytes = utils::ceil(len, 8);
         MutableBuffer::new(num_bytes).with_bitset(num_bytes, false)
     }
 
@@ -654,7 +697,7 @@ impl MutableBuffer {
     /// Returns the new capacity for this buffer.
     pub fn reserve(&mut self, capacity: usize) -> usize {
         if capacity > self.capacity {
-            let new_capacity = bit_util::round_upto_multiple_of_64(capacity);
+            let new_capacity = utils::round_upto_multiple_of_64(capacity);
             let new_capacity = cmp::max(new_capacity, self.capacity * 2);
             let new_data =
                 unsafe { memory::reallocate(self.data, self.capacity, new_capacity) };
@@ -675,7 +718,7 @@ impl MutableBuffer {
         if new_len > self.len {
             self.reserve(new_len);
         } else {
-            let new_capacity = bit_util::round_upto_multiple_of_64(new_len);
+            let new_capacity = utils::round_upto_multiple_of_64(new_len);
             if new_capacity < self.capacity {
                 let new_data =
                     unsafe { memory::reallocate(self.data, self.capacity, new_capacity) };
@@ -729,6 +772,8 @@ impl MutableBuffer {
 
     /// Returns a raw pointer for this buffer.
     ///
+    /// # Safety
+    ///
     /// Note that this should be used cautiously, and the returned pointer should not be
     /// stored anywhere, to avoid dangling pointers.
     #[inline]
@@ -736,6 +781,13 @@ impl MutableBuffer {
         self.data
     }
 
+    /// Returns a mutable raw pointer for this buffer.
+    ///
+    /// # Safety
+    ///
+    /// Note that this should be used cautiously, and the returned pointer should not be
+    /// stored anywhere, to avoid dangling pointers. Moreover, arguments and return type using this
+    /// method should be either immutable, or mutable.
     #[inline]
     pub fn raw_data_mut(&mut self) -> *mut u8 {
         self.data
@@ -779,6 +831,51 @@ impl MutableBuffer {
         }
         self.len += bytes.len();
     }
+
+    /// Gives immutable bit slice of the underlying buffer
+    /// This method can be used to get bit views for bit operations on the immutable view over the buffer.
+    #[inline]
+    pub fn bit_slice(&self) -> BufferBitSlice {
+        BufferBitSlice::new(self.data())
+    }
+
+    /// Gives mutable bit slice of the underlying buffer
+    /// This method can be used to get bit views for bit operations on the mutable view over the buffer.
+    #[inline]
+    pub fn bit_slice_mut(&mut self) -> BufferBitSliceMut {
+        BufferBitSliceMut::new(self.data_mut())
+    }
+
+    /// Count one bits in this buffer
+    #[inline]
+    pub fn count_ones(&self) -> usize {
+        self.bit_slice().count_ones()
+    }
+
+    /// Count zero bits in this buffer
+    #[inline]
+    pub fn count_zeros(&self) -> usize {
+        self.bit_slice().count_zeros()
+    }
+
+    /// Unsets bit at the given position in the underlying mutable buffer
+    #[inline]
+    pub fn set_bit(&mut self, i: usize) {
+        self.bit_slice_mut().set_bit(i, true)
+    }
+
+    /// Unsets bit at the given position in the underlying mutable buffer
+    #[inline]
+    pub fn unset_bit(&mut self, i: usize) {
+        self.bit_slice_mut().set_bit(i, false)
+    }
+
+    ///
+    /// Get bits in this view as vector of booleans
+    #[inline]
+    pub fn typed_bits(&mut self) -> Vec<bool> {
+        self.bit_slice().typed_bits()
+    }
 }
 
 impl Drop for MutableBuffer {
@@ -806,7 +903,7 @@ unsafe impl Send for MutableBuffer {}
 
 #[cfg(test)]
 mod tests {
-    use crate::util::bit_util;
+
     use std::ptr::null_mut;
     use std::thread;
 
@@ -908,11 +1005,11 @@ mod tests {
     fn test_with_bitset() {
         let mut_buf = MutableBuffer::new(64).with_bitset(64, false);
         let buf = mut_buf.freeze();
-        assert_eq!(0, bit_util::count_set_bits(buf.data()));
+        assert_eq!(0, buf.count_ones());
 
         let mut_buf = MutableBuffer::new(64).with_bitset(64, true);
         let buf = mut_buf.freeze();
-        assert_eq!(512, bit_util::count_set_bits(buf.data()));
+        assert_eq!(512, buf.count_ones());
     }
 
     #[test]
@@ -920,12 +1017,12 @@ mod tests {
         let mut mut_buf = MutableBuffer::new(64).with_bitset(64, true);
         mut_buf.set_null_bits(0, 64);
         let buf = mut_buf.freeze();
-        assert_eq!(0, bit_util::count_set_bits(buf.data()));
+        assert_eq!(0, buf.count_ones());
 
         let mut mut_buf = MutableBuffer::new(64).with_bitset(64, true);
         mut_buf.set_null_bits(32, 32);
         let buf = mut_buf.freeze();
-        assert_eq!(256, bit_util::count_set_bits(buf.data()));
+        assert_eq!(256, buf.count_ones());
     }
 
     #[test]
diff --git a/rust/arrow/src/compute/kernels/aggregate.rs b/rust/arrow/src/compute/kernels/aggregate.rs
index 444e2454a1c..e332a644de5 100644
--- a/rust/arrow/src/compute/kernels/aggregate.rs
+++ b/rust/arrow/src/compute/kernels/aggregate.rs
@@ -152,9 +152,15 @@ where
             let data_chunks = data.chunks_exact(64);
             let remainder = data_chunks.remainder();
 
-            let bit_chunks = buffer.bit_chunks(array.offset(), array.len());
+            let buffer_slice = buffer.bit_slice().view(array.offset(), array.len());
+            let buffer_chunks = buffer_slice.chunks::<u64>();
+
+            let buffer_remainder_bits: u64 = buffer_chunks.remainder_bits();
+
+            let buffer_chunk_iter = buffer_chunks.interpret();
+
             &data_chunks
-                .zip(bit_chunks.iter())
+                .zip(buffer_chunk_iter)
                 .for_each(|(chunk, mask)| {
                     chunk.iter().enumerate().for_each(|(i, value)| {
                         if (mask & (1 << i)) != 0 {
@@ -163,10 +169,8 @@ where
                     });
                 });
 
-            let remainder_bits = bit_chunks.remainder_bits();
-
             remainder.iter().enumerate().for_each(|(i, value)| {
-                if remainder_bits & (1 << i) != 0 {
+                if buffer_remainder_bits & (1 << i) != 0 {
                     sum = sum + *value;
                 }
             });
@@ -216,24 +220,27 @@ where
             let data_chunks = data.chunks_exact(64);
             let remainder = data_chunks.remainder();
 
-            let bit_chunks = buffer.bit_chunks(array.offset(), array.len());
+            let bit_slice = buffer.bit_slice().view(array.offset(), array.len());
+            let bit_chunks = bit_slice.chunks::<u64>();
             let remainder_bits = bit_chunks.remainder_bits();
 
-            data_chunks.zip(bit_chunks).for_each(|(chunk, mut mask)| {
-                // split chunks further into slices corresponding to the vector length
-                // the compiler is able to unroll this inner loop and remove bounds checks
-                // since the outer chunk size (64) is always a multiple of the number of lanes
-                chunk.chunks_exact(T::lanes()).for_each(|chunk| {
-                    let zero = T::init(T::default_value());
-                    let vecmask = T::mask_from_u64(mask);
-                    let chunk = T::load(&chunk);
-                    let blended = T::mask_select(vecmask, chunk, zero);
-
-                    vector_sum = vector_sum + blended;
-
-                    mask = mask >> T::lanes();
+            data_chunks
+                .zip(bit_chunks.interpret())
+                .for_each(|(chunk, mut mask)| {
+                    // split chunks further into slices corresponding to the vector length
+                    // the compiler is able to unroll this inner loop and remove bounds checks
+                    // since the outer chunk size (64) is always a multiple of the number of lanes
+                    chunk.chunks_exact(T::lanes()).for_each(|chunk| {
+                        let zero = T::init(T::default_value());
+                        let vecmask = T::mask_from_u64(mask);
+                        let chunk = T::load(&chunk);
+                        let blended = T::mask_select(vecmask, chunk, zero);
+
+                        vector_sum = vector_sum + blended;
+
+                        mask = mask >> T::lanes();
+                    });
                 });
-            });
 
             remainder.iter().enumerate().for_each(|(i, value)| {
                 if remainder_bits & (1 << i) != 0 {
diff --git a/rust/arrow/src/compute/kernels/arithmetic.rs b/rust/arrow/src/compute/kernels/arithmetic.rs
index fe1bda5e26f..c88680e6d6a 100644
--- a/rust/arrow/src/compute/kernels/arithmetic.rs
+++ b/rust/arrow/src/compute/kernels/arithmetic.rs
@@ -31,6 +31,7 @@ use std::sync::Arc;
 
 use num::{One, Zero};
 
+use crate::array::*;
 #[cfg(feature = "simd")]
 use crate::bitmap::Bitmap;
 use crate::buffer::Buffer;
@@ -42,7 +43,6 @@ use crate::compute::util::simd_load_set_invalid;
 use crate::datatypes;
 use crate::datatypes::ToByteSlice;
 use crate::error::{ArrowError, Result};
-use crate::{array::*, util::bit_util};
 
 /// Helper function to perform math lambda function on values from two arrays. If either
 /// left or right value is null then the output value is also null, so `1 + null` is
@@ -113,8 +113,8 @@ where
     if let Some(b) = &null_bit_buffer {
         // some value is null
         for i in 0..left.len() {
-            values.push(unsafe {
-                if bit_util::get_bit_raw(b.raw_data(), i) {
+            values.push({
+                if b.get_bit(i) {
                     let right_value = right.value(i);
                     if right_value.is_zero() {
                         return Err(ArrowError::DivideByZero);
diff --git a/rust/arrow/src/compute/kernels/boolean.rs b/rust/arrow/src/compute/kernels/boolean.rs
index 075ffb0d67b..ac9d24d20bb 100644
--- a/rust/arrow/src/compute/kernels/boolean.rs
+++ b/rust/arrow/src/compute/kernels/boolean.rs
@@ -31,7 +31,7 @@ use crate::buffer::{
 use crate::compute::util::combine_option_bitmap;
 use crate::datatypes::DataType;
 use crate::error::{ArrowError, Result};
-use crate::util::bit_util::ceil;
+use crate::util::utils::ceil;
 
 /// Helper function to implement binary kernels
 fn binary_boolean_kernel<F>(
@@ -140,7 +140,7 @@ pub fn is_not_null(input: &ArrayRef) -> Result<BooleanArray> {
                 .with_bitset(len_bytes, true)
                 .freeze()
         }
-        Some(buffer) => buffer.bit_slice(input.offset(), len),
+        Some(buffer) => buffer.bit_view(input.offset(), len),
     };
 
     let data =
diff --git a/rust/arrow/src/compute/kernels/comparison.rs b/rust/arrow/src/compute/kernels/comparison.rs
index d73356e44fe..f0bcf7e6f76 100644
--- a/rust/arrow/src/compute/kernels/comparison.rs
+++ b/rust/arrow/src/compute/kernels/comparison.rs
@@ -31,7 +31,7 @@ use crate::buffer::{Buffer, MutableBuffer};
 use crate::compute::util::combine_option_bitmap;
 use crate::datatypes::{ArrowNumericType, BooleanType, DataType};
 use crate::error::{ArrowError, Result};
-use crate::util::bit_util;
+use crate::util::utils;
 
 /// Helper function to perform boolean lambda function on values from two arrays, this
 /// version does not attempt to use SIMD.
@@ -570,27 +570,26 @@ where
         ));
     }
 
-    let num_bytes = bit_util::ceil(left_len, 8);
+    let num_bytes = utils::ceil(left_len, 8);
 
     let not_both_null_bit_buffer =
         match combine_option_bitmap(left.data_ref(), right.data_ref(), left_len)? {
             Some(buff) => buff,
             None => new_all_set_buffer(num_bytes),
         };
-    let not_both_null_bitmap = not_both_null_bit_buffer.data();
+    let _not_both_null_bitmap = not_both_null_bit_buffer.data();
 
     let mut bool_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, false);
-    let bool_slice = bool_buf.data_mut();
 
     // if both array slots are valid, check if list contains primitive
     for i in 0..left_len {
-        if bit_util::get_bit(not_both_null_bitmap, i) {
+        if not_both_null_bit_buffer.get_bit(i) {
             let list = right.value(i);
             let list = list.as_any().downcast_ref::<PrimitiveArray<T>>().unwrap();
 
             for j in 0..list.len() {
                 if list.is_valid(j) && (left.value(i) == list.value(j)) {
-                    bit_util::set_bit(bool_slice, i);
+                    bool_buf.set_bit(i);
                     continue;
                 }
             }
@@ -625,21 +624,19 @@ where
         ));
     }
 
-    let num_bytes = bit_util::ceil(left_len, 8);
+    let num_bytes = utils::ceil(left_len, 8);
 
     let not_both_null_bit_buffer =
         match combine_option_bitmap(left.data_ref(), right.data_ref(), left_len)? {
             Some(buff) => buff,
             None => new_all_set_buffer(num_bytes),
         };
-    let not_both_null_bitmap = not_both_null_bit_buffer.data();
 
     let mut bool_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, false);
-    let bool_slice = bool_buf.data_mut();
 
     for i in 0..left_len {
         // contains(null, null) = false
-        if bit_util::get_bit(not_both_null_bitmap, i) {
+        if not_both_null_bit_buffer.get_bit(i) {
             let list = right.value(i);
             let list = list
                 .as_any()
@@ -648,7 +645,7 @@ where
 
             for j in 0..list.len() {
                 if list.is_valid(j) && (left.value(i) == list.value(j)) {
-                    bit_util::set_bit(bool_slice, i);
+                    bool_buf.set_bit(i);
                     continue;
                 }
             }
diff --git a/rust/arrow/src/compute/kernels/filter.rs b/rust/arrow/src/compute/kernels/filter.rs
index 55024ff2ae2..a171c1fcebe 100644
--- a/rust/arrow/src/compute/kernels/filter.rs
+++ b/rust/arrow/src/compute/kernels/filter.rs
@@ -21,10 +21,10 @@ use crate::array::*;
 use crate::datatypes::*;
 use crate::error::{ArrowError, Result};
 use crate::record_batch::RecordBatch;
+use crate::util::bit_ops::BufferBitSlice;
 use crate::{
     bitmap::Bitmap,
     buffer::{Buffer, MutableBuffer},
-    util::bit_util,
 };
 use std::{mem, sync::Arc};
 
@@ -93,8 +93,9 @@ impl<'a> NullBitSetter<'a> {
 impl<'a> CopyNullBit for NullBitSetter<'a> {
     #[inline]
     fn copy_null_bit(&mut self, source_index: usize) {
-        if !bit_util::get_bit(self.source_bytes, source_index) {
-            bit_util::unset_bit(self.target_buffer.data_mut(), self.target_index);
+        let source = BufferBitSlice::new(self.source_bytes);
+        if !source.get_bit(source_index) {
+            self.target_buffer.unset_bit(self.target_index);
             self.null_count += 1;
         }
         self.target_index += 1;
@@ -320,8 +321,9 @@ impl FilterContext {
         }
         let filter_mask: Vec<u64> = (0..64).map(|x| 1u64 << x).collect();
         let filter_bytes = filter_array.data_ref().buffers()[0].data();
-        let filtered_count =
-            bit_util::count_set_bits_offset(filter_bytes, 0, filter_array.len());
+        let filtered_count = BufferBitSlice::new(filter_bytes)
+            .view(0, filter_array.len())
+            .count_ones();
 
         // transmute filter_bytes to &[u64]
         let mut u64_buffer = MutableBuffer::new(filter_bytes.len());
diff --git a/rust/arrow/src/compute/kernels/limit.rs b/rust/arrow/src/compute/kernels/limit.rs
index 65f66bce8e5..8f191c3e933 100644
--- a/rust/arrow/src/compute/kernels/limit.rs
+++ b/rust/arrow/src/compute/kernels/limit.rs
@@ -37,8 +37,8 @@ mod tests {
     use crate::array::*;
     use crate::buffer::Buffer;
     use crate::datatypes::{DataType, Field, ToByteSlice};
-    use crate::util::bit_util;
 
+    use crate::util::bit_ops::BufferBitSliceMut;
     use std::sync::Arc;
 
     #[test]
@@ -103,11 +103,12 @@ mod tests {
             Buffer::from(&[0, 2, 2, 4, 4, 6, 6, 9, 9, 10].to_byte_slice());
         // 01010101 00000001
         let mut null_bits: [u8; 2] = [0; 2];
-        bit_util::set_bit(&mut null_bits, 0);
-        bit_util::set_bit(&mut null_bits, 2);
-        bit_util::set_bit(&mut null_bits, 4);
-        bit_util::set_bit(&mut null_bits, 6);
-        bit_util::set_bit(&mut null_bits, 8);
+        let mut null_bit_slice = BufferBitSliceMut::new(&mut null_bits);
+        null_bit_slice.set_bit(0, true);
+        null_bit_slice.set_bit(2, true);
+        null_bit_slice.set_bit(4, true);
+        null_bit_slice.set_bit(6, true);
+        null_bit_slice.set_bit(8, true);
 
         // Construct a list array from the above two
         let list_data_type =
diff --git a/rust/arrow/src/compute/kernels/take.rs b/rust/arrow/src/compute/kernels/take.rs
index 9c9ca56fec3..8788e760035 100644
--- a/rust/arrow/src/compute/kernels/take.rs
+++ b/rust/arrow/src/compute/kernels/take.rs
@@ -23,7 +23,7 @@ use crate::buffer::{Buffer, MutableBuffer};
 use crate::compute::util::take_value_indices_from_list;
 use crate::datatypes::*;
 use crate::error::{ArrowError, Result};
-use crate::util::bit_util;
+use crate::util::utils;
 use crate::{array::*, buffer::buffer_bin_and};
 
 use num::{ToPrimitive, Zero};
@@ -212,11 +212,9 @@ where
 
     let array = values.as_any().downcast_ref::<PrimitiveArray<T>>().unwrap();
 
-    let num_bytes = bit_util::ceil(data_len, 8);
+    let num_bytes = utils::ceil(data_len, 8);
     let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true);
 
-    let null_slice = null_buf.data_mut();
-
     // This iteration is implemented with a while loop, rather than a
     // map()/collect(), since the while loop performs better in the benchmarks.
     let mut new_values: Vec<T::Native> = Vec::with_capacity(data_len);
@@ -227,7 +225,7 @@ where
         })?;
 
         if array.is_null(index) {
-            bit_util::unset_bit(null_slice, i);
+            null_buf.unset_bit(i);
         }
 
         new_values.push(array.value(index));
@@ -265,22 +263,19 @@ where
 
     let array = values.as_any().downcast_ref::<BooleanArray>().unwrap();
 
-    let num_byte = bit_util::ceil(data_len, 8);
+    let num_byte = utils::ceil(data_len, 8);
     let mut null_buf = MutableBuffer::new(num_byte).with_bitset(num_byte, true);
     let mut val_buf = MutableBuffer::new(num_byte).with_bitset(num_byte, false);
 
-    let null_slice = null_buf.data_mut();
-    let val_slice = val_buf.data_mut();
-
     (0..data_len).try_for_each::<_, Result<()>>(|i| {
         let index = ToPrimitive::to_usize(&indices.value(i)).ok_or_else(|| {
             ArrowError::ComputeError("Cast to usize failed".to_string())
         })?;
 
         if array.is_null(index) {
-            bit_util::unset_bit(null_slice, i);
+            null_buf.unset_bit(i);
         } else if array.value(index) {
-            bit_util::set_bit(val_slice, i);
+            val_buf.set_bit(i);
         }
 
         Ok(())
@@ -320,9 +315,8 @@ where
         .downcast_ref::<GenericStringArray<OffsetSize>>()
         .unwrap();
 
-    let num_bytes = bit_util::ceil(data_len, 8);
+    let num_bytes = utils::ceil(data_len, 8);
     let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true);
-    let null_slice = null_buf.data_mut();
 
     let mut offsets = Vec::with_capacity(data_len + 1);
     let mut values = Vec::with_capacity(data_len);
@@ -341,7 +335,7 @@ where
             values.extend_from_slice(s.as_bytes());
         } else {
             // set null bit
-            bit_util::unset_bit(null_slice, i);
+            null_buf.unset_bit(i);
         }
         offsets.push(length_so_far);
     }
@@ -388,17 +382,14 @@ where
 
     let taken = take_impl::<OffsetType>(&list.values(), &list_indices, None)?;
     // determine null count and null buffer, which are a function of `values` and `indices`
-    let mut null_count = 0;
-    let num_bytes = bit_util::ceil(indices.len(), 8);
+    let num_bytes = utils::ceil(indices.len(), 8);
     let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true);
     {
-        let null_slice = null_buf.data_mut();
         offsets[..].windows(2).enumerate().for_each(
             |(i, window): (usize, &[OffsetType::Native])| {
                 if window[0] == window[1] {
                     // offsets are equal, slot is null
-                    bit_util::unset_bit(null_slice, i);
-                    null_count += 1;
+                    null_buf.unset_bit(i);
                 }
             },
         );
@@ -407,7 +398,7 @@ where
     // create a new list with taken data and computed null information
     let list_data = ArrayDataBuilder::new(list.data_type().clone())
         .len(indices.len())
-        .null_count(null_count)
+        .null_count(null_buf.count_zeros())
         .null_bit_buffer(null_buf.freeze())
         .offset(0)
         .add_child_data(taken.data())
@@ -452,6 +443,7 @@ where
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::util::bit_ops::*;
 
     fn test_take_primitive_arrays<T>(
         data: Vec<Option<T::Native>>,
@@ -951,9 +943,10 @@ mod tests {
             let expected_offsets = Buffer::from(&expected_offsets.to_byte_slice());
             // construct list array from the two
             let mut null_bits: [u8; 1] = [0; 1];
-            bit_util::set_bit(&mut null_bits, 2);
-            bit_util::set_bit(&mut null_bits, 3);
-            bit_util::set_bit(&mut null_bits, 4);
+            let mut null_bit_view = BufferBitSliceMut::new(&mut null_bits);
+            null_bit_view.set_bit(2, true);
+            null_bit_view.set_bit(3, true);
+            null_bit_view.set_bit(4, true);
             let expected_list_data = ArrayData::builder(list_data_type)
                 .len(5)
                 .null_count(2)
diff --git a/rust/arrow/src/compute/util.rs b/rust/arrow/src/compute/util.rs
index 3b578011640..dde50e4511c 100644
--- a/rust/arrow/src/compute/util.rs
+++ b/rust/arrow/src/compute/util.rs
@@ -45,10 +45,10 @@ pub(super) fn combine_option_bitmap(
     match left {
         None => match right {
             None => Ok(None),
-            Some(r) => Ok(Some(r.bit_slice(right_offset_in_bits, len_in_bits))),
+            Some(r) => Ok(Some(r.bit_view(right_offset_in_bits, len_in_bits))),
         },
         Some(l) => match right {
-            None => Ok(Some(l.bit_slice(left_offset_in_bits, len_in_bits))),
+            None => Ok(Some(l.bit_view(left_offset_in_bits, len_in_bits))),
 
             Some(r) => Ok(Some(buffer_bin_and(
                 &l,
@@ -78,10 +78,10 @@ pub(super) fn compare_option_bitmap(
     match left {
         None => match right {
             None => Ok(None),
-            Some(r) => Ok(Some(r.bit_slice(right_offset_in_bits, len_in_bits))),
+            Some(r) => Ok(Some(r.bit_view(right_offset_in_bits, len_in_bits))),
         },
         Some(l) => match right {
-            None => Ok(Some(l.bit_slice(left_offset_in_bits, len_in_bits))),
+            None => Ok(Some(l.bit_view(left_offset_in_bits, len_in_bits))),
 
             Some(r) => Ok(Some(buffer_bin_or(
                 &l,
diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs
index d6dd7a544cd..68be32fea0e 100644
--- a/rust/arrow/src/datatypes.rs
+++ b/rust/arrow/src/datatypes.rs
@@ -40,7 +40,7 @@ use serde_json::{
 };
 
 use crate::error::{ArrowError, Result};
-use crate::util::bit_util;
+use crate::util::bit_ops::BufferBitSlice;
 
 /// The set of datatypes that are supported by this implementation of Apache Arrow.
 ///
@@ -386,7 +386,11 @@ impl ArrowPrimitiveType for BooleanType {
     /// The pointer must be part of a bit-packed boolean array, and the index must be less than the
     /// size of the array.
     unsafe fn index(raw_ptr: *const Self::Native, i: usize) -> Self::Native {
-        bit_util::get_bit_raw(raw_ptr as *const u8, i)
+        let data = std::slice::from_raw_parts(
+            raw_ptr as *const u8,
+            std::mem::size_of::<isize>(),
+        );
+        BufferBitSlice::new(data).get_bit(i)
     }
 }
 
diff --git a/rust/arrow/src/ipc/writer.rs b/rust/arrow/src/ipc/writer.rs
index cb861609f51..d3a17730a60 100644
--- a/rust/arrow/src/ipc/writer.rs
+++ b/rust/arrow/src/ipc/writer.rs
@@ -30,7 +30,7 @@ use crate::datatypes::*;
 use crate::error::{ArrowError, Result};
 use crate::ipc;
 use crate::record_batch::RecordBatch;
-use crate::util::bit_util;
+use crate::util::utils;
 
 use ipc::CONTINUATION_MARKER;
 
@@ -533,7 +533,7 @@ fn write_array_data(
         let null_buffer = match array_data.null_buffer() {
             None => {
                 // create a buffer and fill it with valid bits
-                let num_bytes = bit_util::ceil(num_rows, 8);
+                let num_bytes = utils::ceil(num_rows, 8);
                 let buffer = MutableBuffer::new(num_bytes);
                 let buffer = buffer.with_bitset(num_bytes, true);
                 buffer.freeze()
diff --git a/rust/arrow/src/util/bit_chunk_iterator.rs b/rust/arrow/src/util/bit_chunk_iterator.rs
deleted file mode 100644
index ec10c6f1d41..00000000000
--- a/rust/arrow/src/util/bit_chunk_iterator.rs
+++ /dev/null
@@ -1,257 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-use crate::buffer::Buffer;
-use crate::util::bit_util::ceil;
-use std::fmt::Debug;
-
-#[derive(Debug)]
-pub struct BitChunks<'a> {
-    buffer: &'a Buffer,
-    raw_data: *const u8,
-    /// offset inside a byte, guaranteed to be between 0 and 7 (inclusive)
-    bit_offset: usize,
-    /// number of complete u64 chunks
-    chunk_len: usize,
-    /// number of remaining bits, guaranteed to be between 0 and 63 (inclusive)
-    remainder_len: usize,
-}
-
-impl<'a> BitChunks<'a> {
-    pub fn new(buffer: &'a Buffer, offset: usize, len: usize) -> Self {
-        assert!(ceil(offset + len, 8) <= buffer.len() * 8);
-
-        let byte_offset = offset / 8;
-        let bit_offset = offset % 8;
-
-        let raw_data = unsafe { buffer.raw_data().add(byte_offset) };
-
-        let chunk_bits = 8 * std::mem::size_of::<u64>();
-
-        let chunk_len = len / chunk_bits;
-        let remainder_len = len & (chunk_bits - 1);
-
-        BitChunks::<'a> {
-            buffer: &buffer,
-            raw_data,
-            bit_offset,
-            chunk_len,
-            remainder_len,
-        }
-    }
-}
-
-#[derive(Debug)]
-pub struct BitChunkIterator<'a> {
-    buffer: &'a Buffer,
-    raw_data: *const u8,
-    bit_offset: usize,
-    chunk_len: usize,
-    index: usize,
-}
-
-impl<'a> BitChunks<'a> {
-    /// Returns the number of remaining bits, guaranteed to be between 0 and 63 (inclusive)
-    #[inline]
-    pub const fn remainder_len(&self) -> usize {
-        self.remainder_len
-    }
-
-    /// Returns the bitmask of remaining bits
-    #[inline]
-    pub fn remainder_bits(&self) -> u64 {
-        let bit_len = self.remainder_len;
-        if bit_len == 0 {
-            0
-        } else {
-            let bit_offset = self.bit_offset;
-            // number of bytes to read
-            // might be one more than sizeof(u64) if the offset is in the middle of a byte
-            let byte_len = ceil(bit_len + bit_offset, 8);
-            // pointer to remainder bytes after all complete chunks
-            let base = unsafe {
-                self.raw_data
-                    .add(self.chunk_len * std::mem::size_of::<u64>())
-            };
-
-            let mut bits = unsafe { std::ptr::read(base) } as u64 >> bit_offset;
-            for i in 1..byte_len {
-                let byte = unsafe { std::ptr::read(base.add(i)) };
-                bits |= (byte as u64) << (i * 8 - bit_offset);
-            }
-
-            bits & ((1 << bit_len) - 1)
-        }
-    }
-
-    /// Returns an iterator over chunks of 64 bits represented as an u64
-    #[inline]
-    pub const fn iter(&self) -> BitChunkIterator<'a> {
-        BitChunkIterator::<'a> {
-            buffer: self.buffer,
-            raw_data: self.raw_data,
-            bit_offset: self.bit_offset,
-            chunk_len: self.chunk_len,
-            index: 0,
-        }
-    }
-}
-
-impl<'a> IntoIterator for BitChunks<'a> {
-    type Item = u64;
-    type IntoIter = BitChunkIterator<'a>;
-
-    fn into_iter(self) -> Self::IntoIter {
-        self.iter()
-    }
-}
-
-impl Iterator for BitChunkIterator<'_> {
-    type Item = u64;
-
-    #[inline]
-    fn next(&mut self) -> Option<u64> {
-        let index = self.index;
-        if index >= self.chunk_len {
-            return None;
-        }
-
-        // cast to *const u64 should be fine since we are using read_unaligned below
-        #[allow(clippy::cast_ptr_alignment)]
-        let raw_data = self.raw_data as *const u64;
-
-        // bit-packed buffers are stored starting with the least-significant byte first
-        // so when reading as u64 on a big-endian machine, the bytes need to be swapped
-        let current = unsafe { std::ptr::read_unaligned(raw_data.add(index)).to_le() };
-
-        let combined = if self.bit_offset == 0 {
-            current
-        } else {
-            let next =
-                unsafe { std::ptr::read_unaligned(raw_data.add(index + 1)).to_le() };
-
-            current >> self.bit_offset
-                | (next & ((1 << self.bit_offset) - 1)) << (64 - self.bit_offset)
-        };
-
-        self.index = index + 1;
-
-        Some(combined)
-    }
-
-    #[inline]
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        (
-            self.chunk_len - self.index,
-            Some(self.chunk_len - self.index),
-        )
-    }
-}
-
-impl ExactSizeIterator for BitChunkIterator<'_> {
-    #[inline]
-    fn len(&self) -> usize {
-        self.chunk_len - self.index
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::buffer::Buffer;
-
-    #[test]
-    fn test_iter_aligned() {
-        let input: &[u8] = &[0, 1, 2, 3, 4, 5, 6, 7];
-        let buffer: Buffer = Buffer::from(input);
-
-        let bitchunks = buffer.bit_chunks(0, 64);
-        let result = bitchunks.into_iter().collect::<Vec<_>>();
-
-        assert_eq!(vec![0x0706050403020100], result);
-    }
-
-    #[test]
-    fn test_iter_unaligned() {
-        let input: &[u8] = &[
-            0b00000000, 0b00000001, 0b00000010, 0b00000100, 0b00001000, 0b00010000,
-            0b00100000, 0b01000000, 0b11111111,
-        ];
-        let buffer: Buffer = Buffer::from(input);
-
-        let bitchunks = buffer.bit_chunks(4, 64);
-
-        assert_eq!(0, bitchunks.remainder_len());
-        assert_eq!(0, bitchunks.remainder_bits());
-
-        let result = bitchunks.into_iter().collect::<Vec<_>>();
-
-        assert_eq!(
-            vec![0b1111010000000010000000010000000010000000010000000010000000010000],
-            result
-        );
-    }
-
-    #[test]
-    fn test_iter_unaligned_remainder_1_byte() {
-        let input: &[u8] = &[
-            0b00000000, 0b00000001, 0b00000010, 0b00000100, 0b00001000, 0b00010000,
-            0b00100000, 0b01000000, 0b11111111,
-        ];
-        let buffer: Buffer = Buffer::from(input);
-
-        let bitchunks = buffer.bit_chunks(4, 66);
-
-        assert_eq!(2, bitchunks.remainder_len());
-        assert_eq!(0b00000011, bitchunks.remainder_bits());
-
-        let result = bitchunks.into_iter().collect::<Vec<_>>();
-
-        assert_eq!(
-            vec![0b1111010000000010000000010000000010000000010000000010000000010000],
-            result
-        );
-    }
-
-    #[test]
-    fn test_iter_unaligned_remainder_bits_across_bytes() {
-        let input: &[u8] = &[0b00111111, 0b11111100];
-        let buffer: Buffer = Buffer::from(input);
-
-        // remainder contains bits from both bytes
-        // result should be the highest 2 bits from first byte followed by lowest 5 bits of second bytes
-        let bitchunks = buffer.bit_chunks(6, 7);
-
-        assert_eq!(7, bitchunks.remainder_len());
-        assert_eq!(0b1110000, bitchunks.remainder_bits());
-    }
-
-    #[test]
-    fn test_iter_unaligned_remainder_bits_large() {
-        let input: &[u8] = &[
-            0b11111111, 0b00000000, 0b11111111, 0b00000000, 0b11111111, 0b00000000,
-            0b11111111, 0b00000000, 0b11111111,
-        ];
-        let buffer: Buffer = Buffer::from(input);
-
-        let bitchunks = buffer.bit_chunks(2, 63);
-
-        assert_eq!(63, bitchunks.remainder_len());
-        assert_eq!(
-            0b1000000_00111111_11000000_00111111_11000000_00111111_11000000_00111111,
-            bitchunks.remainder_bits()
-        );
-    }
-}
diff --git a/rust/arrow/src/util/bit_ops.rs b/rust/arrow/src/util/bit_ops.rs
new file mode 100644
index 00000000000..0dd4571d46e
--- /dev/null
+++ b/rust/arrow/src/util/bit_ops.rs
@@ -0,0 +1,407 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::buffer::Buffer;
+
+use bitvec::prelude::*;
+use bitvec::slice::ChunksExact;
+
+use std::fmt::Debug;
+
+///
+/// Immutable bit slice representation of buffer data
+#[derive(Debug)]
+pub struct BufferBitSlice<'a> {
+    buffer_data: &'a [u8],
+    bit_slice: &'a BitSlice<LocalBits, u8>,
+}
+
+impl<'a> BufferBitSlice<'a> {
+    ///
+    /// Creates a immutable bit slice over the given data
+    #[inline]
+    pub fn new(buffer_data: &'a [u8]) -> Self {
+        let bit_slice = BitSlice::<LocalBits, _>::from_slice(buffer_data).unwrap();
+
+        BufferBitSlice {
+            buffer_data,
+            bit_slice: &bit_slice,
+        }
+    }
+
+    ///
+    /// Returns immutable view with the given offset in bits and length in bits.
+    /// This view have zero-copy representation over the actual data.
+    #[inline(always)]
+    pub fn view(&self, offset_in_bits: usize, len_in_bits: usize) -> Self {
+        Self {
+            buffer_data: self.buffer_data,
+            bit_slice: &self.bit_slice[offset_in_bits..offset_in_bits + len_in_bits],
+        }
+    }
+
+    ///
+    /// Returns bit chunks in native 64-bit allocation size.
+    /// Native representations in Arrow follows 64-bit convention.
+    /// Chunks can still be reinterpreted in any primitive type lower than u64.
+    #[inline(always)]
+    pub fn chunks<T>(&self) -> BufferBitChunksExact<T>
+    where
+        T: BitMemory,
+    {
+        let offset_size_in_bits = 8 * std::mem::size_of::<T>();
+        let chunks_exact = self.bit_slice.chunks_exact(offset_size_in_bits);
+        let remainder_bits = chunks_exact.remainder();
+        let remainder: T = if remainder_bits.len() == 0 {
+            T::default()
+        } else {
+            remainder_bits.load::<T>()
+        };
+        BufferBitChunksExact {
+            chunks_exact,
+            remainder,
+            remainder_len_in_bits: remainder_bits.len(),
+        }
+    }
+
+    ///
+    /// Converts the bit view into the Buffer.
+    /// Buffer is always byte-aligned and well-aligned.
+    #[inline]
+    pub fn as_buffer(&self) -> Buffer {
+        Buffer::from(self.bit_slice.as_slice())
+    }
+
+    ///
+    /// Count ones in the given bit view
+    #[inline]
+    pub fn count_ones(&self) -> usize {
+        self.bit_slice.count_ones()
+    }
+
+    ///
+    /// Count zeros in the given bit view
+    #[inline]
+    pub fn count_zeros(&self) -> usize {
+        self.bit_slice.count_zeros()
+    }
+
+    ///
+    /// Get bit value at the given index in this bit view
+    #[inline]
+    pub fn get_bit(&self, index: usize) -> bool {
+        *unsafe { self.bit_slice.get_unchecked(index) }
+    }
+
+    ///
+    /// Get bits in this view as vector of booleans
+    #[inline]
+    pub fn typed_bits(&self) -> Vec<bool> {
+        self.bit_slice.iter().map(|e| *e).collect()
+    }
+
+    ///
+    /// Get manipulated data as byte slice
+    #[inline]
+    pub fn to_slice(&self) -> &[u8] {
+        self.bit_slice.as_slice()
+    }
+}
+
+impl<'a> PartialEq for BufferBitSlice<'a> {
+    fn eq(&self, other: &Self) -> bool {
+        self.bit_slice == other.bit_slice
+    }
+}
+
+///
+/// Conversion from mutable slice to immutable bit slice
+impl<'a> From<&'a [u8]> for BufferBitSlice<'a> {
+    fn from(data: &'a [u8]) -> Self {
+        BufferBitSlice::new(data)
+    }
+}
+
+///
+/// Mutable bit slice representation of buffer data
+#[derive(Debug)]
+pub struct BufferBitSliceMut<'a> {
+    bit_slice: &'a mut BitSlice<LocalBits, u8>,
+}
+
+impl<'a> BufferBitSliceMut<'a> {
+    ///
+    /// Creates a mutable bit slice over the given data
+    #[inline]
+    pub fn new(buffer_data: &'a mut [u8]) -> Self {
+        let bit_slice = BitSlice::<LocalBits, _>::from_slice_mut(buffer_data).unwrap();
+
+        BufferBitSliceMut { bit_slice }
+    }
+
+    ///
+    /// Returns mutable view with the given offset in bits and length in bits.
+    /// This view have zero-copy representation over the actual data.
+    #[inline(always)]
+    pub fn view(&'a mut self, offset_in_bits: usize, len_in_bits: usize) -> Self {
+        Self {
+            bit_slice: &mut self.bit_slice[offset_in_bits..offset_in_bits + len_in_bits],
+        }
+    }
+
+    ///
+    /// Set given bit at the position to a given value
+    #[inline]
+    pub fn set_bit_all(&mut self, value: bool) {
+        self.bit_slice.set_all(value)
+    }
+
+    ///
+    /// Count zeros in the given bit view
+    #[inline]
+    pub fn set_bit(&mut self, index: usize, value: bool) {
+        unsafe { self.bit_slice.set_unchecked(index, value) }
+    }
+
+    ///
+    /// Converts the bit view into the Buffer.
+    /// Buffer is always byte-aligned and well-aligned.
+    #[inline]
+    pub fn as_buffer(&self) -> Buffer {
+        Buffer::from(self.bit_slice.as_slice())
+    }
+
+    ///
+    /// Count ones in the given bit view
+    #[inline]
+    pub fn count_ones(&self) -> usize {
+        self.bit_slice.count_ones()
+    }
+
+    ///
+    /// Count zeros in the given bit view
+    #[inline]
+    pub fn count_zeros(&self) -> usize {
+        self.bit_slice.count_zeros()
+    }
+
+    ///
+    /// Get bit value at the given index in this bit view
+    #[inline]
+    pub fn get_bit(&self, index: usize) -> bool {
+        *unsafe { self.bit_slice.get_unchecked(index) }
+    }
+
+    ///
+    /// Get bits in this view as vector of booleans
+    #[inline]
+    pub fn typed_bits(&self) -> Vec<bool> {
+        self.bit_slice.iter().map(|e| *e).collect()
+    }
+
+    ///
+    /// Get manipulated data as byte slice
+    #[inline]
+    pub fn to_slice(&self) -> &[u8] {
+        self.bit_slice.as_slice()
+    }
+}
+
+impl<'a> PartialEq for BufferBitSliceMut<'a> {
+    fn eq(&self, other: &Self) -> bool {
+        self.bit_slice == other.bit_slice
+    }
+}
+
+///
+/// Conversion from mutable slice to mutable bit slice
+impl<'a> From<&'a mut [u8]> for BufferBitSliceMut<'a> {
+    fn from(data: &'a mut [u8]) -> Self {
+        BufferBitSliceMut::new(data)
+    }
+}
+
+///
+/// Exact chunk view over the bit slice
+#[derive(Clone, Debug)]
+pub struct BufferBitChunksExact<'a, T>
+where
+    T: BitMemory,
+{
+    chunks_exact: ChunksExact<'a, LocalBits, u8>,
+    remainder: T,
+    remainder_len_in_bits: usize,
+}
+
+impl<'a, T> BufferBitChunksExact<'a, T>
+where
+    T: BitMemory,
+{
+    ///
+    /// Returns remainder bit length from the exact chunk iterator
+    #[inline(always)]
+    pub fn remainder_bit_len(&self) -> usize {
+        self.remainder_len_in_bits
+    }
+
+    ///
+    /// Returns the remainder bits interpreted as given type.
+    #[inline(always)]
+    pub fn remainder_bits(&self) -> T {
+        self.remainder
+    }
+
+    ///
+    /// Interprets underlying chunk's view's bits as a given type.
+    #[inline(always)]
+    pub fn interpret(self) -> impl Iterator<Item = T> + 'a
+    where
+        T: BitMemory,
+    {
+        self.chunks_exact.map(|e| e.load::<T>())
+    }
+
+    ///
+    /// Returns underlying iterator as it is
+    #[inline(always)]
+    pub fn iter(&self) -> &ChunksExact<'a, LocalBits, u8> {
+        &self.chunks_exact
+    }
+}
+
+///
+/// Implements consuming iterator for exact chunk iterator
+impl<'a, T> IntoIterator for BufferBitChunksExact<'a, T>
+where
+    T: BitMemory,
+{
+    type Item = &'a BitSlice<LocalBits, u8>;
+    type IntoIter = ChunksExact<'a, LocalBits, u8>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.chunks_exact
+    }
+}
+
+#[cfg(all(test, target_endian = "little"))]
+mod tests_bit_slices_little_endian {
+    use super::*;
+    use crate::datatypes::ToByteSlice;
+
+    #[test]
+    fn test_bit_slice_iter_aligned() {
+        let input: &[u8] = &[0, 1, 2, 3, 4, 5, 6, 7];
+        let buffer: Buffer = Buffer::from(input);
+
+        let bit_slice = buffer.bit_slice();
+        let result = bit_slice.chunks().interpret().collect::<Vec<u64>>();
+
+        assert_eq!(vec![0x0706050403020100], result);
+    }
+
+    #[test]
+    fn test_bit_slice_iter_unaligned() {
+        let input: &[u8] = &[
+            0b00000000, 0b00000001, 0b00000010, 0b00000100, 0b00001000, 0b00010000,
+            0b00100000, 0b01000000, 0b11111111,
+        ];
+        let buffer: Buffer = Buffer::from(input);
+
+        let bit_slice = buffer.bit_slice().view(4, 64);
+        let chunks = bit_slice.chunks::<u64>();
+
+        assert_eq!(0, chunks.remainder_bit_len());
+        assert_eq!(0, chunks.remainder_bits());
+
+        let result = chunks.interpret().collect::<Vec<u64>>();
+
+        assert_eq!(
+            vec![0b1111_01000000_00100000_00010000_00001000_00000100_00000010_00000001_0000],
+            result
+        );
+    }
+
+    #[test]
+    fn test_bit_slice_iter_unaligned_remainder_1_byte() {
+        let input: &[u8] = &[
+            0b00000000, 0b00000001, 0b00000010, 0b00000100, 0b00001000, 0b00010000,
+            0b00100000, 0b01000000, 0b11111111,
+        ];
+        let buffer: Buffer = Buffer::from(input);
+
+        let bit_slice = buffer.bit_slice().view(4, 66);
+        let chunks = bit_slice.chunks::<u64>();
+
+        assert_eq!(2, chunks.remainder_bit_len());
+        assert_eq!(0b00000011, chunks.remainder_bits());
+
+        let result = chunks.interpret().collect::<Vec<u64>>();
+
+        assert_eq!(
+            vec![0b1111_01000000_00100000_00010000_00001000_00000100_00000010_00000001_0000],
+            result
+        );
+    }
+
+    #[test]
+    fn test_bit_slice_iter_unaligned_remainder_bits_across_bytes() {
+        let input: &[u8] = &[0b00111111, 0b11111100];
+        let buffer: Buffer = Buffer::from(input);
+
+        // remainder contains bits from both bytes
+        // result should be the highest 2 bits from first byte followed by lowest 5 bits of second bytes
+        let bit_slice = buffer.bit_slice().view(6, 7);
+        let chunks = bit_slice.chunks::<u64>();
+
+        assert_eq!(7, chunks.remainder_bit_len());
+        assert_eq!(0b1110000, chunks.remainder_bits());
+    }
+
+    #[test]
+    fn test_bit_slice_iter_unaligned_remainder_bits_large() {
+        let input: &[u8] = &[
+            0b11111111, 0b00000000, 0b11111111, 0b00000000, 0b11111111, 0b00000000,
+            0b11111111, 0b00000000, 0b11111111,
+        ];
+        let buffer: Buffer = Buffer::from(input);
+
+        let bit_slice = buffer.bit_slice().view(2, 63);
+        let chunks = bit_slice.chunks::<u64>();
+
+        assert_eq!(63, chunks.remainder_bit_len());
+        assert_eq!(
+            0b1000000_00111111_11000000_00111111_11000000_00111111_11000000_00111111,
+            chunks.remainder_bits()
+        );
+    }
+
+    #[test]
+    fn test_bit_slice_iter_reinterpret() {
+        assert_eq!(LocalBits::default(), Lsb0::default());
+        let buffer_slice = &[0, 1, 2, 3, 4, 5, 6, 7].to_byte_slice();
+        // Name of the bit slice comes from byte slice, since it is still on the stack and behaves similarly to Rust's byte slice.
+        let buffer = Buffer::from(buffer_slice);
+
+        // Let's get the whole buffer.
+        let bit_slice = buffer.bit_slice().view(0, buffer_slice.len() * 8);
+        // Let's also get a chunked bits as u8, not u64 this time...
+        let chunks = bit_slice.chunks::<u8>();
+
+        let result = chunks.interpret().collect::<Vec<_>>();
+        assert_eq!(buffer_slice.to_vec(), result);
+    }
+}
diff --git a/rust/arrow/src/util/bit_util.rs b/rust/arrow/src/util/bit_util.rs
deleted file mode 100644
index 269eceb8be2..00000000000
--- a/rust/arrow/src/util/bit_util.rs
+++ /dev/null
@@ -1,390 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! Utils for working with bits
-
-#[cfg(feature = "simd")]
-use packed_simd::u8x64;
-
-const BIT_MASK: [u8; 8] = [1, 2, 4, 8, 16, 32, 64, 128];
-
-const POPCOUNT_TABLE: [u8; 256] = [
-    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4,
-    3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5,
-    3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4,
-    3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4,
-    3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5,
-    3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4,
-    3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7,
-    6, 7, 7, 8,
-];
-
-/// Returns the nearest number that is `>=` than `num` and is a multiple of 64
-#[inline]
-pub fn round_upto_multiple_of_64(num: usize) -> usize {
-    round_upto_power_of_2(num, 64)
-}
-
-/// Returns the nearest multiple of `factor` that is `>=` than `num`. Here `factor` must
-/// be a power of 2.
-pub fn round_upto_power_of_2(num: usize, factor: usize) -> usize {
-    debug_assert!(factor > 0 && (factor & (factor - 1)) == 0);
-    (num + (factor - 1)) & !(factor - 1)
-}
-
-/// Returns whether bit at position `i` in `data` is set or not
-#[inline]
-pub fn get_bit(data: &[u8], i: usize) -> bool {
-    (data[i >> 3] & BIT_MASK[i & 7]) != 0
-}
-
-/// Returns whether bit at position `i` in `data` is set or not.
-///
-/// # Safety
-///
-/// Note this doesn't do any bound checking, for performance reason. The caller is
-/// responsible to guarantee that `i` is within bounds.
-#[inline]
-pub unsafe fn get_bit_raw(data: *const u8, i: usize) -> bool {
-    (*data.add(i >> 3) & BIT_MASK[i & 7]) != 0
-}
-
-/// Sets bit at position `i` for `data`
-#[inline]
-pub fn set_bit(data: &mut [u8], i: usize) {
-    data[i >> 3] |= BIT_MASK[i & 7];
-}
-
-/// Sets bit at position `i` for `data`
-///
-/// # Safety
-///
-/// Note this doesn't do any bound checking, for performance reason. The caller is
-/// responsible to guarantee that `i` is within bounds.
-#[inline]
-pub unsafe fn set_bit_raw(data: *mut u8, i: usize) {
-    *data.add(i >> 3) |= BIT_MASK[i & 7];
-}
-
-/// Sets bit at position `i` for `data` to 0
-#[inline]
-pub fn unset_bit(data: &mut [u8], i: usize) {
-    data[i >> 3] ^= BIT_MASK[i & 7];
-}
-
-/// Sets bit at position `i` for `data` to 0
-///
-/// # Safety
-///
-/// Note this doesn't do any bound checking, for performance reason. The caller is
-/// responsible to guarantee that `i` is within bounds.
-#[inline]
-pub unsafe fn unset_bit_raw(data: *mut u8, i: usize) {
-    *data.add(i >> 3) ^= BIT_MASK[i & 7];
-}
-
-/// Returns the number of 1-bits in `data`
-#[inline]
-pub fn count_set_bits(data: &[u8]) -> usize {
-    let mut count: usize = 0;
-    for u in data {
-        count += POPCOUNT_TABLE[*u as usize] as usize;
-    }
-    count
-}
-
-/// Returns the number of 1-bits in `data`, starting from `offset` with `length` bits
-/// inspected. Note that both `offset` and `length` are measured in bits.
-#[inline]
-pub fn count_set_bits_offset(data: &[u8], offset: usize, length: usize) -> usize {
-    let bit_end = offset + length;
-    assert!(bit_end <= (data.len() << 3));
-
-    let byte_start = std::cmp::min(round_upto_power_of_2(offset, 8), bit_end);
-    let num_bytes = (bit_end - byte_start) >> 3;
-
-    let mut result = 0;
-
-    for i in offset..byte_start {
-        if get_bit(data, i) {
-            result += 1;
-        }
-    }
-    for i in 0..num_bytes {
-        result += POPCOUNT_TABLE[data[(byte_start >> 3) + i] as usize] as usize;
-    }
-    for i in (byte_start + (num_bytes << 3))..bit_end {
-        if get_bit(data, i) {
-            result += 1;
-        }
-    }
-
-    result
-}
-
-/// Returns the ceil of `value`/`divisor`
-#[inline]
-pub fn ceil(value: usize, divisor: usize) -> usize {
-    let (quot, rem) = (value / divisor, value % divisor);
-    if rem > 0 && divisor > 0 {
-        quot + 1
-    } else {
-        quot
-    }
-}
-
-/// Performs SIMD bitwise binary operations.
-///
-/// # Safety
-///
-/// Note that each slice should be 64 bytes and it is the callers responsibility to ensure
-/// that this is the case.  If passed slices larger than 64 bytes the operation will only
-/// be performed on the first 64 bytes.  Slices less than 64 bytes will panic.
-#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "simd"))]
-pub unsafe fn bitwise_bin_op_simd<F>(left: &[u8], right: &[u8], result: &mut [u8], op: F)
-where
-    F: Fn(u8x64, u8x64) -> u8x64,
-{
-    let left_simd = u8x64::from_slice_unaligned_unchecked(left);
-    let right_simd = u8x64::from_slice_unaligned_unchecked(right);
-    let simd_result = op(left_simd, right_simd);
-    simd_result.write_to_slice_unaligned_unchecked(result);
-}
-
-#[cfg(test)]
-mod tests {
-    use std::collections::HashSet;
-
-    use super::*;
-    use crate::util::test_util::seedable_rng;
-    use rand::Rng;
-
-    #[test]
-    fn test_round_upto_multiple_of_64() {
-        assert_eq!(0, round_upto_multiple_of_64(0));
-        assert_eq!(64, round_upto_multiple_of_64(1));
-        assert_eq!(64, round_upto_multiple_of_64(63));
-        assert_eq!(64, round_upto_multiple_of_64(64));
-        assert_eq!(128, round_upto_multiple_of_64(65));
-        assert_eq!(192, round_upto_multiple_of_64(129));
-    }
-
-    #[test]
-    fn test_get_bit() {
-        // 00001101
-        assert_eq!(true, get_bit(&[0b00001101], 0));
-        assert_eq!(false, get_bit(&[0b00001101], 1));
-        assert_eq!(true, get_bit(&[0b00001101], 2));
-        assert_eq!(true, get_bit(&[0b00001101], 3));
-
-        // 01001001 01010010
-        assert_eq!(true, get_bit(&[0b01001001, 0b01010010], 0));
-        assert_eq!(false, get_bit(&[0b01001001, 0b01010010], 1));
-        assert_eq!(false, get_bit(&[0b01001001, 0b01010010], 2));
-        assert_eq!(true, get_bit(&[0b01001001, 0b01010010], 3));
-        assert_eq!(false, get_bit(&[0b01001001, 0b01010010], 4));
-        assert_eq!(false, get_bit(&[0b01001001, 0b01010010], 5));
-        assert_eq!(true, get_bit(&[0b01001001, 0b01010010], 6));
-        assert_eq!(false, get_bit(&[0b01001001, 0b01010010], 7));
-        assert_eq!(false, get_bit(&[0b01001001, 0b01010010], 8));
-        assert_eq!(true, get_bit(&[0b01001001, 0b01010010], 9));
-        assert_eq!(false, get_bit(&[0b01001001, 0b01010010], 10));
-        assert_eq!(false, get_bit(&[0b01001001, 0b01010010], 11));
-        assert_eq!(true, get_bit(&[0b01001001, 0b01010010], 12));
-        assert_eq!(false, get_bit(&[0b01001001, 0b01010010], 13));
-        assert_eq!(true, get_bit(&[0b01001001, 0b01010010], 14));
-        assert_eq!(false, get_bit(&[0b01001001, 0b01010010], 15));
-    }
-
-    #[test]
-    fn test_get_bit_raw() {
-        const NUM_BYTE: usize = 10;
-        let mut buf = vec![0; NUM_BYTE];
-        let mut expected = vec![];
-        let mut rng = seedable_rng();
-        for i in 0..8 * NUM_BYTE {
-            let b = rng.gen_bool(0.5);
-            expected.push(b);
-            if b {
-                set_bit(&mut buf[..], i)
-            }
-        }
-
-        let raw_ptr = buf.as_ptr();
-        for (i, b) in expected.iter().enumerate() {
-            unsafe {
-                assert_eq!(*b, get_bit_raw(raw_ptr, i));
-            }
-        }
-    }
-
-    #[test]
-    fn test_set_bit() {
-        let mut b = [0b00000000];
-        set_bit(&mut b, 0);
-        assert_eq!([0b00000001], b);
-        set_bit(&mut b, 2);
-        assert_eq!([0b00000101], b);
-        set_bit(&mut b, 5);
-        assert_eq!([0b00100101], b);
-    }
-
-    #[test]
-    fn test_unset_bit() {
-        let mut b = [0b11111111];
-        unset_bit(&mut b, 0);
-        assert_eq!([0b11111110], b);
-        unset_bit(&mut b, 2);
-        assert_eq!([0b11111010], b);
-        unset_bit(&mut b, 5);
-        assert_eq!([0b11011010], b);
-    }
-
-    #[test]
-    fn test_set_bit_raw() {
-        const NUM_BYTE: usize = 10;
-        let mut buf = vec![0; NUM_BYTE];
-        let mut expected = vec![];
-        let mut rng = seedable_rng();
-        for i in 0..8 * NUM_BYTE {
-            let b = rng.gen_bool(0.5);
-            expected.push(b);
-            if b {
-                unsafe {
-                    set_bit_raw(buf.as_mut_ptr(), i);
-                }
-            }
-        }
-
-        let raw_ptr = buf.as_ptr();
-        for (i, b) in expected.iter().enumerate() {
-            unsafe {
-                assert_eq!(*b, get_bit_raw(raw_ptr, i));
-            }
-        }
-    }
-
-    #[test]
-    fn test_unset_bit_raw() {
-        const NUM_BYTE: usize = 10;
-        let mut buf = vec![255; NUM_BYTE];
-        let mut expected = vec![];
-        let mut rng = seedable_rng();
-        for i in 0..8 * NUM_BYTE {
-            let b = rng.gen_bool(0.5);
-            expected.push(b);
-            if !b {
-                unsafe {
-                    unset_bit_raw(buf.as_mut_ptr(), i);
-                }
-            }
-        }
-
-        let raw_ptr = buf.as_ptr();
-        for (i, b) in expected.iter().enumerate() {
-            unsafe {
-                assert_eq!(*b, get_bit_raw(raw_ptr, i));
-            }
-        }
-    }
-
-    #[test]
-    fn test_get_set_bit_roundtrip() {
-        const NUM_BYTES: usize = 10;
-        const NUM_SETS: usize = 10;
-
-        let mut buffer: [u8; NUM_BYTES * 8] = [0; NUM_BYTES * 8];
-        let mut v = HashSet::new();
-        let mut rng = seedable_rng();
-        for _ in 0..NUM_SETS {
-            let offset = rng.gen_range(0, 8 * NUM_BYTES);
-            v.insert(offset);
-            set_bit(&mut buffer[..], offset);
-        }
-        for i in 0..NUM_BYTES * 8 {
-            assert_eq!(v.contains(&i), get_bit(&buffer[..], i));
-        }
-    }
-
-    #[test]
-    fn test_count_bits_slice() {
-        assert_eq!(0, count_set_bits(&[0b00000000]));
-        assert_eq!(8, count_set_bits(&[0b11111111]));
-        assert_eq!(3, count_set_bits(&[0b00001101]));
-        assert_eq!(6, count_set_bits(&[0b01001001, 0b01010010]));
-    }
-
-    #[test]
-    fn test_count_bits_offset_slice() {
-        assert_eq!(8, count_set_bits_offset(&[0b11111111], 0, 8));
-        assert_eq!(3, count_set_bits_offset(&[0b11111111], 0, 3));
-        assert_eq!(5, count_set_bits_offset(&[0b11111111], 3, 5));
-        assert_eq!(1, count_set_bits_offset(&[0b11111111], 3, 1));
-        assert_eq!(0, count_set_bits_offset(&[0b11111111], 8, 0));
-        assert_eq!(2, count_set_bits_offset(&[0b01010101], 0, 3));
-        assert_eq!(16, count_set_bits_offset(&[0b11111111, 0b11111111], 0, 16));
-        assert_eq!(10, count_set_bits_offset(&[0b11111111, 0b11111111], 0, 10));
-        assert_eq!(10, count_set_bits_offset(&[0b11111111, 0b11111111], 3, 10));
-        assert_eq!(8, count_set_bits_offset(&[0b11111111, 0b11111111], 8, 8));
-        assert_eq!(5, count_set_bits_offset(&[0b11111111, 0b11111111], 11, 5));
-        assert_eq!(0, count_set_bits_offset(&[0b11111111, 0b11111111], 16, 0));
-        assert_eq!(2, count_set_bits_offset(&[0b01101101, 0b10101010], 7, 5));
-        assert_eq!(4, count_set_bits_offset(&[0b01101101, 0b10101010], 7, 9));
-    }
-
-    #[test]
-    #[cfg(all(any(target_arch = "x86", target_arch = "x86_64")))]
-    fn test_ceil() {
-        assert_eq!(ceil(0, 1), 0);
-        assert_eq!(ceil(1, 1), 1);
-        assert_eq!(ceil(1, 2), 1);
-        assert_eq!(ceil(1, 8), 1);
-        assert_eq!(ceil(7, 8), 1);
-        assert_eq!(ceil(8, 8), 1);
-        assert_eq!(ceil(9, 8), 2);
-        assert_eq!(ceil(9, 9), 1);
-        assert_eq!(ceil(10000000000, 10), 1000000000);
-        assert_eq!(ceil(10, 10000000000), 1);
-        assert_eq!(ceil(10000000000, 1000000000), 10);
-    }
-
-    #[test]
-    #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "simd"))]
-    fn test_bitwise_and_simd() {
-        let buf1 = [0b00110011u8; 64];
-        let buf2 = [0b11110000u8; 64];
-        let mut buf3 = [0b00000000; 64];
-        unsafe { bitwise_bin_op_simd(&buf1, &buf2, &mut buf3, |a, b| a & b) };
-        for i in buf3.iter() {
-            assert_eq!(&0b00110000u8, i);
-        }
-    }
-
-    #[test]
-    #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "simd"))]
-    fn test_bitwise_or_simd() {
-        let buf1 = [0b00110011u8; 64];
-        let buf2 = [0b11110000u8; 64];
-        let mut buf3 = [0b00000000; 64];
-        unsafe { bitwise_bin_op_simd(&buf1, &buf2, &mut buf3, |a, b| a | b) };
-        for i in buf3.iter() {
-            assert_eq!(&0b11110011u8, i);
-        }
-    }
-}
diff --git a/rust/arrow/src/util/buffered_iterator.rs b/rust/arrow/src/util/buffered_iterator.rs
index 059b82424a8..5d42ee43e66 100644
--- a/rust/arrow/src/util/buffered_iterator.rs
+++ b/rust/arrow/src/util/buffered_iterator.rs
@@ -54,7 +54,7 @@ where
     /// Useful to extract the exact item where an error occurred
     #[inline]
     pub fn n(&self) -> usize {
-        return self.buffer.len();
+        self.buffer.len()
     }
 }
 
diff --git a/rust/arrow/src/util/mod.rs b/rust/arrow/src/util/mod.rs
index 053d1329631..ca090c0a40b 100644
--- a/rust/arrow/src/util/mod.rs
+++ b/rust/arrow/src/util/mod.rs
@@ -15,8 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-pub mod bit_chunk_iterator;
-pub mod bit_util;
+pub mod bit_ops;
 pub mod buffered_iterator;
 pub mod display;
 pub mod integration_util;
@@ -24,3 +23,4 @@ pub mod integration_util;
 pub mod pretty;
 pub mod string_writer;
 pub mod test_util;
+pub mod utils;
diff --git a/rust/arrow/src/util/utils.rs b/rust/arrow/src/util/utils.rs
new file mode 100644
index 00000000000..84435830896
--- /dev/null
+++ b/rust/arrow/src/util/utils.rs
@@ -0,0 +1,119 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Utils for working with bits
+
+#[cfg(feature = "simd")]
+use packed_simd::u8x64;
+
+/// Returns the nearest number that is `>=` than `num` and is a multiple of 64
+#[inline]
+pub fn round_upto_multiple_of_64(num: usize) -> usize {
+    round_upto_power_of_2(num, 64)
+}
+
+/// Returns the nearest multiple of `factor` that is `>=` than `num`. Here `factor` must
+/// be a power of 2.
+pub fn round_upto_power_of_2(num: usize, factor: usize) -> usize {
+    debug_assert!(factor > 0 && (factor & (factor - 1)) == 0);
+    (num + (factor - 1)) & !(factor - 1)
+}
+
+/// Returns the ceil of `value`/`divisor`
+#[inline]
+pub fn ceil(value: usize, divisor: usize) -> usize {
+    let (quot, rem) = (value / divisor, value % divisor);
+    if rem > 0 && divisor > 0 {
+        quot + 1
+    } else {
+        quot
+    }
+}
+
+/// Performs SIMD bitwise binary operations.
+///
+/// # Safety
+///
+/// Note that each slice should be 64 bytes and it is the callers responsibility to ensure
+/// that this is the case.  If passed slices larger than 64 bytes the operation will only
+/// be performed on the first 64 bytes.  Slices less than 64 bytes will panic.
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "simd"))]
+pub unsafe fn bitwise_bin_op_simd<F>(left: &[u8], right: &[u8], result: &mut [u8], op: F)
+where
+    F: Fn(u8x64, u8x64) -> u8x64,
+{
+    let left_simd = u8x64::from_slice_unaligned_unchecked(left);
+    let right_simd = u8x64::from_slice_unaligned_unchecked(right);
+    let simd_result = op(left_simd, right_simd);
+    simd_result.write_to_slice_unaligned_unchecked(result);
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+
+    #[test]
+    fn test_round_upto_multiple_of_64() {
+        assert_eq!(0, round_upto_multiple_of_64(0));
+        assert_eq!(64, round_upto_multiple_of_64(1));
+        assert_eq!(64, round_upto_multiple_of_64(63));
+        assert_eq!(64, round_upto_multiple_of_64(64));
+        assert_eq!(128, round_upto_multiple_of_64(65));
+        assert_eq!(192, round_upto_multiple_of_64(129));
+    }
+
+    #[test]
+    #[cfg(all(any(target_arch = "x86", target_arch = "x86_64")))]
+    fn test_ceil() {
+        assert_eq!(ceil(0, 1), 0);
+        assert_eq!(ceil(1, 1), 1);
+        assert_eq!(ceil(1, 2), 1);
+        assert_eq!(ceil(1, 8), 1);
+        assert_eq!(ceil(7, 8), 1);
+        assert_eq!(ceil(8, 8), 1);
+        assert_eq!(ceil(9, 8), 2);
+        assert_eq!(ceil(9, 9), 1);
+        assert_eq!(ceil(10000000000, 10), 1000000000);
+        assert_eq!(ceil(10, 10000000000), 1);
+        assert_eq!(ceil(10000000000, 1000000000), 10);
+    }
+
+    #[test]
+    #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "simd"))]
+    fn test_bitwise_and_simd() {
+        let buf1 = [0b00110011u8; 64];
+        let buf2 = [0b11110000u8; 64];
+        let mut buf3 = [0b00000000; 64];
+        unsafe { bitwise_bin_op_simd(&buf1, &buf2, &mut buf3, |a, b| a & b) };
+        for i in buf3.iter() {
+            assert_eq!(&0b00110000u8, i);
+        }
+    }
+
+    #[test]
+    #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "simd"))]
+    fn test_bitwise_or_simd() {
+        let buf1 = [0b00110011u8; 64];
+        let buf2 = [0b11110000u8; 64];
+        let mut buf3 = [0b00000000; 64];
+        unsafe { bitwise_bin_op_simd(&buf1, &buf2, &mut buf3, |a, b| a | b) };
+        for i in buf3.iter() {
+            assert_eq!(&0b11110011u8, i);
+        }
+    }
+}
diff --git a/rust/datafusion/tests/sql.rs b/rust/datafusion/tests/sql.rs
index 8dd7afa1d7e..c6ad1b381eb 100644
--- a/rust/datafusion/tests/sql.rs
+++ b/rust/datafusion/tests/sql.rs
@@ -349,7 +349,7 @@ async fn csv_query_avg_sqrt() -> Result<()> {
     let mut actual = execute(&mut ctx, sql).await;
     actual.sort();
     let expected = vec![vec!["0.6706002946036462"]];
-    assert_eq!(actual, expected);
+    assert_float_eq(&expected, &actual);
     Ok(())
 }
 
@@ -363,7 +363,7 @@ async fn csv_query_custom_udf_with_cast() -> Result<()> {
     let sql = "SELECT avg(custom_sqrt(c11)) FROM aggregate_test_100";
     let actual = execute(&mut ctx, sql).await;
     let expected = vec![vec!["0.6584408483418833"]];
-    assert_eq!(actual, expected);
+    assert_float_eq(&expected, &actual);
     Ok(())
 }
 
@@ -377,11 +377,11 @@ async fn sqrt_f32_vs_f64() -> Result<()> {
     let actual = execute(&mut ctx, sql).await;
     let expected = vec![vec!["0.6584408485889435"]];
 
-    assert_eq!(actual, expected);
+    assert_float_eq(&expected, &actual);
     let sql = "SELECT avg(sqrt(CAST(c11 AS double))) FROM aggregate_test_100";
     let actual = execute(&mut ctx, sql).await;
     let expected = vec![vec!["0.6584408483418833"]];
-    assert_eq!(actual, expected);
+    assert_float_eq(&expected, &actual);
     Ok(())
 }
 
@@ -405,7 +405,7 @@ async fn csv_query_sqrt_sqrt() -> Result<()> {
     let actual = execute(&mut ctx, sql).await;
     // sqrt(sqrt(c12=0.9294097332465232)) = 0.9818650561397431
     let expected = vec![vec!["0.9818650561397431"]];
-    assert_eq!(actual, expected);
+    assert_float_eq(&expected, &actual);
     Ok(())
 }
 
@@ -448,7 +448,7 @@ async fn csv_query_avg() -> Result<()> {
     let mut actual = execute(&mut ctx, sql).await;
     actual.sort();
     let expected = vec![vec!["0.5089725099127211"]];
-    assert_eq!(expected, actual);
+    assert_float_eq(&expected, &actual);
     Ok(())
 }
 
@@ -1399,3 +1399,25 @@ async fn query_on_string_dictionary() -> Result<()> {
 
     Ok(())
 }
+
+fn assert_float_eq<T>(expected: &Vec<Vec<T>>, received: &Vec<Vec<String>>)
+where
+    T: AsRef<str>,
+{
+    expected
+        .into_iter()
+        .flatten()
+        .zip(received.into_iter().flatten())
+        .for_each(|(l, r)| {
+            let (l, r) = (
+                l.as_ref().parse::<f64>().unwrap(),
+                r.as_str().parse::<f64>().unwrap(),
+            );
+            // Too small floats are hard to approximate, give them some more error range.
+            if l < 1.0 && r < 1.0 {
+                assert!((l - r).abs() <= 2.0_f64 * f64::EPSILON);
+            } else {
+                assert!((l - r).abs() <= f64::EPSILON);
+            }
+        });
+}
diff --git a/rust/integration-testing/src/bin/arrow-json-integration-test.rs b/rust/integration-testing/src/bin/arrow-json-integration-test.rs
index d4afd13528d..d0fc14794d2 100644
--- a/rust/integration-testing/src/bin/arrow-json-integration-test.rs
+++ b/rust/integration-testing/src/bin/arrow-json-integration-test.rs
@@ -35,7 +35,7 @@ use arrow::{
     buffer::Buffer,
     buffer::MutableBuffer,
     datatypes::ToByteSlice,
-    util::{bit_util, integration_util::*},
+    util::{integration_util::*, utils},
 };
 
 fn main() -> Result<()> {
@@ -588,7 +588,7 @@ fn dictionary_array_from_json(
 
 /// A helper to create a null buffer from a Vec<bool>
 fn create_null_buf(json_col: &ArrowJsonColumn) -> Buffer {
-    let num_bytes = bit_util::ceil(json_col.count, 8);
+    let num_bytes = utils::ceil(json_col.count, 8);
     let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, false);
     json_col
         .validity
@@ -597,9 +597,8 @@ fn create_null_buf(json_col: &ArrowJsonColumn) -> Buffer {
         .iter()
         .enumerate()
         .for_each(|(i, v)| {
-            let null_slice = null_buf.data_mut();
             if *v != 0 {
-                bit_util::set_bit(null_slice, i);
+                null_buf.set_bit(i);
             }
         });
     null_buf.freeze()
diff --git a/rust/parquet/src/arrow/array_reader.rs b/rust/parquet/src/arrow/array_reader.rs
index 00e7c74147f..7a9780595ae 100644
--- a/rust/parquet/src/arrow/array_reader.rs
+++ b/rust/parquet/src/arrow/array_reader.rs
@@ -53,7 +53,7 @@ use arrow::datatypes::{
     UInt16Type as ArrowUInt16Type, UInt32Type as ArrowUInt32Type,
     UInt64Type as ArrowUInt64Type, UInt8Type as ArrowUInt8Type,
 };
-use arrow::util::bit_util;
+use arrow::util::utils;
 
 use crate::arrow::converter::{
     BinaryArrayConverter, BinaryConverter, Converter, FixedLenBinaryConverter,
@@ -904,13 +904,12 @@ impl<OffsetSize: OffsetSizeTrait> ArrayReader for ListArrayReader<OffsetSize> {
         }
         offsets.push(cur_offset);
 
-        let num_bytes = bit_util::ceil(offsets.len(), 8);
+        let num_bytes = utils::ceil(offsets.len(), 8);
         let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, false);
-        let null_slice = null_buf.data_mut();
         let mut list_index = 0;
         for i in 0..rep_levels.len() {
             if rep_levels[i] == 0 && def_levels[i] != 0 {
-                bit_util::set_bit(null_slice, list_index);
+                null_buf.set_bit(list_index);
             }
             if rep_levels[i] == 0 {
                 list_index += 1;

From 4948aa0dbfe35ea2ba77a6bd8bf2674e694fcdde Mon Sep 17 00:00:00 2001
From: Mahmut Bulut <vertexclique@gmail.com>
Date: Sun, 15 Nov 2020 14:12:25 +0100
Subject: [PATCH 02/22] Apply checks

---
 rust/arrow/src/buffer.rs               | 88 ++++++++++++++++++++++----
 rust/arrow/src/compute/kernels/cast.rs |  6 +-
 rust/arrow/src/util/bit_ops.rs         |  6 +-
 3 files changed, 80 insertions(+), 20 deletions(-)

diff --git a/rust/arrow/src/buffer.rs b/rust/arrow/src/buffer.rs
index c9673cbf158..6fe2b4a51fd 100644
--- a/rust/arrow/src/buffer.rs
+++ b/rust/arrow/src/buffer.rs
@@ -1236,43 +1236,103 @@ mod tests {
 
     #[test]
     fn test_count_bits_offset_slice() {
-        assert_eq!(8, Buffer::from(&[0b11111111]).bit_slice().view(0, 8).count_ones());
-        assert_eq!(3, Buffer::from(&[0b11111111]).bit_slice().view(0, 3).count_ones());
-        assert_eq!(5, Buffer::from(&[0b11111111]).bit_slice().view(3, 5).count_ones());
-        assert_eq!(1, Buffer::from(&[0b11111111]).bit_slice().view(3, 1).count_ones());
-        assert_eq!(0, Buffer::from(&[0b11111111]).bit_slice().view(8, 0).count_ones());
-        assert_eq!(2, Buffer::from(&[0b01010101]).bit_slice().view(0, 3).count_ones());
+        assert_eq!(
+            8,
+            Buffer::from(&[0b11111111])
+                .bit_slice()
+                .view(0, 8)
+                .count_ones()
+        );
+        assert_eq!(
+            3,
+            Buffer::from(&[0b11111111])
+                .bit_slice()
+                .view(0, 3)
+                .count_ones()
+        );
+        assert_eq!(
+            5,
+            Buffer::from(&[0b11111111])
+                .bit_slice()
+                .view(3, 5)
+                .count_ones()
+        );
+        assert_eq!(
+            1,
+            Buffer::from(&[0b11111111])
+                .bit_slice()
+                .view(3, 1)
+                .count_ones()
+        );
+        assert_eq!(
+            0,
+            Buffer::from(&[0b11111111])
+                .bit_slice()
+                .view(8, 0)
+                .count_ones()
+        );
+        assert_eq!(
+            2,
+            Buffer::from(&[0b01010101])
+                .bit_slice()
+                .view(0, 3)
+                .count_ones()
+        );
         assert_eq!(
             16,
-            Buffer::from(&[0b11111111, 0b11111111]).bit_slice().view(0, 16).count_ones()
+            Buffer::from(&[0b11111111, 0b11111111])
+                .bit_slice()
+                .view(0, 16)
+                .count_ones()
         );
         assert_eq!(
             10,
-            Buffer::from(&[0b11111111, 0b11111111]).bit_slice().view(0, 10).count_ones()
+            Buffer::from(&[0b11111111, 0b11111111])
+                .bit_slice()
+                .view(0, 10)
+                .count_ones()
         );
         assert_eq!(
             10,
-            Buffer::from(&[0b11111111, 0b11111111]).bit_slice().view(3, 10).count_ones()
+            Buffer::from(&[0b11111111, 0b11111111])
+                .bit_slice()
+                .view(3, 10)
+                .count_ones()
         );
         assert_eq!(
             8,
-            Buffer::from(&[0b11111111, 0b11111111]).bit_slice().view(8, 8).count_ones()
+            Buffer::from(&[0b11111111, 0b11111111])
+                .bit_slice()
+                .view(8, 8)
+                .count_ones()
         );
         assert_eq!(
             5,
-            Buffer::from(&[0b11111111, 0b11111111]).bit_slice().view(11, 5).count_ones()
+            Buffer::from(&[0b11111111, 0b11111111])
+                .bit_slice()
+                .view(11, 5)
+                .count_ones()
         );
         assert_eq!(
             0,
-            Buffer::from(&[0b11111111, 0b11111111]).bit_slice().view(16, 0).count_ones()
+            Buffer::from(&[0b11111111, 0b11111111])
+                .bit_slice()
+                .view(16, 0)
+                .count_ones()
         );
         assert_eq!(
             2,
-            Buffer::from(&[0b01101101, 0b10101010]).bit_slice().view(7, 5).count_ones()
+            Buffer::from(&[0b01101101, 0b10101010])
+                .bit_slice()
+                .view(7, 5)
+                .count_ones()
         );
         assert_eq!(
             4,
-            Buffer::from(&[0b01101101, 0b10101010]).bit_slice().view(7, 9).count_ones()
+            Buffer::from(&[0b01101101, 0b10101010])
+                .bit_slice()
+                .view(7, 9)
+                .count_ones()
         );
     }
 }
diff --git a/rust/arrow/src/compute/kernels/cast.rs b/rust/arrow/src/compute/kernels/cast.rs
index eb1b07d817a..ef79302927f 100644
--- a/rust/arrow/src/compute/kernels/cast.rs
+++ b/rust/arrow/src/compute/kernels/cast.rs
@@ -2740,14 +2740,14 @@ mod tests {
                     (Ok(_), false) => {
                         panic!("Was able to cast array from {:?} to {:?} but can_cast_types reported false",
                                array.data_type(), to_type)
-                    },
+                    }
                     (Err(e), true) => {
                         panic!("Was not able to cast array from {:?} to {:?} but can_cast_types reported true. \
                                 Error was {:?}",
                                array.data_type(), to_type, e)
-                    },
+                    }
                     // otherwise it was a match
-                    _=> {},
+                    _ => {}
                 };
             }
         }
diff --git a/rust/arrow/src/util/bit_ops.rs b/rust/arrow/src/util/bit_ops.rs
index 0dd4571d46e..b9654e4d966 100644
--- a/rust/arrow/src/util/bit_ops.rs
+++ b/rust/arrow/src/util/bit_ops.rs
@@ -46,7 +46,7 @@ impl<'a> BufferBitSlice<'a> {
     ///
     /// Returns immutable view with the given offset in bits and length in bits.
     /// This view have zero-copy representation over the actual data.
-    #[inline(always)]
+    #[inline]
     pub fn view(&self, offset_in_bits: usize, len_in_bits: usize) -> Self {
         Self {
             buffer_data: self.buffer_data,
@@ -58,7 +58,7 @@ impl<'a> BufferBitSlice<'a> {
     /// Returns bit chunks in native 64-bit allocation size.
     /// Native representations in Arrow follows 64-bit convention.
     /// Chunks can still be reinterpreted in any primitive type lower than u64.
-    #[inline(always)]
+    #[inline]
     pub fn chunks<T>(&self) -> BufferBitChunksExact<T>
     where
         T: BitMemory,
@@ -156,7 +156,7 @@ impl<'a> BufferBitSliceMut<'a> {
     ///
     /// Returns mutable view with the given offset in bits and length in bits.
     /// This view have zero-copy representation over the actual data.
-    #[inline(always)]
+    #[inline]
     pub fn view(&'a mut self, offset_in_bits: usize, len_in_bits: usize) -> Self {
         Self {
             bit_slice: &mut self.bit_slice[offset_in_bits..offset_in_bits + len_in_bits],

From 2aef46fd5632a52f49468676b5eace755b64c3e2 Mon Sep 17 00:00:00 2001
From: Mahmut Bulut <vertexclique@gmail.com>
Date: Wed, 18 Nov 2020 11:32:37 +0100
Subject: [PATCH 03/22] Update rust/arrow/src/util/bit_ops.rs

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
---
 rust/arrow/src/util/bit_ops.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/rust/arrow/src/util/bit_ops.rs b/rust/arrow/src/util/bit_ops.rs
index b9654e4d966..72de1f61b27 100644
--- a/rust/arrow/src/util/bit_ops.rs
+++ b/rust/arrow/src/util/bit_ops.rs
@@ -23,7 +23,11 @@ use bitvec::slice::ChunksExact;
 use std::fmt::Debug;
 
 ///
-/// Immutable bit slice representation of buffer data
+/// Immutable bit slice view of `Buffer` data. 
+///
+/// `BufferBitSlice` does not own any underlying data, but rather wraps references 
+/// to the underlying data in a `Buffer` and has methods for addressing and interacting with 
+/// individual bits
 #[derive(Debug)]
 pub struct BufferBitSlice<'a> {
     buffer_data: &'a [u8],

From 57b9a3896e5e8343eca30e3934fabe07cfb72700 Mon Sep 17 00:00:00 2001
From: Mahmut Bulut <vertexclique@gmail.com>
Date: Wed, 18 Nov 2020 11:32:55 +0100
Subject: [PATCH 04/22] Update rust/arrow/src/util/bit_ops.rs

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
---
 rust/arrow/src/util/bit_ops.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/rust/arrow/src/util/bit_ops.rs b/rust/arrow/src/util/bit_ops.rs
index 72de1f61b27..d61116dc611 100644
--- a/rust/arrow/src/util/bit_ops.rs
+++ b/rust/arrow/src/util/bit_ops.rs
@@ -141,7 +141,10 @@ impl<'a> From<&'a [u8]> for BufferBitSlice<'a> {
 }
 
 ///
-/// Mutable bit slice representation of buffer data
+/// Mutable bit slice view of buffer data
+///
+/// `BufferBitSliceMut` does not own any underlying data, but rather 
+/// has methods for addressing and interacting with individual bits.
 #[derive(Debug)]
 pub struct BufferBitSliceMut<'a> {
     bit_slice: &'a mut BitSlice<LocalBits, u8>,

From ce647aa76a19dc1101eae92a04e8634ada627a64 Mon Sep 17 00:00:00 2001
From: Mahmut Bulut <vertexclique@gmail.com>
Date: Wed, 18 Nov 2020 11:33:58 +0100
Subject: [PATCH 05/22] Update rust/arrow/src/util/bit_ops.rs

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
---
 rust/arrow/src/util/bit_ops.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rust/arrow/src/util/bit_ops.rs b/rust/arrow/src/util/bit_ops.rs
index d61116dc611..5c8f39f8cf0 100644
--- a/rust/arrow/src/util/bit_ops.rs
+++ b/rust/arrow/src/util/bit_ops.rs
@@ -178,7 +178,7 @@ impl<'a> BufferBitSliceMut<'a> {
     }
 
     ///
-    /// Count zeros in the given bit view
+    /// Set given bit at the position to a given value
     #[inline]
     pub fn set_bit(&mut self, index: usize, value: bool) {
         unsafe { self.bit_slice.set_unchecked(index, value) }

From d05cd7d9ec0ec6904e21c849b72087d1aba3d16a Mon Sep 17 00:00:00 2001
From: Mahmut Bulut <vertexclique@gmail.com>
Date: Wed, 18 Nov 2020 11:34:09 +0100
Subject: [PATCH 06/22] Update rust/arrow/src/util/bit_ops.rs

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
---
 rust/arrow/src/util/bit_ops.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rust/arrow/src/util/bit_ops.rs b/rust/arrow/src/util/bit_ops.rs
index 5c8f39f8cf0..839943d25f9 100644
--- a/rust/arrow/src/util/bit_ops.rs
+++ b/rust/arrow/src/util/bit_ops.rs
@@ -171,7 +171,7 @@ impl<'a> BufferBitSliceMut<'a> {
     }
 
     ///
-    /// Set given bit at the position to a given value
+    /// Sets all bits in this slice to the given value
     #[inline]
     pub fn set_bit_all(&mut self, value: bool) {
         self.bit_slice.set_all(value)

From 4c8078462bffa3163de18e0078df9ae95fdaddaa Mon Sep 17 00:00:00 2001
From: Mahmut Bulut <vertexclique@gmail.com>
Date: Wed, 18 Nov 2020 11:40:17 +0100
Subject: [PATCH 07/22] Update rust/arrow/src/util/bit_ops.rs

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
---
 rust/arrow/src/util/bit_ops.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rust/arrow/src/util/bit_ops.rs b/rust/arrow/src/util/bit_ops.rs
index 839943d25f9..6a118a59227 100644
--- a/rust/arrow/src/util/bit_ops.rs
+++ b/rust/arrow/src/util/bit_ops.rs
@@ -83,7 +83,7 @@ impl<'a> BufferBitSlice<'a> {
     }
 
     ///
-    /// Converts the bit view into the Buffer.
+    /// Converts the bit view into a Buffer.
     /// Buffer is always byte-aligned and well-aligned.
     #[inline]
     pub fn as_buffer(&self) -> Buffer {

From d4768835e6ef19b5b0764861ba24977c4df73bd2 Mon Sep 17 00:00:00 2001
From: Mahmut Bulut <vertexclique@gmail.com>
Date: Wed, 18 Nov 2020 11:40:43 +0100
Subject: [PATCH 08/22] Update rust/arrow/src/util/bit_ops.rs

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
---
 rust/arrow/src/util/bit_ops.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rust/arrow/src/util/bit_ops.rs b/rust/arrow/src/util/bit_ops.rs
index 6a118a59227..f87b4476618 100644
--- a/rust/arrow/src/util/bit_ops.rs
+++ b/rust/arrow/src/util/bit_ops.rs
@@ -185,7 +185,7 @@ impl<'a> BufferBitSliceMut<'a> {
     }
 
     ///
-    /// Converts the bit view into the Buffer.
+    /// Converts the bit view into a Buffer.
     /// Buffer is always byte-aligned and well-aligned.
     #[inline]
     pub fn as_buffer(&self) -> Buffer {

From babcbdb4b08f078a994c814b6710c388d1468393 Mon Sep 17 00:00:00 2001
From: Mahmut Bulut <vertexclique@gmail.com>
Date: Wed, 18 Nov 2020 11:41:15 +0100
Subject: [PATCH 09/22] Update rust/arrow/src/util/bit_ops.rs

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
---
 rust/arrow/src/util/bit_ops.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/rust/arrow/src/util/bit_ops.rs b/rust/arrow/src/util/bit_ops.rs
index f87b4476618..466f6ef8291 100644
--- a/rust/arrow/src/util/bit_ops.rs
+++ b/rust/arrow/src/util/bit_ops.rs
@@ -244,6 +244,9 @@ impl<'a> From<&'a mut [u8]> for BufferBitSliceMut<'a> {
 
 ///
 /// Exact chunk view over the bit slice
+///
+/// The view is represented as some number of aligned T-sized chunks, 
+/// followed by some number of remainder bits
 #[derive(Clone, Debug)]
 pub struct BufferBitChunksExact<'a, T>
 where

From 4d8af274fcbd505b1681c1c7638b33ca0261c088 Mon Sep 17 00:00:00 2001
From: Mahmut Bulut <vertexclique@gmail.com>
Date: Wed, 18 Nov 2020 15:14:33 +0100
Subject: [PATCH 10/22] Add doctest for bit slice reinterpretation

---
 rust/arrow/src/array/array_union.rs          |  2 +-
 rust/arrow/src/array/data.rs                 |  2 +-
 rust/arrow/src/array/equal/utils.rs          |  2 +-
 rust/arrow/src/buffer.rs                     | 40 +++++++++---------
 rust/arrow/src/compute/kernels/aggregate.rs  |  4 +-
 rust/arrow/src/compute/kernels/comparison.rs |  2 -
 rust/arrow/src/compute/kernels/filter.rs     |  2 +-
 rust/arrow/src/util/bit_ops.rs               | 43 +++++++++++++++-----
 8 files changed, 58 insertions(+), 39 deletions(-)

diff --git a/rust/arrow/src/array/array_union.rs b/rust/arrow/src/array/array_union.rs
index 3a271d2e4db..926e9aeb91b 100644
--- a/rust/arrow/src/array/array_union.rs
+++ b/rust/arrow/src/array/array_union.rs
@@ -233,7 +233,7 @@ impl UnionArray {
             // In format v4 unions had their own validity bitmap and offsets are compressed by omitting null values
             // Starting with v5 unions don't have a validity bitmap and it's possible to directly index into the offsets buffer
             let valid_slots = match self.data.null_buffer() {
-                Some(b) => b.bit_slice().view(0, index).count_ones(),
+                Some(b) => b.bit_slice().slicing(0, index).count_ones(),
                 None => index,
             };
             self.data().buffers()[1].data()[valid_slots * size_of::<i32>()] as i32
diff --git a/rust/arrow/src/array/data.rs b/rust/arrow/src/array/data.rs
index 9deba00e0fd..a6d6e54cb75 100644
--- a/rust/arrow/src/array/data.rs
+++ b/rust/arrow/src/array/data.rs
@@ -30,7 +30,7 @@ use super::equal::equal;
 #[inline]
 fn count_nulls(null_bit_buffer: Option<&Buffer>, offset: usize, len: usize) -> usize {
     if let Some(ref buf) = null_bit_buffer {
-        let ones = buf.bit_slice().view(offset, len).count_ones();
+        let ones = buf.bit_slice().slicing(offset, len).count_ones();
         len.checked_sub(ones).unwrap()
     } else {
         0
diff --git a/rust/arrow/src/array/equal/utils.rs b/rust/arrow/src/array/equal/utils.rs
index 343c86c3d88..36cdb44e5fd 100644
--- a/rust/arrow/src/array/equal/utils.rs
+++ b/rust/arrow/src/array/equal/utils.rs
@@ -32,7 +32,7 @@ pub(super) fn equal_bits(
         BufferBitSlice::new(lhs_values),
         BufferBitSlice::new(rhs_values),
     );
-    lhs.view(lhs_start, len) == rhs.view(rhs_start, len)
+    lhs.slicing(lhs_start, len) == rhs.slicing(rhs_start, len)
 }
 
 #[inline]
diff --git a/rust/arrow/src/buffer.rs b/rust/arrow/src/buffer.rs
index 35c9f62fd54..a6a3b7bf86e 100644
--- a/rust/arrow/src/buffer.rs
+++ b/rust/arrow/src/buffer.rs
@@ -264,7 +264,7 @@ impl Buffer {
             self.slice(offset_in_bits / 8)
         } else {
             self.bit_slice()
-                .view(offset_in_bits, len_in_bits)
+                .slicing(offset_in_bits, len_in_bits)
                 .as_buffer()
         }
     }
@@ -434,10 +434,10 @@ where
     let mut result =
         MutableBuffer::new(ceil(len_in_bits, 8)).with_bitset(len_in_bits / 64 * 8, false);
 
-    let left_slice = left.bit_slice().view(left_offset_in_bits, len_in_bits);
+    let left_slice = left.bit_slice().slicing(left_offset_in_bits, len_in_bits);
     let left_chunks = left_slice.chunks::<u64>();
 
-    let right_slice = right.bit_slice().view(right_offset_in_bits, len_in_bits);
+    let right_slice = right.bit_slice().slicing(right_offset_in_bits, len_in_bits);
     let right_chunks = right_slice.chunks::<u64>();
 
     let remainder_bytes = ceil(left_chunks.remainder_bit_len(), 8);
@@ -475,7 +475,7 @@ where
     let mut result =
         MutableBuffer::new(ceil(len_in_bits, 8)).with_bitset(len_in_bits / 64 * 8, false);
 
-    let left_slice = left.bit_slice().view(offset_in_bits, len_in_bits);
+    let left_slice = left.bit_slice().slicing(offset_in_bits, len_in_bits);
     let left_chunks = left_slice.chunks::<u64>();
 
     let remainder_bytes = ceil(left_chunks.remainder_bit_len(), 8);
@@ -757,9 +757,9 @@ impl MutableBuffer {
     /// This is useful when one wants to clear (or set) the bits and then manipulate
     /// the buffer directly (e.g., modifying the buffer by holding a mutable reference
     /// from `data_mut()`).
-    pub fn with_bitset(mut self, end: usize, val: bool) -> Self {
+    pub fn with_bitset(mut self, end: usize, initial_value: bool) -> Self {
         assert!(end <= self.capacity);
-        let v = if val { 255 } else { 0 };
+        let v = if initial_value { 0xFF_u8 } else { 0x00_u8 };
         unsafe {
             std::ptr::write_bytes(self.data, v, end);
             self.len = end;
@@ -1328,98 +1328,98 @@ mod tests {
             8,
             Buffer::from(&[0b11111111])
                 .bit_slice()
-                .view(0, 8)
+                .slicing(0, 8)
                 .count_ones()
         );
         assert_eq!(
             3,
             Buffer::from(&[0b11111111])
                 .bit_slice()
-                .view(0, 3)
+                .slicing(0, 3)
                 .count_ones()
         );
         assert_eq!(
             5,
             Buffer::from(&[0b11111111])
                 .bit_slice()
-                .view(3, 5)
+                .slicing(3, 5)
                 .count_ones()
         );
         assert_eq!(
             1,
             Buffer::from(&[0b11111111])
                 .bit_slice()
-                .view(3, 1)
+                .slicing(3, 1)
                 .count_ones()
         );
         assert_eq!(
             0,
             Buffer::from(&[0b11111111])
                 .bit_slice()
-                .view(8, 0)
+                .slicing(8, 0)
                 .count_ones()
         );
         assert_eq!(
             2,
             Buffer::from(&[0b01010101])
                 .bit_slice()
-                .view(0, 3)
+                .slicing(0, 3)
                 .count_ones()
         );
         assert_eq!(
             16,
             Buffer::from(&[0b11111111, 0b11111111])
                 .bit_slice()
-                .view(0, 16)
+                .slicing(0, 16)
                 .count_ones()
         );
         assert_eq!(
             10,
             Buffer::from(&[0b11111111, 0b11111111])
                 .bit_slice()
-                .view(0, 10)
+                .slicing(0, 10)
                 .count_ones()
         );
         assert_eq!(
             10,
             Buffer::from(&[0b11111111, 0b11111111])
                 .bit_slice()
-                .view(3, 10)
+                .slicing(3, 10)
                 .count_ones()
         );
         assert_eq!(
             8,
             Buffer::from(&[0b11111111, 0b11111111])
                 .bit_slice()
-                .view(8, 8)
+                .slicing(8, 8)
                 .count_ones()
         );
         assert_eq!(
             5,
             Buffer::from(&[0b11111111, 0b11111111])
                 .bit_slice()
-                .view(11, 5)
+                .slicing(11, 5)
                 .count_ones()
         );
         assert_eq!(
             0,
             Buffer::from(&[0b11111111, 0b11111111])
                 .bit_slice()
-                .view(16, 0)
+                .slicing(16, 0)
                 .count_ones()
         );
         assert_eq!(
             2,
             Buffer::from(&[0b01101101, 0b10101010])
                 .bit_slice()
-                .view(7, 5)
+                .slicing(7, 5)
                 .count_ones()
         );
         assert_eq!(
             4,
             Buffer::from(&[0b01101101, 0b10101010])
                 .bit_slice()
-                .view(7, 9)
+                .slicing(7, 9)
                 .count_ones()
         );
     }
diff --git a/rust/arrow/src/compute/kernels/aggregate.rs b/rust/arrow/src/compute/kernels/aggregate.rs
index 4a53a7651f4..bfb47da43bf 100644
--- a/rust/arrow/src/compute/kernels/aggregate.rs
+++ b/rust/arrow/src/compute/kernels/aggregate.rs
@@ -152,7 +152,7 @@ where
             let data_chunks = data.chunks_exact(64);
             let remainder = data_chunks.remainder();
 
-            let buffer_slice = buffer.bit_slice().view(array.offset(), array.len());
+            let buffer_slice = buffer.bit_slice().slicing(array.offset(), array.len());
             let buffer_chunks = buffer_slice.chunks::<u64>();
 
             let buffer_remainder_bits: u64 = buffer_chunks.remainder_bits();
@@ -220,7 +220,7 @@ where
             let data_chunks = data.chunks_exact(64);
             let remainder = data_chunks.remainder();
 
-            let bit_slice = buffer.bit_slice().view(array.offset(), array.len());
+            let bit_slice = buffer.bit_slice().slicing(array.offset(), array.len());
             let bit_chunks = bit_slice.chunks::<u64>();
             let remainder_bits = bit_chunks.remainder_bits();
 
diff --git a/rust/arrow/src/compute/kernels/comparison.rs b/rust/arrow/src/compute/kernels/comparison.rs
index 00d0f0bc326..5e2689293b2 100644
--- a/rust/arrow/src/compute/kernels/comparison.rs
+++ b/rust/arrow/src/compute/kernels/comparison.rs
@@ -633,8 +633,6 @@ where
             Some(buff) => buff,
             None => new_all_set_buffer(num_bytes),
         };
-    let _not_both_null_bitmap = not_both_null_bit_buffer.data();
-
     let mut bool_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, false);
 
     // if both array slots are valid, check if list contains primitive
diff --git a/rust/arrow/src/compute/kernels/filter.rs b/rust/arrow/src/compute/kernels/filter.rs
index a171c1fcebe..623c748a322 100644
--- a/rust/arrow/src/compute/kernels/filter.rs
+++ b/rust/arrow/src/compute/kernels/filter.rs
@@ -322,7 +322,7 @@ impl FilterContext {
         let filter_mask: Vec<u64> = (0..64).map(|x| 1u64 << x).collect();
         let filter_bytes = filter_array.data_ref().buffers()[0].data();
         let filtered_count = BufferBitSlice::new(filter_bytes)
-            .view(0, filter_array.len())
+            .slicing(0, filter_array.len())
             .count_ones();
 
         // transmute filter_bytes to &[u64]
diff --git a/rust/arrow/src/util/bit_ops.rs b/rust/arrow/src/util/bit_ops.rs
index 466f6ef8291..c823e1edb6f 100644
--- a/rust/arrow/src/util/bit_ops.rs
+++ b/rust/arrow/src/util/bit_ops.rs
@@ -30,7 +30,6 @@ use std::fmt::Debug;
 /// individual bits
 #[derive(Debug)]
 pub struct BufferBitSlice<'a> {
-    buffer_data: &'a [u8],
     bit_slice: &'a BitSlice<LocalBits, u8>,
 }
 
@@ -42,7 +41,6 @@ impl<'a> BufferBitSlice<'a> {
         let bit_slice = BitSlice::<LocalBits, _>::from_slice(buffer_data).unwrap();
 
         BufferBitSlice {
-            buffer_data,
             bit_slice: &bit_slice,
         }
     }
@@ -51,15 +49,38 @@ impl<'a> BufferBitSlice<'a> {
     /// Returns immutable view with the given offset in bits and length in bits.
     /// This view have zero-copy representation over the actual data.
     #[inline]
-    pub fn view(&self, offset_in_bits: usize, len_in_bits: usize) -> Self {
+    pub fn slicing(&self, offset_in_bits: usize, len_in_bits: usize) -> Self {
         Self {
-            buffer_data: self.buffer_data,
             bit_slice: &self.bit_slice[offset_in_bits..offset_in_bits + len_in_bits],
         }
     }
 
     ///
-    /// Returns bit chunks in native 64-bit allocation size.
+    /// Returns bit chunks in given byte width.
+    /// This can be u64(native Arrow byte representation size) or any other unsigned primitive like:
+    /// u8, u16, u32, u128 and usize.
+    ///
+    /// This method is generic over the given primitives to enable user to filter out
+    /// any upper/lower nibble/s which is not used like:
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # use arrow::buffer::Buffer;
+    /// let input: &[u8] = &[
+    ///     0b11111111, 0b00000000, 0b11111111, 0b00000000,
+    ///     0b11111111, 0b00000000, 0b11111111, 0b00000000,
+    /// ];
+    ///
+    /// let buffer: Buffer = Buffer::from(input);
+    /// let bit_slice = buffer.bit_slice();
+    /// // Interpret bit slice as u8
+    /// let chunks = bit_slice.chunks::<u8>();
+    ///
+    /// // Filter out null bytes for compression
+    /// let bytes = chunks.interpret().filter(|e| *e != 0x00_u8).collect::<Vec<u8>>();
+    /// assert_eq!(bytes.len(), 4);
+    /// ```
     /// Native representations in Arrow follows 64-bit convention.
     /// Chunks can still be reinterpreted in any primitive type lower than u64.
     #[inline]
@@ -164,7 +185,7 @@ impl<'a> BufferBitSliceMut<'a> {
     /// Returns mutable view with the given offset in bits and length in bits.
     /// This view have zero-copy representation over the actual data.
     #[inline]
-    pub fn view(&'a mut self, offset_in_bits: usize, len_in_bits: usize) -> Self {
+    pub fn slicing(&'a mut self, offset_in_bits: usize, len_in_bits: usize) -> Self {
         Self {
             bit_slice: &mut self.bit_slice[offset_in_bits..offset_in_bits + len_in_bits],
         }
@@ -331,7 +352,7 @@ mod tests_bit_slices_little_endian {
         ];
         let buffer: Buffer = Buffer::from(input);
 
-        let bit_slice = buffer.bit_slice().view(4, 64);
+        let bit_slice = buffer.bit_slice().slicing(4, 64);
         let chunks = bit_slice.chunks::<u64>();
 
         assert_eq!(0, chunks.remainder_bit_len());
@@ -353,7 +374,7 @@ mod tests_bit_slices_little_endian {
         ];
         let buffer: Buffer = Buffer::from(input);
 
-        let bit_slice = buffer.bit_slice().view(4, 66);
+        let bit_slice = buffer.bit_slice().slicing(4, 66);
         let chunks = bit_slice.chunks::<u64>();
 
         assert_eq!(2, chunks.remainder_bit_len());
@@ -374,7 +395,7 @@ mod tests_bit_slices_little_endian {
 
         // remainder contains bits from both bytes
         // result should be the highest 2 bits from first byte followed by lowest 5 bits of second bytes
-        let bit_slice = buffer.bit_slice().view(6, 7);
+        let bit_slice = buffer.bit_slice().slicing(6, 7);
         let chunks = bit_slice.chunks::<u64>();
 
         assert_eq!(7, chunks.remainder_bit_len());
@@ -389,7 +410,7 @@ mod tests_bit_slices_little_endian {
         ];
         let buffer: Buffer = Buffer::from(input);
 
-        let bit_slice = buffer.bit_slice().view(2, 63);
+        let bit_slice = buffer.bit_slice().slicing(2, 63);
         let chunks = bit_slice.chunks::<u64>();
 
         assert_eq!(63, chunks.remainder_bit_len());
@@ -407,7 +428,7 @@ mod tests_bit_slices_little_endian {
         let buffer = Buffer::from(buffer_slice);
 
         // Let's get the whole buffer.
-        let bit_slice = buffer.bit_slice().view(0, buffer_slice.len() * 8);
+        let bit_slice = buffer.bit_slice().slicing(0, buffer_slice.len() * 8);
         // Let's also get a chunked bits as u8, not u64 this time...
         let chunks = bit_slice.chunks::<u8>();
 

From 081d365e26447aee3ff3868429a1f67c63c1a0f1 Mon Sep 17 00:00:00 2001
From: Mahmut Bulut <vertexclique@gmail.com>
Date: Wed, 18 Nov 2020 15:18:32 +0100
Subject: [PATCH 11/22] Apply checks and formatting

---
 rust/arrow/src/util/bit_ops.rs | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/rust/arrow/src/util/bit_ops.rs b/rust/arrow/src/util/bit_ops.rs
index c823e1edb6f..3d58fe0c464 100644
--- a/rust/arrow/src/util/bit_ops.rs
+++ b/rust/arrow/src/util/bit_ops.rs
@@ -23,10 +23,10 @@ use bitvec::slice::ChunksExact;
 use std::fmt::Debug;
 
 ///
-/// Immutable bit slice view of `Buffer` data. 
+/// Immutable bit slice view of `Buffer` data.
 ///
-/// `BufferBitSlice` does not own any underlying data, but rather wraps references 
-/// to the underlying data in a `Buffer` and has methods for addressing and interacting with 
+/// `BufferBitSlice` does not own any underlying data, but rather wraps references
+/// to the underlying data in a `Buffer` and has methods for addressing and interacting with
 /// individual bits
 #[derive(Debug)]
 pub struct BufferBitSlice<'a> {
@@ -164,7 +164,7 @@ impl<'a> From<&'a [u8]> for BufferBitSlice<'a> {
 ///
 /// Mutable bit slice view of buffer data
 ///
-/// `BufferBitSliceMut` does not own any underlying data, but rather 
+/// `BufferBitSliceMut` does not own any underlying data, but rather
 /// has methods for addressing and interacting with individual bits.
 #[derive(Debug)]
 pub struct BufferBitSliceMut<'a> {
@@ -266,7 +266,7 @@ impl<'a> From<&'a mut [u8]> for BufferBitSliceMut<'a> {
 ///
 /// Exact chunk view over the bit slice
 ///
-/// The view is represented as some number of aligned T-sized chunks, 
+/// The view is represented as some number of aligned T-sized chunks,
 /// followed by some number of remainder bits
 #[derive(Clone, Debug)]
 pub struct BufferBitChunksExact<'a, T>

From 6613b94b61c30f7f53d27a121336f307c90abeb3 Mon Sep 17 00:00:00 2001
From: Mahmut Bulut <vertexclique@gmail.com>
Date: Wed, 18 Nov 2020 15:38:21 +0100
Subject: [PATCH 12/22] Address test comments

---
 rust/arrow/src/compute/kernels/aggregate.rs |  2 +-
 rust/arrow/src/util/bit_ops.rs              | 16 ++++++++++------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/rust/arrow/src/compute/kernels/aggregate.rs b/rust/arrow/src/compute/kernels/aggregate.rs
index bfb47da43bf..5b644ecd031 100644
--- a/rust/arrow/src/compute/kernels/aggregate.rs
+++ b/rust/arrow/src/compute/kernels/aggregate.rs
@@ -159,7 +159,7 @@ where
 
             let buffer_chunk_iter = buffer_chunks.interpret();
 
-            &data_chunks
+            data_chunks
                 .zip(buffer_chunk_iter)
                 .for_each(|(chunk, mask)| {
                     chunk.iter().enumerate().for_each(|(i, value)| {
diff --git a/rust/arrow/src/util/bit_ops.rs b/rust/arrow/src/util/bit_ops.rs
index 3d58fe0c464..7c39b110c70 100644
--- a/rust/arrow/src/util/bit_ops.rs
+++ b/rust/arrow/src/util/bit_ops.rs
@@ -91,7 +91,7 @@ impl<'a> BufferBitSlice<'a> {
         let offset_size_in_bits = 8 * std::mem::size_of::<T>();
         let chunks_exact = self.bit_slice.chunks_exact(offset_size_in_bits);
         let remainder_bits = chunks_exact.remainder();
-        let remainder: T = if remainder_bits.len() == 0 {
+        let remainder: T = if remainder_bits.is_empty() {
             T::default()
         } else {
             remainder_bits.load::<T>()
@@ -136,7 +136,7 @@ impl<'a> BufferBitSlice<'a> {
     /// Get bits in this view as vector of booleans
     #[inline]
     pub fn typed_bits(&self) -> Vec<bool> {
-        self.bit_slice.iter().map(|e| *e).collect()
+        self.bit_slice.iter().copied().collect()
     }
 
     ///
@@ -238,7 +238,7 @@ impl<'a> BufferBitSliceMut<'a> {
     /// Get bits in this view as vector of booleans
     #[inline]
     pub fn typed_bits(&self) -> Vec<bool> {
-        self.bit_slice.iter().map(|e| *e).collect()
+        self.bit_slice.iter().copied().collect()
     }
 
     ///
@@ -347,21 +347,23 @@ mod tests_bit_slices_little_endian {
     #[test]
     fn test_bit_slice_iter_unaligned() {
         let input: &[u8] = &[
-            0b00000000, 0b00000001, 0b00000010, 0b00000100, 0b00001000, 0b00010000,
-            0b00100000, 0b01000000, 0b11111111,
+            0b11110000, 0b00000001, 0b00000010, 0b00000100, 0b00001000, 0b00010000,
+            0b00100000, 0b01000000, 0b11111010,
         ];
         let buffer: Buffer = Buffer::from(input);
 
+        // After the 4th bit get a bit slice of 64 bits.
         let bit_slice = buffer.bit_slice().slicing(4, 64);
         let chunks = bit_slice.chunks::<u64>();
 
+        // 64 bits perfectly fits.
         assert_eq!(0, chunks.remainder_bit_len());
         assert_eq!(0, chunks.remainder_bits());
 
         let result = chunks.interpret().collect::<Vec<u64>>();
 
         assert_eq!(
-            vec![0b1111_01000000_00100000_00010000_00001000_00000100_00000010_00000001_0000],
+            vec![0b1010_01000000_00100000_00010000_00001000_00000100_00000010_00000001_1111],
             result
         );
     }
@@ -374,9 +376,11 @@ mod tests_bit_slices_little_endian {
         ];
         let buffer: Buffer = Buffer::from(input);
 
+        // After the 4th bit get a bit slice of 66 bits.
         let bit_slice = buffer.bit_slice().slicing(4, 66);
         let chunks = bit_slice.chunks::<u64>();
 
+        // 66 bits doesn't fit into 64 bits primitive type, so there are 2 remainder bits.
         assert_eq!(2, chunks.remainder_bit_len());
         assert_eq!(0b00000011, chunks.remainder_bits());
 

From 3625692f8eb50a6744e05a82f843ee5a19973822 Mon Sep 17 00:00:00 2001
From: Mahmut Bulut <vertexclique@gmail.com>
Date: Wed, 18 Nov 2020 15:45:38 +0100
Subject: [PATCH 13/22] Remove bit_util.rs

---
 rust/arrow/src/util/bit_util.rs | 328 --------------------------------
 1 file changed, 328 deletions(-)
 delete mode 100644 rust/arrow/src/util/bit_util.rs

diff --git a/rust/arrow/src/util/bit_util.rs b/rust/arrow/src/util/bit_util.rs
deleted file mode 100644
index bf54e22394c..00000000000
--- a/rust/arrow/src/util/bit_util.rs
+++ /dev/null
@@ -1,328 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! Utils for working with bits
-
-#[cfg(feature = "simd")]
-use packed_simd::u8x64;
-
-const BIT_MASK: [u8; 8] = [1, 2, 4, 8, 16, 32, 64, 128];
-
-/// Returns the nearest number that is `>=` than `num` and is a multiple of 64
-#[inline]
-pub fn round_upto_multiple_of_64(num: usize) -> usize {
-    round_upto_power_of_2(num, 64)
-}
-
-/// Returns the nearest multiple of `factor` that is `>=` than `num`. Here `factor` must
-/// be a power of 2.
-pub fn round_upto_power_of_2(num: usize, factor: usize) -> usize {
-    debug_assert!(factor > 0 && (factor & (factor - 1)) == 0);
-    (num + (factor - 1)) & !(factor - 1)
-}
-
-/// Returns whether bit at position `i` in `data` is set or not
-#[inline]
-pub fn get_bit(data: &[u8], i: usize) -> bool {
-    (data[i >> 3] & BIT_MASK[i & 7]) != 0
-}
-
-/// Returns whether bit at position `i` in `data` is set or not.
-///
-/// # Safety
-///
-/// Note this doesn't do any bound checking, for performance reason. The caller is
-/// responsible to guarantee that `i` is within bounds.
-#[inline]
-pub unsafe fn get_bit_raw(data: *const u8, i: usize) -> bool {
-    (*data.add(i >> 3) & BIT_MASK[i & 7]) != 0
-}
-
-/// Sets bit at position `i` for `data`
-#[inline]
-pub fn set_bit(data: &mut [u8], i: usize) {
-    data[i >> 3] |= BIT_MASK[i & 7];
-}
-
-/// Sets bit at position `i` for `data`
-///
-/// # Safety
-///
-/// Note this doesn't do any bound checking, for performance reason. The caller is
-/// responsible to guarantee that `i` is within bounds.
-#[inline]
-pub unsafe fn set_bit_raw(data: *mut u8, i: usize) {
-    *data.add(i >> 3) |= BIT_MASK[i & 7];
-}
-
-/// Sets bit at position `i` for `data` to 0
-#[inline]
-pub fn unset_bit(data: &mut [u8], i: usize) {
-    data[i >> 3] ^= BIT_MASK[i & 7];
-}
-
-/// Sets bit at position `i` for `data` to 0
-///
-/// # Safety
-///
-/// Note this doesn't do any bound checking, for performance reason. The caller is
-/// responsible to guarantee that `i` is within bounds.
-#[inline]
-pub unsafe fn unset_bit_raw(data: *mut u8, i: usize) {
-    *data.add(i >> 3) ^= BIT_MASK[i & 7];
-}
-
-/// Returns the ceil of `value`/`divisor`
-#[inline]
-pub fn ceil(value: usize, divisor: usize) -> usize {
-    let (quot, rem) = (value / divisor, value % divisor);
-    if rem > 0 && divisor > 0 {
-        quot + 1
-    } else {
-        quot
-    }
-}
-
-/// Performs SIMD bitwise binary operations.
-///
-/// # Safety
-///
-/// Note that each slice should be 64 bytes and it is the callers responsibility to ensure
-/// that this is the case.  If passed slices larger than 64 bytes the operation will only
-/// be performed on the first 64 bytes.  Slices less than 64 bytes will panic.
-#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "simd"))]
-pub unsafe fn bitwise_bin_op_simd<F>(left: &[u8], right: &[u8], result: &mut [u8], op: F)
-where
-    F: Fn(u8x64, u8x64) -> u8x64,
-{
-    let left_simd = u8x64::from_slice_unaligned_unchecked(left);
-    let right_simd = u8x64::from_slice_unaligned_unchecked(right);
-    let simd_result = op(left_simd, right_simd);
-    simd_result.write_to_slice_unaligned_unchecked(result);
-}
-
-#[cfg(test)]
-mod tests {
-    use std::collections::HashSet;
-
-    use super::*;
-    use crate::util::test_util::seedable_rng;
-    use rand::Rng;
-
-    #[test]
-    fn test_round_upto_multiple_of_64() {
-        assert_eq!(0, round_upto_multiple_of_64(0));
-        assert_eq!(64, round_upto_multiple_of_64(1));
-        assert_eq!(64, round_upto_multiple_of_64(63));
-        assert_eq!(64, round_upto_multiple_of_64(64));
-        assert_eq!(128, round_upto_multiple_of_64(65));
-        assert_eq!(192, round_upto_multiple_of_64(129));
-    }
-
-    #[test]
-    fn test_get_bit() {
-        // 00001101
-        assert_eq!(true, get_bit(&[0b00001101], 0));
-        assert_eq!(false, get_bit(&[0b00001101], 1));
-        assert_eq!(true, get_bit(&[0b00001101], 2));
-        assert_eq!(true, get_bit(&[0b00001101], 3));
-
-        // 01001001 01010010
-        assert_eq!(true, get_bit(&[0b01001001, 0b01010010], 0));
-        assert_eq!(false, get_bit(&[0b01001001, 0b01010010], 1));
-        assert_eq!(false, get_bit(&[0b01001001, 0b01010010], 2));
-        assert_eq!(true, get_bit(&[0b01001001, 0b01010010], 3));
-        assert_eq!(false, get_bit(&[0b01001001, 0b01010010], 4));
-        assert_eq!(false, get_bit(&[0b01001001, 0b01010010], 5));
-        assert_eq!(true, get_bit(&[0b01001001, 0b01010010], 6));
-        assert_eq!(false, get_bit(&[0b01001001, 0b01010010], 7));
-        assert_eq!(false, get_bit(&[0b01001001, 0b01010010], 8));
-        assert_eq!(true, get_bit(&[0b01001001, 0b01010010], 9));
-        assert_eq!(false, get_bit(&[0b01001001, 0b01010010], 10));
-        assert_eq!(false, get_bit(&[0b01001001, 0b01010010], 11));
-        assert_eq!(true, get_bit(&[0b01001001, 0b01010010], 12));
-        assert_eq!(false, get_bit(&[0b01001001, 0b01010010], 13));
-        assert_eq!(true, get_bit(&[0b01001001, 0b01010010], 14));
-        assert_eq!(false, get_bit(&[0b01001001, 0b01010010], 15));
-    }
-
-    #[test]
-    fn test_get_bit_raw() {
-        const NUM_BYTE: usize = 10;
-        let mut buf = vec![0; NUM_BYTE];
-        let mut expected = vec![];
-        let mut rng = seedable_rng();
-        for i in 0..8 * NUM_BYTE {
-            let b = rng.gen_bool(0.5);
-            expected.push(b);
-            if b {
-                set_bit(&mut buf[..], i)
-            }
-        }
-
-        let raw_ptr = buf.as_ptr();
-        for (i, b) in expected.iter().enumerate() {
-            unsafe {
-                assert_eq!(*b, get_bit_raw(raw_ptr, i));
-            }
-        }
-    }
-
-    #[test]
-    fn test_set_bit() {
-        let mut b = [0b00000000];
-        set_bit(&mut b, 0);
-        assert_eq!([0b00000001], b);
-        set_bit(&mut b, 2);
-        assert_eq!([0b00000101], b);
-        set_bit(&mut b, 5);
-        assert_eq!([0b00100101], b);
-    }
-
-    #[test]
-    fn test_unset_bit() {
-        let mut b = [0b11111111];
-        unset_bit(&mut b, 0);
-        assert_eq!([0b11111110], b);
-        unset_bit(&mut b, 2);
-        assert_eq!([0b11111010], b);
-        unset_bit(&mut b, 5);
-        assert_eq!([0b11011010], b);
-    }
-
-    #[test]
-    fn test_set_bit_raw() {
-        const NUM_BYTE: usize = 10;
-        let mut buf = vec![0; NUM_BYTE];
-        let mut expected = vec![];
-        let mut rng = seedable_rng();
-        for i in 0..8 * NUM_BYTE {
-            let b = rng.gen_bool(0.5);
-            expected.push(b);
-            if b {
-                unsafe {
-                    set_bit_raw(buf.as_mut_ptr(), i);
-                }
-            }
-        }
-
-        let raw_ptr = buf.as_ptr();
-        for (i, b) in expected.iter().enumerate() {
-            unsafe {
-                assert_eq!(*b, get_bit_raw(raw_ptr, i));
-            }
-        }
-    }
-
-    #[test]
-    fn test_unset_bit_raw() {
-        const NUM_BYTE: usize = 10;
-        let mut buf = vec![255; NUM_BYTE];
-        let mut expected = vec![];
-        let mut rng = seedable_rng();
-        for i in 0..8 * NUM_BYTE {
-            let b = rng.gen_bool(0.5);
-            expected.push(b);
-            if !b {
-                unsafe {
-                    unset_bit_raw(buf.as_mut_ptr(), i);
-                }
-            }
-        }
-
-        let raw_ptr = buf.as_ptr();
-        for (i, b) in expected.iter().enumerate() {
-            unsafe {
-                assert_eq!(*b, get_bit_raw(raw_ptr, i));
-            }
-        }
-    }
-
-    #[test]
-    fn test_get_set_bit_roundtrip() {
-        const NUM_BYTES: usize = 10;
-        const NUM_SETS: usize = 10;
-
-        let mut buffer: [u8; NUM_BYTES * 8] = [0; NUM_BYTES * 8];
-        let mut v = HashSet::new();
-        let mut rng = seedable_rng();
-        for _ in 0..NUM_SETS {
-            let offset = rng.gen_range(0, 8 * NUM_BYTES);
-            v.insert(offset);
-            set_bit(&mut buffer[..], offset);
-        }
-        for i in 0..NUM_BYTES * 8 {
-            assert_eq!(v.contains(&i), get_bit(&buffer[..], i));
-        }
-    }
-
-    #[test]
-    #[cfg(all(any(target_arch = "x86", target_arch = "x86_64")))]
-    fn test_ceil() {
-        assert_eq!(ceil(0, 1), 0);
-        assert_eq!(ceil(1, 1), 1);
-        assert_eq!(ceil(1, 2), 1);
-        assert_eq!(ceil(1, 8), 1);
-        assert_eq!(ceil(7, 8), 1);
-        assert_eq!(ceil(8, 8), 1);
-        assert_eq!(ceil(9, 8), 2);
-        assert_eq!(ceil(9, 9), 1);
-        assert_eq!(ceil(10000000000, 10), 1000000000);
-        assert_eq!(ceil(10, 10000000000), 1);
-        assert_eq!(ceil(10000000000, 1000000000), 10);
-    }
-
-    #[test]
-    #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "avx512"))]
-    fn test_bitwise_and_avx512() {
-        use crate::buffer::avx512_bin_and;
-
-        let buf1 = [0b00110011u8; 64];
-        let buf2 = [0b11110000u8; 64];
-        let mut buf3 = [0b00000000; 64];
-        unsafe {
-            avx512_bin_and(&buf1, &buf2, &mut buf3);
-        };
-        for i in buf3.iter() {
-            assert_eq!(&0b00110000u8, i);
-        }
-    }
-
-    #[test]
-    #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "simd"))]
-    fn test_bitwise_and_simd() {
-        let buf1 = [0b00110011u8; 64];
-        let buf2 = [0b11110000u8; 64];
-        let mut buf3 = [0b00000000; 64];
-        unsafe { bitwise_bin_op_simd(&buf1, &buf2, &mut buf3, |a, b| a & b) };
-        for i in buf3.iter() {
-            assert_eq!(&0b00110000u8, i);
-        }
-    }
-
-    #[test]
-    #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "simd"))]
-    fn test_bitwise_or_simd() {
-        let buf1 = [0b00110011u8; 64];
-        let buf2 = [0b11110000u8; 64];
-        let mut buf3 = [0b00000000; 64];
-        unsafe { bitwise_bin_op_simd(&buf1, &buf2, &mut buf3, |a, b| a | b) };
-        for i in buf3.iter() {
-            assert_eq!(&0b11110011u8, i);
-        }
-    }
-}

From 10b0e9caef014ba5267f6c6676546d285f8c1046 Mon Sep 17 00:00:00 2001
From: Mahmut Bulut <vertexclique@gmail.com>
Date: Wed, 18 Nov 2020 15:49:19 +0100
Subject: [PATCH 14/22] Make unnecessary slices immutable

---
 rust/arrow/src/array/array_list.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rust/arrow/src/array/array_list.rs b/rust/arrow/src/array/array_list.rs
index 72a152f6f59..e886d7a4e38 100644
--- a/rust/arrow/src/array/array_list.rs
+++ b/rust/arrow/src/array/array_list.rs
@@ -305,7 +305,7 @@ mod tests {
     };
 
     use super::*;
-    use crate::util::bit_ops::BufferBitSliceMut;
+    use crate::util::bit_ops::{BufferBitSlice, BufferBitSliceMut};
 
     #[test]
     fn test_list_array() {
@@ -722,7 +722,7 @@ mod tests {
         assert_eq!(1, sliced_array.offset());
         assert_eq!(2, sliced_array.null_count());
 
-        let null_bit_slice = BufferBitSliceMut::new(&mut null_bits);
+        let null_bit_slice = BufferBitSlice::new(&null_bits);
         for i in 0..sliced_array.len() {
             if null_bit_slice.get_bit(sliced_array.offset() + i) {
                 assert!(sliced_array.is_valid(i));

From 9946245075a695d06d872609c757a35f05981ddb Mon Sep 17 00:00:00 2001
From: Mahmut Bulut <vertexclique@gmail.com>
Date: Wed, 18 Nov 2020 16:11:01 +0100
Subject: [PATCH 15/22] Rename 'interpret' to 'to_native_iter'.

---
 rust/arrow/src/buffer.rs                    |  6 +++---
 rust/arrow/src/compute/kernels/aggregate.rs |  4 ++--
 rust/arrow/src/util/bit_ops.rs              | 14 +++++++-------
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/rust/arrow/src/buffer.rs b/rust/arrow/src/buffer.rs
index a6a3b7bf86e..6801532b3eb 100644
--- a/rust/arrow/src/buffer.rs
+++ b/rust/arrow/src/buffer.rs
@@ -444,8 +444,8 @@ where
     let rem = op(left_chunks.remainder_bits(), right_chunks.remainder_bits());
     let rem = &rem.to_ne_bytes()[0..remainder_bytes];
 
-    let left_chunk_iter = left_chunks.interpret();
-    let right_chunk_iter = right_chunks.interpret();
+    let left_chunk_iter = left_chunks.to_native_iter();
+    let right_chunk_iter = right_chunks.to_native_iter();
 
     let result_chunks = result.typed_data_mut::<u64>().iter_mut();
 
@@ -482,7 +482,7 @@ where
     let rem = op(left_chunks.remainder_bits());
     let rem = &rem.to_ne_bytes()[0..remainder_bytes];
 
-    let left_chunk_iter = left_chunks.interpret();
+    let left_chunk_iter = left_chunks.to_native_iter();
 
     let result_chunks = result.typed_data_mut::<u64>().iter_mut();
 
diff --git a/rust/arrow/src/compute/kernels/aggregate.rs b/rust/arrow/src/compute/kernels/aggregate.rs
index 5b644ecd031..8540d35c1ef 100644
--- a/rust/arrow/src/compute/kernels/aggregate.rs
+++ b/rust/arrow/src/compute/kernels/aggregate.rs
@@ -157,7 +157,7 @@ where
 
             let buffer_remainder_bits: u64 = buffer_chunks.remainder_bits();
 
-            let buffer_chunk_iter = buffer_chunks.interpret();
+            let buffer_chunk_iter = buffer_chunks.to_native_iter();
 
             data_chunks
                 .zip(buffer_chunk_iter)
@@ -225,7 +225,7 @@ where
             let remainder_bits = bit_chunks.remainder_bits();
 
             data_chunks
-                .zip(bit_chunks.interpret())
+                .zip(bit_chunks.to_native_iter())
                 .for_each(|(chunk, mut mask)| {
                     // split chunks further into slices corresponding to the vector length
                     // the compiler is able to unroll this inner loop and remove bounds checks
diff --git a/rust/arrow/src/util/bit_ops.rs b/rust/arrow/src/util/bit_ops.rs
index 7c39b110c70..ef26d55cbd1 100644
--- a/rust/arrow/src/util/bit_ops.rs
+++ b/rust/arrow/src/util/bit_ops.rs
@@ -78,7 +78,7 @@ impl<'a> BufferBitSlice<'a> {
     /// let chunks = bit_slice.chunks::<u8>();
     ///
     /// // Filter out null bytes for compression
-    /// let bytes = chunks.interpret().filter(|e| *e != 0x00_u8).collect::<Vec<u8>>();
+    /// let bytes = chunks.to_native_iter().filter(|e| *e != 0x00_u8).collect::<Vec<u8>>();
     /// assert_eq!(bytes.len(), 4);
     /// ```
     /// Native representations in Arrow follows 64-bit convention.
@@ -297,9 +297,9 @@ where
     }
 
     ///
-    /// Interprets underlying chunk's view's bits as a given type.
+    /// Returns an iterator which interprets underlying chunk's view's bits as given type.
     #[inline(always)]
-    pub fn interpret(self) -> impl Iterator<Item = T> + 'a
+    pub fn to_native_iter(self) -> impl Iterator<Item = T> + 'a
     where
         T: BitMemory,
     {
@@ -339,7 +339,7 @@ mod tests_bit_slices_little_endian {
         let buffer: Buffer = Buffer::from(input);
 
         let bit_slice = buffer.bit_slice();
-        let result = bit_slice.chunks().interpret().collect::<Vec<u64>>();
+        let result = bit_slice.chunks().to_native_iter().collect::<Vec<u64>>();
 
         assert_eq!(vec![0x0706050403020100], result);
     }
@@ -360,7 +360,7 @@ mod tests_bit_slices_little_endian {
         assert_eq!(0, chunks.remainder_bit_len());
         assert_eq!(0, chunks.remainder_bits());
 
-        let result = chunks.interpret().collect::<Vec<u64>>();
+        let result = chunks.to_native_iter().collect::<Vec<u64>>();
 
         assert_eq!(
             vec![0b1010_01000000_00100000_00010000_00001000_00000100_00000010_00000001_1111],
@@ -384,7 +384,7 @@ mod tests_bit_slices_little_endian {
         assert_eq!(2, chunks.remainder_bit_len());
         assert_eq!(0b00000011, chunks.remainder_bits());
 
-        let result = chunks.interpret().collect::<Vec<u64>>();
+        let result = chunks.to_native_iter().collect::<Vec<u64>>();
 
         assert_eq!(
             vec![0b1111_01000000_00100000_00010000_00001000_00000100_00000010_00000001_0000],
@@ -436,7 +436,7 @@ mod tests_bit_slices_little_endian {
         // Let's also get a chunked bits as u8, not u64 this time...
         let chunks = bit_slice.chunks::<u8>();
 
-        let result = chunks.interpret().collect::<Vec<_>>();
+        let result = chunks.to_native_iter().collect::<Vec<_>>();
         assert_eq!(buffer_slice.to_vec(), result);
     }
 }

From 3e8230c03eab127aea80c73d68b66332ac8d4543 Mon Sep 17 00:00:00 2001
From: Mahmut Bulut <vertexclique@gmail.com>
Date: Wed, 18 Nov 2020 16:14:37 +0100
Subject: [PATCH 16/22] Follow the conventions and call it into_native_iter

---
 rust/arrow/src/buffer.rs                    |  6 +++---
 rust/arrow/src/compute/kernels/aggregate.rs | 10 +++++-----
 rust/arrow/src/util/bit_ops.rs              | 12 ++++++------
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/rust/arrow/src/buffer.rs b/rust/arrow/src/buffer.rs
index 6801532b3eb..ad2b0e36c9e 100644
--- a/rust/arrow/src/buffer.rs
+++ b/rust/arrow/src/buffer.rs
@@ -444,8 +444,8 @@ where
     let rem = op(left_chunks.remainder_bits(), right_chunks.remainder_bits());
     let rem = &rem.to_ne_bytes()[0..remainder_bytes];
 
-    let left_chunk_iter = left_chunks.to_native_iter();
-    let right_chunk_iter = right_chunks.to_native_iter();
+    let left_chunk_iter = left_chunks.into_native_iter();
+    let right_chunk_iter = right_chunks.into_native_iter();
 
     let result_chunks = result.typed_data_mut::<u64>().iter_mut();
 
@@ -482,7 +482,7 @@ where
     let rem = op(left_chunks.remainder_bits());
     let rem = &rem.to_ne_bytes()[0..remainder_bytes];
 
-    let left_chunk_iter = left_chunks.to_native_iter();
+    let left_chunk_iter = left_chunks.into_native_iter();
 
     let result_chunks = result.typed_data_mut::<u64>().iter_mut();
 
diff --git a/rust/arrow/src/compute/kernels/aggregate.rs b/rust/arrow/src/compute/kernels/aggregate.rs
index 8540d35c1ef..da79b6d9a73 100644
--- a/rust/arrow/src/compute/kernels/aggregate.rs
+++ b/rust/arrow/src/compute/kernels/aggregate.rs
@@ -157,7 +157,7 @@ where
 
             let buffer_remainder_bits: u64 = buffer_chunks.remainder_bits();
 
-            let buffer_chunk_iter = buffer_chunks.to_native_iter();
+            let buffer_chunk_iter = buffer_chunks.into_native_iter();
 
             data_chunks
                 .zip(buffer_chunk_iter)
@@ -224,9 +224,8 @@ where
             let bit_chunks = bit_slice.chunks::<u64>();
             let remainder_bits = bit_chunks.remainder_bits();
 
-            data_chunks
-                .zip(bit_chunks.to_native_iter())
-                .for_each(|(chunk, mut mask)| {
+            data_chunks.zip(bit_chunks.into_native_iter()).for_each(
+                |(chunk, mut mask)| {
                     // split chunks further into slices corresponding to the vector length
                     // the compiler is able to unroll this inner loop and remove bounds checks
                     // since the outer chunk size (64) is always a multiple of the number of lanes
@@ -240,7 +239,8 @@ where
 
                         mask = mask >> T::lanes();
                     });
-                });
+                },
+            );
 
             remainder.iter().enumerate().for_each(|(i, value)| {
                 if remainder_bits & (1 << i) != 0 {
diff --git a/rust/arrow/src/util/bit_ops.rs b/rust/arrow/src/util/bit_ops.rs
index ef26d55cbd1..a1973d74e74 100644
--- a/rust/arrow/src/util/bit_ops.rs
+++ b/rust/arrow/src/util/bit_ops.rs
@@ -78,7 +78,7 @@ impl<'a> BufferBitSlice<'a> {
     /// let chunks = bit_slice.chunks::<u8>();
     ///
     /// // Filter out null bytes for compression
-    /// let bytes = chunks.to_native_iter().filter(|e| *e != 0x00_u8).collect::<Vec<u8>>();
+    /// let bytes = chunks.into_native_iter().filter(|e| *e != 0x00_u8).collect::<Vec<u8>>();
     /// assert_eq!(bytes.len(), 4);
     /// ```
     /// Native representations in Arrow follows 64-bit convention.
@@ -299,7 +299,7 @@ where
     ///
     /// Returns an iterator which interprets underlying chunk's view's bits as given type.
     #[inline(always)]
-    pub fn to_native_iter(self) -> impl Iterator<Item = T> + 'a
+    pub fn into_native_iter(self) -> impl Iterator<Item = T> + 'a
     where
         T: BitMemory,
     {
@@ -339,7 +339,7 @@ mod tests_bit_slices_little_endian {
         let buffer: Buffer = Buffer::from(input);
 
         let bit_slice = buffer.bit_slice();
-        let result = bit_slice.chunks().to_native_iter().collect::<Vec<u64>>();
+        let result = bit_slice.chunks().into_native_iter().collect::<Vec<u64>>();
 
         assert_eq!(vec![0x0706050403020100], result);
     }
@@ -360,7 +360,7 @@ mod tests_bit_slices_little_endian {
         assert_eq!(0, chunks.remainder_bit_len());
         assert_eq!(0, chunks.remainder_bits());
 
-        let result = chunks.to_native_iter().collect::<Vec<u64>>();
+        let result = chunks.into_native_iter().collect::<Vec<u64>>();
 
         assert_eq!(
             vec![0b1010_01000000_00100000_00010000_00001000_00000100_00000010_00000001_1111],
@@ -384,7 +384,7 @@ mod tests_bit_slices_little_endian {
         assert_eq!(2, chunks.remainder_bit_len());
         assert_eq!(0b00000011, chunks.remainder_bits());
 
-        let result = chunks.to_native_iter().collect::<Vec<u64>>();
+        let result = chunks.into_native_iter().collect::<Vec<u64>>();
 
         assert_eq!(
             vec![0b1111_01000000_00100000_00010000_00001000_00000100_00000010_00000001_0000],
@@ -436,7 +436,7 @@ mod tests_bit_slices_little_endian {
         // Let's also get a chunked bits as u8, not u64 this time...
         let chunks = bit_slice.chunks::<u8>();
 
-        let result = chunks.to_native_iter().collect::<Vec<_>>();
+        let result = chunks.into_native_iter().collect::<Vec<_>>();
         assert_eq!(buffer_slice.to_vec(), result);
     }
 }

From af0f588e43b8b5947f0e032758b43f45f15d336b Mon Sep 17 00:00:00 2001
From: Mahmut Bulut <vertexclique@gmail.com>
Date: Wed, 18 Nov 2020 19:14:00 +0100
Subject: [PATCH 17/22] Apply clippy feedback

---
 rust/arrow/src/util/bit_ops.rs | 6 +++---
 rust/datafusion/tests/sql.rs   | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/rust/arrow/src/util/bit_ops.rs b/rust/arrow/src/util/bit_ops.rs
index a1973d74e74..29af1371c6c 100644
--- a/rust/arrow/src/util/bit_ops.rs
+++ b/rust/arrow/src/util/bit_ops.rs
@@ -363,7 +363,7 @@ mod tests_bit_slices_little_endian {
         let result = chunks.into_native_iter().collect::<Vec<u64>>();
 
         assert_eq!(
-            vec![0b1010_01000000_00100000_00010000_00001000_00000100_00000010_00000001_1111],
+            vec![0b1010_0100_0000_0010_0000_0001_0000_0000_1000_0000_0100_0000_0010_0000_0001_1111],
             result
         );
     }
@@ -387,7 +387,7 @@ mod tests_bit_slices_little_endian {
         let result = chunks.into_native_iter().collect::<Vec<u64>>();
 
         assert_eq!(
-            vec![0b1111_01000000_00100000_00010000_00001000_00000100_00000010_00000001_0000],
+            vec![0b1111_0100_0000_0010_0000_0001_0000_0000_1000_0000_0100_0000_0010_0000_0001_0000],
             result
         );
     }
@@ -419,7 +419,7 @@ mod tests_bit_slices_little_endian {
 
         assert_eq!(63, chunks.remainder_bit_len());
         assert_eq!(
-            0b1000000_00111111_11000000_00111111_11000000_00111111_11000000_00111111,
+            0b100_0000_0011_1111_1100_0000_0011_1111_1100_0000_0011_1111_1100_0000_0011_1111,
             chunks.remainder_bits()
         );
     }
diff --git a/rust/datafusion/tests/sql.rs b/rust/datafusion/tests/sql.rs
index b1a0ec4189c..16daba0495a 100644
--- a/rust/datafusion/tests/sql.rs
+++ b/rust/datafusion/tests/sql.rs
@@ -1400,14 +1400,14 @@ async fn query_on_string_dictionary() -> Result<()> {
     Ok(())
 }
 
-fn assert_float_eq<T>(expected: &Vec<Vec<T>>, received: &Vec<Vec<String>>)
+fn assert_float_eq<T>(expected: &[Vec<T>], received: &[Vec<String>])
 where
     T: AsRef<str>,
 {
     expected
-        .into_iter()
+        .iter()
         .flatten()
-        .zip(received.into_iter().flatten())
+        .zip(received.iter().flatten())
         .for_each(|(l, r)| {
             let (l, r) = (
                 l.as_ref().parse::<f64>().unwrap(),

From 457d8c7303864673c25bcb4e95e002b9585e9c39 Mon Sep 17 00:00:00 2001
From: Mahmut Bulut <vertexclique@gmail.com>
Date: Wed, 18 Nov 2020 22:08:56 +0100
Subject: [PATCH 18/22] Reword comment and explain 'well-aligned'

---
 rust/arrow/src/util/bit_ops.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rust/arrow/src/util/bit_ops.rs b/rust/arrow/src/util/bit_ops.rs
index 29af1371c6c..20e9051afe8 100644
--- a/rust/arrow/src/util/bit_ops.rs
+++ b/rust/arrow/src/util/bit_ops.rs
@@ -105,7 +105,7 @@ impl<'a> BufferBitSlice<'a> {
 
     ///
     /// Converts the bit view into a Buffer.
-    /// Buffer is always byte-aligned and well-aligned.
+    /// Buffer is always byte-aligned and it's pointer is aligned to size of u64.
     #[inline]
     pub fn as_buffer(&self) -> Buffer {
         Buffer::from(self.bit_slice.as_slice())
@@ -207,7 +207,7 @@ impl<'a> BufferBitSliceMut<'a> {
 
     ///
     /// Converts the bit view into a Buffer.
-    /// Buffer is always byte-aligned and well-aligned.
+    /// Buffer is always byte-aligned and it's pointer is aligned to size of u64.
     #[inline]
     pub fn as_buffer(&self) -> Buffer {
         Buffer::from(self.bit_slice.as_slice())

From e35e0be193da4868b088ba89b9defc25841f07c7 Mon Sep 17 00:00:00 2001
From: Mahmut Bulut <vertexclique@gmail.com>
Date: Tue, 24 Nov 2020 17:28:36 +0100
Subject: [PATCH 19/22] ARROW-10588 - Chunked parallel producer consumer

---
 rust/arrow/Cargo.toml                       |   1 +
 rust/arrow/benches/aggregate_kernels.rs     |  21 +--
 rust/arrow/benches/arithmetic_kernels.rs    |  22 +--
 rust/arrow/src/buffer.rs                    |  29 ++--
 rust/arrow/src/compute/kernels/aggregate.rs |  39 ++++--
 rust/arrow/src/util/bit_ops.rs              | 146 ++++++++++++++++++++
 6 files changed, 206 insertions(+), 52 deletions(-)

diff --git a/rust/arrow/Cargo.toml b/rust/arrow/Cargo.toml
index 696114cd2ea..308887999ec 100644
--- a/rust/arrow/Cargo.toml
+++ b/rust/arrow/Cargo.toml
@@ -36,6 +36,7 @@ name = "arrow"
 path = "src/lib.rs"
 
 [dependencies]
+rayon = "1.5"
 serde = { version = "1.0", features = ["rc"] }
 serde_derive = "1.0"
 serde_json = { version = "1.0", features = ["preserve_order"] }
diff --git a/rust/arrow/benches/aggregate_kernels.rs b/rust/arrow/benches/aggregate_kernels.rs
index 7e8aea3636e..466df050194 100644
--- a/rust/arrow/benches/aggregate_kernels.rs
+++ b/rust/arrow/benches/aggregate_kernels.rs
@@ -74,21 +74,22 @@ fn bench_min_string(arr_a: &StringArray) {
 }
 
 fn add_benchmark(c: &mut Criterion) {
-    let arr_a = create_array(512, false);
+    let size = 2_usize.pow(20);
+    let arr_a = create_array(size, false);
 
-    c.bench_function("sum 512", |b| b.iter(|| bench_sum(&arr_a)));
-    c.bench_function("min 512", |b| b.iter(|| bench_min(&arr_a)));
+    c.bench_function("sum 2^20", |b| b.iter(|| bench_sum(&arr_a)));
+    c.bench_function("min 2^20", |b| b.iter(|| bench_min(&arr_a)));
 
-    let arr_a = create_array(512, true);
+    let arr_a = create_array(size, true);
 
-    c.bench_function("sum nulls 512", |b| b.iter(|| bench_sum(&arr_a)));
-    c.bench_function("min nulls 512", |b| b.iter(|| bench_min(&arr_a)));
+    c.bench_function("sum nulls 2^20", |b| b.iter(|| bench_sum(&arr_a)));
+    c.bench_function("min nulls 2^20", |b| b.iter(|| bench_min(&arr_a)));
 
-    let arr_b = create_string_array(512, false);
-    c.bench_function("min string 512", |b| b.iter(|| bench_min_string(&arr_b)));
+    let arr_b = create_string_array(size, false);
+    c.bench_function("min string 2^20", |b| b.iter(|| bench_min_string(&arr_b)));
 
-    let arr_b = create_string_array(512, true);
-    c.bench_function("min nulls string 512", |b| {
+    let arr_b = create_string_array(size, true);
+    c.bench_function("min nulls string 2^20", |b| {
         b.iter(|| bench_min_string(&arr_b))
     });
 }
diff --git a/rust/arrow/benches/arithmetic_kernels.rs b/rust/arrow/benches/arithmetic_kernels.rs
index b3272d12e46..8538b1ae434 100644
--- a/rust/arrow/benches/arithmetic_kernels.rs
+++ b/rust/arrow/benches/arithmetic_kernels.rs
@@ -73,25 +73,25 @@ fn bench_limit(arr_a: &ArrayRef, max: usize) {
 }
 
 fn add_benchmark(c: &mut Criterion) {
-    let arr_a = create_array(512, false);
-    let arr_b = create_array(512, false);
+    let arr_a = create_array(2_usize.pow(20), false);
+    let arr_b = create_array(2_usize.pow(20), false);
 
-    c.bench_function("add 512", |b| b.iter(|| bench_add(&arr_a, &arr_b)));
-    c.bench_function("subtract 512", |b| {
+    c.bench_function("add 2^20", |b| b.iter(|| bench_add(&arr_a, &arr_b)));
+    c.bench_function("subtract 2^20", |b| {
         b.iter(|| bench_subtract(&arr_a, &arr_b))
     });
-    c.bench_function("multiply 512", |b| {
+    c.bench_function("multiply 2^20", |b| {
         b.iter(|| bench_multiply(&arr_a, &arr_b))
     });
-    c.bench_function("divide 512", |b| b.iter(|| bench_divide(&arr_a, &arr_b)));
-    c.bench_function("limit 512, 512", |b| b.iter(|| bench_limit(&arr_a, 512)));
+    c.bench_function("divide 2^20", |b| b.iter(|| bench_divide(&arr_a, &arr_b)));
+    c.bench_function("limit 2^20, 512", |b| b.iter(|| bench_limit(&arr_a, 512)));
 
-    let arr_a_nulls = create_array(512, false);
-    let arr_b_nulls = create_array(512, false);
-    c.bench_function("add_nulls_512", |b| {
+    let arr_a_nulls = create_array(2_usize.pow(20), false);
+    let arr_b_nulls = create_array(2_usize.pow(20), false);
+    c.bench_function("add_nulls_2^20", |b| {
         b.iter(|| bench_add(&arr_a_nulls, &arr_b_nulls))
     });
-    c.bench_function("divide_nulls_512", |b| {
+    c.bench_function("divide_nulls_2^20", |b| {
         b.iter(|| bench_divide(&arr_a_nulls, &arr_b_nulls))
     });
 }
diff --git a/rust/arrow/src/buffer.rs b/rust/arrow/src/buffer.rs
index ad2b0e36c9e..e9ce8dd4f7d 100644
--- a/rust/arrow/src/buffer.rs
+++ b/rust/arrow/src/buffer.rs
@@ -36,6 +36,8 @@ use crate::memory;
 use crate::util::bit_ops::*;
 use crate::util::utils;
 use crate::util::utils::ceil;
+use bitvec::field::BitField;
+use rayon::prelude::*;
 #[cfg(any(feature = "simd", feature = "avx512"))]
 use std::borrow::BorrowMut;
 
@@ -428,31 +430,28 @@ fn bitwise_bin_op_helper<F>(
     op: F,
 ) -> Buffer
 where
-    F: Fn(u64, u64) -> u64,
+    F: Fn(u64, u64) -> u64 + Send + Sync,
 {
     // reserve capacity and set length so we can get a typed view of u64 chunks
     let mut result =
         MutableBuffer::new(ceil(len_in_bits, 8)).with_bitset(len_in_bits / 64 * 8, false);
 
     let left_slice = left.bit_slice().slicing(left_offset_in_bits, len_in_bits);
-    let left_chunks = left_slice.chunks::<u64>();
+    let left_chunks = left_slice.par_chunks::<u64>();
 
     let right_slice = right.bit_slice().slicing(right_offset_in_bits, len_in_bits);
-    let right_chunks = right_slice.chunks::<u64>();
+    let right_chunks = right_slice.par_chunks::<u64>();
 
     let remainder_bytes = ceil(left_chunks.remainder_bit_len(), 8);
     let rem = op(left_chunks.remainder_bits(), right_chunks.remainder_bits());
     let rem = &rem.to_ne_bytes()[0..remainder_bytes];
 
-    let left_chunk_iter = left_chunks.into_native_iter();
-    let right_chunk_iter = right_chunks.into_native_iter();
-
-    let result_chunks = result.typed_data_mut::<u64>().iter_mut();
+    let result_chunks = result.typed_data_mut::<u64>().par_iter_mut();
 
     result_chunks
-        .zip(left_chunk_iter.zip(right_chunk_iter))
+        .zip(left_chunks.zip(right_chunks))
         .for_each(|(res, (left, right))| {
-            *res = op(left, right);
+            *res = op((*left).load::<u64>(), (*right).load::<u64>());
         });
 
     result.extend_from_slice(rem);
@@ -469,25 +468,23 @@ fn bitwise_unary_op_helper<F>(
     op: F,
 ) -> Buffer
 where
-    F: Fn(u64) -> u64,
+    F: Fn(u64) -> u64 + Send + Sync,
 {
     // reserve capacity and set length so we can get a typed view of u64 chunks
     let mut result =
         MutableBuffer::new(ceil(len_in_bits, 8)).with_bitset(len_in_bits / 64 * 8, false);
 
     let left_slice = left.bit_slice().slicing(offset_in_bits, len_in_bits);
-    let left_chunks = left_slice.chunks::<u64>();
+    let left_chunks = left_slice.par_chunks::<u64>();
 
     let remainder_bytes = ceil(left_chunks.remainder_bit_len(), 8);
     let rem = op(left_chunks.remainder_bits());
     let rem = &rem.to_ne_bytes()[0..remainder_bytes];
 
-    let left_chunk_iter = left_chunks.into_native_iter();
-
-    let result_chunks = result.typed_data_mut::<u64>().iter_mut();
+    let result_chunks = result.typed_data_mut::<u64>().par_iter_mut();
 
-    result_chunks.zip(left_chunk_iter).for_each(|(res, left)| {
-        *res = op(left);
+    result_chunks.zip(left_chunks).for_each(|(res, left)| {
+        *res = op((*left).load::<u64>());
     });
 
     result.extend_from_slice(rem);
diff --git a/rust/arrow/src/compute/kernels/aggregate.rs b/rust/arrow/src/compute/kernels/aggregate.rs
index da79b6d9a73..f4ba54ae284 100644
--- a/rust/arrow/src/compute/kernels/aggregate.rs
+++ b/rust/arrow/src/compute/kernels/aggregate.rs
@@ -21,6 +21,10 @@ use std::ops::Add;
 
 use crate::array::{Array, GenericStringArray, PrimitiveArray, StringOffsetSizeTrait};
 use crate::datatypes::ArrowNumericType;
+use bitvec::prelude::BitField;
+use rayon::iter::IndexedParallelIterator;
+use rayon::prelude::*;
+use std::iter::Sum;
 
 /// Helper macro to perform min/max of strings
 fn min_max_string<T: StringOffsetSizeTrait, F: Fn(&str, &str) -> bool>(
@@ -129,7 +133,7 @@ where
 pub fn sum<T>(array: &PrimitiveArray<T>) -> Option<T::Native>
 where
     T: ArrowNumericType,
-    T::Native: Add<Output = T::Native>,
+    T::Native: Add<Output = T::Native> + Sum,
 {
     let null_count = array.null_count();
 
@@ -141,33 +145,38 @@ where
 
     match array.data().null_buffer() {
         None => {
-            let sum = data.iter().fold(T::default_value(), |accumulator, value| {
-                accumulator + *value
-            });
+            let total = data
+                .par_iter()
+                .map(|value| {
+                    let mut sum = T::default_value();
+                    sum = sum + *value;
+                    sum
+                })
+                .sum();
 
-            Some(sum)
+            Some(total)
         }
         Some(buffer) => {
-            let mut sum = T::default_value();
-            let data_chunks = data.chunks_exact(64);
+            let data_chunks = data.par_chunks_exact(64);
             let remainder = data_chunks.remainder();
 
             let buffer_slice = buffer.bit_slice().slicing(array.offset(), array.len());
-            let buffer_chunks = buffer_slice.chunks::<u64>();
+            let buffer_chunks = buffer_slice.par_chunks::<u64>();
 
             let buffer_remainder_bits: u64 = buffer_chunks.remainder_bits();
 
-            let buffer_chunk_iter = buffer_chunks.into_native_iter();
-
-            data_chunks
-                .zip(buffer_chunk_iter)
-                .for_each(|(chunk, mask)| {
+            let mut sum = data_chunks
+                .zip(buffer_chunks)
+                .map(|(chunk, mask)| {
+                    let mut sum = T::default_value();
                     chunk.iter().enumerate().for_each(|(i, value)| {
-                        if (mask & (1 << i)) != 0 {
+                        if (mask.load::<u64>() & (1 << i)) != 0 {
                             sum = sum + *value;
                         }
                     });
-                });
+                    sum
+                })
+                .sum();
 
             remainder.iter().enumerate().for_each(|(i, value)| {
                 if buffer_remainder_bits & (1 << i) != 0 {
diff --git a/rust/arrow/src/util/bit_ops.rs b/rust/arrow/src/util/bit_ops.rs
index 20e9051afe8..00cdccd7806 100644
--- a/rust/arrow/src/util/bit_ops.rs
+++ b/rust/arrow/src/util/bit_ops.rs
@@ -20,7 +20,10 @@ use crate::buffer::Buffer;
 use bitvec::prelude::*;
 use bitvec::slice::ChunksExact;
 
+use rayon::iter::plumbing::*;
+use rayon::prelude::*;
 use std::fmt::Debug;
+use std::marker::PhantomData as marker;
 
 ///
 /// Immutable bit slice view of `Buffer` data.
@@ -103,6 +106,27 @@ impl<'a> BufferBitSlice<'a> {
         }
     }
 
+    #[inline]
+    pub fn par_chunks<T>(&self) -> ParallelChunksExact<T>
+    where
+        T: BitMemory,
+    {
+        let offset_size_in_bits = 8 * std::mem::size_of::<T>();
+        let chunks_exact = self.bit_slice.chunks_exact(offset_size_in_bits);
+        let remainder_bits = chunks_exact.remainder();
+        let remainder: T = if remainder_bits.is_empty() {
+            T::default()
+        } else {
+            remainder_bits.load::<T>()
+        };
+        ParallelChunksExact {
+            bit_slice: self.bit_slice,
+            chunk_size: offset_size_in_bits,
+            remainder,
+            remainder_len_in_bits: remainder_bits.len(),
+        }
+    }
+
     ///
     /// Converts the bit view into a Buffer.
     /// Buffer is always byte-aligned and it's pointer is aligned to size of u64.
@@ -328,6 +352,128 @@ where
     }
 }
 
+/// Implements parallel iterator over an immutable reference to a `BufferBitChunksExact`
+#[derive(Debug)]
+pub struct ParallelChunksExact<'a, T> {
+    bit_slice: &'a BitSlice<LocalBits, u8>,
+    chunk_size: usize,
+    remainder: T,
+    remainder_len_in_bits: usize,
+}
+
+impl<'a, T> ParallelChunksExact<'a, T>
+where
+    T: BitMemory,
+{
+    ///
+    /// Returns remainder bit length from the exact chunk iterator
+    #[inline(always)]
+    pub fn remainder_bit_len(&self) -> usize {
+        self.remainder_len_in_bits
+    }
+
+    ///
+    /// Returns the remainder bits interpreted as given type.
+    #[inline(always)]
+    pub fn remainder_bits(&self) -> T {
+        self.remainder
+    }
+
+    ///
+    /// Returns an iterator which interprets underlying chunk's view's bits as given type.
+    #[inline(always)]
+    pub fn into_native_iter(self) -> impl Iterator<Item = T> + 'a
+    where
+        T: BitMemory,
+    {
+        self.bit_slice
+            .chunks_exact(self.chunk_size)
+            .map(|e| e.load::<T>())
+    }
+}
+
+impl<'a, T> ParallelIterator for ParallelChunksExact<'a, T>
+where
+    T: BitMemory,
+{
+    type Item = &'a BitSlice<LocalBits, u8>;
+
+    fn drive_unindexed<C>(self, consumer: C) -> C::Result
+    where
+        C: UnindexedConsumer<Self::Item>,
+    {
+        bridge(self, consumer)
+    }
+
+    fn opt_len(&self) -> Option<usize> {
+        Some(self.len())
+    }
+}
+
+impl<'a, T> IndexedParallelIterator for ParallelChunksExact<'a, T>
+where
+    T: BitMemory,
+{
+    fn len(&self) -> usize {
+        self.bit_slice.len()
+    }
+
+    fn drive<C>(self, consumer: C) -> C::Result
+    where
+        C: Consumer<Self::Item>,
+    {
+        bridge(self, consumer)
+    }
+
+    fn with_producer<CB>(self, callback: CB) -> CB::Output
+    where
+        CB: ProducerCallback<Self::Item>,
+    {
+        callback.callback(BufferBitSliceProducer::<T> {
+            inner: self.bit_slice,
+            marker,
+        })
+    }
+}
+
+#[derive(Debug)]
+pub struct BufferBitSliceProducer<'a, T>
+where
+    T: BitMemory,
+{
+    inner: &'a BitSlice<LocalBits, u8>,
+    marker: marker<T>,
+}
+
+impl<'a, T> Producer for BufferBitSliceProducer<'a, T>
+where
+    T: BitMemory,
+{
+    type Item = &'a BitSlice<LocalBits, u8>;
+    type IntoIter = ChunksExact<'a, LocalBits, u8>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        let offset_size_in_bits = 8 * std::mem::size_of::<T>();
+        self.inner.chunks_exact(offset_size_in_bits)
+    }
+
+    fn split_at(self, index: usize) -> (Self, Self) {
+        let offset_size_in_bits = 8 * std::mem::size_of::<T>();
+        let elem_index = index * offset_size_in_bits;
+        let (left, right) = self.inner.split_at(elem_index);
+        (
+            BufferBitSliceProducer {
+                inner: left,
+                marker,
+            },
+            BufferBitSliceProducer {
+                inner: right,
+                marker,
+            },
+        )
+    }
+}
+
 #[cfg(all(test, target_endian = "little"))]
 mod tests_bit_slices_little_endian {
     use super::*;

From 64f0ccc302a061bda2a481dacdffd07dd64b5130 Mon Sep 17 00:00:00 2001
From: Mahmut Bulut <vertexclique@gmail.com>
Date: Tue, 24 Nov 2020 19:27:07 +0100
Subject: [PATCH 20/22] ARROW-10588 - Chunked parallel SIMD producer consumer

---
 rust/arrow/Cargo.toml                       |  2 +-
 rust/arrow/src/buffer.rs                    | 62 ++++++++++-----------
 rust/arrow/src/compute/kernels/aggregate.rs |  3 +-
 3 files changed, 33 insertions(+), 34 deletions(-)

diff --git a/rust/arrow/Cargo.toml b/rust/arrow/Cargo.toml
index 308887999ec..55372481e1e 100644
--- a/rust/arrow/Cargo.toml
+++ b/rust/arrow/Cargo.toml
@@ -54,7 +54,7 @@ hex = "0.4"
 prettytable-rs = { version = "0.8.0", optional = true }
 
 [features]
-default = []
+default = ["simd"]
 avx512 = []
 simd = ["packed_simd"]
 prettyprint = ["prettytable-rs"]
diff --git a/rust/arrow/src/buffer.rs b/rust/arrow/src/buffer.rs
index e9ce8dd4f7d..bc3edf377cb 100644
--- a/rust/arrow/src/buffer.rs
+++ b/rust/arrow/src/buffer.rs
@@ -38,7 +38,7 @@ use crate::util::utils;
 use crate::util::utils::ceil;
 use bitvec::field::BitField;
 use rayon::prelude::*;
-#[cfg(any(feature = "simd", feature = "avx512"))]
+#[cfg(any(feature = "avx512"))]
 use std::borrow::BorrowMut;
 
 /// Buffer is a contiguous memory region of fixed size and is aligned at a 64-byte
@@ -343,32 +343,29 @@ fn bitwise_bin_op_simd_helper<F_SIMD, F_SCALAR>(
     scalar_op: F_SCALAR,
 ) -> Buffer
 where
-    F_SIMD: Fn(u8x64, u8x64) -> u8x64,
-    F_SCALAR: Fn(u8, u8) -> u8,
+    F_SIMD: Fn(u8x64, u8x64) -> u8x64 + Send + Sync,
+    F_SCALAR: Fn(u8, u8) -> u8 + Send + Sync,
 {
     let mut result = MutableBuffer::new(len).with_bitset(len, false);
     let lanes = u8x64::lanes();
 
-    let mut left_chunks = left.data()[left_offset..].chunks_exact(lanes);
-    let mut right_chunks = right.data()[right_offset..].chunks_exact(lanes);
-    let mut result_chunks = result.data_mut().chunks_exact_mut(lanes);
+    let left_chunks = left.data()[left_offset..].par_chunks_exact(lanes);
+    let right_chunks = right.data()[right_offset..].par_chunks_exact(lanes);
+    let mut result_chunks = result.data_mut().par_chunks_exact_mut(lanes);
+
+    let res_remainder = result_chunks.take_remainder();
+    let left_remainder = left_chunks.remainder();
+    let right_remainder = right_chunks.remainder();
 
     result_chunks
-        .borrow_mut()
-        .zip(left_chunks.borrow_mut().zip(right_chunks.borrow_mut()))
+        .zip(left_chunks.zip(right_chunks))
         .for_each(|(res, (left, right))| {
             unsafe { utils::bitwise_bin_op_simd(&left, &right, res, &simd_op) };
         });
 
-    result_chunks
-        .into_remainder()
-        .iter_mut()
-        .zip(
-            left_chunks
-                .remainder()
-                .iter()
-                .zip(right_chunks.remainder().iter()),
-        )
+    res_remainder
+        .into_par_iter()
+        .zip(left_remainder.into_par_iter().zip(right_remainder.into_par_iter()))
         .for_each(|(res, (left, right))| {
             *res = scalar_op(*left, *right);
         });
@@ -390,28 +387,29 @@ fn bitwise_unary_op_simd_helper<F_SIMD, F_SCALAR>(
     scalar_op: F_SCALAR,
 ) -> Buffer
 where
-    F_SIMD: Fn(u8x64) -> u8x64,
-    F_SCALAR: Fn(u8) -> u8,
+    F_SIMD: Fn(u8x64) -> u8x64 + Send + Sync,
+    F_SCALAR: Fn(u8) -> u8 + Send + Sync,
 {
     let mut result = MutableBuffer::new(len).with_bitset(len, false);
     let lanes = u8x64::lanes();
 
-    let mut left_chunks = left.data()[left_offset..].chunks_exact(lanes);
-    let mut result_chunks = result.data_mut().chunks_exact_mut(lanes);
+    let left_chunks = left.data()[left_offset..].par_chunks_exact(lanes);
+    let mut result_chunks = result.data_mut().par_chunks_exact_mut(lanes);
 
-    result_chunks
-        .borrow_mut()
-        .zip(left_chunks.borrow_mut())
-        .for_each(|(res, left)| unsafe {
-            let data_simd = u8x64::from_slice_unaligned_unchecked(left);
-            let simd_result = simd_op(data_simd);
-            simd_result.write_to_slice_unaligned_unchecked(res);
+    let res_remainder = result_chunks.take_remainder();
+    let left_remainder = left_chunks.remainder();
+
+    left_chunks
+        .map(|e| unsafe { u8x64::from_slice_unaligned_unchecked(e) })
+        .map(simd_op)
+        .zip(result_chunks)
+        .for_each(|(e, res)| unsafe {
+            e.write_to_slice_unaligned_unchecked(res)
         });
 
-    result_chunks
-        .into_remainder()
-        .iter_mut()
-        .zip(left_chunks.remainder().iter())
+    res_remainder
+        .into_par_iter()
+        .zip(left_remainder.into_par_iter())
         .for_each(|(res, left)| {
             *res = scalar_op(*left);
         });
diff --git a/rust/arrow/src/compute/kernels/aggregate.rs b/rust/arrow/src/compute/kernels/aggregate.rs
index f4ba54ae284..34e49e33573 100644
--- a/rust/arrow/src/compute/kernels/aggregate.rs
+++ b/rust/arrow/src/compute/kernels/aggregate.rs
@@ -21,9 +21,10 @@ use std::ops::Add;
 
 use crate::array::{Array, GenericStringArray, PrimitiveArray, StringOffsetSizeTrait};
 use crate::datatypes::ArrowNumericType;
-use bitvec::prelude::BitField;
 use rayon::iter::IndexedParallelIterator;
 use rayon::prelude::*;
+
+use bitvec::prelude::BitField;
 use std::iter::Sum;
 
 /// Helper macro to perform min/max of strings

From 0eb35a021f666eb0d7f06aac76845fd402e07a81 Mon Sep 17 00:00:00 2001
From: Mahmut Bulut <vertexclique@gmail.com>
Date: Tue, 24 Nov 2020 22:41:58 +0100
Subject: [PATCH 21/22] ARROW-10588 - Horizontal sum operation

---
 rust/arrow/src/compute/kernels/aggregate.rs | 85 +++++++++------------
 rust/arrow/src/datatypes.rs                 | 70 ++++++++++-------
 2 files changed, 78 insertions(+), 77 deletions(-)

diff --git a/rust/arrow/src/compute/kernels/aggregate.rs b/rust/arrow/src/compute/kernels/aggregate.rs
index 34e49e33573..d04469f9160 100644
--- a/rust/arrow/src/compute/kernels/aggregate.rs
+++ b/rust/arrow/src/compute/kernels/aggregate.rs
@@ -17,7 +17,7 @@
 
 //! Defines aggregations over Arrow arrays.
 
-use std::ops::Add;
+use std::ops::{Add, Sub};
 
 use crate::array::{Array, GenericStringArray, PrimitiveArray, StringOffsetSizeTrait};
 use crate::datatypes::ArrowNumericType;
@@ -194,9 +194,12 @@ where
 ///
 /// Returns `None` if the array is empty or only contains null values.
 #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "simd"))]
-pub fn sum<T: ArrowNumericType>(array: &PrimitiveArray<T>) -> Option<T::Native>
+pub fn sum<T>(array: &PrimitiveArray<T>) -> Option<T::Native>
 where
-    T::Native: Add<Output = T::Native>,
+    T: ArrowNumericType,
+    // T::Native: Add<Output = T::Native> + Sum + Sum<T::Simd>,
+    T::Native: Add<Output = T::Native> + Sum,
+    T::Simd: Send + Sync + Sum,
 {
     let null_count = array.null_count();
 
@@ -206,74 +209,58 @@ where
 
     let data: &[T::Native] = array.value_slice(0, array.len());
 
-    let mut vector_sum = T::init(T::default_value());
-    let mut remainder_sum = T::default_value();
-
     match array.data().null_buffer() {
         None => {
-            let data_chunks = data.chunks_exact(64);
+            let data_chunks = data.par_chunks_exact(T::lanes());
             let remainder = data_chunks.remainder();
 
-            data_chunks.for_each(|chunk| {
-                chunk.chunks_exact(T::lanes()).for_each(|chunk| {
-                    let chunk = T::load(&chunk);
-                    vector_sum = vector_sum + chunk;
-                });
-            });
+            let mut agg_sum = T::horizontal_sum(data_chunks
+                .map(T::load)
+                .sum::<T::Simd>());
 
             remainder.iter().for_each(|value| {
-                remainder_sum = remainder_sum + *value;
+                agg_sum = agg_sum + *value;
             });
+
+            Some(agg_sum)
         }
         Some(buffer) => {
             // process data in chunks of 64 elements since we also get 64 bits of validity information at a time
-            let data_chunks = data.chunks_exact(64);
+            let data_chunks = data.par_chunks_exact(64);
             let remainder = data_chunks.remainder();
 
             let bit_slice = buffer.bit_slice().slicing(array.offset(), array.len());
-            let bit_chunks = bit_slice.chunks::<u64>();
+            let bit_chunks = bit_slice.par_chunks::<u64>();
             let remainder_bits = bit_chunks.remainder_bits();
 
-            data_chunks.zip(bit_chunks.into_native_iter()).for_each(
-                |(chunk, mut mask)| {
-                    // split chunks further into slices corresponding to the vector length
-                    // the compiler is able to unroll this inner loop and remove bounds checks
-                    // since the outer chunk size (64) is always a multiple of the number of lanes
-                    chunk.chunks_exact(T::lanes()).for_each(|chunk| {
-                        let zero = T::init(T::default_value());
-                        let vecmask = T::mask_from_u64(mask);
-                        let chunk = T::load(&chunk);
-                        let blended = T::mask_select(vecmask, chunk, zero);
-
-                        vector_sum = vector_sum + blended;
-
-                        mask = mask >> T::lanes();
-                    });
-                },
-            );
+            let agg_sum: T::Simd = data_chunks.zip(bit_chunks).map(
+                |(chunk, mask)| {
+                    chunk
+                        .par_chunks_exact(T::lanes())
+                        .fold(|| T::init(T::default_value()), |mut acc, chunk| {
+                            let zero = T::init(T::default_value());
+                            let vecmask = T::mask_from_u64(mask.load::<u64>());
+                            let chunk = T::load(&chunk);
+                            let blended = T::mask_select(vecmask, chunk, zero);
+                            acc = acc + blended;
+
+                            // mask = mask >> T::lanes();
+                            acc
+                        })
+                        .sum::<T::Simd>()
+                }).sum::<T::Simd>();
+
+            let mut agg_sum: T::Native = T::horizontal_sum(agg_sum);
 
             remainder.iter().enumerate().for_each(|(i, value)| {
                 if remainder_bits & (1 << i) != 0 {
-                    remainder_sum = remainder_sum + *value;
+                    agg_sum = agg_sum + *value;
                 }
             });
+
+            Some(agg_sum)
         }
     }
-
-    // calculate horizontal sum of accumulator by writing to a temporary
-    // this is probably faster than extracting individual lanes
-    // the compiler is free to optimize this to something faster
-    let tmp = &mut [T::default_value(); 64];
-    T::write(vector_sum, &mut tmp[0..T::lanes()]);
-
-    let mut total_sum = T::default_value();
-    tmp[0..T::lanes()]
-        .iter()
-        .for_each(|lane| total_sum = total_sum + *lane);
-
-    total_sum = total_sum + remainder_sum;
-
-    Some(total_sum)
 }
 
 #[cfg(test)]
diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs
index 4107b4b01de..f5b924c5be7 100644
--- a/rust/arrow/src/datatypes.rs
+++ b/rust/arrow/src/datatypes.rs
@@ -41,6 +41,7 @@ use serde_json::{
 
 use crate::error::{ArrowError, Result};
 use crate::util::bit_ops::BufferBitSlice;
+use std::iter::Sum;
 
 /// The set of datatypes that are supported by this implementation of Apache Arrow.
 ///
@@ -562,6 +563,9 @@ where
         op: F,
     ) -> Self::Simd;
 
+    /// Horizontal sum of SIMD vector elements
+    fn horizontal_sum(vector: Self::Simd) -> Self::Native;
+
     /// SIMD version of equal
     fn eq(left: Self::Simd, right: Self::Simd) -> Self::SimdMask;
 
@@ -591,7 +595,7 @@ where
 pub trait ArrowNumericType: ArrowPrimitiveType {}
 
 macro_rules! make_numeric_type {
-    ($impl_ty:ty, $native_ty:ty, $simd_ty:ident, $simd_mask_ty:ident) => {
+    ($impl_ty:ty, $native_ty:ty, $simd_ty:ident, $simd_mask_ty:ident, $lanes:expr) => {
         #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "simd"))]
         impl ArrowNumericType for $impl_ty {
             type Simd = $simd_ty;
@@ -741,6 +745,15 @@ macro_rules! make_numeric_type {
                 op(left, right)
             }
 
+            #[inline]
+            fn horizontal_sum(vector: Self::Simd) -> Self::Native {
+                let mut x = vector.extract(0) as Self::Native;
+                for i in 1..Self::Simd::lanes() {
+                    x += vector.extract(i) as Self::Native;
+                }
+                x
+            }
+
             #[inline]
             fn eq(left: Self::Simd, right: Self::Simd) -> Self::SimdMask {
                 left.eq(right)
@@ -776,6 +789,7 @@ macro_rules! make_numeric_type {
                 unsafe { simd_result.write_to_slice_unaligned_unchecked(slice) };
             }
         }
+
         #[cfg(any(
             not(any(target_arch = "x86", target_arch = "x86_64")),
             not(feature = "simd")
@@ -784,33 +798,33 @@ macro_rules! make_numeric_type {
     };
 }
 
-make_numeric_type!(Int8Type, i8, i8x64, m8x64);
-make_numeric_type!(Int16Type, i16, i16x32, m16x32);
-make_numeric_type!(Int32Type, i32, i32x16, m32x16);
-make_numeric_type!(Int64Type, i64, i64x8, m64x8);
-make_numeric_type!(UInt8Type, u8, u8x64, m8x64);
-make_numeric_type!(UInt16Type, u16, u16x32, m16x32);
-make_numeric_type!(UInt32Type, u32, u32x16, m32x16);
-make_numeric_type!(UInt64Type, u64, u64x8, m64x8);
-make_numeric_type!(Float32Type, f32, f32x16, m32x16);
-make_numeric_type!(Float64Type, f64, f64x8, m64x8);
-
-make_numeric_type!(TimestampSecondType, i64, i64x8, m64x8);
-make_numeric_type!(TimestampMillisecondType, i64, i64x8, m64x8);
-make_numeric_type!(TimestampMicrosecondType, i64, i64x8, m64x8);
-make_numeric_type!(TimestampNanosecondType, i64, i64x8, m64x8);
-make_numeric_type!(Date32Type, i32, i32x16, m32x16);
-make_numeric_type!(Date64Type, i64, i64x8, m64x8);
-make_numeric_type!(Time32SecondType, i32, i32x16, m32x16);
-make_numeric_type!(Time32MillisecondType, i32, i32x16, m32x16);
-make_numeric_type!(Time64MicrosecondType, i64, i64x8, m64x8);
-make_numeric_type!(Time64NanosecondType, i64, i64x8, m64x8);
-make_numeric_type!(IntervalYearMonthType, i32, i32x16, m32x16);
-make_numeric_type!(IntervalDayTimeType, i64, i64x8, m64x8);
-make_numeric_type!(DurationSecondType, i64, i64x8, m64x8);
-make_numeric_type!(DurationMillisecondType, i64, i64x8, m64x8);
-make_numeric_type!(DurationMicrosecondType, i64, i64x8, m64x8);
-make_numeric_type!(DurationNanosecondType, i64, i64x8, m64x8);
+make_numeric_type!(Int8Type, i8, i8x64, m8x64, 64);
+make_numeric_type!(Int16Type, i16, i16x32, m16x32, 32);
+make_numeric_type!(Int32Type, i32, i32x16, m32x16, 16);
+make_numeric_type!(Int64Type, i64, i64x8, m64x8, 8);
+make_numeric_type!(UInt8Type, u8, u8x64, m8x64, 64);
+make_numeric_type!(UInt16Type, u16, u16x32, m16x32, 32);
+make_numeric_type!(UInt32Type, u32, u32x16, m32x16, 16);
+make_numeric_type!(UInt64Type, u64, u64x8, m64x8, 8);
+make_numeric_type!(Float32Type, f32, f32x16, m32x16, 16);
+make_numeric_type!(Float64Type, f64, f64x8, m64x8, 8);
+
+make_numeric_type!(TimestampSecondType, i64, i64x8, m64x8, 8);
+make_numeric_type!(TimestampMillisecondType, i64, i64x8, m64x8, 8);
+make_numeric_type!(TimestampMicrosecondType, i64, i64x8, m64x8, 8);
+make_numeric_type!(TimestampNanosecondType, i64, i64x8, m64x8, 8);
+make_numeric_type!(Date32Type, i32, i32x16, m32x16, 16);
+make_numeric_type!(Date64Type, i64, i64x8, m64x8, 8);
+make_numeric_type!(Time32SecondType, i32, i32x16, m32x16, 16);
+make_numeric_type!(Time32MillisecondType, i32, i32x16, m32x16, 16);
+make_numeric_type!(Time64MicrosecondType, i64, i64x8, m64x8, 8);
+make_numeric_type!(Time64NanosecondType, i64, i64x8, m64x8, 8);
+make_numeric_type!(IntervalYearMonthType, i32, i32x16, m32x16, 16);
+make_numeric_type!(IntervalDayTimeType, i64, i64x8, m64x8, 8);
+make_numeric_type!(DurationSecondType, i64, i64x8, m64x8, 8);
+make_numeric_type!(DurationMillisecondType, i64, i64x8, m64x8, 8);
+make_numeric_type!(DurationMicrosecondType, i64, i64x8, m64x8, 8);
+make_numeric_type!(DurationNanosecondType, i64, i64x8, m64x8, 8);
 
 /// A subtype of primitive type that represents temporal values.
 pub trait ArrowTemporalType: ArrowPrimitiveType {}

From 92c070ece41634e6a74751404b88dd7e771a61ed Mon Sep 17 00:00:00 2001
From: Mahmut Bulut <vertexclique@gmail.com>
Date: Tue, 24 Nov 2020 22:46:42 +0100
Subject: [PATCH 22/22] ARROW-10588 - Simplify and remove unnecessary
 operations

---
 rust/arrow/src/buffer.rs                    | 10 +++++----
 rust/arrow/src/compute/kernels/aggregate.rs | 23 +++++++++------------
 rust/arrow/src/datatypes.rs                 |  1 -
 3 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/rust/arrow/src/buffer.rs b/rust/arrow/src/buffer.rs
index bc3edf377cb..e6afb331b33 100644
--- a/rust/arrow/src/buffer.rs
+++ b/rust/arrow/src/buffer.rs
@@ -365,7 +365,11 @@ where
 
     res_remainder
         .into_par_iter()
-        .zip(left_remainder.into_par_iter().zip(right_remainder.into_par_iter()))
+        .zip(
+            left_remainder
+                .into_par_iter()
+                .zip(right_remainder.into_par_iter()),
+        )
         .for_each(|(res, (left, right))| {
             *res = scalar_op(*left, *right);
         });
@@ -403,9 +407,7 @@ where
         .map(|e| unsafe { u8x64::from_slice_unaligned_unchecked(e) })
         .map(simd_op)
         .zip(result_chunks)
-        .for_each(|(e, res)| unsafe {
-            e.write_to_slice_unaligned_unchecked(res)
-        });
+        .for_each(|(e, res)| unsafe { e.write_to_slice_unaligned_unchecked(res) });
 
     res_remainder
         .into_par_iter()
diff --git a/rust/arrow/src/compute/kernels/aggregate.rs b/rust/arrow/src/compute/kernels/aggregate.rs
index d04469f9160..61e18076cef 100644
--- a/rust/arrow/src/compute/kernels/aggregate.rs
+++ b/rust/arrow/src/compute/kernels/aggregate.rs
@@ -17,7 +17,7 @@
 
 //! Defines aggregations over Arrow arrays.
 
-use std::ops::{Add, Sub};
+use std::ops::Add;
 
 use crate::array::{Array, GenericStringArray, PrimitiveArray, StringOffsetSizeTrait};
 use crate::datatypes::ArrowNumericType;
@@ -214,9 +214,8 @@ where
             let data_chunks = data.par_chunks_exact(T::lanes());
             let remainder = data_chunks.remainder();
 
-            let mut agg_sum = T::horizontal_sum(data_chunks
-                .map(T::load)
-                .sum::<T::Simd>());
+            let mut agg_sum =
+                T::horizontal_sum(data_chunks.map(T::load).sum::<T::Simd>());
 
             remainder.iter().for_each(|value| {
                 agg_sum = agg_sum + *value;
@@ -233,22 +232,20 @@ where
             let bit_chunks = bit_slice.par_chunks::<u64>();
             let remainder_bits = bit_chunks.remainder_bits();
 
-            let agg_sum: T::Simd = data_chunks.zip(bit_chunks).map(
-                |(chunk, mask)| {
+            let agg_sum: T::Simd = data_chunks
+                .zip(bit_chunks)
+                .map(|(chunk, mask)| {
                     chunk
                         .par_chunks_exact(T::lanes())
-                        .fold(|| T::init(T::default_value()), |mut acc, chunk| {
+                        .map(|chunk| {
                             let zero = T::init(T::default_value());
                             let vecmask = T::mask_from_u64(mask.load::<u64>());
                             let chunk = T::load(&chunk);
-                            let blended = T::mask_select(vecmask, chunk, zero);
-                            acc = acc + blended;
-
-                            // mask = mask >> T::lanes();
-                            acc
+                            T::mask_select(vecmask, chunk, zero)
                         })
                         .sum::<T::Simd>()
-                }).sum::<T::Simd>();
+                })
+                .sum::<T::Simd>();
 
             let mut agg_sum: T::Native = T::horizontal_sum(agg_sum);
 
diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs
index f5b924c5be7..e3e5a4dc247 100644
--- a/rust/arrow/src/datatypes.rs
+++ b/rust/arrow/src/datatypes.rs
@@ -41,7 +41,6 @@ use serde_json::{
 
 use crate::error::{ArrowError, Result};
 use crate::util::bit_ops::BufferBitSlice;
-use std::iter::Sum;
 
 /// The set of datatypes that are supported by this implementation of Apache Arrow.
 ///