From e5656e640fe6796bcd753e49d561d50fe66d3167 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Thu, 29 Apr 2021 18:23:56 +0200 Subject: [PATCH 01/12] Proof of concept --- cpp/src/arrow/python/numpy_to_arrow.cc | 6 +++++- python/pyarrow/tests/test_array.py | 15 +++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index c17e70823d5..5b07bcfe781 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -594,7 +594,11 @@ Status NumPyConverter::Visit(const FixedSizeBinaryType& type) { if (mask_ != nullptr) { Ndarray1DIndexer mask_values(mask_); - RETURN_NOT_OK(builder.AppendValues(data, length_, mask_values.data())); + std::unique_ptr inverted_mask(new uint8_t[length_]); + for (int64_t i = 0; i < length_; ++i) { + inverted_mask[i] = !mask_values[i]; + } + RETURN_NOT_OK(builder.AppendValues(data, length_, inverted_mask.get())); } else { RETURN_NOT_OK(builder.AppendValues(data, length_)); } diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 086ed4cb160..ac0f430299d 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2714,6 +2714,21 @@ def test_array_masked(): assert arr.type == pa.int64() +def test_binary_array_masked(): + # ARROW-12431 + masked_basic = pa.array([b'\x05'], type=pa.binary(1), + mask=np.array([False])) + assert pa.array([b'\x05']).to_pylist() == masked_basic.to_pylist() + + masked = pa.array(np.array([b'\x05']), type=pa.binary(1), + mask=np.array([False])) + assert pa.array([b'\x05']).to_pylist() == masked.to_pylist() + + masked_nulls = pa.array(np.array([b'\x05']), type=pa.binary(1), + mask=np.array([True])) + assert pa.array([None]).to_pylist() == masked_nulls.to_pylist() + + def test_array_invalid_mask_raises(): # ARROW-10742 cases = [ From 6e5d9527cb1eb07827ff4c94fbe8ccea23718294 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Thu, 29 Apr 2021 18:34:45 +0200 Subject: [PATCH 02/12] Add comment --- cpp/src/arrow/python/numpy_to_arrow.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 5b07bcfe781..652bc8fbd81 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -598,6 +598,8 @@ Status NumPyConverter::Visit(const FixedSizeBinaryType& type) { for (int64_t i = 0; i < length_; ++i) { inverted_mask[i] = !mask_values[i]; } + // AppendValues wants the mask flipped from what we got, + // so we need to provide it the inverted_mask, not the original one. RETURN_NOT_OK(builder.AppendValues(data, length_, inverted_mask.get())); } else { RETURN_NOT_OK(builder.AppendValues(data, length_)); From f8ad0d5e61010f828515d0529138c88eb8e7282f Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Fri, 30 Apr 2021 12:26:25 +0200 Subject: [PATCH 03/12] More complete test to confirm VarLength binaries don't face the issue --- python/pyarrow/tests/test_array.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index ac0f430299d..e9ae6f4c7be 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2720,6 +2720,7 @@ def test_binary_array_masked(): mask=np.array([False])) assert pa.array([b'\x05']).to_pylist() == masked_basic.to_pylist() + # Fixed Length Binary masked = pa.array(np.array([b'\x05']), type=pa.binary(1), mask=np.array([False])) assert pa.array([b'\x05']).to_pylist() == masked.to_pylist() @@ -2728,6 +2729,15 @@ def test_binary_array_masked(): mask=np.array([True])) assert pa.array([None]).to_pylist() == masked_nulls.to_pylist() + # Variable Length Binary + masked = pa.array(np.array([b'\x05']), type=pa.binary(-1), + mask=np.array([False])) + assert pa.array([b'\x05']).to_pylist() == masked.to_pylist() + + masked_nulls = pa.array(np.array([b'\x05']), type=pa.binary(-1), + mask=np.array([True])) + assert pa.array([None]).to_pylist() == masked_nulls.to_pylist() + def test_array_invalid_mask_raises(): # ARROW-10742 From b2f936c454cab20b32d63f99156ce9579c2df58f Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Fri, 30 Apr 2021 12:29:59 +0200 Subject: [PATCH 04/12] linter --- python/pyarrow/tests/test_array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index e9ae6f4c7be..2bae2661087 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2717,7 +2717,7 @@ def test_array_masked(): def test_binary_array_masked(): # ARROW-12431 masked_basic = pa.array([b'\x05'], type=pa.binary(1), - mask=np.array([False])) + mask=np.array([False])) assert pa.array([b'\x05']).to_pylist() == masked_basic.to_pylist() # Fixed Length Binary From 7777a179e6d557060f677e5139008b4dd839558e Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Fri, 30 Apr 2021 12:40:11 +0200 Subject: [PATCH 05/12] lint cpp --- cpp/src/arrow/python/numpy_to_arrow.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 652bc8fbd81..203a6eeec06 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -596,7 +596,7 @@ Status NumPyConverter::Visit(const FixedSizeBinaryType& type) { Ndarray1DIndexer mask_values(mask_); std::unique_ptr inverted_mask(new uint8_t[length_]); for (int64_t i = 0; i < length_; ++i) { - inverted_mask[i] = !mask_values[i]; + inverted_mask[i] = !mask_values[i]; } // AppendValues wants the mask flipped from what we got, // so we need to provide it the inverted_mask, not the original one. From 998ac4e9239fb6f25bfc066c77f86c7c214832e7 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Fri, 30 Apr 2021 14:28:06 +0200 Subject: [PATCH 06/12] Adapt tests --- python/pyarrow/tests/test_array.py | 10 +++++----- python/pyarrow/tests/test_pandas.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 2bae2661087..f4151f4098f 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2718,25 +2718,25 @@ def test_binary_array_masked(): # ARROW-12431 masked_basic = pa.array([b'\x05'], type=pa.binary(1), mask=np.array([False])) - assert pa.array([b'\x05']).to_pylist() == masked_basic.to_pylist() + assert [b'\x05'] == masked_basic.to_pylist() # Fixed Length Binary masked = pa.array(np.array([b'\x05']), type=pa.binary(1), mask=np.array([False])) - assert pa.array([b'\x05']).to_pylist() == masked.to_pylist() + assert [b'\x05'] == masked.to_pylist() masked_nulls = pa.array(np.array([b'\x05']), type=pa.binary(1), mask=np.array([True])) - assert pa.array([None]).to_pylist() == masked_nulls.to_pylist() + assert [None] == masked_nulls.to_pylist() # Variable Length Binary masked = pa.array(np.array([b'\x05']), type=pa.binary(-1), mask=np.array([False])) - assert pa.array([b'\x05']).to_pylist() == masked.to_pylist() + assert [b'\x05'] == masked.to_pylist() masked_nulls = pa.array(np.array([b'\x05']), type=pa.binary(-1), mask=np.array([True])) - assert pa.array([None]).to_pylist() == masked_nulls.to_pylist() + assert [None] == masked_nulls.to_pylist() def test_array_invalid_mask_raises(): diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 77c18b839c6..7f904433fa2 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -1705,7 +1705,7 @@ def test_numpy_string_array_to_fixed_size_binary(self): expected = pa.array(list(arr), type=pa.binary(3)) assert converted.equals(expected) - mask = np.array([True, False, True]) + mask = np.array([False, True, False]) converted = pa.array(arr, type=pa.binary(3), mask=mask) expected = pa.array([b'foo', None, b'baz'], type=pa.binary(3)) assert converted.equals(expected) From 0d7672cf3a795ee65639735ce712b473b38c6ff1 Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Mon, 3 May 2021 16:12:37 +0200 Subject: [PATCH 07/12] do not pass -1 for varlen binaries --- python/pyarrow/tests/test_array.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index f4151f4098f..075c2aa86fc 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2730,11 +2730,11 @@ def test_binary_array_masked(): assert [None] == masked_nulls.to_pylist() # Variable Length Binary - masked = pa.array(np.array([b'\x05']), type=pa.binary(-1), + masked = pa.array(np.array([b'\x05']), type=pa.binary(), mask=np.array([False])) assert [b'\x05'] == masked.to_pylist() - masked_nulls = pa.array(np.array([b'\x05']), type=pa.binary(-1), + masked_nulls = pa.array(np.array([b'\x05']), type=pa.binary(), mask=np.array([True])) assert [None] == masked_nulls.to_pylist() From e06be2e31359028ef35fd7a8d6e678277366a8ed Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Tue, 4 May 2021 11:02:42 +0200 Subject: [PATCH 08/12] Iterate over data, not mask --- cpp/src/arrow/python/numpy_to_arrow.cc | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 203a6eeec06..d9f275a67aa 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -594,13 +594,15 @@ Status NumPyConverter::Visit(const FixedSizeBinaryType& type) { if (mask_ != nullptr) { Ndarray1DIndexer mask_values(mask_); - std::unique_ptr inverted_mask(new uint8_t[length_]); + RETURN_NOT_OK(builder.Reserve(length_)); for (int64_t i = 0; i < length_; ++i) { - inverted_mask[i] = !mask_values[i]; + if (mask_values[i]) { + RETURN_NOT_OK(builder.AppendNull()); + } else { + RETURN_NOT_OK(builder.Append(data)); + } + data += stride_; } - // AppendValues wants the mask flipped from what we got, - // so we need to provide it the inverted_mask, not the original one. - RETURN_NOT_OK(builder.AppendValues(data, length_, inverted_mask.get())); } else { RETURN_NOT_OK(builder.AppendValues(data, length_)); } From 85993b94765dd99c54b546248de3d2ff2e594e8a Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Wed, 5 May 2021 12:47:31 +0200 Subject: [PATCH 09/12] Faster and simplified implementation --- cpp/src/arrow/python/numpy_to_arrow.cc | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index d9f275a67aa..6b958bcf51c 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -589,27 +589,16 @@ Status NumPyConverter::Visit(const FixedSizeBinaryType& type) { byte_width, ")"); } - FixedSizeBinaryBuilder builder(::arrow::fixed_size_binary(byte_width), pool_); - auto data = reinterpret_cast(PyArray_DATA(arr_)); - + RETURN_NOT_OK(InitNullBitmap()); if (mask_ != nullptr) { - Ndarray1DIndexer mask_values(mask_); - RETURN_NOT_OK(builder.Reserve(length_)); - for (int64_t i = 0; i < length_; ++i) { - if (mask_values[i]) { - RETURN_NOT_OK(builder.AppendNull()); - } else { - RETURN_NOT_OK(builder.Append(data)); - } - data += stride_; - } - } else { - RETURN_NOT_OK(builder.AppendValues(data, length_)); + null_count_ = MaskToBitmap(mask_, length_, null_bitmap_data_); } - std::shared_ptr result; - RETURN_NOT_OK(builder.Finish(&result)); - return PushArray(result->data()); + std::shared_ptr data; + RETURN_NOT_OK(PrepareInputData(&data)); + + auto arr_data = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_, 0); + return PushArray(arr_data); } namespace { From 8d6f1bc5f886b139a3ea60ad85a8b81e6d20161d Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Thu, 6 May 2021 12:04:22 +0200 Subject: [PATCH 10/12] Revert "Faster and simplified implementation" This reverts commit b3e0a37049dffdcac63b5ce446907be60523e651. --- cpp/src/arrow/python/numpy_to_arrow.cc | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 6b958bcf51c..d9f275a67aa 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -589,16 +589,27 @@ Status NumPyConverter::Visit(const FixedSizeBinaryType& type) { byte_width, ")"); } - RETURN_NOT_OK(InitNullBitmap()); + FixedSizeBinaryBuilder builder(::arrow::fixed_size_binary(byte_width), pool_); + auto data = reinterpret_cast(PyArray_DATA(arr_)); + if (mask_ != nullptr) { - null_count_ = MaskToBitmap(mask_, length_, null_bitmap_data_); + Ndarray1DIndexer mask_values(mask_); + RETURN_NOT_OK(builder.Reserve(length_)); + for (int64_t i = 0; i < length_; ++i) { + if (mask_values[i]) { + RETURN_NOT_OK(builder.AppendNull()); + } else { + RETURN_NOT_OK(builder.Append(data)); + } + data += stride_; + } + } else { + RETURN_NOT_OK(builder.AppendValues(data, length_)); } - std::shared_ptr data; - RETURN_NOT_OK(PrepareInputData(&data)); - - auto arr_data = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_, 0); - return PushArray(arr_data); + std::shared_ptr result; + RETURN_NOT_OK(builder.Finish(&result)); + return PushArray(result->data()); } namespace { From 7cf6cd10908e005a0973f85f32026cd31327440d Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Thu, 6 May 2021 12:07:33 +0200 Subject: [PATCH 11/12] modify array behaviour --- python/pyarrow/tests/test_array.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 075c2aa86fc..351220bf538 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2738,6 +2738,13 @@ def test_binary_array_masked(): mask=np.array([True])) assert [None] == masked_nulls.to_pylist() + # Fixed Length Binary, copy + npa = np.array([b'aaa', b'bbb', b'ccc']*10) + arrow_array = pa.array(npa, type=pa.binary(3), + mask=np.array([False, False, False]*10)) + npa[npa == b"bbb"] = b"XXX" + assert ([b'aaa', b'bbb', b'ccc']*10) == arrow_array.to_pylist() + def test_array_invalid_mask_raises(): # ARROW-10742 From d0602c73749fab046e4375b5ef2c2b7f19699e0c Mon Sep 17 00:00:00 2001 From: Alessandro Molina Date: Fri, 28 May 2021 08:46:27 +0200 Subject: [PATCH 12/12] Fix conversion of strided binary arrays --- cpp/src/arrow/python/numpy_to_arrow.cc | 5 ++++- python/pyarrow/tests/test_array.py | 13 +++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index d9f275a67aa..a382f766333 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -604,7 +604,10 @@ Status NumPyConverter::Visit(const FixedSizeBinaryType& type) { data += stride_; } } else { - RETURN_NOT_OK(builder.AppendValues(data, length_)); + for (int64_t i = 0; i < length_; ++i) { + RETURN_NOT_OK(builder.Append(data)); + data += stride_; + } } std::shared_ptr result; diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 351220bf538..30500bc3c5b 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2746,6 +2746,19 @@ def test_binary_array_masked(): assert ([b'aaa', b'bbb', b'ccc']*10) == arrow_array.to_pylist() +def test_binary_array_strided(): + # Masked + nparray = np.array([b"ab", b"cd", b"ef"]) + arrow_array = pa.array(nparray[::2], pa.binary(2), + mask=np.array([False, False])) + assert [b"ab", b"ef"] == arrow_array.to_pylist() + + # Unmasked + nparray = np.array([b"ab", b"cd", b"ef"]) + arrow_array = pa.array(nparray[::2], pa.binary(2)) + assert [b"ab", b"ef"] == arrow_array.to_pylist() + + def test_array_invalid_mask_raises(): # ARROW-10742 cases = [