From 1dc641adf2dfb7b0fc66c3973d483f97ec43ec56 Mon Sep 17 00:00:00 2001 From: feik Date: Sun, 16 Jun 2024 11:48:13 -0400 Subject: [PATCH 01/25] Update core.py Added store_decimal_as_integer --- python/pyarrow/parquet/core.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index eaff79c8b13..da6b77501e2 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -873,6 +873,21 @@ def _sanitize_table(table, new_schema, flavor): Specify the sort order of the data being written. The writer does not sort the data nor does it verify that the data is sorted. The sort order is written to the row group metadata, which can then be used by readers. +store_decimal_as_integer : bool or list, default False + Allow decimals with 1 <= precision <= 18 to be stored as integers. + In Parquet, DECIMAL can be stored in any of the following physical types: + - int32: for 1 <= precision <= 9. + - int64: for 10 <= precision <= 18. + - fixed_len_byte_array: precision is limited by the array size. Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits. + - binary: precision is unlimited. The minimum number of bytes to store the unscaled value is used. + + By default, this is DISABLED and all decimal types annotate fixed_len_byte_array. + When enabled, the writer will use following physical types to store decimals: + - int32: for 1 <= precision <= 9. + - int64: for 10 <= precision <= 18. + - fixed_len_byte_array: for precision > 18. + + As a consequence, decimal columns stored in integer types are more compact. """ _parquet_writer_example_doc = """\ @@ -968,6 +983,7 @@ def __init__(self, where, schema, filesystem=None, write_page_index=False, write_page_checksum=False, sorting_columns=None, + store_decimal_as_integer=False, **options): if use_deprecated_int96_timestamps is None: # Use int96 timestamps for Spark @@ -1020,6 +1036,7 @@ def __init__(self, where, schema, filesystem=None, write_page_index=write_page_index, write_page_checksum=write_page_checksum, sorting_columns=sorting_columns, + store_decimal_as_integer=store_decimal_as_integer, **options) self.is_open = True @@ -1873,6 +1890,7 @@ def write_table(table, where, row_group_size=None, version='2.6', write_page_index=False, write_page_checksum=False, sorting_columns=None, + store_decimal_as_integer=False, **kwargs): # Implementor's note: when adding keywords here / updating defaults, also # update it in write_to_dataset and _dataset_parquet.pyx ParquetFileWriteOptions @@ -1903,6 +1921,7 @@ def write_table(table, where, row_group_size=None, version='2.6', write_page_index=write_page_index, write_page_checksum=write_page_checksum, sorting_columns=sorting_columns, + store_decimal_as_integer=store_decimal_as_integer, **kwargs) as writer: writer.write_table(table, row_group_size=row_group_size) except Exception: From 1acb8f6128a01788acd101ad4be8f9b8d30439f1 Mon Sep 17 00:00:00 2001 From: feik Date: Sun, 16 Jun 2024 11:50:25 -0400 Subject: [PATCH 02/25] Update _parquet.pxd Added store_decimal_as_integer --- python/pyarrow/_parquet.pxd | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 1bfa505c544..e1be6e79f70 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -594,6 +594,7 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( write_page_index=*, write_page_checksum=*, sorting_columns=*, + store_decimal_as_integer=*, ) except * From 26515f63c82e5069088be0d21a14461c058d1cc0 Mon Sep 17 00:00:00 2001 From: feik Date: Sun, 16 Jun 2024 11:52:27 -0400 Subject: [PATCH 03/25] Update _dataset_parquet.pyx Added store_decimal_as_integer --- python/pyarrow/_dataset_parquet.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index 4942336a126..b243aa91bee 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -613,6 +613,9 @@ cdef class ParquetFileWriteOptions(FileWriteOptions): write_page_index=self._properties["write_page_index"], write_page_checksum=self._properties["write_page_checksum"], sorting_columns=self._properties["sorting_columns"], + store_decimal_as_integer=( + self._properties["store_decimal_as_integer"] + ), ) def _set_arrow_properties(self): @@ -664,6 +667,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions): encryption_config=None, write_page_checksum=False, sorting_columns=None, + store_decimal_as_integer=False, ) self._set_properties() From 84895d170bd1f8b76ef700e69fcd324b85083fdf Mon Sep 17 00:00:00 2001 From: feik Date: Sun, 16 Jun 2024 12:03:20 -0400 Subject: [PATCH 04/25] Update _parquet.pyx store_decimal_as_integer WIP --- python/pyarrow/_parquet.pyx | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 414f0cef4e5..189a736438d 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -1830,7 +1830,9 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( dictionary_pagesize_limit=None, write_page_index=False, write_page_checksum=False, - sorting_columns=None) except *: + sorting_columns=None, + store_decimal_as_integer=False) except *: + """General writer properties""" cdef: shared_ptr[WriterProperties] properties @@ -1941,6 +1943,22 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( "'use_byte_stream_split' cannot be passed" "together with 'column_encoding'") + # store_decimal_as_integer + + if isinstance(store_decimal_as_integer, bool): + if store_decimal_as_integer: + props.encoding(ParquetEncoding_BYTE_STREAM_SPLIT) + elif store_decimal_as_integer is not None: + for column in store_decimal_as_integer: + if column_encoding is None: + column_encoding = {column: 'BYTE_STREAM_SPLIT'} + elif column_encoding.get(column, None) is None: + column_encoding[column] = 'BYTE_STREAM_SPLIT' + else: + raise ValueError( + "'store_decimal_as_integer' cannot be passed" + "together with 'column_encoding'") + # column_encoding # encoding map - encode individual columns @@ -2114,6 +2132,7 @@ cdef class ParquetWriter(_Weakrefable): int64_t write_batch_size int64_t dictionary_pagesize_limit object store_schema + object store_decimal_as_integer def __cinit__(self, where, Schema schema not None, use_dictionary=None, compression=None, version=None, @@ -2135,7 +2154,8 @@ cdef class ParquetWriter(_Weakrefable): store_schema=True, write_page_index=False, write_page_checksum=False, - sorting_columns=None): + sorting_columns=None, + store_decimal_as_integer=False): cdef: shared_ptr[WriterProperties] properties shared_ptr[ArrowWriterProperties] arrow_properties @@ -2169,6 +2189,7 @@ cdef class ParquetWriter(_Weakrefable): write_page_index=write_page_index, write_page_checksum=write_page_checksum, sorting_columns=sorting_columns, + store_decimal_as_integer=store_decimal_as_integer, ) arrow_properties = _create_arrow_writer_properties( use_deprecated_int96_timestamps=use_deprecated_int96_timestamps, From 68a0ed34cfdbcc403a3d6afad63f3364d6a5bb5a Mon Sep 17 00:00:00 2001 From: feik Date: Sun, 16 Jun 2024 13:39:16 -0400 Subject: [PATCH 05/25] Update _parquet.pyx store_decimal_as_integer completed --- python/pyarrow/_parquet.pyx | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 189a736438d..875616d82a0 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -1947,17 +1947,9 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( if isinstance(store_decimal_as_integer, bool): if store_decimal_as_integer: - props.encoding(ParquetEncoding_BYTE_STREAM_SPLIT) - elif store_decimal_as_integer is not None: - for column in store_decimal_as_integer: - if column_encoding is None: - column_encoding = {column: 'BYTE_STREAM_SPLIT'} - elif column_encoding.get(column, None) is None: - column_encoding[column] = 'BYTE_STREAM_SPLIT' - else: - raise ValueError( - "'store_decimal_as_integer' cannot be passed" - "together with 'column_encoding'") + props.enable_store_decimal_as_integer() + else: + props.disable_store_decimal_as_integer() # column_encoding # encoding map - encode individual columns From a14d341df1b1096d18dc23266b8bf02a48e49b11 Mon Sep 17 00:00:00 2001 From: feik Date: Sun, 16 Jun 2024 13:41:41 -0400 Subject: [PATCH 06/25] Update core.py Removed list option from store_decimal_as_integer --- python/pyarrow/parquet/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index da6b77501e2..04f46a1876e 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -873,7 +873,7 @@ def _sanitize_table(table, new_schema, flavor): Specify the sort order of the data being written. The writer does not sort the data nor does it verify that the data is sorted. The sort order is written to the row group metadata, which can then be used by readers. -store_decimal_as_integer : bool or list, default False +store_decimal_as_integer : bool, default False Allow decimals with 1 <= precision <= 18 to be stored as integers. In Parquet, DECIMAL can be stored in any of the following physical types: - int32: for 1 <= precision <= 9. From 27305f76026d21fc775b90c4425067b9fa2b4df0 Mon Sep 17 00:00:00 2001 From: feik Date: Sun, 16 Jun 2024 13:53:21 -0400 Subject: [PATCH 07/25] Update test_basic.py WIP --- python/pyarrow/tests/parquet/test_basic.py | 38 ++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 56b967a0595..d2cbc7d7e05 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -357,6 +357,44 @@ def test_byte_stream_split(): use_dictionary=False) +def test_store_decimal_as_integer(): + # This is only a smoke test. + arr_float = pa.array(list(map(float, range(100)))) + arr_int = pa.array(list(map(int, range(100)))) + arr_bool = pa.array([True, False] * 50) + data_float = [arr_float, arr_float] + table = pa.Table.from_arrays(data_float, names=['a', 'b']) + + # Check with byte_stream_split for both columns. + _check_roundtrip(table, expected=table, compression="gzip", + use_dictionary=False, use_byte_stream_split=True) + + # Check with byte_stream_split for column 'b' and dictionary + # for column 'a'. + _check_roundtrip(table, expected=table, compression="gzip", + use_dictionary=['a'], + use_byte_stream_split=['b']) + + # Check with a collision for both columns. + _check_roundtrip(table, expected=table, compression="gzip", + use_dictionary=['a', 'b'], + use_byte_stream_split=['a', 'b']) + + # Check with mixed column types. + mixed_table = pa.Table.from_arrays([arr_float, arr_float, arr_int, arr_int], + names=['a', 'b', 'c', 'd']) + _check_roundtrip(mixed_table, expected=mixed_table, + use_dictionary=['b', 'd'], + use_byte_stream_split=['a', 'c']) + + # Try to use the wrong data type with the byte_stream_split encoding. + # This should throw an exception. + table = pa.Table.from_arrays([arr_bool], names=['tmp']) + with pytest.raises(IOError, match='BYTE_STREAM_SPLIT only supports'): + _check_roundtrip(table, expected=table, use_byte_stream_split=True, + use_dictionary=False) + + def test_column_encoding(): arr_float = pa.array(list(map(float, range(100)))) arr_int = pa.array(list(map(int, range(100)))) From d9ff600ea4d3db7cfe5bde557b1a0981d5b4347f Mon Sep 17 00:00:00 2001 From: feik Date: Sun, 16 Jun 2024 20:08:17 -0400 Subject: [PATCH 08/25] Update test_basic.py WIP --- python/pyarrow/tests/parquet/test_basic.py | 45 +++++++++------------- 1 file changed, 18 insertions(+), 27 deletions(-) diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index d2cbc7d7e05..1feb59f38b1 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -19,6 +19,7 @@ import io import warnings from shutil import copytree +import decimal import numpy as np import pytest @@ -358,41 +359,31 @@ def test_byte_stream_split(): def test_store_decimal_as_integer(): - # This is only a smoke test. - arr_float = pa.array(list(map(float, range(100)))) - arr_int = pa.array(list(map(int, range(100)))) + arr_decimal_1_9 = pa.array(list(map(float, range(100))), type=pa.decimal128(5,2)) + arr_decimal_10_18 = pa.array(list(map(float, range(100))), type=pa.decimal128(16,9)) + arr_decimal_gt18 = pa.array(list(map(float, range(100))), type=pa.decimal128(22,2)) arr_bool = pa.array([True, False] * 50) - data_float = [arr_float, arr_float] - table = pa.Table.from_arrays(data_float, names=['a', 'b']) + data_decimal = [arr_decimal_1_9, arr_decimal_10_18, arr_decimal_gt18] + table = pa.Table.from_arrays(data_decimal, names=['a', 'b', 'c']) - # Check with byte_stream_split for both columns. + # Check with store_decimal_as_integer. _check_roundtrip(table, expected=table, compression="gzip", - use_dictionary=False, use_byte_stream_split=True) + use_dictionary=False, store_decimal_as_integer=True) - # Check with byte_stream_split for column 'b' and dictionary - # for column 'a'. + # Check with store_decimal_as_integer and delta-int encoding. _check_roundtrip(table, expected=table, compression="gzip", - use_dictionary=['a'], - use_byte_stream_split=['b']) - - # Check with a collision for both columns. - _check_roundtrip(table, expected=table, compression="gzip", - use_dictionary=['a', 'b'], - use_byte_stream_split=['a', 'b']) - + use_dictionary=False, + store_decimal_as_integer=True, + column_encoding="DELTA_BINARY_PACKED") + # Check with mixed column types. - mixed_table = pa.Table.from_arrays([arr_float, arr_float, arr_int, arr_int], + mixed_table = pa.Table.from_arrays([arr_decimal_1_9, arr_decimal_10_18, + arr_decimal_gt18, aar_bool], names=['a', 'b', 'c', 'd']) - _check_roundtrip(mixed_table, expected=mixed_table, - use_dictionary=['b', 'd'], - use_byte_stream_split=['a', 'c']) + _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, + store_decimal_as_integer=True) - # Try to use the wrong data type with the byte_stream_split encoding. - # This should throw an exception. - table = pa.Table.from_arrays([arr_bool], names=['tmp']) - with pytest.raises(IOError, match='BYTE_STREAM_SPLIT only supports'): - _check_roundtrip(table, expected=table, use_byte_stream_split=True, - use_dictionary=False) + def test_column_encoding(): From d9c686422e7630c1021f4e62bdffa8ee6ff85208 Mon Sep 17 00:00:00 2001 From: feik Date: Sun, 16 Jun 2024 20:14:12 -0400 Subject: [PATCH 09/25] Update test_basic.py --- python/pyarrow/tests/parquet/test_basic.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 1feb59f38b1..289dd34027b 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -359,9 +359,15 @@ def test_byte_stream_split(): def test_store_decimal_as_integer(): - arr_decimal_1_9 = pa.array(list(map(float, range(100))), type=pa.decimal128(5,2)) - arr_decimal_10_18 = pa.array(list(map(float, range(100))), type=pa.decimal128(16,9)) - arr_decimal_gt18 = pa.array(list(map(float, range(100))), type=pa.decimal128(22,2)) + arr_decimal_1_9 = pa.array(list(map(float, range(100))), + type=pa.decimal128(5,2) + ) + arr_decimal_10_18 = pa.array(list(map(float, range(100))), + type=pa.decimal128(16,9) + ) + arr_decimal_gt18 = pa.array(list(map(float, range(100))), + type=pa.decimal128(22,2) + ) arr_bool = pa.array([True, False] * 50) data_decimal = [arr_decimal_1_9, arr_decimal_10_18, arr_decimal_gt18] table = pa.Table.from_arrays(data_decimal, names=['a', 'b', 'c']) @@ -374,7 +380,9 @@ def test_store_decimal_as_integer(): _check_roundtrip(table, expected=table, compression="gzip", use_dictionary=False, store_decimal_as_integer=True, - column_encoding="DELTA_BINARY_PACKED") + column_encoding={'a': 'DELTA_BINARY_PACKED', + 'b': 'DELTA_BINARY_PACKED'} + ) # Check with mixed column types. mixed_table = pa.Table.from_arrays([arr_decimal_1_9, arr_decimal_10_18, From 77b2260035059e194f564e20ac5b895eeae3d4c1 Mon Sep 17 00:00:00 2001 From: feik Date: Sun, 16 Jun 2024 20:46:47 -0400 Subject: [PATCH 10/25] Update test_basic.py Update tests to use decimal python types --- python/pyarrow/tests/parquet/test_basic.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 289dd34027b..a43f81efec0 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -19,7 +19,7 @@ import io import warnings from shutil import copytree -import decimal +from decimal import Decimal import numpy as np import pytest @@ -359,13 +359,13 @@ def test_byte_stream_split(): def test_store_decimal_as_integer(): - arr_decimal_1_9 = pa.array(list(map(float, range(100))), + arr_decimal_1_9 = pa.array(list(map(Decimal, range(100))), type=pa.decimal128(5,2) ) - arr_decimal_10_18 = pa.array(list(map(float, range(100))), + arr_decimal_10_18 = pa.array(list(map(Decimal, range(100))), type=pa.decimal128(16,9) ) - arr_decimal_gt18 = pa.array(list(map(float, range(100))), + arr_decimal_gt18 = pa.array(list(map(Decimal, range(100))), type=pa.decimal128(22,2) ) arr_bool = pa.array([True, False] * 50) From 6181e55f2d52fddbd7aeb33dcc5691eb36d4a56a Mon Sep 17 00:00:00 2001 From: feik Date: Mon, 17 Jun 2024 09:35:44 -0400 Subject: [PATCH 11/25] Update _parquet.pxd --- python/pyarrow/_parquet.pxd | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index e1be6e79f70..c2ee7d3aedf 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -431,6 +431,8 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: Builder* disable_statistics() Builder* enable_statistics() Builder* enable_statistics(const c_string& path) + Builder* enable_store_decimal_as_integer() + Builder* disable_store_decimal_as_integer() Builder* data_pagesize(int64_t size) Builder* encoding(ParquetEncoding encoding) Builder* encoding(const c_string& path, From 4c29b86e7faa66aaec4007e9ff95dbc5d2712e06 Mon Sep 17 00:00:00 2001 From: feik Date: Mon, 17 Jun 2024 10:36:37 -0400 Subject: [PATCH 12/25] Update test_basic.py Fixed variable typo --- python/pyarrow/tests/parquet/test_basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index a43f81efec0..14749058a0c 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -386,7 +386,7 @@ def test_store_decimal_as_integer(): # Check with mixed column types. mixed_table = pa.Table.from_arrays([arr_decimal_1_9, arr_decimal_10_18, - arr_decimal_gt18, aar_bool], + arr_decimal_gt18, arr_bool], names=['a', 'b', 'c', 'd']) _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, store_decimal_as_integer=True) From b916ff00ac67ccb16bd46d5107ac7496e55dccf5 Mon Sep 17 00:00:00 2001 From: feik Date: Mon, 17 Jun 2024 11:21:37 -0400 Subject: [PATCH 13/25] Update core.py Linting fix --- python/pyarrow/parquet/core.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 04f46a1876e..7c5595ebf38 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -878,15 +878,17 @@ def _sanitize_table(table, new_schema, flavor): In Parquet, DECIMAL can be stored in any of the following physical types: - int32: for 1 <= precision <= 9. - int64: for 10 <= precision <= 18. - - fixed_len_byte_array: precision is limited by the array size. Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits. - - binary: precision is unlimited. The minimum number of bytes to store the unscaled value is used. - + - fixed_len_byte_array: precision is limited by the array size. + Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits. + - binary: precision is unlimited. The minimum number of bytes to store the + unscaled value is used. + By default, this is DISABLED and all decimal types annotate fixed_len_byte_array. When enabled, the writer will use following physical types to store decimals: - int32: for 1 <= precision <= 9. - int64: for 10 <= precision <= 18. - fixed_len_byte_array: for precision > 18. - + As a consequence, decimal columns stored in integer types are more compact. """ From 1947987a443e316ef6bb8ed8d5d31c240758d830 Mon Sep 17 00:00:00 2001 From: feik Date: Mon, 17 Jun 2024 11:24:29 -0400 Subject: [PATCH 14/25] Update test_basic.py Linting fix --- python/pyarrow/tests/parquet/test_basic.py | 51 ++++++++++++---------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 14749058a0c..5bbba03b1d1 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -359,39 +359,42 @@ def test_byte_stream_split(): def test_store_decimal_as_integer(): - arr_decimal_1_9 = pa.array(list(map(Decimal, range(100))), - type=pa.decimal128(5,2) - ) - arr_decimal_10_18 = pa.array(list(map(Decimal, range(100))), - type=pa.decimal128(16,9) - ) - arr_decimal_gt18 = pa.array(list(map(Decimal, range(100))), - type=pa.decimal128(22,2) - ) + arr_decimal_1_9 = pa.array(list(map(Decimal, range(100))), + type=pa.decimal128(5, 2)) + arr_decimal_10_18 = pa.array(list(map(Decimal, range(100))), + type=pa.decimal128(16, 9)) + arr_decimal_gt18 = pa.array(list(map(Decimal, range(100))), + type=pa.decimal128(22, 2)) arr_bool = pa.array([True, False] * 50) data_decimal = [arr_decimal_1_9, arr_decimal_10_18, arr_decimal_gt18] table = pa.Table.from_arrays(data_decimal, names=['a', 'b', 'c']) # Check with store_decimal_as_integer. - _check_roundtrip(table, expected=table, compression="gzip", - use_dictionary=False, store_decimal_as_integer=True) + _check_roundtrip(table, + expected=table, + compression="gzip", + use_dictionary=False, + store_decimal_as_integer=True) - # Check with store_decimal_as_integer and delta-int encoding. - _check_roundtrip(table, expected=table, compression="gzip", + # Check with store_decimal_as_integer and delta-int encoding. + _check_roundtrip(table, + expected=table, + compression="gzip", use_dictionary=False, store_decimal_as_integer=True, - column_encoding={'a': 'DELTA_BINARY_PACKED', - 'b': 'DELTA_BINARY_PACKED'} - ) - - # Check with mixed column types. - mixed_table = pa.Table.from_arrays([arr_decimal_1_9, arr_decimal_10_18, - arr_decimal_gt18, arr_bool], - names=['a', 'b', 'c', 'd']) - _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, - store_decimal_as_integer=True) + column_encoding={ + 'a': 'DELTA_BINARY_PACKED', + 'b': 'DELTA_BINARY_PACKED' + }) - + # Check with mixed column types. + mixed_table = pa.Table.from_arrays( + [arr_decimal_1_9, arr_decimal_10_18, arr_decimal_gt18, arr_bool], + names=['a', 'b', 'c', 'd']) + _check_roundtrip(mixed_table, + expected=mixed_table, + use_dictionary=False, + store_decimal_as_integer=True) def test_column_encoding(): From 7520c6304a4399c4c426c600d6f542ae34523f16 Mon Sep 17 00:00:00 2001 From: feik Date: Mon, 17 Jun 2024 12:37:02 -0400 Subject: [PATCH 15/25] Update test_basic.py Last linting nit --- python/pyarrow/tests/parquet/test_basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 5bbba03b1d1..1c336006d0b 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -394,7 +394,7 @@ def test_store_decimal_as_integer(): _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, - store_decimal_as_integer=True) + store_decimal_as_integer=True) def test_column_encoding(): From bc7231baa40a7d4b4c2e4760164d99b5ee60ec38 Mon Sep 17 00:00:00 2001 From: feik Date: Mon, 17 Jun 2024 13:45:18 -0400 Subject: [PATCH 16/25] Update _parquet.pyx Remove extra whitespace --- python/pyarrow/_parquet.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 875616d82a0..1321777fd96 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -1832,7 +1832,7 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( write_page_checksum=False, sorting_columns=None, store_decimal_as_integer=False) except *: - + """General writer properties""" cdef: shared_ptr[WriterProperties] properties @@ -1950,7 +1950,7 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( props.enable_store_decimal_as_integer() else: props.disable_store_decimal_as_integer() - + # column_encoding # encoding map - encode individual columns From 6685fb1dd17b3865ef80842bd5bc485d1f94377a Mon Sep 17 00:00:00 2001 From: Brian Kiefer Date: Wed, 26 Jun 2024 19:18:21 -0400 Subject: [PATCH 17/25] * Raise value error if store_decimal_as_integer not boolean * Syntax cleanup --- python/pyarrow/_dataset_parquet.pyx | 4 +--- python/pyarrow/_parquet.pyx | 3 +++ 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index b243aa91bee..a7afd065b59 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -613,9 +613,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions): write_page_index=self._properties["write_page_index"], write_page_checksum=self._properties["write_page_checksum"], sorting_columns=self._properties["sorting_columns"], - store_decimal_as_integer=( - self._properties["store_decimal_as_integer"] - ), + store_decimal_as_integer=self._properties["store_decimal_as_integer"], ) def _set_arrow_properties(self): diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 1321777fd96..615b29a5cc2 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -1950,6 +1950,9 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( props.enable_store_decimal_as_integer() else: props.disable_store_decimal_as_integer() + else: + raise ValueError("'store_decimal_as_integer' must type boolean") + # column_encoding # encoding map - encode individual columns From 180545d6923c1c21b4b65d196aeb8a3d84f867c2 Mon Sep 17 00:00:00 2001 From: Brian Kiefer Date: Wed, 26 Jun 2024 19:29:41 -0400 Subject: [PATCH 18/25] Changed exception from ValueError to TypeError --- python/pyarrow/_parquet.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 615b29a5cc2..7ccce2f9f10 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -1951,7 +1951,7 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( else: props.disable_store_decimal_as_integer() else: - raise ValueError("'store_decimal_as_integer' must type boolean") + raise TypeError("'store_decimal_as_integer' must type boolean") # column_encoding From c9309582cc9fb92473ca22544d151d84b8ec64f4 Mon Sep 17 00:00:00 2001 From: Brian Kiefer Date: Wed, 26 Jun 2024 19:38:24 -0400 Subject: [PATCH 19/25] Removed extra line (lint) --- python/pyarrow/_parquet.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 7ccce2f9f10..80a85c02fc7 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -1953,7 +1953,6 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( else: raise TypeError("'store_decimal_as_integer' must type boolean") - # column_encoding # encoding map - encode individual columns From 828911e46d6c704c4625a75a8ff78eb375753d21 Mon Sep 17 00:00:00 2001 From: feik Date: Thu, 27 Jun 2024 09:31:32 -0400 Subject: [PATCH 20/25] Update python/pyarrow/_parquet.pyx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Grammer/Wording Co-authored-by: Raúl Cumplido --- python/pyarrow/_parquet.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 80a85c02fc7..e942d7bb842 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -1951,7 +1951,7 @@ cdef shared_ptr[WriterProperties] _create_writer_properties( else: props.disable_store_decimal_as_integer() else: - raise TypeError("'store_decimal_as_integer' must type boolean") + raise TypeError("'store_decimal_as_integer' must be a boolean") # column_encoding # encoding map - encode individual columns From d4002d49be044eab1feaa06c1fb3ae000bae5bba Mon Sep 17 00:00:00 2001 From: feik Date: Thu, 27 Jun 2024 09:32:38 -0400 Subject: [PATCH 21/25] Update python/pyarrow/parquet/core.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Grammer Co-authored-by: Raúl Cumplido --- python/pyarrow/parquet/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 7c5595ebf38..31b3ad57da6 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -884,7 +884,7 @@ def _sanitize_table(table, new_schema, flavor): unscaled value is used. By default, this is DISABLED and all decimal types annotate fixed_len_byte_array. - When enabled, the writer will use following physical types to store decimals: + When enabled, the writer will use the following physical types to store decimals: - int32: for 1 <= precision <= 9. - int64: for 10 <= precision <= 18. - fixed_len_byte_array: for precision > 18. From 7869fd357ec46664e16a3121c9738376964009c5 Mon Sep 17 00:00:00 2001 From: feik Date: Fri, 28 Jun 2024 09:45:49 -0400 Subject: [PATCH 22/25] Dec as int newtest (#1) * Added explicent physical_type test of parquet schema * Fix whitespace * Fix whitespace --- python/pyarrow/tests/parquet/test_basic.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 1c336006d0b..35aabd159fa 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -15,9 +15,11 @@ # specific language governing permissions and limitations # under the License. +import os from collections import OrderedDict import io import warnings +import tempfile from shutil import copytree from decimal import Decimal @@ -376,6 +378,21 @@ def test_store_decimal_as_integer(): use_dictionary=False, store_decimal_as_integer=True) + # Check physical type in parquet schema + with tempfile.TemporaryDirectory() as tempdir: + pqtestfile_path = os.path.join(tempdir, 'test.parquet') + pq.write_table(table, pqtestfile_path, + compression="gzip", + use_dictionary=False, + store_decimal_as_integer=True) + + pqtestfile = pq.ParquetFile(pqtestfile_path) + pqcol_decimal_1_9 = pqtestfile.schema.column(0) + pqcol_decimal_10_18 = pqtestfile.schema.column(1) + + assert pqcol_decimal_1_9.physical_type == 'INT32' + assert pqcol_decimal_10_18.physical_type == 'INT64' + # Check with store_decimal_as_integer and delta-int encoding. _check_roundtrip(table, expected=table, From c9ae9aa32a38a2665cca77318d989009b473f328 Mon Sep 17 00:00:00 2001 From: Brian Kiefer Date: Fri, 28 Jun 2024 10:38:45 -0400 Subject: [PATCH 23/25] Change tempdir approach --- python/pyarrow/tests/parquet/test_basic.py | 28 ++++++++++------------ 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 35aabd159fa..3fd922f9192 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -19,7 +19,6 @@ from collections import OrderedDict import io import warnings -import tempfile from shutil import copytree from decimal import Decimal @@ -360,7 +359,7 @@ def test_byte_stream_split(): use_dictionary=False) -def test_store_decimal_as_integer(): +def test_store_decimal_as_integer(tempdir): arr_decimal_1_9 = pa.array(list(map(Decimal, range(100))), type=pa.decimal128(5, 2)) arr_decimal_10_18 = pa.array(list(map(Decimal, range(100))), @@ -379,19 +378,18 @@ def test_store_decimal_as_integer(): store_decimal_as_integer=True) # Check physical type in parquet schema - with tempfile.TemporaryDirectory() as tempdir: - pqtestfile_path = os.path.join(tempdir, 'test.parquet') - pq.write_table(table, pqtestfile_path, - compression="gzip", - use_dictionary=False, - store_decimal_as_integer=True) - - pqtestfile = pq.ParquetFile(pqtestfile_path) - pqcol_decimal_1_9 = pqtestfile.schema.column(0) - pqcol_decimal_10_18 = pqtestfile.schema.column(1) - - assert pqcol_decimal_1_9.physical_type == 'INT32' - assert pqcol_decimal_10_18.physical_type == 'INT64' + pqtestfile_path = os.path.join(tempdir, 'test.parquet') + pq.write_table(table, pqtestfile_path, + compression="gzip", + use_dictionary=False, + store_decimal_as_integer=True) + + pqtestfile = pq.ParquetFile(pqtestfile_path) + pqcol_decimal_1_9 = pqtestfile.schema.column(0) + pqcol_decimal_10_18 = pqtestfile.schema.column(1) + + assert pqcol_decimal_1_9.physical_type == 'INT32' + assert pqcol_decimal_10_18.physical_type == 'INT64' # Check with store_decimal_as_integer and delta-int encoding. _check_roundtrip(table, From 458c0e00fd9689a0543a3f5420aeebbead395da3 Mon Sep 17 00:00:00 2001 From: Brian Kiefer Date: Fri, 28 Jun 2024 11:20:49 -0400 Subject: [PATCH 24/25] Fix visual indent --- python/pyarrow/tests/parquet/test_basic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 3fd922f9192..08204933d02 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -380,9 +380,9 @@ def test_store_decimal_as_integer(tempdir): # Check physical type in parquet schema pqtestfile_path = os.path.join(tempdir, 'test.parquet') pq.write_table(table, pqtestfile_path, - compression="gzip", - use_dictionary=False, - store_decimal_as_integer=True) + compression="gzip", + use_dictionary=False, + store_decimal_as_integer=True) pqtestfile = pq.ParquetFile(pqtestfile_path) pqcol_decimal_1_9 = pqtestfile.schema.column(0) From c56dda8ddb84c8acc54762b508eb2fe9bad49fef Mon Sep 17 00:00:00 2001 From: Brian Kiefer Date: Fri, 28 Jun 2024 11:22:57 -0400 Subject: [PATCH 25/25] Added test comment --- python/pyarrow/tests/parquet/test_basic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 08204933d02..194af7415e8 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -392,6 +392,7 @@ def test_store_decimal_as_integer(tempdir): assert pqcol_decimal_10_18.physical_type == 'INT64' # Check with store_decimal_as_integer and delta-int encoding. + # DELTA_BINARY_PACKED requires parquet physical type to be INT64 or INT32 _check_roundtrip(table, expected=table, compression="gzip",