From cb04d9a5cebe55c58c648ec9e04e2378f4fc4882 Mon Sep 17 00:00:00 2001 From: Divyansh200102 Date: Mon, 16 Oct 2023 16:38:22 +0530 Subject: [PATCH 01/11] Update dataset.rst --- docs/source/python/api/dataset.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/python/api/dataset.rst b/docs/source/python/api/dataset.rst index 3575846c353..89bcee9f17c 100644 --- a/docs/source/python/api/dataset.rst +++ b/docs/source/python/api/dataset.rst @@ -47,6 +47,7 @@ Classes IpcFileFormat ParquetFileFormat ParquetReadOptions + ParquetWriteOptions ParquetFragmentScanOptions ParquetFileFragment OrcFileFormat From 6c034ba027ea58684550d3c6af89a556c032f5f2 Mon Sep 17 00:00:00 2001 From: Divyansh200102 Date: Tue, 24 Oct 2023 11:28:43 +0530 Subject: [PATCH 02/11] Updated dataset_parquet.pyx and dataset.rst --- docs/source/python/api/dataset.rst | 2 +- python/pyarrow/_dataset_parquet.pyx | 66 +++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/docs/source/python/api/dataset.rst b/docs/source/python/api/dataset.rst index 89bcee9f17c..8b337b7e075 100644 --- a/docs/source/python/api/dataset.rst +++ b/docs/source/python/api/dataset.rst @@ -47,7 +47,7 @@ Classes IpcFileFormat ParquetFileFormat ParquetReadOptions - ParquetWriteOptions + ParquetFileWriteOptions ParquetFragmentScanOptions ParquetFileFragment OrcFileFormat diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index 58ef6145cf7..d592a9d99bc 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -560,6 +560,72 @@ cdef class ParquetReadOptions(_Weakrefable): cdef class ParquetFileWriteOptions(FileWriteOptions): + """ + Parquet format specific options for writing. + + Parameters + ---------- + use_dictionary: bool, default True + Whether to use dictionary encoding for string and binary columns. + compression: str, default "snappy" + The compression codec to use. Valid options include "snappy", "gzip", + "brotli", and "lz4". + version: str, default "2.6" + The Parquet file version to write. + write_statistics: bool, default None + Whether to write column statistics to the file. + data_page_size: int, default None + The size (in bytes) of data pages. + compression_level: int, default None + The compression level to use. Valid values range from 0 (no + compression) to 9 (highest compression). + use_byte_stream_split: bool, default False + Whether to split byte stream columns (e.g. UTF-8 strings) into + multiple Parquet columns. + column_encoding: dict, default None + A dictionary mapping column names to encoding types. Valid encoding + types include "plain", "plain_dictionary", "rle", and + "bit_packed". + data_page_version: str, default "1.0" + The Parquet data page version to write. + use_deprecated_int96_timestamps: bool, default False + Whether to use the deprecated INT96 format for timestamps. + coerce_timestamps: bool, default None + Whether to coerce timestamps to a particular time zone. If None, + timestamps will be written in their original time zone. + allow_truncated_timestamps: bool, default False + Whether to allow timestamps to be truncated if they are outside of the + representable range of the Parquet format. + use_compliant_nested_type: bool, default True + Whether to use the compliant nested type representation for nested + types. + encryption_config: ParquetEncryptionConfiguration, default None + The encryption configuration to use. + + Examples + -------- + + To write a Parquet file with dictionary encoding and Snappy compression, + use the following options: + + ```python + options = ParquetFileWriteOptions(use_dictionary=True, compression="snappy") + ``` + + To write a Parquet file with column statistics and a data page size of + 1 MB, use the following options: + + ```python + options = ParquetFileWriteOptions(write_statistics=True, data_page_size=1024 * 1024) + ``` + + To write a Parquet file with a custom compression level, use the + `compression_level` option: + + ```python + options = ParquetFileWriteOptions(compression_level=5) + ``` + """ def update(self, **kwargs): """ From 82e83fb77de3b5e1b7ed302de4aa0af4454ad3c8 Mon Sep 17 00:00:00 2001 From: Divyansh200102 Date: Tue, 24 Oct 2023 17:59:17 +0530 Subject: [PATCH 03/11] Update _dataset_parquet.pyx --- python/pyarrow/_dataset_parquet.pyx | 134 ++++++++++++++-------------- 1 file changed, 67 insertions(+), 67 deletions(-) diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index d592a9d99bc..33939ca0d83 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -560,73 +560,73 @@ cdef class ParquetReadOptions(_Weakrefable): cdef class ParquetFileWriteOptions(FileWriteOptions): - """ - Parquet format specific options for writing. - - Parameters - ---------- - use_dictionary: bool, default True - Whether to use dictionary encoding for string and binary columns. - compression: str, default "snappy" - The compression codec to use. Valid options include "snappy", "gzip", - "brotli", and "lz4". - version: str, default "2.6" - The Parquet file version to write. - write_statistics: bool, default None - Whether to write column statistics to the file. - data_page_size: int, default None - The size (in bytes) of data pages. - compression_level: int, default None - The compression level to use. Valid values range from 0 (no - compression) to 9 (highest compression). - use_byte_stream_split: bool, default False - Whether to split byte stream columns (e.g. UTF-8 strings) into - multiple Parquet columns. - column_encoding: dict, default None - A dictionary mapping column names to encoding types. Valid encoding - types include "plain", "plain_dictionary", "rle", and - "bit_packed". - data_page_version: str, default "1.0" - The Parquet data page version to write. - use_deprecated_int96_timestamps: bool, default False - Whether to use the deprecated INT96 format for timestamps. - coerce_timestamps: bool, default None - Whether to coerce timestamps to a particular time zone. If None, - timestamps will be written in their original time zone. - allow_truncated_timestamps: bool, default False - Whether to allow timestamps to be truncated if they are outside of the - representable range of the Parquet format. - use_compliant_nested_type: bool, default True - Whether to use the compliant nested type representation for nested - types. - encryption_config: ParquetEncryptionConfiguration, default None - The encryption configuration to use. - - Examples - -------- - - To write a Parquet file with dictionary encoding and Snappy compression, - use the following options: - - ```python - options = ParquetFileWriteOptions(use_dictionary=True, compression="snappy") - ``` - - To write a Parquet file with column statistics and a data page size of - 1 MB, use the following options: - - ```python - options = ParquetFileWriteOptions(write_statistics=True, data_page_size=1024 * 1024) - ``` - - To write a Parquet file with a custom compression level, use the - `compression_level` option: - - ```python - options = ParquetFileWriteOptions(compression_level=5) - ``` - """ - + """ +Parquet format specific options for writing. + +Parameters +---------- +use_dictionary: bool, default True + Whether to use dictionary encoding for string and binary columns. +compression: str, default "snappy" + The compression codec to use. Valid options include "snappy", "gzip", + "brotli", and "lz4". +version: str, default "2.6" + The Parquet file version to write. +write_statistics: bool, default None + Whether to write column statistics to the file. +data_page_size: int, default None + The size (in bytes) of data pages. +compression_level: int, default None + The compression level to use. Valid values range from 0 (no + compression) to 9 (highest compression). +use_byte_stream_split: bool, default False + Whether to split byte stream columns (e.g. UTF-8 strings) into + multiple Parquet columns. +column_encoding: dict, default None + A dictionary mapping column names to encoding types. Valid encoding + types include "plain", "plain_dictionary", "rle", and + "bit_packed". +data_page_version: str, default "1.0" + The Parquet data page version to write. +use_deprecated_int96_timestamps: bool, default False + Whether to use the deprecated INT96 format for timestamps. +coerce_timestamps: bool, default None + Whether to coerce timestamps to a particular time zone. If None, + timestamps will be written in their original time zone. +allow_truncated_timestamps: bool, default False + Whether to allow timestamps to be truncated if they are outside of the + representable range of the Parquet format. +use_compliant_nested_type: bool, default True + Whether to use the compliant nested type representation for nested + types. +encryption_config: ParquetEncryptionConfiguration, default None + The encryption configuration to use. + +Examples +-------- + +>>> import pyarrow as pa +>>> import pyarrow.parquet as pq + +# Create a table +>>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021], +... 'n_legs': [2, 2, 4, 4, 5, 100], +... 'animal': ["Flamingo", "Parrot", "Dog", "Horse", +... "Brittle stars", "Centipede"]}) + +# Write a Parquet file with dictionary encoding and Snappy compression +>>> options = pq.ParquetFileWriteOptions(use_dictionary=True, compression="snappy") +>>> pq.write_table(table, "example.parquet", write_options=options) + +# Write a Parquet file with column statistics and a data page size of 1 MB +>>> options = pq.ParquetFileWriteOptions(write_statistics=True, data_page_size=1024 * 1024) +>>> pq.write_table(table, "example_stats.parquet", write_options=options) + +# Write a Parquet file with a custom compression level +>>> options = pq.ParquetFileWriteOptions(compression_level=5) +>>> pq.write_table(table, "example_compression.parquet", write_options=options) + + def update(self, **kwargs): """ Parameters From a7abce0a1a0a9521fe9f9ea19b63e5ce5f312e72 Mon Sep 17 00:00:00 2001 From: Divyansh200102 Date: Tue, 24 Oct 2023 18:14:03 +0530 Subject: [PATCH 04/11] Update _dataset_parquet.pyx --- python/pyarrow/_dataset_parquet.pyx | 130 ++++++++++++++-------------- 1 file changed, 65 insertions(+), 65 deletions(-) diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index 33939ca0d83..51ff4b75d04 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -560,71 +560,71 @@ cdef class ParquetReadOptions(_Weakrefable): cdef class ParquetFileWriteOptions(FileWriteOptions): - """ -Parquet format specific options for writing. - -Parameters ----------- -use_dictionary: bool, default True - Whether to use dictionary encoding for string and binary columns. -compression: str, default "snappy" - The compression codec to use. Valid options include "snappy", "gzip", - "brotli", and "lz4". -version: str, default "2.6" - The Parquet file version to write. -write_statistics: bool, default None - Whether to write column statistics to the file. -data_page_size: int, default None - The size (in bytes) of data pages. -compression_level: int, default None - The compression level to use. Valid values range from 0 (no - compression) to 9 (highest compression). -use_byte_stream_split: bool, default False - Whether to split byte stream columns (e.g. UTF-8 strings) into - multiple Parquet columns. -column_encoding: dict, default None - A dictionary mapping column names to encoding types. Valid encoding - types include "plain", "plain_dictionary", "rle", and - "bit_packed". -data_page_version: str, default "1.0" - The Parquet data page version to write. -use_deprecated_int96_timestamps: bool, default False - Whether to use the deprecated INT96 format for timestamps. -coerce_timestamps: bool, default None - Whether to coerce timestamps to a particular time zone. If None, - timestamps will be written in their original time zone. -allow_truncated_timestamps: bool, default False - Whether to allow timestamps to be truncated if they are outside of the - representable range of the Parquet format. -use_compliant_nested_type: bool, default True - Whether to use the compliant nested type representation for nested - types. -encryption_config: ParquetEncryptionConfiguration, default None - The encryption configuration to use. - -Examples --------- - ->>> import pyarrow as pa ->>> import pyarrow.parquet as pq - -# Create a table ->>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021], -... 'n_legs': [2, 2, 4, 4, 5, 100], -... 'animal': ["Flamingo", "Parrot", "Dog", "Horse", -... "Brittle stars", "Centipede"]}) - -# Write a Parquet file with dictionary encoding and Snappy compression ->>> options = pq.ParquetFileWriteOptions(use_dictionary=True, compression="snappy") ->>> pq.write_table(table, "example.parquet", write_options=options) - -# Write a Parquet file with column statistics and a data page size of 1 MB ->>> options = pq.ParquetFileWriteOptions(write_statistics=True, data_page_size=1024 * 1024) ->>> pq.write_table(table, "example_stats.parquet", write_options=options) - -# Write a Parquet file with a custom compression level ->>> options = pq.ParquetFileWriteOptions(compression_level=5) ->>> pq.write_table(table, "example_compression.parquet", write_options=options) + """ + Parquet format specific options for writing. + + Parameters + ---------- + use_dictionary: bool, default True + Whether to use dictionary encoding for string and binary columns. + compression: str, default "snappy" + The compression codec to use. Valid options include "snappy", "gzip", + "brotli", and "lz4". + version: str, default "2.6" + The Parquet file version to write. + write_statistics: bool, default None + Whether to write column statistics to the file. + data_page_size: int, default None + The size (in bytes) of data pages. + compression_level: int, default None + The compression level to use. Valid values range from 0 (no + compression) to 9 (highest compression). + use_byte_stream_split: bool, default False + Whether to split byte stream columns (e.g. UTF-8 strings) into + multiple Parquet columns. + column_encoding: dict, default None + A dictionary mapping column names to encoding types. Valid encoding + types include "plain", "plain_dictionary", "rle", and + "bit_packed". + data_page_version: str, default "1.0" + The Parquet data page version to write. + use_deprecated_int96_timestamps: bool, default False + Whether to use the deprecated INT96 format for timestamps. + coerce_timestamps: bool, default None + Whether to coerce timestamps to a particular time zone. If None, + timestamps will be written in their original time zone. + allow_truncated_timestamps: bool, default False + Whether to allow timestamps to be truncated if they are outside of the + representable range of the Parquet format. + use_compliant_nested_type: bool, default True + Whether to use the compliant nested type representation for nested + types. + encryption_config: ParquetEncryptionConfiguration, default None + The encryption configuration to use. + + Examples + -------- + + >>> import pyarrow as pa + >>> import pyarrow.parquet as pq + + # Create a table + >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021], + ... 'n_legs': [2, 2, 4, 4, 5, 100], + ... 'animal': ["Flamingo", "Parrot", "Dog", "Horse", + ... "Brittle stars", "Centipede"]}) + + # Write a Parquet file with dictionary encoding and Snappy compression + >>> options = pq.ParquetFileWriteOptions(use_dictionary=True, compression="snappy") + >>> pq.write_table(table, "example.parquet", write_options=options) + + # Write a Parquet file with column statistics and a data page size of 1 MB + >>> options = pq.ParquetFileWriteOptions(write_statistics=True, data_page_size=1024 * 1024) + >>> pq.write_table(table, "example_stats.parquet", write_options=options) + + # Write a Parquet file with a custom compression level + >>> options = pq.ParquetFileWriteOptions(compression_level=5) + >>> pq.write_table(table, "example_compression.parquet", write_options=options) def update(self, **kwargs): From 2ba9439a879021d1e6eb1ed246e96bc190f536c1 Mon Sep 17 00:00:00 2001 From: Divyansh200102 Date: Thu, 22 Feb 2024 17:09:53 +0530 Subject: [PATCH 05/11] Formatting --- python/pyarrow/_dataset_parquet.pyx | 2 +- python/pyarrow/array.pxi | 3 ++- python/pyarrow/table.pxi | 3 ++- python/pyarrow/types.pxi | 3 ++- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index 51ff4b75d04..e24fcf9d2d7 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -625,8 +625,8 @@ cdef class ParquetFileWriteOptions(FileWriteOptions): # Write a Parquet file with a custom compression level >>> options = pq.ParquetFileWriteOptions(compression_level=5) >>> pq.write_table(table, "example_compression.parquet", write_options=options) + """ - def update(self, **kwargs): """ Parameters diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index ad01d45571b..e441278ae67 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1748,7 +1748,8 @@ cdef class Array(_PandasConvertible): inner_array = pyarrow_unwrap_array(casted_array) except ArrowInvalid as e: raise ValueError( - f"Could not cast {self.type} to requested type {target_type}: {e}" + f"Could not cast {self.type} to requested type { + target_type}: {e}" ) else: inner_array = self.sp_array diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index ee3872aa3a2..18b7944d1ca 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -3081,7 +3081,8 @@ cdef class RecordBatch(_Tabular): inner_batch = pyarrow_unwrap_batch(casted_batch) except ArrowInvalid as e: raise ValueError( - f"Could not cast {self.schema} to requested schema {target_schema}: {e}" + f"Could not cast {self.schema} to requested schema { + target_schema}: {e}" ) else: inner_batch = self.sp_batch diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index e9bf56c6213..1f7773174c3 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -133,7 +133,8 @@ cdef void* _as_c_pointer(v, allow_null=False) except *: else: capsule_name_str = capsule_name.decode() raise ValueError( - f"Can't convert PyCapsule with name '{capsule_name_str}' to pointer address" + f"Can't convert PyCapsule with name '{ + capsule_name_str}' to pointer address" ) else: raise TypeError(f"Expected a pointer value, got {type(v)!r}") From a6871a6ad4ccb028914b5a316b160a7782d8090e Mon Sep 17 00:00:00 2001 From: Divyansh200102 Date: Thu, 22 Feb 2024 17:26:17 +0530 Subject: [PATCH 06/11] Revert "Formatting" This reverts commit 2ba9439a879021d1e6eb1ed246e96bc190f536c1. --- python/pyarrow/_dataset_parquet.pyx | 2 +- python/pyarrow/array.pxi | 3 +-- python/pyarrow/table.pxi | 3 +-- python/pyarrow/types.pxi | 3 +-- 4 files changed, 4 insertions(+), 7 deletions(-) diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index e24fcf9d2d7..51ff4b75d04 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -625,8 +625,8 @@ cdef class ParquetFileWriteOptions(FileWriteOptions): # Write a Parquet file with a custom compression level >>> options = pq.ParquetFileWriteOptions(compression_level=5) >>> pq.write_table(table, "example_compression.parquet", write_options=options) - """ + def update(self, **kwargs): """ Parameters diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index e441278ae67..ad01d45571b 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1748,8 +1748,7 @@ cdef class Array(_PandasConvertible): inner_array = pyarrow_unwrap_array(casted_array) except ArrowInvalid as e: raise ValueError( - f"Could not cast {self.type} to requested type { - target_type}: {e}" + f"Could not cast {self.type} to requested type {target_type}: {e}" ) else: inner_array = self.sp_array diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 18b7944d1ca..ee3872aa3a2 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -3081,8 +3081,7 @@ cdef class RecordBatch(_Tabular): inner_batch = pyarrow_unwrap_batch(casted_batch) except ArrowInvalid as e: raise ValueError( - f"Could not cast {self.schema} to requested schema { - target_schema}: {e}" + f"Could not cast {self.schema} to requested schema {target_schema}: {e}" ) else: inner_batch = self.sp_batch diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 1f7773174c3..e9bf56c6213 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -133,8 +133,7 @@ cdef void* _as_c_pointer(v, allow_null=False) except *: else: capsule_name_str = capsule_name.decode() raise ValueError( - f"Can't convert PyCapsule with name '{ - capsule_name_str}' to pointer address" + f"Can't convert PyCapsule with name '{capsule_name_str}' to pointer address" ) else: raise TypeError(f"Expected a pointer value, got {type(v)!r}") From 0068d4230c888cc9b62d7036cdfc324382349084 Mon Sep 17 00:00:00 2001 From: Divyansh200102 Date: Thu, 22 Feb 2024 17:29:21 +0530 Subject: [PATCH 07/11] Add missing docstrings --- python/pyarrow/_dataset_parquet.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index 51ff4b75d04..2309c705dfe 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -625,7 +625,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions): # Write a Parquet file with a custom compression level >>> options = pq.ParquetFileWriteOptions(compression_level=5) >>> pq.write_table(table, "example_compression.parquet", write_options=options) - + """ def update(self, **kwargs): """ From 28d864ac71498cdefd6d3f8c5a87415d8c688e06 Mon Sep 17 00:00:00 2001 From: Divyansh200102 Date: Thu, 22 Feb 2024 18:35:20 +0530 Subject: [PATCH 08/11] Update _dataset_parquet.pyx --- python/pyarrow/_dataset_parquet.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index 2309c705dfe..0457cc9e568 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -626,7 +626,6 @@ cdef class ParquetFileWriteOptions(FileWriteOptions): >>> options = pq.ParquetFileWriteOptions(compression_level=5) >>> pq.write_table(table, "example_compression.parquet", write_options=options) """ - def update(self, **kwargs): """ Parameters From 1b942599d9e7afc305f1f7c8cc61bfb0637e71e4 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Thu, 22 Feb 2024 18:18:35 +0100 Subject: [PATCH 09/11] Update python/pyarrow/_dataset_parquet.pyx --- python/pyarrow/_dataset_parquet.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index 0457cc9e568..e24fcf9d2d7 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -626,6 +626,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions): >>> options = pq.ParquetFileWriteOptions(compression_level=5) >>> pq.write_table(table, "example_compression.parquet", write_options=options) """ + def update(self, **kwargs): """ Parameters From 352a9192244a885d2a86ca609def61957808380f Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Thu, 22 Feb 2024 22:36:17 +0100 Subject: [PATCH 10/11] Update python/pyarrow/_dataset_parquet.pyx --- python/pyarrow/_dataset_parquet.pyx | 130 ++++++++++++++-------------- 1 file changed, 65 insertions(+), 65 deletions(-) diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index e24fcf9d2d7..ae57f257f20 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -560,72 +560,72 @@ cdef class ParquetReadOptions(_Weakrefable): cdef class ParquetFileWriteOptions(FileWriteOptions): - """ - Parquet format specific options for writing. + """ + Parquet format specific options for writing. - Parameters - ---------- - use_dictionary: bool, default True - Whether to use dictionary encoding for string and binary columns. - compression: str, default "snappy" - The compression codec to use. Valid options include "snappy", "gzip", - "brotli", and "lz4". - version: str, default "2.6" - The Parquet file version to write. - write_statistics: bool, default None - Whether to write column statistics to the file. - data_page_size: int, default None - The size (in bytes) of data pages. - compression_level: int, default None - The compression level to use. Valid values range from 0 (no - compression) to 9 (highest compression). - use_byte_stream_split: bool, default False - Whether to split byte stream columns (e.g. UTF-8 strings) into - multiple Parquet columns. - column_encoding: dict, default None - A dictionary mapping column names to encoding types. Valid encoding - types include "plain", "plain_dictionary", "rle", and - "bit_packed". - data_page_version: str, default "1.0" - The Parquet data page version to write. - use_deprecated_int96_timestamps: bool, default False - Whether to use the deprecated INT96 format for timestamps. - coerce_timestamps: bool, default None - Whether to coerce timestamps to a particular time zone. If None, - timestamps will be written in their original time zone. - allow_truncated_timestamps: bool, default False - Whether to allow timestamps to be truncated if they are outside of the - representable range of the Parquet format. - use_compliant_nested_type: bool, default True - Whether to use the compliant nested type representation for nested - types. - encryption_config: ParquetEncryptionConfiguration, default None - The encryption configuration to use. - - Examples - -------- - - >>> import pyarrow as pa - >>> import pyarrow.parquet as pq - - # Create a table - >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021], - ... 'n_legs': [2, 2, 4, 4, 5, 100], - ... 'animal': ["Flamingo", "Parrot", "Dog", "Horse", - ... "Brittle stars", "Centipede"]}) - - # Write a Parquet file with dictionary encoding and Snappy compression - >>> options = pq.ParquetFileWriteOptions(use_dictionary=True, compression="snappy") - >>> pq.write_table(table, "example.parquet", write_options=options) - - # Write a Parquet file with column statistics and a data page size of 1 MB - >>> options = pq.ParquetFileWriteOptions(write_statistics=True, data_page_size=1024 * 1024) - >>> pq.write_table(table, "example_stats.parquet", write_options=options) - - # Write a Parquet file with a custom compression level - >>> options = pq.ParquetFileWriteOptions(compression_level=5) - >>> pq.write_table(table, "example_compression.parquet", write_options=options) - """ + Parameters + ---------- + use_dictionary: bool, default True + Whether to use dictionary encoding for string and binary columns. + compression: str, default "snappy" + The compression codec to use. Valid options include "snappy", "gzip", + "brotli", and "lz4". + version: str, default "2.6" + The Parquet file version to write. + write_statistics: bool, default None + Whether to write column statistics to the file. + data_page_size: int, default None + The size (in bytes) of data pages. + compression_level: int, default None + The compression level to use. Valid values range from 0 (no + compression) to 9 (highest compression). + use_byte_stream_split: bool, default False + Whether to split byte stream columns (e.g. UTF-8 strings) into + multiple Parquet columns. + column_encoding: dict, default None + A dictionary mapping column names to encoding types. Valid encoding + types include "plain", "plain_dictionary", "rle", and + "bit_packed". + data_page_version: str, default "1.0" + The Parquet data page version to write. + use_deprecated_int96_timestamps: bool, default False + Whether to use the deprecated INT96 format for timestamps. + coerce_timestamps: bool, default None + Whether to coerce timestamps to a particular time zone. If None, + timestamps will be written in their original time zone. + allow_truncated_timestamps: bool, default False + Whether to allow timestamps to be truncated if they are outside of the + representable range of the Parquet format. + use_compliant_nested_type: bool, default True + Whether to use the compliant nested type representation for nested + types. + encryption_config: ParquetEncryptionConfiguration, default None + The encryption configuration to use. + + Examples + -------- + + >>> import pyarrow as pa + >>> import pyarrow.parquet as pq + + # Create a table + >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021], + ... 'n_legs': [2, 2, 4, 4, 5, 100], + ... 'animal': ["Flamingo", "Parrot", "Dog", "Horse", + ... "Brittle stars", "Centipede"]}) + + # Write a Parquet file with dictionary encoding and Snappy compression + >>> options = pq.ParquetFileWriteOptions(use_dictionary=True, compression="snappy") + >>> pq.write_table(table, "example.parquet", write_options=options) + + # Write a Parquet file with column statistics and a data page size of 1 MB + >>> options = pq.ParquetFileWriteOptions(write_statistics=True, data_page_size=1024 * 1024) + >>> pq.write_table(table, "example_stats.parquet", write_options=options) + + # Write a Parquet file with a custom compression level + >>> options = pq.ParquetFileWriteOptions(compression_level=5) + >>> pq.write_table(table, "example_compression.parquet", write_options=options) + """ def update(self, **kwargs): """ From ffcf56cb5e188e0dd441f44c80b1bb11676e1d01 Mon Sep 17 00:00:00 2001 From: Divyansh200102 Date: Sat, 24 Feb 2024 12:28:03 +0530 Subject: [PATCH 11/11] Formatting --- python/pyarrow/_dataset_parquet.pyx | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index ae57f257f20..082d863ee8d 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -565,41 +565,41 @@ cdef class ParquetFileWriteOptions(FileWriteOptions): Parameters ---------- - use_dictionary: bool, default True + use_dictionary : bool, default True Whether to use dictionary encoding for string and binary columns. - compression: str, default "snappy" + compression : str, default "snappy" The compression codec to use. Valid options include "snappy", "gzip", "brotli", and "lz4". - version: str, default "2.6" + version : str, default "2.6" The Parquet file version to write. - write_statistics: bool, default None + write_statistics : bool, default None Whether to write column statistics to the file. - data_page_size: int, default None + data_page_size : int, default None The size (in bytes) of data pages. - compression_level: int, default None + compression_level : int, default None The compression level to use. Valid values range from 0 (no compression) to 9 (highest compression). - use_byte_stream_split: bool, default False + use_byte_stream_split : bool, default False Whether to split byte stream columns (e.g. UTF-8 strings) into multiple Parquet columns. - column_encoding: dict, default None + column_encoding : dict, default None A dictionary mapping column names to encoding types. Valid encoding types include "plain", "plain_dictionary", "rle", and "bit_packed". - data_page_version: str, default "1.0" + data_page_version : str, default "1.0" The Parquet data page version to write. - use_deprecated_int96_timestamps: bool, default False + use_deprecated_int96_timestamps : bool, default False Whether to use the deprecated INT96 format for timestamps. - coerce_timestamps: bool, default None + coerce_timestamps : bool, default None Whether to coerce timestamps to a particular time zone. If None, timestamps will be written in their original time zone. - allow_truncated_timestamps: bool, default False + allow_truncated_timestamps : bool, default False Whether to allow timestamps to be truncated if they are outside of the representable range of the Parquet format. - use_compliant_nested_type: bool, default True + use_compliant_nested_type : bool, default True Whether to use the compliant nested type representation for nested types. - encryption_config: ParquetEncryptionConfiguration, default None + encryption_config : ParquetEncryptionConfiguration, default None The encryption configuration to use. Examples