From 05a65c213a9647930799814278102245ccebf790 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Fri, 22 Apr 2022 11:52:12 +0200 Subject: [PATCH 1/6] Support row_group_size/chunk_size keyword in pq.write_to_dataset --- python/pyarrow/parquet/__init__.py | 4 +- python/pyarrow/tests/test_dataset.py | 2 +- python/run_test.sh | 64 ++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+), 2 deletions(-) create mode 100755 python/run_test.sh diff --git a/python/pyarrow/parquet/__init__.py b/python/pyarrow/parquet/__init__.py index 63b3e9f1a56..39f61503eab 100644 --- a/python/pyarrow/parquet/__init__.py +++ b/python/pyarrow/parquet/__init__.py @@ -3109,6 +3109,7 @@ def file_visitor(written_file): # extract non-file format options schema = kwargs.pop("schema", None) use_threads = kwargs.pop("use_threads", True) + chunk_size = kwargs.pop("chunk_size", None) # raise for unsupported keywords msg = ( @@ -3147,7 +3148,8 @@ def file_visitor(written_file): partitioning=partitioning, use_threads=use_threads, file_visitor=file_visitor, basename_template=basename_template, - existing_data_behavior=existing_data_behavior) + existing_data_behavior=existing_data_behavior, + max_rows_per_group=chunk_size) return # warnings and errors when using legecy implementation diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 19448d36870..c27430ab151 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -971,7 +971,7 @@ def _create_dataset_for_fragments(tempdir, chunk_size=None, filesystem=None): path = str(tempdir / "test_parquet_dataset") # write_to_dataset currently requires pandas - pq.write_to_dataset(table, path, use_legacy_dataset=True, + pq.write_to_dataset(table, path, partition_cols=["part"], chunk_size=chunk_size) dataset = ds.dataset( path, format="parquet", partitioning="hive", filesystem=filesystem diff --git a/python/run_test.sh b/python/run_test.sh new file mode 100755 index 00000000000..5722dbeb643 --- /dev/null +++ b/python/run_test.sh @@ -0,0 +1,64 @@ +# -DARROW_USE_ASAN=OFF \ +# -DARROW_USE_UBSAN=OFF \ +# -DARROW_USE_TSAN=OFF \ + +export ARROW_BUILD_TYPE=debug +export ARROW_HOME=$CONDA_PREFIX +export PARQUET_TEST_DATA=`pwd`/../cpp/submodules/parquet-testing/data +export ARROW_TEST_DATA=`pwd`/../testing/data + +export ARROW_HDFS_TEST_HOST=impala +export ARROW_HDFS_TEST_PORT=8020 +export ARROW_HDFS_TEST_USER=hdfs + +mkdir -p ../cpp/build +pushd ../cpp/build + +cmake -GNinja \ + -DARROW_BUILD_BENCHMARKS=OFF \ + -DARROW_BUILD_STATIC=ON \ + -DARROW_BUILD_TESTS=ON \ + -DARROW_DATASET=OFF \ + -DARROW_DEPENDENCY_SOURCE=AUTO \ + -DARROW_EXTRA_ERROR_CONTEXT=ON \ + -DARROW_BUILD_INTEGRATION=ON \ + -DARROW_DEPENDENCY_SOURCE=CONDA \ + -DARROW_FLIGHT=ON \ + -DARROW_GANDIVA=OFF \ + -DARROW_JEMALLOC=ON \ + -DARROW_MIMALLOC=ON \ + -DARROW_PARQUET=OFF \ + -DARROW_ORC=OFF \ + -DARROW_PARQUET=ON \ + -DARROW_USE_CCACHE=ON \ + -DARROW_PLASMA=OFF \ + -DARROW_PYTHON=OFF \ + -DARROW_S3=OFF \ + -DARROW_TEST_MEMCHECK=OFF \ + -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ + -DCMAKE_EXPORT_COMPILE_COMMANDS=YES \ + -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ + .. + +ninja +ninja install + +popd + +# export PYARROW_CMAKE_GENERATOR=Ninja +# export PYARROW_BUILD_TYPE=debug +# export PYARROW_WITH_PARQUET=1 +# export PYARROW_WITH_PLASMA=1 +# export PYARROW_WITH_HDFS=1 +# export PYARROW_WITH_GANDIVA=0 +# export PYARROW_WITH_DATASET=1 +# export PYARROW_WITH_FLIGHT=1 +# export PYARROW_WITH_S3=1 +# export PYARROW_WITH_ORC=1 + +# # export DYLD_INSERT_LIBRARIES=/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/lib/clang/12.0.0/lib/darwin/libclang_rt.asan_osx_dynamic.dylib +# # export DYLD_INSERT_LIBRARIES=/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/lib/clang/12.0.0/lib/darwin/libclang_rt.tsan_osx_dynamic.dylib + +# python setup.py develop + +# pytest -sv "$@" From f69d9b94110f9482be3d4bfbbda3ec6ff13c3b0b Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Fri, 22 Apr 2022 14:23:54 +0200 Subject: [PATCH 2/6] Add row_group_size also --- python/pyarrow/parquet/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/parquet/__init__.py b/python/pyarrow/parquet/__init__.py index 39f61503eab..a9066aee9f4 100644 --- a/python/pyarrow/parquet/__init__.py +++ b/python/pyarrow/parquet/__init__.py @@ -3110,6 +3110,10 @@ def file_visitor(written_file): schema = kwargs.pop("schema", None) use_threads = kwargs.pop("use_threads", True) chunk_size = kwargs.pop("chunk_size", None) + row_group_size = kwargs.pop("row_group_size", None) + + if row_group_size or chunk_size is not None: + row_group_size = row_group_size if row_group_size else chunk_size # raise for unsupported keywords msg = ( @@ -3149,7 +3153,7 @@ def file_visitor(written_file): file_visitor=file_visitor, basename_template=basename_template, existing_data_behavior=existing_data_behavior, - max_rows_per_group=chunk_size) + max_rows_per_group=row_group_size) return # warnings and errors when using legecy implementation From 93141c151a43a22bf8d7c4cba6284e1cb495e0f7 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Fri, 22 Apr 2022 14:28:46 +0200 Subject: [PATCH 3/6] Fix precedence --- python/pyarrow/parquet/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/parquet/__init__.py b/python/pyarrow/parquet/__init__.py index a9066aee9f4..740896e02de 100644 --- a/python/pyarrow/parquet/__init__.py +++ b/python/pyarrow/parquet/__init__.py @@ -3112,7 +3112,7 @@ def file_visitor(written_file): chunk_size = kwargs.pop("chunk_size", None) row_group_size = kwargs.pop("row_group_size", None) - if row_group_size or chunk_size is not None: + if (row_group_size or chunk_size) is not None: row_group_size = row_group_size if row_group_size else chunk_size # raise for unsupported keywords From edf3e7a8ae78cde69b3e648a73acdec727a6752e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 22 Apr 2022 18:16:56 +0200 Subject: [PATCH 4/6] Update python/pyarrow/parquet/__init__.py --- python/pyarrow/parquet/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/pyarrow/parquet/__init__.py b/python/pyarrow/parquet/__init__.py index 740896e02de..15f0e06ac40 100644 --- a/python/pyarrow/parquet/__init__.py +++ b/python/pyarrow/parquet/__init__.py @@ -3112,8 +3112,7 @@ def file_visitor(written_file): chunk_size = kwargs.pop("chunk_size", None) row_group_size = kwargs.pop("row_group_size", None) - if (row_group_size or chunk_size) is not None: - row_group_size = row_group_size if row_group_size else chunk_size + row_group_size = row_group_size if row_group_size is not None else chunk_size # raise for unsupported keywords msg = ( From f370e9f0ebd3bce87923be41bf84edc79627c861 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 22 Apr 2022 20:13:06 +0200 Subject: [PATCH 5/6] Flake8 --- python/pyarrow/parquet/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/parquet/__init__.py b/python/pyarrow/parquet/__init__.py index 15f0e06ac40..3b10dd3b23d 100644 --- a/python/pyarrow/parquet/__init__.py +++ b/python/pyarrow/parquet/__init__.py @@ -3112,7 +3112,9 @@ def file_visitor(written_file): chunk_size = kwargs.pop("chunk_size", None) row_group_size = kwargs.pop("row_group_size", None) - row_group_size = row_group_size if row_group_size is not None else chunk_size + row_group_size = ( + row_group_size if row_group_size is not None else chunk_size + ) # raise for unsupported keywords msg = ( From 15f0c39fa83407930b5462e90bd4463449bf1d8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Fri, 22 Apr 2022 22:49:32 +0200 Subject: [PATCH 6/6] Remove accidentally committed script --- python/run_test.sh | 64 ---------------------------------------------- 1 file changed, 64 deletions(-) delete mode 100755 python/run_test.sh diff --git a/python/run_test.sh b/python/run_test.sh deleted file mode 100755 index 5722dbeb643..00000000000 --- a/python/run_test.sh +++ /dev/null @@ -1,64 +0,0 @@ -# -DARROW_USE_ASAN=OFF \ -# -DARROW_USE_UBSAN=OFF \ -# -DARROW_USE_TSAN=OFF \ - -export ARROW_BUILD_TYPE=debug -export ARROW_HOME=$CONDA_PREFIX -export PARQUET_TEST_DATA=`pwd`/../cpp/submodules/parquet-testing/data -export ARROW_TEST_DATA=`pwd`/../testing/data - -export ARROW_HDFS_TEST_HOST=impala -export ARROW_HDFS_TEST_PORT=8020 -export ARROW_HDFS_TEST_USER=hdfs - -mkdir -p ../cpp/build -pushd ../cpp/build - -cmake -GNinja \ - -DARROW_BUILD_BENCHMARKS=OFF \ - -DARROW_BUILD_STATIC=ON \ - -DARROW_BUILD_TESTS=ON \ - -DARROW_DATASET=OFF \ - -DARROW_DEPENDENCY_SOURCE=AUTO \ - -DARROW_EXTRA_ERROR_CONTEXT=ON \ - -DARROW_BUILD_INTEGRATION=ON \ - -DARROW_DEPENDENCY_SOURCE=CONDA \ - -DARROW_FLIGHT=ON \ - -DARROW_GANDIVA=OFF \ - -DARROW_JEMALLOC=ON \ - -DARROW_MIMALLOC=ON \ - -DARROW_PARQUET=OFF \ - -DARROW_ORC=OFF \ - -DARROW_PARQUET=ON \ - -DARROW_USE_CCACHE=ON \ - -DARROW_PLASMA=OFF \ - -DARROW_PYTHON=OFF \ - -DARROW_S3=OFF \ - -DARROW_TEST_MEMCHECK=OFF \ - -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ - -DCMAKE_EXPORT_COMPILE_COMMANDS=YES \ - -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ - .. - -ninja -ninja install - -popd - -# export PYARROW_CMAKE_GENERATOR=Ninja -# export PYARROW_BUILD_TYPE=debug -# export PYARROW_WITH_PARQUET=1 -# export PYARROW_WITH_PLASMA=1 -# export PYARROW_WITH_HDFS=1 -# export PYARROW_WITH_GANDIVA=0 -# export PYARROW_WITH_DATASET=1 -# export PYARROW_WITH_FLIGHT=1 -# export PYARROW_WITH_S3=1 -# export PYARROW_WITH_ORC=1 - -# # export DYLD_INSERT_LIBRARIES=/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/lib/clang/12.0.0/lib/darwin/libclang_rt.asan_osx_dynamic.dylib -# # export DYLD_INSERT_LIBRARIES=/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/lib/clang/12.0.0/lib/darwin/libclang_rt.tsan_osx_dynamic.dylib - -# python setup.py develop - -# pytest -sv "$@"