From bb4860478390b320c22dddc6ac4a8f381144077f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 11 Jan 2024 15:56:41 +0100 Subject: [PATCH 1/4] temp: remove usage of logging fs from crashing test --- python/pyarrow/tests/test_dataset.py | 32 ++++++++++++++-------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index ae2146c0bda..eb882d5ce37 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -3720,8 +3720,8 @@ def test_parquet_dataset_factory_metadata(tempdir): @pytest.mark.parquet @pytest.mark.pandas -def test_parquet_dataset_lazy_filtering(tempdir, open_logging_fs): - fs, assert_opens = open_logging_fs +def test_parquet_dataset_lazy_filtering(tempdir): + filesystem = fs.LocalFileSystem() # Test to ensure that no IO happens when filtering a dataset # created with ParquetDatasetFactory from a _metadata file @@ -3730,28 +3730,28 @@ def test_parquet_dataset_lazy_filtering(tempdir, open_logging_fs): metadata_path, _ = _create_parquet_dataset_simple(root_path) # creating the dataset should only open the metadata file - with assert_opens([metadata_path]): - dataset = ds.parquet_dataset( - metadata_path, - partitioning=ds.partitioning(flavor="hive"), - filesystem=fs) + # with assert_opens([metadata_path]): + dataset = ds.parquet_dataset( + metadata_path, + partitioning=ds.partitioning(flavor="hive"), + filesystem=filesystem) # materializing fragments should not open any file - with assert_opens([]): - fragments = list(dataset.get_fragments()) + # with assert_opens([]): + fragments = list(dataset.get_fragments()) # filtering fragments should not open any file - with assert_opens([]): - list(dataset.get_fragments(ds.field("f1") > 15)) + # with assert_opens([]): + list(dataset.get_fragments(ds.field("f1") > 15)) # splitting by row group should still not open any file - with assert_opens([]): - fragments[0].split_by_row_group(ds.field("f1") > 15) + # with assert_opens([]): + fragments[0].split_by_row_group(ds.field("f1") > 15) # ensuring metadata of split fragment should also not open any file - with assert_opens([]): - rg_fragments = fragments[0].split_by_row_group() - rg_fragments[0].ensure_complete_metadata() + # with assert_opens([]): + rg_fragments = fragments[0].split_by_row_group() + rg_fragments[0].ensure_complete_metadata() # FIXME(bkietz) on Windows this results in FileNotFoundErrors. # but actually scanning does open files From a60c5b31edb652eb10292bc62543538be5e60a7e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 12 Jan 2024 09:58:29 +0100 Subject: [PATCH 2/4] add print --- ci/scripts/python_wheel_unix_test.sh | 2 +- cpp/src/arrow/dataset/file_parquet.cc | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/ci/scripts/python_wheel_unix_test.sh b/ci/scripts/python_wheel_unix_test.sh index 01250ff7ef4..e4bee4cb565 100755 --- a/ci/scripts/python_wheel_unix_test.sh +++ b/ci/scripts/python_wheel_unix_test.sh @@ -91,5 +91,5 @@ if [ "${CHECK_UNITTESTS}" == "ON" ]; then # Execute unittest, test dependencies must be installed python -c 'import pyarrow; pyarrow.create_library_symlinks()' - python -m pytest -r s --pyargs pyarrow + python -m pytest -r s -v -s --pyargs pyarrow fi diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 0ce08502921..e1ba1c0ae6c 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -17,6 +17,7 @@ #include "arrow/dataset/file_parquet.h" +#include #include #include #include @@ -820,6 +821,7 @@ Status ParquetFileFragment::SetMetadata( manifest_ = std::move(manifest); statistics_expressions_.resize(row_groups_->size(), compute::literal(true)); + std::cout << "Manifest number of fields: " << manifest_->descr->num_columns() << "\n"; statistics_expressions_complete_.resize(manifest_->descr->num_columns(), false); for (int row_group : *row_groups_) { From 86c30180e62654078e8febcf4f819ead5067c583 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 12 Jan 2024 13:56:02 +0100 Subject: [PATCH 3/4] return error instead of debug print --- cpp/src/arrow/dataset/file_parquet.cc | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index e1ba1c0ae6c..0b08fa7c77b 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -17,7 +17,6 @@ #include "arrow/dataset/file_parquet.h" -#include #include #include #include @@ -821,8 +820,11 @@ Status ParquetFileFragment::SetMetadata( manifest_ = std::move(manifest); statistics_expressions_.resize(row_groups_->size(), compute::literal(true)); - std::cout << "Manifest number of fields: " << manifest_->descr->num_columns() << "\n"; - statistics_expressions_complete_.resize(manifest_->descr->num_columns(), false); + auto num_columns = manifest_->descr->num_columns(); + if (num_columns < 0) { + return Status::Invalid("Problem with manifest: negative number of columns"); + } + statistics_expressions_complete_.resize(num_columns, false); for (int row_group : *row_groups_) { // Ensure RowGroups are indexing valid RowGroups before augmenting. From 124e1dc5f3636a5de48e6dc99fb81de036db041a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 16 Jan 2024 08:17:49 +0100 Subject: [PATCH 4/4] undo temp changes --- ci/scripts/python_wheel_unix_test.sh | 2 +- python/pyarrow/tests/test_dataset.py | 32 ++++++++++++++-------------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/ci/scripts/python_wheel_unix_test.sh b/ci/scripts/python_wheel_unix_test.sh index e4bee4cb565..01250ff7ef4 100755 --- a/ci/scripts/python_wheel_unix_test.sh +++ b/ci/scripts/python_wheel_unix_test.sh @@ -91,5 +91,5 @@ if [ "${CHECK_UNITTESTS}" == "ON" ]; then # Execute unittest, test dependencies must be installed python -c 'import pyarrow; pyarrow.create_library_symlinks()' - python -m pytest -r s -v -s --pyargs pyarrow + python -m pytest -r s --pyargs pyarrow fi diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index eb882d5ce37..ae2146c0bda 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -3720,8 +3720,8 @@ def test_parquet_dataset_factory_metadata(tempdir): @pytest.mark.parquet @pytest.mark.pandas -def test_parquet_dataset_lazy_filtering(tempdir): - filesystem = fs.LocalFileSystem() +def test_parquet_dataset_lazy_filtering(tempdir, open_logging_fs): + fs, assert_opens = open_logging_fs # Test to ensure that no IO happens when filtering a dataset # created with ParquetDatasetFactory from a _metadata file @@ -3730,28 +3730,28 @@ def test_parquet_dataset_lazy_filtering(tempdir): metadata_path, _ = _create_parquet_dataset_simple(root_path) # creating the dataset should only open the metadata file - # with assert_opens([metadata_path]): - dataset = ds.parquet_dataset( - metadata_path, - partitioning=ds.partitioning(flavor="hive"), - filesystem=filesystem) + with assert_opens([metadata_path]): + dataset = ds.parquet_dataset( + metadata_path, + partitioning=ds.partitioning(flavor="hive"), + filesystem=fs) # materializing fragments should not open any file - # with assert_opens([]): - fragments = list(dataset.get_fragments()) + with assert_opens([]): + fragments = list(dataset.get_fragments()) # filtering fragments should not open any file - # with assert_opens([]): - list(dataset.get_fragments(ds.field("f1") > 15)) + with assert_opens([]): + list(dataset.get_fragments(ds.field("f1") > 15)) # splitting by row group should still not open any file - # with assert_opens([]): - fragments[0].split_by_row_group(ds.field("f1") > 15) + with assert_opens([]): + fragments[0].split_by_row_group(ds.field("f1") > 15) # ensuring metadata of split fragment should also not open any file - # with assert_opens([]): - rg_fragments = fragments[0].split_by_row_group() - rg_fragments[0].ensure_complete_metadata() + with assert_opens([]): + rg_fragments = fragments[0].split_by_row_group() + rg_fragments[0].ensure_complete_metadata() # FIXME(bkietz) on Windows this results in FileNotFoundErrors. # but actually scanning does open files