From 48b7ea5c67ecbeddfbdbb5e81c709e4153e4bb53 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 28 Jan 2020 15:43:29 +0100 Subject: [PATCH 01/26] POC: use dataset API in existing parquet tests --- python/pyarrow/_dataset.pyx | 3 +- python/pyarrow/parquet.py | 19 +++++++- python/pyarrow/tests/test_parquet.py | 71 ++++++++++++++++++++++++---- 3 files changed, 83 insertions(+), 10 deletions(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 556faa74539..71595baa339 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -1191,7 +1191,8 @@ cdef class FileSystemDatasetFactory(DatasetFactory): c_options ) else: - raise TypeError('Must pass either paths or a FileSelector') + raise TypeError('Must pass either paths or a FileSelector, but ' + 'passed {}'.format(type(paths_or_selector))) self.init(GetResultValue(result)) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 656321646a3..332c59cf59c 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -1285,7 +1285,24 @@ def _make_manifest(path_or_paths, fs, pathsep='/', metadata_nthreads=1, def read_table(source, columns=None, use_threads=True, metadata=None, use_pandas_metadata=False, memory_map=False, read_dictionary=None, filesystem=None, filters=None, - buffer_size=0): + buffer_size=0, use_datasets=True): + if use_datasets: + import pyarrow.dataset as ds + + dataset = ds.dataset(source, filesystem=filesystem, format="parquet") + # TODO implement filter (tuple -> expression conversion) + table = dataset.to_table(columns=columns) + + # remove ARROW:schema metadata, current parquet version doesn't + # preserve this + metadata = table.schema.metadata + if metadata: + metadata.pop(b"ARROW:schema", None) + if len(metadata) == 0: + metadata = None + table = table.replace_schema_metadata(metadata) + return table + if _is_path_like(source): pf = ParquetDataset(source, metadata=metadata, memory_map=memory_map, read_dictionary=read_dictionary, diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 7e900f0af8c..ee1ea64e42f 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -76,10 +76,11 @@ def _roundtrip_table(table, read_table_kwargs=None, read_table_kwargs = read_table_kwargs or {} write_table_kwargs = write_table_kwargs or {} - buf = io.BytesIO() - _write_table(table, buf, **write_table_kwargs) - buf.seek(0) - return _read_table(buf, **read_table_kwargs) + from pyarrow.fs import _MockFileSystem + mockfs = _MockFileSystem() + with mockfs.open_output_stream("test") as out: + _write_table(table, out, **write_table_kwargs) + return _read_table("test", filesystem=mockfs, **read_table_kwargs) def _check_roundtrip(table, expected=None, read_table_kwargs=None, @@ -101,11 +102,17 @@ def _check_roundtrip(table, expected=None, read_table_kwargs=None, def _roundtrip_pandas_dataframe(df, write_kwargs): table = pa.Table.from_pandas(df) - buf = io.BytesIO() - _write_table(table, buf, **write_kwargs) + # buf = io.BytesIO() + # _write_table(table, buf, **write_kwargs) - buf.seek(0) - table1 = _read_table(buf) + # buf.seek(0) + + from pyarrow.fs import _MockFileSystem + mockfs = _MockFileSystem() + with mockfs.open_output_stream("test") as out: + _write_table(table, out, **write_kwargs) + + table1 = _read_table("test", filesystem=mockfs) return table1.to_pandas() @@ -155,6 +162,8 @@ def alltypes_sample(size=10000, seed=0, categorical=False): return pd.DataFrame(arrays) +# TODO non-deterministic order +@pytest.mark.skip @pytest.mark.pandas @pytest.mark.parametrize('chunk_size', [None, 1000]) def test_pandas_parquet_2_0_roundtrip(tempdir, chunk_size): @@ -287,6 +296,8 @@ def test_nested_list_nonnullable_roundtrip_bug(): _check_roundtrip(t, data_page_size=4096) +# TODO BytesIO +@pytest.mark.skip @pytest.mark.pandas def test_pandas_parquet_datetime_tz(): s = pd.Series([datetime.datetime(2017, 9, 6)]) @@ -435,6 +446,8 @@ def test_multiple_path_types(tempdir): tm.assert_frame_equal(df, df_read) +# TODO duplicate column selection actually gives duplicate columns now +@pytest.mark.skip @pytest.mark.pandas def test_pandas_column_selection(tempdir): size = 10000 @@ -491,6 +504,8 @@ def _test_dataframe(size=10000, seed=0): return df +# TODO NativeFile support +@pytest.mark.skip @pytest.mark.pandas def test_pandas_parquet_native_file_roundtrip(tempdir): df = _test_dataframe(10000) @@ -503,6 +518,8 @@ def test_pandas_parquet_native_file_roundtrip(tempdir): tm.assert_frame_equal(df, df_read) +# TODO buffer +@pytest.mark.skip @pytest.mark.pandas def test_parquet_incremental_file_build(tempdir): df = _test_dataframe(100) @@ -530,6 +547,8 @@ def test_parquet_incremental_file_build(tempdir): tm.assert_frame_equal(result.to_pandas(), expected) +# TODO buffer +@pytest.mark.skip @pytest.mark.pandas def test_read_pandas_column_subset(tempdir): df = _test_dataframe(10000) @@ -542,6 +561,8 @@ def test_read_pandas_column_subset(tempdir): tm.assert_frame_equal(df[['strings', 'uint8']], df_read) +# TODO buffer +@pytest.mark.skip @pytest.mark.pandas def test_pandas_parquet_empty_roundtrip(tempdir): df = _test_dataframe(0) @@ -554,6 +575,8 @@ def test_pandas_parquet_empty_roundtrip(tempdir): tm.assert_frame_equal(df, df_read) +# TODO buffer +@pytest.mark.skip @pytest.mark.pandas def test_pandas_can_write_nested_data(tempdir): data = { @@ -1355,6 +1378,8 @@ def test_fixed_size_binary(): _check_roundtrip(table) +# TODO buffer +@pytest.mark.skip @pytest.mark.pandas def test_multithreaded_read(): df = alltypes_sample(size=10000) @@ -1373,6 +1398,8 @@ def test_multithreaded_read(): assert table1.equals(table2) +# TODO buffer +@pytest.mark.skip @pytest.mark.pandas def test_min_chunksize(): data = pd.DataFrame([np.arange(4)], columns=['A', 'B', 'C', 'D']) @@ -1870,6 +1897,8 @@ def test_invalid_pred_op(tempdir): ]) +# TODO implement filters +@pytest.mark.skip @pytest.mark.pandas def test_filters_read_table(tempdir): # test that filters keyword is passed through in read_table @@ -2934,6 +2963,8 @@ def test_decimal_roundtrip_negative_scale(tempdir): tm.assert_frame_equal(result, expected) +# TODO buffer +@pytest.mark.skip @pytest.mark.pandas def test_parquet_writer_context_obj(tempdir): df = _test_dataframe(100) @@ -2959,6 +2990,8 @@ def test_parquet_writer_context_obj(tempdir): tm.assert_frame_equal(result.to_pandas(), expected) +# TODO buffer +@pytest.mark.skip @pytest.mark.pandas def test_parquet_writer_context_obj_with_exception(tempdir): df = _test_dataframe(100) @@ -2991,6 +3024,8 @@ def test_parquet_writer_context_obj_with_exception(tempdir): tm.assert_frame_equal(result.to_pandas(), expected) +# TODO buffer +@pytest.mark.skip @pytest.mark.pandas def test_zlib_compression_bug(): # ARROW-3514: "zlib deflate failed, output buffer too small" @@ -3050,6 +3085,8 @@ def test_empty_row_groups(tempdir): assert reader.read_row_group(i).equals(table) +# TODO buffer +@pytest.mark.skip @pytest.mark.pandas def test_parquet_writer_with_caller_provided_filesystem(): out = pa.BufferOutputStream() @@ -3155,6 +3192,8 @@ def test_read_column_invalid_index(): f.reader.read_column(index) +# TODO buffer +@pytest.mark.skip @pytest.mark.pandas def test_direct_read_dictionary(): # ARROW-3325 @@ -3202,6 +3241,8 @@ def test_dataset_read_dictionary(tempdir): assert c1.equals(ex_chunks[0]) +# TODO buffer +@pytest.mark.skip @pytest.mark.pandas def test_direct_read_dictionary_subfield(): repeats = 10 @@ -3298,6 +3339,8 @@ def test_write_to_dataset_metadata(tempdir): assert d1 == d2 +# TODO better error message for invalid files (certainly if it is the only one) +@pytest.mark.skip def test_parquet_file_too_small(tempdir): path = str(tempdir / "test.parquet") with pytest.raises(pa.ArrowInvalid, @@ -3313,6 +3356,8 @@ def test_parquet_file_too_small(tempdir): pq.read_table(path) +# TODO buffer +@pytest.mark.skip @pytest.mark.pandas def test_categorical_index_survives_roundtrip(): # ARROW-3652, addressed by ARROW-3246 @@ -3328,6 +3373,8 @@ def test_categorical_index_survives_roundtrip(): assert ref_df.index.equals(df.index) +# TODO buffer +@pytest.mark.skip @pytest.mark.pandas def test_categorical_order_survives_roundtrip(): # ARROW-6302 @@ -3351,6 +3398,8 @@ def _simple_table_write_read(table): return pq.read_table(pa.BufferReader(contents)) +# TODO buffer +@pytest.mark.skip def test_dictionary_array_automatically_read(): # ARROW-3246 @@ -3415,6 +3464,8 @@ def test_field_id_metadata(): assert schema[2].metadata[field_name] == b'5' +# TODO buffer +@pytest.mark.skip @pytest.mark.pandas def test_pandas_categorical_na_type_row_groups(): # ARROW-5085 @@ -3433,6 +3484,8 @@ def test_pandas_categorical_na_type_row_groups(): assert result[1].equals(table[1]) +# TODO buffer +@pytest.mark.skip @pytest.mark.pandas def test_pandas_categorical_roundtrip(): # ARROW-5480, this was enabled by ARROW-3246 @@ -3495,6 +3548,8 @@ def test_multi_dataset_metadata(tempdir): assert md['serialized_size'] > 0 +# WONTFIX schema unification now happens when dataset is created +@pytest.mark.skip @pytest.mark.pandas def test_filter_before_validate_schema(tempdir): # ARROW-4076 apply filter before schema validation From 7dcd9603352ed3b1758789afb5d999208bebbd7c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 6 Feb 2020 16:11:59 +0100 Subject: [PATCH 02/26] support old-style filters --- python/pyarrow/parquet.py | 73 ++++++++++++++++++++++++++-- python/pyarrow/tests/test_parquet.py | 2 - 2 files changed, 70 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 332c59cf59c..fcfc2aafe62 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -102,6 +102,66 @@ def _check_filters(filters): ) return filters + +def _filters_to_expression(filters): + """ + Check if filters are well-formed. + + Predicates are expressed in disjunctive normal form (DNF). This means + that the innermost tuple describe a single column predicate. These + inner predicate make are all combined with a conjunction (AND) into a + larger predicate. The most outer list then combines all filters + with a disjunction (OR). By this, we should be able to express all + kinds of filters that are possible using boolean logic. + """ + import pyarrow.dataset as ds + + filters = _check_filters(filters) + + def convert_single_predicate(col, op, val): + field = ds.field(col) + + if op == "=" or op == "==": + return field == val + elif op == "!=": + return field != val + elif op == '<': + return field < val + elif op == '>': + return field > val + elif op == '<=': + return field <= val + elif op == '>=': + return field >= val + elif op == 'in': + return field.isin(val) + elif op == 'not in': + return ~field.isin(val) + else: + raise ValueError( + '"{0}" is not a valid operator in predicates.'.format( + (col, op, val))) + + or_exprs = [] + + for conjunction in filters: + and_exprs = [] + for col, op, val in conjunction: + and_exprs.append(convert_single_predicate(col, op, val)) + if len(and_exprs) > 1: + expr = ds.AndExpression(*and_exprs) + else: + expr = and_exprs[0] + or_exprs.append(expr) + + if len(or_exprs) > 1: + expr = ds.OrExpression(*or_exprs) + else: + expr = or_exprs[0] + + return expr + + # ---------------------------------------------------------------------- # Reading a single Parquet file @@ -1288,10 +1348,17 @@ def read_table(source, columns=None, use_threads=True, metadata=None, buffer_size=0, use_datasets=True): if use_datasets: import pyarrow.dataset as ds + import pyarrow.fs + + # map old filesystems to new one + if isinstance(filesystem, LocalFileSystem): + filesystem = pyarrow.fs.LocalFileSystem() - dataset = ds.dataset(source, filesystem=filesystem, format="parquet") - # TODO implement filter (tuple -> expression conversion) - table = dataset.to_table(columns=columns) + dataset = ds.dataset(source, filesystem=filesystem, format="parquet", + partitioning="hive") + if filters is not None and not isinstance(filters, ds.Expression): + filters = _filters_to_expression(filters) + table = dataset.to_table(columns=columns, filter=filters) # remove ARROW:schema metadata, current parquet version doesn't # preserve this diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index ee1ea64e42f..cb94c8d28fa 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -1897,8 +1897,6 @@ def test_invalid_pred_op(tempdir): ]) -# TODO implement filters -@pytest.mark.skip @pytest.mark.pandas def test_filters_read_table(tempdir): # test that filters keyword is passed through in read_table From e50273507eae507a11339ebd69c906527b95e408 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 23 Mar 2020 15:41:33 +0100 Subject: [PATCH 03/26] add ParquetDatasetV2 shim and use in tests --- python/pyarrow/parquet.py | 61 ++++++- python/pyarrow/tests/test_parquet.py | 228 +++++++++++++++++++-------- 2 files changed, 218 insertions(+), 71 deletions(-) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index fcfc2aafe62..af5c45cfb10 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -1086,10 +1086,23 @@ class ParquetDataset: {} """.format(_read_docstring_common) + def __new__(cls, path_or_paths=None, filesystem=None, schema=None, + metadata=None, split_row_groups=False, validate_schema=True, + filters=None, metadata_nthreads=1, read_dictionary=None, + memory_map=False, buffer_size=0, use_dataset=False): + if use_dataset: + # TODO raise warning on unsupported keywords + return ParquetDatasetV2(path_or_paths, filesystem=filesystem, + filters=filters, + read_dictionary=read_dictionary, + buffer_size=buffer_size) + self = object.__new__(cls) + return self + def __init__(self, path_or_paths, filesystem=None, schema=None, metadata=None, split_row_groups=False, validate_schema=True, filters=None, metadata_nthreads=1, read_dictionary=None, - memory_map=False, buffer_size=0): + memory_map=False, buffer_size=0, use_dataset=False): self._metadata = _ParquetDatasetMetadata() a_path = path_or_paths if isinstance(a_path, list): @@ -1312,6 +1325,52 @@ def _make_manifest(path_or_paths, fs, pathsep='/', metadata_nthreads=1, return pieces, partitions, common_metadata_path, metadata_path +class ParquetDatasetV2: + """ + ParquetDataset shim using the Dataset API under the hood. + """ + def __init__(self, path_or_paths, filesystem=None, filters=None, + read_dictionary=None, buffer_size=None): + import pyarrow.dataset as ds + import pyarrow.fs + + # map old filesystems to new one + if isinstance(filesystem, LocalFileSystem): + filesystem = pyarrow.fs.LocalFileSystem() + + reader_options = {} + if buffer_size: + reader_options.update(use_buffered_stream=True, + buffer_size=buffer_size) + if read_dictionary is not None: + reader_options.update(dict_columns=read_dictionary) + parquat_format = ds.ParquetFileFormat(reader_options=reader_options) + + dataset = ds.dataset(path_or_paths, filesystem=filesystem, + format=parquat_format, partitioning="hive") + self._dataset = dataset + self.filters = filters + if filters is not None: + self.filter_expression = _filters_to_expression(filters) + else: + self.filter_expression = None + + @property + def schema(self): + return self._dataset.schema + + def read(self, columns=None, use_threads=False): + return self._dataset.to_table( + columns=columns, filter=self.filter_expression, + use_threads=use_threads + ) + + @property + def pieces(self): + # TODO raise deprecation warning + return list(self._dataset.get_fragments()) + + _read_table_docstring = """ {0} diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index cb94c8d28fa..7e4836cf228 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -56,6 +56,16 @@ def datadir(datadir): return datadir / 'parquet' +parametrize_use_dataset = pytest.mark.parametrize("use_dataset", [False, True]) +parametrize_use_dataset_not_supported = pytest.mark.parametrize( + "use_dataset", [False, pytest.param(True, marks=pytest.mark.skip)]) + + +# @pytest.fixture(params=[False, True]) +# def use_dataset(request): +# return request.param + + def _write_table(table, path, **kwargs): # So we see the ImportError somewhere import pyarrow.parquet as pq @@ -1608,9 +1618,10 @@ def test_partition_set_dictionary_type(): @pytest.mark.pandas -def test_read_partitioned_directory(tempdir): +@parametrize_use_dataset +def test_read_partitioned_directory(tempdir, use_dataset): fs = LocalFileSystem.get_instance() - _partition_test_for_filesystem(fs, tempdir) + _partition_test_for_filesystem(fs, tempdir, use_dataset) @pytest.mark.pandas @@ -1631,7 +1642,8 @@ def test_create_parquet_dataset_multi_threaded(tempdir): @pytest.mark.pandas -def test_equivalency(tempdir): +@parametrize_use_dataset +def test_equivalency(tempdir, use_dataset): fs = LocalFileSystem.get_instance() base_path = tempdir @@ -1658,7 +1670,8 @@ def test_equivalency(tempdir): dataset = pq.ParquetDataset( base_path, filesystem=fs, filters=[('integer', '=', 1), ('string', '!=', 'b'), - ('boolean', '==', True)] + ('boolean', '==', True)], + use_dataset=use_dataset, ) table = dataset.read() result_df = (table.to_pandas().reset_index(drop=True)) @@ -1679,7 +1692,8 @@ def test_equivalency(tempdir): ], [('integer', '=', 0), ('boolean', '==', 'False')] ] - dataset = pq.ParquetDataset(base_path, filesystem=fs, filters=filters) + dataset = pq.ParquetDataset( + base_path, filesystem=fs, filters=filters, use_dataset=use_dataset) table = dataset.read() result_df = table.to_pandas().reset_index(drop=True) @@ -1700,14 +1714,17 @@ def test_equivalency(tempdir): # current code. with pytest.raises(NotImplementedError): filters = [[('string', '==', b'1\0a')]] - pq.ParquetDataset(base_path, filesystem=fs, filters=filters) + pq.ParquetDataset( + base_path, filesystem=fs, filters=filters, use_dataset=use_dataset) with pytest.raises(NotImplementedError): filters = [[('string', '==', '1\0a')]] - pq.ParquetDataset(base_path, filesystem=fs, filters=filters) + pq.ParquetDataset( + base_path, filesystem=fs, filters=filters, use_dataset=use_dataset) @pytest.mark.pandas -def test_cutoff_exclusive_integer(tempdir): +@parametrize_use_dataset +def test_cutoff_exclusive_integer(tempdir, use_dataset): fs = LocalFileSystem.get_instance() base_path = tempdir @@ -1729,7 +1746,8 @@ def test_cutoff_exclusive_integer(tempdir): filters=[ ('integers', '<', 4), ('integers', '>', 1), - ] + ], + use_dataset=use_dataset ) table = dataset.read() result_df = (table.to_pandas() @@ -1741,11 +1759,14 @@ def test_cutoff_exclusive_integer(tempdir): @pytest.mark.pandas +@parametrize_use_dataset @pytest.mark.xfail( - raises=TypeError, + # different error with use_datasets because result_df is no longer + # categorical + raises=(TypeError, AssertionError), reason='Loss of type information in creation of categoricals.' ) -def test_cutoff_exclusive_datetime(tempdir): +def test_cutoff_exclusive_datetime(tempdir, use_dataset): fs = LocalFileSystem.get_instance() base_path = tempdir @@ -1773,7 +1794,8 @@ def test_cutoff_exclusive_datetime(tempdir): filters=[ ('dates', '<', "2018-04-12"), ('dates', '>', "2018-04-10") - ] + ], + use_dataset=use_dataset ) table = dataset.read() result_df = (table.to_pandas() @@ -1788,7 +1810,8 @@ def test_cutoff_exclusive_datetime(tempdir): @pytest.mark.pandas -def test_inclusive_integer(tempdir): +@parametrize_use_dataset +def test_inclusive_integer(tempdir, use_dataset): fs = LocalFileSystem.get_instance() base_path = tempdir @@ -1810,7 +1833,8 @@ def test_inclusive_integer(tempdir): filters=[ ('integers', '<=', 3), ('integers', '>=', 2), - ] + ], + use_dataset=use_dataset ) table = dataset.read() result_df = (table.to_pandas() @@ -1822,7 +1846,8 @@ def test_inclusive_integer(tempdir): @pytest.mark.pandas -def test_inclusive_set(tempdir): +@parametrize_use_dataset +def test_inclusive_set(tempdir, use_dataset): fs = LocalFileSystem.get_instance() base_path = tempdir @@ -1847,7 +1872,8 @@ def test_inclusive_set(tempdir): dataset = pq.ParquetDataset( base_path, filesystem=fs, filters=[('integer', 'in', {1}), ('string', 'in', {'a', 'b'}), - ('boolean', 'in', {True})] + ('boolean', 'in', {True})], + use_dataset=use_dataset ) table = dataset.read() result_df = (table.to_pandas().reset_index(drop=True)) @@ -1858,7 +1884,8 @@ def test_inclusive_set(tempdir): @pytest.mark.pandas -def test_invalid_pred_op(tempdir): +@parametrize_use_dataset +def test_invalid_pred_op(tempdir, use_dataset): fs = LocalFileSystem.get_instance() base_path = tempdir @@ -1878,23 +1905,39 @@ def test_invalid_pred_op(tempdir): with pytest.raises(ValueError): pq.ParquetDataset(base_path, filesystem=fs, - filters=[ - ('integers', '=<', 3), - ]) + filters=[('integers', '=<', 3), ], + use_dataset=use_dataset) - with pytest.raises(ValueError): - pq.ParquetDataset(base_path, - filesystem=fs, - filters=[ - ('integers', 'in', set()), - ]) - - with pytest.raises(ValueError): - pq.ParquetDataset(base_path, - filesystem=fs, - filters=[ - ('integers', '!=', {3}), - ]) + if not use_dataset: + with pytest.raises(ValueError): + pq.ParquetDataset(base_path, + filesystem=fs, + filters=[('integers', 'in', set()), ], + use_dataset=use_dataset) + else: + # Dataset API returns empty table instead + dataset = pq.ParquetDataset(base_path, + filesystem=fs, + filters=[('integers', 'in', set()), ], + use_dataset=use_dataset) + assert dataset.read().num_rows == 0 + + if not use_dataset: + with pytest.raises(ValueError): + pq.ParquetDataset(base_path, + filesystem=fs, + filters=[('integers', '!=', {3})], + use_dataset=use_dataset) + else: + # TODO(dataset) ARROW-8186: + # `ds.field('int') != {3}` returns bool instead of expression + with pytest.raises(TypeError): + # Datasets API gives filter expression that is always true + dataset = pq.ParquetDataset(base_path, + filesystem=fs, + filters=[('integers', '!=', {3})], + use_dataset=use_dataset) + assert dataset.read().num_rows == 5 @pytest.mark.pandas @@ -1983,7 +2026,7 @@ def test_read_partitioned_directory_s3fs(s3_example): dataset.read() -def _partition_test_for_filesystem(fs, base_path): +def _partition_test_for_filesystem(fs, base_path, use_dataset=False): foo_keys = [0, 1] bar_keys = ['a', 'b', 'c'] partition_spec = [ @@ -2001,7 +2044,8 @@ def _partition_test_for_filesystem(fs, base_path): _generate_partition_directories(fs, base_path, partition_spec, df) - dataset = pq.ParquetDataset(base_path, filesystem=fs) + dataset = pq.ParquetDataset( + base_path, filesystem=fs, use_dataset=use_dataset) table = dataset.read() result_df = (table.to_pandas() .sort_values(by='index') @@ -2010,8 +2054,11 @@ def _partition_test_for_filesystem(fs, base_path): expected_df = (df.sort_values(by='index') .reset_index(drop=True) .reindex(columns=result_df.columns)) - expected_df['foo'] = pd.Categorical(df['foo'], categories=foo_keys) - expected_df['bar'] = pd.Categorical(df['bar'], categories=bar_keys) + if not use_dataset: + # TODO(dataset) Dataset API does not create categorical columns + # for partition keys + expected_df['foo'] = pd.Categorical(df['foo'], categories=foo_keys) + expected_df['bar'] = pd.Categorical(df['bar'], categories=bar_keys) assert (result_df.columns == ['index', 'values', 'foo', 'bar']).all() @@ -2160,7 +2207,8 @@ def _filter_partition(df, part_keys): @pytest.mark.pandas -def test_read_multiple_files(tempdir): +@parametrize_use_dataset +def test_read_multiple_files(tempdir, use_dataset): nfiles = 10 size = 5 @@ -2187,7 +2235,7 @@ def test_read_multiple_files(tempdir): (dirpath / '_SUCCESS.crc').touch() def read_multiple_files(paths, columns=None, use_threads=True, **kwargs): - dataset = pq.ParquetDataset(paths, **kwargs) + dataset = pq.ParquetDataset(paths, use_dataset=use_dataset, **kwargs) return dataset.read(columns=columns, use_threads=use_threads) result = read_multiple_files(paths) @@ -2224,6 +2272,10 @@ def read_multiple_files(paths, columns=None, use_threads=True, **kwargs): t = pa.Table.from_pandas(bad_apple) _write_table(t, bad_apple_path) + if use_dataset: + # TODO(dataset) Dataset API skips bad files + return + bad_meta = pq.read_metadata(bad_apple_path) with pytest.raises(ValueError): @@ -2265,6 +2317,7 @@ def test_dataset_read_pandas(tempdir): frames.append(df) paths.append(path) + # TODO check read_pandas semantics dataset = pq.ParquetDataset(dirpath) columns = ['uint8', 'strings'] result = dataset.read_pandas(columns=columns).to_pandas() @@ -2274,7 +2327,8 @@ def test_dataset_read_pandas(tempdir): @pytest.mark.pandas -def test_dataset_memory_map(tempdir): +@parametrize_use_dataset_not_supported # TODO(dataset) support memory map +def test_dataset_memory_map(tempdir, use_dataset): # ARROW-2627: Check that we can use ParquetDataset with memory-mapping dirpath = tempdir / guid() dirpath.mkdir() @@ -2284,12 +2338,14 @@ def test_dataset_memory_map(tempdir): table = pa.Table.from_pandas(df) _write_table(table, path, version='2.0') - dataset = pq.ParquetDataset(dirpath, memory_map=True) + dataset = pq.ParquetDataset( + dirpath, memory_map=True, use_dataset=use_dataset) assert dataset.pieces[0].read().equals(table) @pytest.mark.pandas -def test_dataset_enable_buffered_stream(tempdir): +@parametrize_use_dataset +def test_dataset_enable_buffered_stream(tempdir, use_dataset): dirpath = tempdir / guid() dirpath.mkdir() @@ -2298,12 +2354,20 @@ def test_dataset_enable_buffered_stream(tempdir): table = pa.Table.from_pandas(df) _write_table(table, path, version='2.0') - with pytest.raises(ValueError): - pq.ParquetDataset(dirpath, buffer_size=-64) + # TODO(dataset) raises an OSError instead of ValueError + with pytest.raises((ValueError, OSError)): + if use_dataset: + # Dataset API only raises when reading + pq.ParquetDataset( + dirpath, buffer_size=-64, use_dataset=True).read() + else: + pq.ParquetDataset( + dirpath, buffer_size=-64, use_dataset=use_dataset) for buffer_size in [128, 1024]: - dataset = pq.ParquetDataset(dirpath, buffer_size=buffer_size) - assert dataset.pieces[0].read().equals(table) + dataset = pq.ParquetDataset( + dirpath, buffer_size=buffer_size, use_dataset=use_dataset) + assert dataset.read().equals(table) @pytest.mark.pandas @@ -2364,8 +2428,9 @@ def _make_example_multifile_dataset(base_path, nfiles=10, file_nrows=5): @pytest.mark.pandas +@parametrize_use_dataset @pytest.mark.parametrize('dir_prefix', ['_', '.']) -def test_ignore_private_directories(tempdir, dir_prefix): +def test_ignore_private_directories(tempdir, dir_prefix, use_dataset): dirpath = tempdir / guid() dirpath.mkdir() @@ -2375,12 +2440,13 @@ def test_ignore_private_directories(tempdir, dir_prefix): # private directory (dirpath / '{}staging'.format(dir_prefix)).mkdir() - dataset = pq.ParquetDataset(dirpath) + dataset = pq.ParquetDataset(dirpath, use_dataset=use_dataset) assert set(map(str, paths)) == {x.path for x in dataset.pieces} @pytest.mark.pandas -def test_ignore_hidden_files_dot(tempdir): +@parametrize_use_dataset +def test_ignore_hidden_files_dot(tempdir, use_dataset): dirpath = tempdir / guid() dirpath.mkdir() @@ -2393,12 +2459,13 @@ def test_ignore_hidden_files_dot(tempdir): with (dirpath / '.private').open('wb') as f: f.write(b'gibberish') - dataset = pq.ParquetDataset(dirpath) + dataset = pq.ParquetDataset(dirpath, use_dataset=use_dataset) assert set(map(str, paths)) == {x.path for x in dataset.pieces} @pytest.mark.pandas -def test_ignore_hidden_files_underscore(tempdir): +@parametrize_use_dataset +def test_ignore_hidden_files_underscore(tempdir, use_dataset): dirpath = tempdir / guid() dirpath.mkdir() @@ -2411,7 +2478,7 @@ def test_ignore_hidden_files_underscore(tempdir): with (dirpath / '_started_321').open('wb') as f: f.write(b'abcd') - dataset = pq.ParquetDataset(dirpath) + dataset = pq.ParquetDataset(dirpath, use_dataset=use_dataset) assert set(map(str, paths)) == {x.path for x in dataset.pieces} @@ -2503,6 +2570,7 @@ def test_read_table_doesnt_warn(datadir): def _test_write_to_dataset_with_partitions(base_path, + use_dataset=False, filesystem=None, schema=None, index_name=None): @@ -2532,9 +2600,15 @@ def _test_write_to_dataset_with_partitions(base_path, # partitioned dataset dataset = pq.ParquetDataset(base_path, filesystem=filesystem, - validate_schema=True) + validate_schema=True, + use_dataset=use_dataset) # ARROW-2209: Ensure the dataset schema also includes the partition columns - dataset_cols = set(dataset.schema.to_arrow_schema().names) + if not use_dataset: + dataset_cols = set(dataset.schema.to_arrow_schema().names) + else: + # TODO(dataset) schema property is an arrow and not parquet schema + dataset_cols = set(dataset.schema.names) + assert dataset_cols == set(output_table.schema.names) input_table = dataset.read() @@ -2547,12 +2621,15 @@ def _test_write_to_dataset_with_partitions(base_path, # Partitioned columns become 'categorical' dtypes input_df = input_df[cols] - for col in partition_by: - output_df[col] = output_df[col].astype('category') + if not use_dataset: + for col in partition_by: + output_df[col] = output_df[col].astype('category') assert output_df.equals(input_df) -def _test_write_to_dataset_no_partitions(base_path, filesystem=None): +def _test_write_to_dataset_no_partitions(base_path, + use_dataset=False, + filesystem=None): # ARROW-1400 output_df = pd.DataFrame({'group1': list('aaabbbbccc'), 'group2': list('eefeffgeee'), @@ -2577,7 +2654,8 @@ def _test_write_to_dataset_no_partitions(base_path, filesystem=None): # Deduplicated incoming DataFrame should match # original outgoing Dataframe input_table = pq.ParquetDataset(base_path, - filesystem=filesystem).read() + filesystem=filesystem, + use_dataset=use_dataset).read() input_df = input_table.to_pandas() input_df = input_df.drop_duplicates() input_df = input_df[cols] @@ -2585,28 +2663,33 @@ def _test_write_to_dataset_no_partitions(base_path, filesystem=None): @pytest.mark.pandas -def test_write_to_dataset_with_partitions(tempdir): - _test_write_to_dataset_with_partitions(str(tempdir)) +@parametrize_use_dataset +def test_write_to_dataset_with_partitions(tempdir, use_dataset): + _test_write_to_dataset_with_partitions(str(tempdir), use_dataset) @pytest.mark.pandas -def test_write_to_dataset_with_partitions_and_schema(tempdir): +@parametrize_use_dataset +def test_write_to_dataset_with_partitions_and_schema(tempdir, use_dataset): schema = pa.schema([pa.field('group1', type=pa.string()), pa.field('group2', type=pa.string()), pa.field('num', type=pa.int64()), pa.field('nan', type=pa.int32()), pa.field('date', type=pa.timestamp(unit='us'))]) - _test_write_to_dataset_with_partitions(str(tempdir), schema=schema) + _test_write_to_dataset_with_partitions( + str(tempdir), use_dataset, schema=schema) @pytest.mark.pandas -def test_write_to_dataset_with_partitions_and_index_name(tempdir): - _test_write_to_dataset_with_partitions(str(tempdir), - index_name='index_name') +@parametrize_use_dataset +def test_write_to_dataset_with_partitions_and_index_name(tempdir, use_dataset): + _test_write_to_dataset_with_partitions( + str(tempdir), use_dataset, index_name='index_name') @pytest.mark.pandas -def test_write_to_dataset_no_partitions(tempdir): +@parametrize_use_dataset +def test_write_to_dataset_no_partitions(tempdir, use_dataset): _test_write_to_dataset_no_partitions(str(tempdir)) @@ -3161,7 +3244,8 @@ def test_write_nested_zero_length_array_chunk_failure(): @pytest.mark.pandas -def test_partitioned_dataset(tempdir): +@parametrize_use_dataset +def test_partitioned_dataset(tempdir, use_dataset): # ARROW-3208: Segmentation fault when reading a Parquet partitioned dataset # to a Parquet file path = tempdir / "ARROW-3208" @@ -3173,7 +3257,7 @@ def test_partitioned_dataset(tempdir): table = pa.Table.from_pandas(df) pq.write_to_dataset(table, root_path=str(path), partition_cols=['one', 'two']) - table = pq.ParquetDataset(path).read() + table = pq.ParquetDataset(path, use_dataset=use_dataset).read() pq.write_table(table, path / "output.parquet") @@ -3217,14 +3301,16 @@ def test_direct_read_dictionary(): @pytest.mark.pandas -def test_dataset_read_dictionary(tempdir): +@parametrize_use_dataset +def test_dataset_read_dictionary(tempdir, use_dataset): path = tempdir / "ARROW-3325-dataset" t1 = pa.table([[util.rands(10) for i in range(5)] * 10], names=['f0']) t2 = pa.table([[util.rands(10) for i in range(5)] * 10], names=['f0']) pq.write_to_dataset(t1, root_path=str(path)) pq.write_to_dataset(t2, root_path=str(path)) - result = pq.ParquetDataset(path, read_dictionary=['f0']).read() + result = pq.ParquetDataset( + path, read_dictionary=['f0'], use_dataset=use_dataset).read() # The order of the chunks is non-deterministic ex_chunks = [t1[0].chunk(0).dictionary_encode(), @@ -3609,6 +3695,8 @@ def test_fastparquet_cross_compatibility(tempdir): tm.assert_frame_equal(table_fp.to_pandas(), df) +# TODO buffer +@pytest.mark.skip @pytest.mark.parametrize('array_factory', [ lambda: pa.array([0, None] * 10), lambda: pa.array([0, None] * 10).dictionary_encode(), From 81314f72d88dffd4f3c03f8526aed2547eef7dea Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 23 Mar 2020 16:43:37 +0100 Subject: [PATCH 04/26] parametrize read_table tests --- python/pyarrow/parquet.py | 21 +- python/pyarrow/tests/test_parquet.py | 389 +++++++++++++++------------ 2 files changed, 233 insertions(+), 177 deletions(-) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index af5c45cfb10..59e1c61a161 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -1404,8 +1404,8 @@ def pieces(self): def read_table(source, columns=None, use_threads=True, metadata=None, use_pandas_metadata=False, memory_map=False, read_dictionary=None, filesystem=None, filters=None, - buffer_size=0, use_datasets=True): - if use_datasets: + buffer_size=0, use_dataset=False): + if use_dataset: import pyarrow.dataset as ds import pyarrow.fs @@ -1413,8 +1413,18 @@ def read_table(source, columns=None, use_threads=True, metadata=None, if isinstance(filesystem, LocalFileSystem): filesystem = pyarrow.fs.LocalFileSystem() - dataset = ds.dataset(source, filesystem=filesystem, format="parquet", - partitioning="hive") + # map additional arguments + # TODO raise warning when unsupported arguments are passed + reader_options = {} + if buffer_size: + reader_options.update(use_buffered_stream=True, + buffer_size=buffer_size) + if read_dictionary is not None: + reader_options.update(dict_columns=read_dictionary) + parquat_format = ds.ParquetFileFormat(reader_options=reader_options) + + dataset = ds.dataset(source, filesystem=filesystem, + format=parquat_format, partitioning="hive") if filters is not None and not isinstance(filters, ds.Expression): filters = _filters_to_expression(filters) table = dataset.to_table(columns=columns, filter=filters) @@ -1454,7 +1464,7 @@ def read_table(source, columns=None, use_threads=True, metadata=None, def read_pandas(source, columns=None, use_threads=True, memory_map=False, - metadata=None, filters=None, buffer_size=0): + metadata=None, filters=None, buffer_size=0, use_dataset=False): return read_table( source, columns=columns, @@ -1464,6 +1474,7 @@ def read_pandas(source, columns=None, use_threads=True, memory_map=False, memory_map=memory_map, buffer_size=buffer_size, use_pandas_metadata=True, + use_dataset=use_dataset, ) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 7e4836cf228..8763838f0ca 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -59,6 +59,8 @@ def datadir(datadir): parametrize_use_dataset = pytest.mark.parametrize("use_dataset", [False, True]) parametrize_use_dataset_not_supported = pytest.mark.parametrize( "use_dataset", [False, pytest.param(True, marks=pytest.mark.skip)]) +parametrize_use_dataset_skip_buffer = pytest.mark.parametrize( + "use_dataset", [False, pytest.param(True, marks=pytest.mark.skip)]) # @pytest.fixture(params=[False, True]) @@ -82,19 +84,26 @@ def _read_table(*args, **kwargs): def _roundtrip_table(table, read_table_kwargs=None, - write_table_kwargs=None): + write_table_kwargs=None, use_dataset=False): read_table_kwargs = read_table_kwargs or {} write_table_kwargs = write_table_kwargs or {} - from pyarrow.fs import _MockFileSystem - mockfs = _MockFileSystem() - with mockfs.open_output_stream("test") as out: - _write_table(table, out, **write_table_kwargs) - return _read_table("test", filesystem=mockfs, **read_table_kwargs) + if use_dataset: + from pyarrow.fs import _MockFileSystem + mockfs = _MockFileSystem() + with mockfs.open_output_stream("test") as out: + _write_table(table, out, **write_table_kwargs) + return _read_table("test", filesystem=mockfs, use_dataset=True, + **read_table_kwargs) + else: + buf = io.BytesIO() + _write_table(table, buf, **write_table_kwargs) + buf.seek(0) + return _read_table(buf, **read_table_kwargs) def _check_roundtrip(table, expected=None, read_table_kwargs=None, - **write_table_kwargs): + use_dataset=False, **write_table_kwargs): if expected is None: expected = table @@ -102,37 +111,43 @@ def _check_roundtrip(table, expected=None, read_table_kwargs=None, # intentionally check twice result = _roundtrip_table(table, read_table_kwargs=read_table_kwargs, - write_table_kwargs=write_table_kwargs) + write_table_kwargs=write_table_kwargs, + use_dataset=use_dataset) assert result.equals(expected) result = _roundtrip_table(result, read_table_kwargs=read_table_kwargs, - write_table_kwargs=write_table_kwargs) + write_table_kwargs=write_table_kwargs, + use_dataset=use_dataset) assert result.equals(expected) -def _roundtrip_pandas_dataframe(df, write_kwargs): +def _roundtrip_pandas_dataframe(df, write_kwargs, use_dataset=False): table = pa.Table.from_pandas(df) - # buf = io.BytesIO() - # _write_table(table, buf, **write_kwargs) + if use_dataset: + from pyarrow.fs import _MockFileSystem + mockfs = _MockFileSystem() + with mockfs.open_output_stream("test") as out: + _write_table(table, out, **write_kwargs) - # buf.seek(0) + table1 = _read_table("test", filesystem=mockfs, use_dataset=True) + else: + buf = io.BytesIO() + _write_table(table, buf, **write_kwargs) - from pyarrow.fs import _MockFileSystem - mockfs = _MockFileSystem() - with mockfs.open_output_stream("test") as out: - _write_table(table, out, **write_kwargs) + buf.seek(0) + table1 = _read_table(buf) - table1 = _read_table("test", filesystem=mockfs) return table1.to_pandas() +@parametrize_use_dataset @pytest.mark.parametrize('dtype', [int, float]) -def test_single_pylist_column_roundtrip(tempdir, dtype): +def test_single_pylist_column_roundtrip(tempdir, dtype, use_dataset): filename = tempdir / 'single_{}_column.parquet'.format(dtype.__name__) data = [pa.array(list(map(dtype, range(5))))] table = pa.Table.from_arrays(data, names=['a']) _write_table(table, filename) - table_read = _read_table(filename) + table_read = _read_table(filename, use_dataset=use_dataset) for i in range(table.num_columns): col_written = table[i] col_read = table_read[i] @@ -172,11 +187,11 @@ def alltypes_sample(size=10000, seed=0, categorical=False): return pd.DataFrame(arrays) -# TODO non-deterministic order -@pytest.mark.skip +# TODO(dataset) non-deterministic order +@parametrize_use_dataset_not_supported @pytest.mark.pandas @pytest.mark.parametrize('chunk_size', [None, 1000]) -def test_pandas_parquet_2_0_roundtrip(tempdir, chunk_size): +def test_pandas_parquet_2_0_roundtrip(tempdir, chunk_size, use_dataset): df = alltypes_sample(size=10000, categorical=True) filename = tempdir / 'pandas_roundtrip.parquet' @@ -200,39 +215,45 @@ def test_parquet_invalid_version(tempdir): _write_table(table, tempdir / 'test_version.parquet', version="2.2") -def test_set_data_page_size(): +@parametrize_use_dataset +def test_set_data_page_size(use_dataset): arr = pa.array([1, 2, 3] * 100000) t = pa.Table.from_arrays([arr], names=['f0']) # 128K, 512K page_sizes = [2 << 16, 2 << 18] for target_page_size in page_sizes: - _check_roundtrip(t, data_page_size=target_page_size) + _check_roundtrip(t, data_page_size=target_page_size, + use_dataset=use_dataset) @pytest.mark.pandas -def test_chunked_table_write(): +@parametrize_use_dataset +def test_chunked_table_write(use_dataset): # ARROW-232 df = alltypes_sample(size=10) batch = pa.RecordBatch.from_pandas(df) table = pa.Table.from_batches([batch] * 3) - _check_roundtrip(table, version='2.0') + _check_roundtrip(table, version='2.0', use_dataset=use_dataset) df, _ = dataframe_with_lists() batch = pa.RecordBatch.from_pandas(df) table = pa.Table.from_batches([batch] * 3) - _check_roundtrip(table, version='2.0') + _check_roundtrip(table, version='2.0', use_dataset=use_dataset) @pytest.mark.pandas -def test_memory_map(tempdir): +@parametrize_use_dataset +def test_memory_map(tempdir, use_dataset): df = alltypes_sample(size=10) table = pa.Table.from_pandas(df) + # TODO(dataset) memory_map is still ignored for now _check_roundtrip(table, read_table_kwargs={'memory_map': True}, - version='2.0') + version='2.0', use_dataset=use_dataset) + # TODO add use_dataset to read_pandas as well filename = str(tempdir / 'tmp_file') with open(filename, 'wb') as f: _write_table(table, f, version='2.0') @@ -241,13 +262,15 @@ def test_memory_map(tempdir): @pytest.mark.pandas -def test_enable_buffered_stream(tempdir): +@parametrize_use_dataset +def test_enable_buffered_stream(tempdir, use_dataset): df = alltypes_sample(size=10) table = pa.Table.from_pandas(df) _check_roundtrip(table, read_table_kwargs={'buffer_size': 1025}, - version='2.0') + version='2.0', use_dataset=use_dataset) + # TODO add use_dataset to read_pandas as well filename = str(tempdir / 'tmp_file') with open(filename, 'wb') as f: _write_table(table, f, version='2.0') @@ -255,19 +278,21 @@ def test_enable_buffered_stream(tempdir): assert table_read.equals(table) -def test_special_chars_filename(tempdir): +@parametrize_use_dataset +def test_special_chars_filename(tempdir, use_dataset): table = pa.Table.from_arrays([pa.array([42])], ["ints"]) filename = "foo # bar" path = tempdir / filename assert not path.exists() _write_table(table, str(path)) assert path.exists() - table_read = _read_table(str(path)) + table_read = _read_table(str(path), use_dataset=use_dataset) assert table_read.equals(table) @pytest.mark.pandas -def test_empty_table_roundtrip(): +@parametrize_use_dataset +def test_empty_table_roundtrip(use_dataset): df = alltypes_sample(size=10) # Create a non-empty table to infer the types correctly, then slice to 0 @@ -278,24 +303,27 @@ def test_empty_table_roundtrip(): assert table.schema.field('null').type == pa.null() assert table.schema.field('null_list').type == pa.list_(pa.null()) - _check_roundtrip(table, version='2.0') + _check_roundtrip(table, version='2.0', use_dataset=use_dataset) @pytest.mark.pandas -def test_empty_table_no_columns(): +@parametrize_use_dataset +def test_empty_table_no_columns(use_dataset): df = pd.DataFrame() empty = pa.Table.from_pandas(df, preserve_index=False) - _check_roundtrip(empty) + _check_roundtrip(empty, use_dataset=use_dataset) -def test_empty_lists_table_roundtrip(): +@parametrize_use_dataset +def test_empty_lists_table_roundtrip(use_dataset): # ARROW-2744: Shouldn't crash when writing an array of empty lists arr = pa.array([[], []], type=pa.list_(pa.int32())) table = pa.Table.from_arrays([arr], ["A"]) - _check_roundtrip(table) + _check_roundtrip(table, use_dataset=use_dataset) -def test_nested_list_nonnullable_roundtrip_bug(): +@parametrize_use_dataset +def test_nested_list_nonnullable_roundtrip_bug(use_dataset): # Reproduce failure in ARROW-5630 typ = pa.list_(pa.field("item", pa.float32(), False)) num_rows = 10000 @@ -303,13 +331,12 @@ def test_nested_list_nonnullable_roundtrip_bug(): pa.array(([[0] * ((i + 5) % 10) for i in range(0, 10)] * (num_rows // 10)), type=typ) ], ['a']) - _check_roundtrip(t, data_page_size=4096) + _check_roundtrip(t, data_page_size=4096, use_dataset=use_dataset) -# TODO BytesIO -@pytest.mark.skip @pytest.mark.pandas -def test_pandas_parquet_datetime_tz(): +@parametrize_use_dataset_skip_buffer +def test_pandas_parquet_datetime_tz(use_dataset): s = pd.Series([datetime.datetime(2017, 9, 6)]) s = s.dt.tz_localize('utc') @@ -334,12 +361,13 @@ def test_pandas_parquet_datetime_tz(): @pytest.mark.pandas -def test_datetime_timezone_tzinfo(): +@parametrize_use_dataset +def test_datetime_timezone_tzinfo(use_dataset): value = datetime.datetime(2018, 1, 1, 1, 23, 45, tzinfo=datetime.timezone.utc) df = pd.DataFrame({'foo': [value]}) - _roundtrip_pandas_dataframe(df, write_kwargs={}) + _roundtrip_pandas_dataframe(df, write_kwargs={}, use_dataset=use_dataset) @pytest.mark.pandas @@ -362,6 +390,7 @@ def test_pandas_parquet_custom_metadata(tempdir): 'step': 1}] +# TODO support read_pandas for use_dataset @pytest.mark.pandas def test_pandas_parquet_column_multiindex(tempdir): df = alltypes_sample(size=10) @@ -381,6 +410,7 @@ def test_pandas_parquet_column_multiindex(tempdir): tm.assert_frame_equal(df, df_read) +# TODO support read_pandas for use_dataset @pytest.mark.pandas def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written(tempdir): df = alltypes_sample(size=10000) @@ -406,7 +436,8 @@ def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written(tempdir): @pytest.mark.pandas -def test_pandas_parquet_1_0_roundtrip(tempdir): +@parametrize_use_dataset +def test_pandas_parquet_1_0_roundtrip(tempdir, use_dataset): size = 10000 np.random.seed(0) df = pd.DataFrame({ @@ -428,7 +459,7 @@ def test_pandas_parquet_1_0_roundtrip(tempdir): filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df) _write_table(arrow_table, filename, version='1.0') - table_read = _read_table(filename) + table_read = _read_table(filename, use_dataset=use_dataset) df_read = table_read.to_pandas() # We pass uint32_t as int64_t if we write Parquet version 1.0 @@ -438,12 +469,13 @@ def test_pandas_parquet_1_0_roundtrip(tempdir): @pytest.mark.pandas -def test_multiple_path_types(tempdir): +@parametrize_use_dataset +def test_multiple_path_types(tempdir, use_dataset): # Test compatibility with PEP 519 path-like objects path = tempdir / 'zzz.parquet' df = pd.DataFrame({'x': np.arange(10, dtype=np.int64)}) _write_table(df, path) - table_read = _read_table(path) + table_read = _read_table(path, use_dataset=use_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @@ -451,15 +483,15 @@ def test_multiple_path_types(tempdir): path = str(tempdir) + 'zzz.parquet' df = pd.DataFrame({'x': np.arange(10, dtype=np.int64)}) _write_table(df, path) - table_read = _read_table(path) + table_read = _read_table(path, use_dataset=use_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) -# TODO duplicate column selection actually gives duplicate columns now -@pytest.mark.skip +# TODO(dataset) duplicate column selection actually gives duplicate columns now @pytest.mark.pandas -def test_pandas_column_selection(tempdir): +@parametrize_use_dataset_not_supported +def test_pandas_column_selection(tempdir, use_dataset): size = 10000 np.random.seed(0) df = pd.DataFrame({ @@ -469,14 +501,16 @@ def test_pandas_column_selection(tempdir): filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df) _write_table(arrow_table, filename) - table_read = _read_table(filename, columns=['uint8']) + table_read = _read_table( + filename, columns=['uint8'], use_dataset=use_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df[['uint8']], df_read) # ARROW-4267: Selection of duplicate columns still leads to these columns # being read uniquely. - table_read = _read_table(filename, columns=['uint8', 'uint8']) + table_read = _read_table( + filename, columns=['uint8', 'uint8'], use_dataset=use_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df[['uint8']], df_read) @@ -515,23 +549,22 @@ def _test_dataframe(size=10000, seed=0): # TODO NativeFile support -@pytest.mark.skip @pytest.mark.pandas -def test_pandas_parquet_native_file_roundtrip(tempdir): +@parametrize_use_dataset_skip_buffer +def test_pandas_parquet_native_file_roundtrip(tempdir, use_dataset): df = _test_dataframe(10000) arrow_table = pa.Table.from_pandas(df) imos = pa.BufferOutputStream() _write_table(arrow_table, imos, version="2.0") buf = imos.getvalue() reader = pa.BufferReader(buf) - df_read = _read_table(reader).to_pandas() + df_read = _read_table(reader, use_dataset=use_dataset).to_pandas() tm.assert_frame_equal(df, df_read) -# TODO buffer -@pytest.mark.skip @pytest.mark.pandas -def test_parquet_incremental_file_build(tempdir): +@parametrize_use_dataset_skip_buffer +def test_parquet_incremental_file_build(tempdir, use_dataset): df = _test_dataframe(100) df['unique_id'] = 0 @@ -551,42 +584,40 @@ def test_parquet_incremental_file_build(tempdir): writer.close() buf = out.getvalue() - result = _read_table(pa.BufferReader(buf)) + result = _read_table(pa.BufferReader(buf), use_dataset=use_dataset) expected = pd.concat(frames, ignore_index=True) tm.assert_frame_equal(result.to_pandas(), expected) -# TODO buffer -@pytest.mark.skip @pytest.mark.pandas -def test_read_pandas_column_subset(tempdir): +@parametrize_use_dataset_skip_buffer +def test_read_pandas_column_subset(tempdir, use_dataset): df = _test_dataframe(10000) arrow_table = pa.Table.from_pandas(df) imos = pa.BufferOutputStream() _write_table(arrow_table, imos, version="2.0") buf = imos.getvalue() reader = pa.BufferReader(buf) - df_read = pq.read_pandas(reader, columns=['strings', 'uint8']).to_pandas() + df_read = pq.read_pandas( + reader, columns=['strings', 'uint8'], use_dataset=use_dataset + ).to_pandas() tm.assert_frame_equal(df[['strings', 'uint8']], df_read) -# TODO buffer -@pytest.mark.skip @pytest.mark.pandas -def test_pandas_parquet_empty_roundtrip(tempdir): +@parametrize_use_dataset_skip_buffer +def test_pandas_parquet_empty_roundtrip(tempdir, use_dataset): df = _test_dataframe(0) arrow_table = pa.Table.from_pandas(df) imos = pa.BufferOutputStream() _write_table(arrow_table, imos, version="2.0") buf = imos.getvalue() reader = pa.BufferReader(buf) - df_read = _read_table(reader).to_pandas() + df_read = _read_table(reader, use_dataset=use_dataset).to_pandas() tm.assert_frame_equal(df, df_read) -# TODO buffer -@pytest.mark.skip @pytest.mark.pandas def test_pandas_can_write_nested_data(tempdir): data = { @@ -613,7 +644,8 @@ def test_pandas_can_write_nested_data(tempdir): @pytest.mark.pandas -def test_pandas_parquet_pyfile_roundtrip(tempdir): +@parametrize_use_dataset_skip_buffer +def test_pandas_parquet_pyfile_roundtrip(tempdir, use_dataset): filename = tempdir / 'pandas_pyfile_roundtrip.parquet' size = 5 df = pd.DataFrame({ @@ -631,13 +663,14 @@ def test_pandas_parquet_pyfile_roundtrip(tempdir): data = io.BytesIO(filename.read_bytes()) - table_read = _read_table(data) + table_read = _read_table(data, use_dataset=use_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @pytest.mark.pandas -def test_pandas_parquet_configuration_options(tempdir): +@parametrize_use_dataset +def test_pandas_parquet_configuration_options(tempdir, use_dataset): size = 10000 np.random.seed(0) df = pd.DataFrame({ @@ -659,14 +692,14 @@ def test_pandas_parquet_configuration_options(tempdir): for use_dictionary in [True, False]: _write_table(arrow_table, filename, version='2.0', use_dictionary=use_dictionary) - table_read = _read_table(filename) + table_read = _read_table(filename, use_dataset=use_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) for write_statistics in [True, False]: _write_table(arrow_table, filename, version='2.0', write_statistics=write_statistics) - table_read = _read_table(filename) + table_read = _read_table(filename, use_dataset=use_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @@ -676,7 +709,7 @@ def test_pandas_parquet_configuration_options(tempdir): continue _write_table(arrow_table, filename, version='2.0', compression=compression) - table_read = _read_table(filename) + table_read = _read_table(filename, use_dataset=use_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @@ -695,7 +728,8 @@ def make_sample_file(table_or_df): return pq.ParquetFile(buf) -def test_byte_stream_split(): +@parametrize_use_dataset +def test_byte_stream_split(use_dataset): # This is only a smoke test. arr_float = pa.array(list(map(float, range(100)))) arr_int = pa.array(list(map(int, range(100)))) @@ -729,26 +763,28 @@ def test_byte_stream_split(): table = pa.Table.from_arrays([arr_int], names=['tmp']) with pytest.raises(IOError): _check_roundtrip(table, expected=table, use_byte_stream_split=True, - use_dictionary=False) + use_dictionary=False, use_dataset=use_dataset) -def test_compression_level(): +@parametrize_use_dataset +def test_compression_level(use_dataset): arr = pa.array(list(map(int, range(1000)))) data = [arr, arr] table = pa.Table.from_arrays(data, names=['a', 'b']) # Check one compression level. _check_roundtrip(table, expected=table, compression="gzip", - compression_level=1) + compression_level=1, use_dataset=use_dataset) # Check another one to make sure that compression_level=1 does not # coincide with the default one in Arrow. _check_roundtrip(table, expected=table, compression="gzip", - compression_level=5) + compression_level=5, use_dataset=use_dataset) # Check that the user can provide a compression level per column _check_roundtrip(table, expected=table, compression="gzip", - compression_level={'a': 2, 'b': 3}) + compression_level={'a': 2, 'b': 3}, + use_dataset=use_dataset) # Check that specifying a compression level for a codec which does allow # specifying one, results into an error. @@ -1388,10 +1424,9 @@ def test_fixed_size_binary(): _check_roundtrip(table) -# TODO buffer -@pytest.mark.skip @pytest.mark.pandas -def test_multithreaded_read(): +@parametrize_use_dataset_skip_buffer +def test_multithreaded_read(use_dataset): df = alltypes_sample(size=10000) table = pa.Table.from_pandas(df) @@ -1400,18 +1435,17 @@ def test_multithreaded_read(): _write_table(table, buf, compression='SNAPPY', version='2.0') buf.seek(0) - table1 = _read_table(buf, use_threads=True) + table1 = _read_table(buf, use_threads=True, use_dataset=use_dataset) buf.seek(0) - table2 = _read_table(buf, use_threads=False) + table2 = _read_table(buf, use_threads=False, use_dataset=use_dataset) assert table1.equals(table2) -# TODO buffer -@pytest.mark.skip @pytest.mark.pandas -def test_min_chunksize(): +@parametrize_use_dataset_skip_buffer +def test_min_chunksize(use_dataset): data = pd.DataFrame([np.arange(4)], columns=['A', 'B', 'C', 'D']) table = pa.Table.from_pandas(data.reset_index()) @@ -1419,7 +1453,7 @@ def test_min_chunksize(): _write_table(table, buf, chunk_size=-1) buf.seek(0) - result = _read_table(buf) + result = _read_table(buf, use_dataset=use_dataset) assert result.equals(table) @@ -2483,7 +2517,8 @@ def test_ignore_hidden_files_underscore(tempdir, use_dataset): @pytest.mark.pandas -def test_multiindex_duplicate_values(tempdir): +@parametrize_use_dataset +def test_multiindex_duplicate_values(tempdir, use_dataset): num_rows = 3 numbers = list(range(num_rows)) index = pd.MultiIndex.from_arrays( @@ -2497,7 +2532,7 @@ def test_multiindex_duplicate_values(tempdir): filename = tempdir / 'dup_multi_index_levels.parquet' _write_table(table, filename) - result_table = _read_table(filename) + result_table = _read_table(filename, use_dataset=use_dataset) assert table.equals(result_table) result_df = result_table.to_pandas() @@ -2554,17 +2589,19 @@ def test_noncoerced_nanoseconds_written_without_exception(tempdir): pq.write_table(tb, filename, coerce_timestamps='ms', version='2.0') -def test_read_non_existent_file(tempdir): +@parametrize_use_dataset +def test_read_non_existent_file(tempdir, use_dataset): path = 'non-existent-file.parquet' try: - pq.read_table(path) + pq.read_table(path, use_dataset=use_dataset) except Exception as e: assert path in e.args[0] -def test_read_table_doesnt_warn(datadir): +@parametrize_use_dataset +def test_read_table_doesnt_warn(datadir, use_dataset): with pytest.warns(None) as record: - pq.read_table(datadir / 'v0.7.1.parquet') + pq.read_table(datadir / 'v0.7.1.parquet', use_dataset=use_dataset) assert len(record) == 0 @@ -2738,6 +2775,7 @@ def test_large_table_int32_overflow(): _write_table(table, f) +# TODO buffer support def _simple_table_roundtrip(table, **write_kwargs): stream = pa.BufferOutputStream() _write_table(table, stream, **write_kwargs) @@ -2806,7 +2844,8 @@ def test_list_of_binary_large_cell(): @pytest.mark.pandas -def test_index_column_name_duplicate(tempdir): +@parametrize_use_dataset +def test_index_column_name_duplicate(tempdir, use_dataset): data = { 'close': { pd.Timestamp('2017-06-30 01:31:00'): 154.99958999999998, @@ -2825,13 +2864,14 @@ def test_index_column_name_duplicate(tempdir): dfx = pd.DataFrame(data).set_index('time', drop=False) tdfx = pa.Table.from_pandas(dfx) _write_table(tdfx, path) - arrow_table = _read_table(path) + arrow_table = _read_table(path, use_dataset=use_dataset) result_df = arrow_table.to_pandas() tm.assert_frame_equal(result_df, dfx) @pytest.mark.pandas -def test_parquet_nested_convenience(tempdir): +@parametrize_use_dataset +def test_parquet_nested_convenience(tempdir, use_dataset): # ARROW-1684 df = pd.DataFrame({ 'a': [[1, 2, 3], None, [4, 5], []], @@ -2843,15 +2883,16 @@ def test_parquet_nested_convenience(tempdir): table = pa.Table.from_pandas(df, preserve_index=False) _write_table(table, path) - read = pq.read_table(path, columns=['a']) + read = pq.read_table(path, columns=['a'], use_dataset=use_dataset) tm.assert_frame_equal(read.to_pandas(), df[['a']]) - read = pq.read_table(path, columns=['a', 'b']) + read = pq.read_table(path, columns=['a', 'b'], use_dataset=use_dataset) tm.assert_frame_equal(read.to_pandas(), df) @pytest.mark.pandas -def test_backwards_compatible_index_naming(datadir): +@parametrize_use_dataset +def test_backwards_compatible_index_naming(datadir, use_dataset): expected_string = b"""\ carat cut color clarity depth table price x y z 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 @@ -2866,13 +2907,14 @@ def test_backwards_compatible_index_naming(datadir): 0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39""" expected = pd.read_csv(io.BytesIO(expected_string), sep=r'\s{2,}', index_col=None, header=0, engine='python') - table = _read_table(datadir / 'v0.7.1.parquet') + table = _read_table(datadir / 'v0.7.1.parquet', use_dataset=use_dataset) result = table.to_pandas() tm.assert_frame_equal(result, expected) @pytest.mark.pandas -def test_backwards_compatible_index_multi_level_named(datadir): +@parametrize_use_dataset +def test_backwards_compatible_index_multi_level_named(datadir, use_dataset): expected_string = b"""\ carat cut color clarity depth table price x y z 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 @@ -2891,13 +2933,17 @@ def test_backwards_compatible_index_multi_level_named(datadir): header=0, engine='python' ).sort_index() - table = _read_table(datadir / 'v0.7.1.all-named-index.parquet') + table = _read_table(datadir / 'v0.7.1.all-named-index.parquet', + use_dataset=use_dataset) result = table.to_pandas() tm.assert_frame_equal(result, expected) @pytest.mark.pandas -def test_backwards_compatible_index_multi_level_some_named(datadir): +@parametrize_use_dataset +def test_backwards_compatible_index_multi_level_some_named( + datadir, use_dataset +): expected_string = b"""\ carat cut color clarity depth table price x y z 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 @@ -2917,13 +2963,15 @@ def test_backwards_compatible_index_multi_level_some_named(datadir): ).sort_index() expected.index = expected.index.set_names(['cut', None, 'clarity']) - table = _read_table(datadir / 'v0.7.1.some-named-index.parquet') + table = _read_table(datadir / 'v0.7.1.some-named-index.parquet', + use_dataset=use_dataset) result = table.to_pandas() tm.assert_frame_equal(result, expected) @pytest.mark.pandas -def test_backwards_compatible_column_metadata_handling(datadir): +@parametrize_use_dataset +def test_backwards_compatible_column_metadata_handling(datadir, use_dataset): expected = pd.DataFrame( {'a': [1, 2, 3], 'b': [.1, .2, .3], 'c': pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')}) @@ -2933,15 +2981,16 @@ def test_backwards_compatible_column_metadata_handling(datadir): names=['index', None]) path = datadir / 'v0.7.1.column-metadata-handling.parquet' - table = _read_table(path) + table = _read_table(path, use_dataset=use_dataset) result = table.to_pandas() tm.assert_frame_equal(result, expected) - table = _read_table(path, columns=['a']) + table = _read_table(path, columns=['a'], use_dataset=use_dataset) result = table.to_pandas() tm.assert_frame_equal(result, expected[['a']].reset_index(drop=True)) +# TODO(dataset) support pickling def _make_dataset_for_pickling(tempdir, N=100): path = tempdir / 'data.parquet' fs = LocalFileSystem.get_instance() @@ -3004,7 +3053,8 @@ def test_cloudpickle_dataset(tempdir, datadir): @pytest.mark.pandas -def test_decimal_roundtrip(tempdir): +@parametrize_use_dataset +def test_decimal_roundtrip(tempdir, use_dataset): num_values = 10 columns = {} @@ -3024,7 +3074,7 @@ def test_decimal_roundtrip(tempdir): string_filename = str(filename) table = pa.Table.from_pandas(expected) _write_table(table, string_filename) - result_table = _read_table(string_filename) + result_table = _read_table(string_filename, use_dataset=use_dataset) result = result_table.to_pandas() tm.assert_frame_equal(result, expected) @@ -3044,10 +3094,9 @@ def test_decimal_roundtrip_negative_scale(tempdir): tm.assert_frame_equal(result, expected) -# TODO buffer -@pytest.mark.skip @pytest.mark.pandas -def test_parquet_writer_context_obj(tempdir): +@parametrize_use_dataset_skip_buffer +def test_parquet_writer_context_obj(tempdir, use_dataset): df = _test_dataframe(100) df['unique_id'] = 0 @@ -3065,16 +3114,15 @@ def test_parquet_writer_context_obj(tempdir): frames.append(df.copy()) buf = out.getvalue() - result = _read_table(pa.BufferReader(buf)) + result = _read_table(pa.BufferReader(buf), use_dataset=use_dataset) expected = pd.concat(frames, ignore_index=True) tm.assert_frame_equal(result.to_pandas(), expected) -# TODO buffer -@pytest.mark.skip @pytest.mark.pandas -def test_parquet_writer_context_obj_with_exception(tempdir): +@parametrize_use_dataset_skip_buffer +def test_parquet_writer_context_obj_with_exception(tempdir, use_dataset): df = _test_dataframe(100) df['unique_id'] = 0 @@ -3099,23 +3147,22 @@ def test_parquet_writer_context_obj_with_exception(tempdir): assert str(e) == error_text buf = out.getvalue() - result = _read_table(pa.BufferReader(buf)) + result = _read_table(pa.BufferReader(buf), use_dataset=use_dataset) expected = pd.concat(frames, ignore_index=True) tm.assert_frame_equal(result.to_pandas(), expected) -# TODO buffer -@pytest.mark.skip @pytest.mark.pandas -def test_zlib_compression_bug(): +@parametrize_use_dataset_skip_buffer +def test_zlib_compression_bug(use_dataset): # ARROW-3514: "zlib deflate failed, output buffer too small" table = pa.Table.from_arrays([pa.array(['abc', 'def'])], ['some_col']) f = io.BytesIO() pq.write_table(table, f, compression='gzip') f.seek(0) - roundtrip = pq.read_table(f) + roundtrip = pq.read_table(f, use_dataset=use_dataset) tm.assert_frame_equal(roundtrip.to_pandas(), table.to_pandas()) @@ -3166,10 +3213,9 @@ def test_empty_row_groups(tempdir): assert reader.read_row_group(i).equals(table) -# TODO buffer -@pytest.mark.skip @pytest.mark.pandas -def test_parquet_writer_with_caller_provided_filesystem(): +@parametrize_use_dataset_skip_buffer +def test_parquet_writer_with_caller_provided_filesystem(use_dataset): out = pa.BufferOutputStream() class CustomFS(FileSystem): @@ -3196,7 +3242,7 @@ def open(self, path, mode='rb'): assert out.closed buf = out.getvalue() - table_read = _read_table(pa.BufferReader(buf)) + table_read = _read_table(pa.BufferReader(buf), use_dataset=use_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df_read, df) @@ -3215,7 +3261,8 @@ def test_writing_empty_lists(): _check_roundtrip(table) -def test_write_nested_zero_length_array_chunk_failure(): +@parametrize_use_dataset +def test_write_nested_zero_length_array_chunk_failure(use_dataset): # Bug report in ARROW-3792 cols = OrderedDict( int32=pa.int32(), @@ -3240,7 +3287,7 @@ def test_write_nested_zero_length_array_chunk_failure(): my_batches = [pa.RecordBatch.from_arrays(batch, schema=pa.schema(cols)) for batch in my_arrays] tbl = pa.Table.from_batches(my_batches, pa.schema(cols)) - _check_roundtrip(tbl) + _check_roundtrip(tbl, use_dataset=use_dataset) @pytest.mark.pandas @@ -3274,10 +3321,9 @@ def test_read_column_invalid_index(): f.reader.read_column(index) -# TODO buffer -@pytest.mark.skip @pytest.mark.pandas -def test_direct_read_dictionary(): +@parametrize_use_dataset_skip_buffer +def test_direct_read_dictionary(use_dataset): # ARROW-3325 repeats = 10 nunique = 5 @@ -3293,7 +3339,7 @@ def test_direct_read_dictionary(): contents = bio.getvalue() result = pq.read_table(pa.BufferReader(contents), - read_dictionary=['f0']) + read_dictionary=['f0'], use_dataset=use_dataset) # Compute dictionary-encoded subfield expected = pa.table([table[0].dictionary_encode()], names=['f0']) @@ -3325,10 +3371,9 @@ def test_dataset_read_dictionary(tempdir, use_dataset): assert c1.equals(ex_chunks[0]) -# TODO buffer -@pytest.mark.skip @pytest.mark.pandas -def test_direct_read_dictionary_subfield(): +@parametrize_use_dataset_skip_buffer +def test_direct_read_dictionary_subfield(use_dataset): repeats = 10 nunique = 5 @@ -3341,7 +3386,8 @@ def test_direct_read_dictionary_subfield(): pq.write_table(table, bio) contents = bio.getvalue() result = pq.read_table(pa.BufferReader(contents), - read_dictionary=['f0.list.item']) + read_dictionary=['f0.list.item'], + use_dataset=use_dataset) arr = pa.array(data[0]) values_as_dict = arr.values.dictionary_encode() @@ -3423,9 +3469,10 @@ def test_write_to_dataset_metadata(tempdir): assert d1 == d2 -# TODO better error message for invalid files (certainly if it is the only one) -@pytest.mark.skip -def test_parquet_file_too_small(tempdir): +# TODO(dataset) better error message for invalid files (certainly if it +# is the only one) +@parametrize_use_dataset_not_supported +def test_parquet_file_too_small(tempdir, use_dataset): path = str(tempdir / "test.parquet") with pytest.raises(pa.ArrowInvalid, match='size is 0 bytes'): @@ -3440,10 +3487,9 @@ def test_parquet_file_too_small(tempdir): pq.read_table(path) -# TODO buffer -@pytest.mark.skip @pytest.mark.pandas -def test_categorical_index_survives_roundtrip(): +@parametrize_use_dataset_skip_buffer +def test_categorical_index_survives_roundtrip(use_dataset): # ARROW-3652, addressed by ARROW-3246 df = pd.DataFrame([['a', 'b'], ['c', 'd']], columns=['c1', 'c2']) df['c1'] = df['c1'].astype('category') @@ -3452,15 +3498,15 @@ def test_categorical_index_survives_roundtrip(): table = pa.Table.from_pandas(df) bos = pa.BufferOutputStream() pq.write_table(table, bos) - ref_df = pq.read_pandas(bos.getvalue()).to_pandas() + ref_df = pq.read_pandas( + bos.getvalue(), use_dataset=use_dataset).to_pandas() assert isinstance(ref_df.index, pd.CategoricalIndex) assert ref_df.index.equals(df.index) -# TODO buffer -@pytest.mark.skip @pytest.mark.pandas -def test_categorical_order_survives_roundtrip(): +@parametrize_use_dataset_skip_buffer +def test_categorical_order_survives_roundtrip(use_dataset): # ARROW-6302 df = pd.DataFrame({"a": pd.Categorical( ["a", "b", "c", "a"], categories=["b", "c", "d"], ordered=True)}) @@ -3470,7 +3516,7 @@ def test_categorical_order_survives_roundtrip(): pq.write_table(table, bos) contents = bos.getvalue() - result = pq.read_pandas(contents).to_pandas() + result = pq.read_pandas(contents, use_dataset=use_dataset).to_pandas() tm.assert_frame_equal(result, df) @@ -3482,9 +3528,8 @@ def _simple_table_write_read(table): return pq.read_table(pa.BufferReader(contents)) -# TODO buffer -@pytest.mark.skip -def test_dictionary_array_automatically_read(): +@parametrize_use_dataset_skip_buffer +def test_dictionary_array_automatically_read(use_dataset): # ARROW-3246 # Make a large dictionary, a little over 4MB of data @@ -3548,10 +3593,9 @@ def test_field_id_metadata(): assert schema[2].metadata[field_name] == b'5' -# TODO buffer -@pytest.mark.skip @pytest.mark.pandas -def test_pandas_categorical_na_type_row_groups(): +@parametrize_use_dataset_skip_buffer +def test_pandas_categorical_na_type_row_groups(use_dataset): # ARROW-5085 df = pd.DataFrame({"col": [None] * 100, "int": [1.0] * 100}) df_category = df.astype({"col": "category", "int": "category"}) @@ -3561,17 +3605,16 @@ def test_pandas_categorical_na_type_row_groups(): # it works pq.write_table(table_cat, buf, version="2.0", chunk_size=10) - result = pq.read_table(buf.getvalue()) + result = pq.read_table(buf.getvalue(), use_dataset=use_dataset) # Result is non-categorical assert result[0].equals(table[0]) assert result[1].equals(table[1]) -# TODO buffer -@pytest.mark.skip @pytest.mark.pandas -def test_pandas_categorical_roundtrip(): +@parametrize_use_dataset_skip_buffer +def test_pandas_categorical_roundtrip(use_dataset): # ARROW-5480, this was enabled by ARROW-3246 # Have one of the categories unobserved and include a null (-1) @@ -3583,7 +3626,7 @@ def test_pandas_categorical_roundtrip(): buf = pa.BufferOutputStream() pq.write_table(pa.table(df), buf) - result = pq.read_table(buf.getvalue()).to_pandas() + result = pq.read_table(buf.getvalue(), use_dataset=use_dataset).to_pandas() assert result.x.dtype == 'category' assert (result.x.cat.categories == categories).all() tm.assert_frame_equal(result, df) @@ -3632,10 +3675,10 @@ def test_multi_dataset_metadata(tempdir): assert md['serialized_size'] > 0 -# WONTFIX schema unification now happens when dataset is created -@pytest.mark.skip +# WONTFIX(dataset) schema unification now happens when dataset is created +@parametrize_use_dataset_not_supported @pytest.mark.pandas -def test_filter_before_validate_schema(tempdir): +def test_filter_before_validate_schema(tempdir, use_dataset): # ARROW-4076 apply filter before schema validation # to avoid checking unneeded schemas @@ -3695,8 +3738,7 @@ def test_fastparquet_cross_compatibility(tempdir): tm.assert_frame_equal(table_fp.to_pandas(), df) -# TODO buffer -@pytest.mark.skip +@parametrize_use_dataset_skip_buffer @pytest.mark.parametrize('array_factory', [ lambda: pa.array([0, None] * 10), lambda: pa.array([0, None] * 10).dictionary_encode(), @@ -3705,7 +3747,9 @@ def test_fastparquet_cross_compatibility(tempdir): ]) @pytest.mark.parametrize('use_dictionary', [False, True]) @pytest.mark.parametrize('read_dictionary', [False, True]) -def test_buffer_contents(array_factory, use_dictionary, read_dictionary): +def test_buffer_contents( + array_factory, use_dictionary, read_dictionary, use_dataset +): # Test that null values are deterministically initialized to zero # after a roundtrip through Parquet. # See ARROW-8006 and ARROW-8011. @@ -3715,7 +3759,8 @@ def test_buffer_contents(array_factory, use_dictionary, read_dictionary): bio.seek(0) read_dictionary = ['col'] if read_dictionary else None table = pq.read_table(bio, use_threads=False, - read_dictionary=read_dictionary) + read_dictionary=read_dictionary, + use_dataset=use_dataset) for col in table.columns: [chunk] = col.chunks From d0e33ecd9076caf91a2901a7575e4b7871b09468 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 24 Mar 2020 15:58:48 +0100 Subject: [PATCH 05/26] do not disallow null characters in strings in filters when use_dataset=True --- python/pyarrow/parquet.py | 27 ++++++++++++++------------- python/pyarrow/tests/test_parquet.py | 27 ++++++++++++++++----------- 2 files changed, 30 insertions(+), 24 deletions(-) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 59e1c61a161..1c59b98b238 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -77,7 +77,7 @@ def _check_contains_null(val): return False -def _check_filters(filters): +def _check_filters(filters, check_null_strings=True): """ Check if filters are well-formed. """ @@ -89,17 +89,18 @@ def _check_filters(filters): # too few: # We have [(,,), ..] instead of [[(,,), ..]] filters = [filters] - for conjunction in filters: - for col, op, val in conjunction: - if ( - isinstance(val, list) - and all(_check_contains_null(v) for v in val) - or _check_contains_null(val) - ): - raise NotImplementedError( - "Null-terminated binary strings are not supported as" - " filter values." - ) + if check_null_strings: + for conjunction in filters: + for col, op, val in conjunction: + if ( + isinstance(val, list) + and all(_check_contains_null(v) for v in val) + or _check_contains_null(val) + ): + raise NotImplementedError( + "Null-terminated binary strings are not supported " + "as filter values." + ) return filters @@ -116,7 +117,7 @@ def _filters_to_expression(filters): """ import pyarrow.dataset as ds - filters = _check_filters(filters) + filters = _check_filters(filters, check_null_strings=False) def convert_single_predicate(col, op, val): field = ds.field(col) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 8763838f0ca..ddc0a62ef6c 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -1743,17 +1743,22 @@ def test_equivalency(tempdir, use_dataset): assert df_filter_2.sum() > 0 assert result_df.shape[0] == (df_filter_1.sum() + df_filter_2.sum()) - # Check for \0 in predicate values. Until they are correctly implemented - # in ARROW-3391, they would otherwise lead to weird results with the - # current code. - with pytest.raises(NotImplementedError): - filters = [[('string', '==', b'1\0a')]] - pq.ParquetDataset( - base_path, filesystem=fs, filters=filters, use_dataset=use_dataset) - with pytest.raises(NotImplementedError): - filters = [[('string', '==', '1\0a')]] - pq.ParquetDataset( - base_path, filesystem=fs, filters=filters, use_dataset=use_dataset) + if not use_dataset: + # Check for \0 in predicate values. Until they are correctly + # implemented in ARROW-3391, they would otherwise lead to weird + # results with the current code. + with pytest.raises(NotImplementedError): + filters = [[('string', '==', b'1\0a')]] + pq.ParquetDataset(base_path, filesystem=fs, filters=filters) + with pytest.raises(NotImplementedError): + filters = [[('string', '==', '1\0a')]] + pq.ParquetDataset(base_path, filesystem=fs, filters=filters) + else: + for filters in [[[('string', '==', b'1\0a')]], + [[('string', '==', '1\0a')]]]: + dataset = pq.ParquetDataset( + base_path, filesystem=fs, filters=filters, use_dataset=True) + assert dataset.read().num_rows == 0 @pytest.mark.pandas From 9d02fde62997b53dbc76e87d5309b92d1c6995e2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 24 Mar 2020 17:26:16 +0100 Subject: [PATCH 06/26] add pytest.mark.dataset mark --- python/pyarrow/tests/test_parquet.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index ddc0a62ef6c..6c969697f7c 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -56,7 +56,8 @@ def datadir(datadir): return datadir / 'parquet' -parametrize_use_dataset = pytest.mark.parametrize("use_dataset", [False, True]) +parametrize_use_dataset = pytest.mark.parametrize( + "use_dataset", [False, pytest.param(True, marks=pytest.mark.dataset)]) parametrize_use_dataset_not_supported = pytest.mark.parametrize( "use_dataset", [False, pytest.param(True, marks=pytest.mark.skip)]) parametrize_use_dataset_skip_buffer = pytest.mark.parametrize( From 5fd6d9e091ba4c25d8730809df0e3e3473650902 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 24 Mar 2020 17:11:40 +0100 Subject: [PATCH 07/26] non-deterministic cases due to use_threads --- python/pyarrow/parquet.py | 3 ++- python/pyarrow/tests/test_parquet.py | 17 ++++++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 1c59b98b238..03d5e3a397d 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -1428,7 +1428,8 @@ def read_table(source, columns=None, use_threads=True, metadata=None, format=parquat_format, partitioning="hive") if filters is not None and not isinstance(filters, ds.Expression): filters = _filters_to_expression(filters) - table = dataset.to_table(columns=columns, filter=filters) + table = dataset.to_table(columns=columns, filter=filters, + use_threads=use_threads) # remove ARROW:schema metadata, current parquet version doesn't # preserve this diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 6c969697f7c..bb2391e30db 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -188,9 +188,8 @@ def alltypes_sample(size=10000, seed=0, categorical=False): return pd.DataFrame(arrays) -# TODO(dataset) non-deterministic order -@parametrize_use_dataset_not_supported @pytest.mark.pandas +@parametrize_use_dataset @pytest.mark.parametrize('chunk_size', [None, 1000]) def test_pandas_parquet_2_0_roundtrip(tempdir, chunk_size, use_dataset): df = alltypes_sample(size=10000, categorical=True) @@ -201,7 +200,10 @@ def test_pandas_parquet_2_0_roundtrip(tempdir, chunk_size, use_dataset): _write_table(arrow_table, filename, version="2.0", coerce_timestamps='ms', chunk_size=chunk_size) - table_read = pq.read_pandas(filename) + # TODO(datasets) + use_threads = False if use_dataset and chunk_size is not None else True + table_read = pq.read_pandas( + filename, use_dataset=use_dataset, use_threads=use_threads) assert table_read.schema.pandas_metadata is not None assert arrow_table.schema.metadata == table_read.schema.metadata @@ -2274,7 +2276,9 @@ def test_read_multiple_files(tempdir, use_dataset): # Write a _SUCCESS.crc file (dirpath / '_SUCCESS.crc').touch() - def read_multiple_files(paths, columns=None, use_threads=True, **kwargs): + # TODO(datasets) changed to use_threads=False because otherwise the + # row order is not deterministic + def read_multiple_files(paths, columns=None, use_threads=False, **kwargs): dataset = pq.ParquetDataset(paths, use_dataset=use_dataset, **kwargs) return dataset.read(columns=columns, use_threads=use_threads) @@ -2654,7 +2658,10 @@ def _test_write_to_dataset_with_partitions(base_path, assert dataset_cols == set(output_table.schema.names) - input_table = dataset.read() + # TODO(datasets) changed to use_threads=False because otherwise the + # row order is not deterministic + kwargs = dict(use_threads=False) if use_dataset else {} + input_table = dataset.read(**kwargs) input_df = input_table.to_pandas() # Read data back in and compare with original DataFrame From 599192cc68a04636be003434ce85074bcd08af52 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 24 Mar 2020 20:37:16 +0100 Subject: [PATCH 08/26] move dataset creation into helper function --- python/pyarrow/parquet.py | 73 ++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 36 deletions(-) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 03d5e3a397d..feee6457459 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -163,6 +163,35 @@ def convert_single_predicate(col, op, val): return expr +def _dataset_from_legacy_args( + path_or_paths, filesystem=None, read_dictionary=None, buffer_size=None +): + """ + Create a pyarrow.dataset.FileSystemDataset to use inside read_table + and ParquetDataset. + """ + import pyarrow.dataset as ds + import pyarrow.fs + + # map old filesystems to new one + if isinstance(filesystem, LocalFileSystem): + filesystem = pyarrow.fs.LocalFileSystem() + + # map additional arguments + # TODO raise warning when unsupported arguments are passed + reader_options = {} + if buffer_size: + reader_options.update(use_buffered_stream=True, + buffer_size=buffer_size) + if read_dictionary is not None: + reader_options.update(dict_columns=read_dictionary) + parquet_format = ds.ParquetFileFormat(reader_options=reader_options) + + dataset = ds.dataset(path_or_paths, filesystem=filesystem, + format=parquet_format, partitioning="hive") + return dataset + + # ---------------------------------------------------------------------- # Reading a single Parquet file @@ -1332,23 +1361,9 @@ class ParquetDatasetV2: """ def __init__(self, path_or_paths, filesystem=None, filters=None, read_dictionary=None, buffer_size=None): - import pyarrow.dataset as ds - import pyarrow.fs - - # map old filesystems to new one - if isinstance(filesystem, LocalFileSystem): - filesystem = pyarrow.fs.LocalFileSystem() - - reader_options = {} - if buffer_size: - reader_options.update(use_buffered_stream=True, - buffer_size=buffer_size) - if read_dictionary is not None: - reader_options.update(dict_columns=read_dictionary) - parquat_format = ds.ParquetFileFormat(reader_options=reader_options) - - dataset = ds.dataset(path_or_paths, filesystem=filesystem, - format=parquat_format, partitioning="hive") + dataset = _dataset_from_legacy_args( + path_or_paths, filesystem=filesystem, + read_dictionary=read_dictionary, buffer_size=buffer_size) self._dataset = dataset self.filters = filters if filters is not None: @@ -1360,7 +1375,7 @@ def __init__(self, path_or_paths, filesystem=None, filters=None, def schema(self): return self._dataset.schema - def read(self, columns=None, use_threads=False): + def read(self, columns=None, use_threads=True): return self._dataset.to_table( columns=columns, filter=self.filter_expression, use_threads=use_threads @@ -1408,24 +1423,10 @@ def read_table(source, columns=None, use_threads=True, metadata=None, buffer_size=0, use_dataset=False): if use_dataset: import pyarrow.dataset as ds - import pyarrow.fs - - # map old filesystems to new one - if isinstance(filesystem, LocalFileSystem): - filesystem = pyarrow.fs.LocalFileSystem() - - # map additional arguments - # TODO raise warning when unsupported arguments are passed - reader_options = {} - if buffer_size: - reader_options.update(use_buffered_stream=True, - buffer_size=buffer_size) - if read_dictionary is not None: - reader_options.update(dict_columns=read_dictionary) - parquat_format = ds.ParquetFileFormat(reader_options=reader_options) - - dataset = ds.dataset(source, filesystem=filesystem, - format=parquat_format, partitioning="hive") + dataset = _dataset_from_legacy_args( + source, filesystem=filesystem, read_dictionary=read_dictionary, + buffer_size=buffer_size) + if filters is not None and not isinstance(filters, ds.Expression): filters = _filters_to_expression(filters) table = dataset.to_table(columns=columns, filter=filters, From 8a780d1d240aad26c2ab5ef73a4bc7256d7b649f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 26 Mar 2020 16:28:02 +0100 Subject: [PATCH 09/26] add support for use_pandas_metadata + some cleanup of the tests --- python/pyarrow/parquet.py | 45 +++++++++++++++--- python/pyarrow/tests/test_parquet.py | 71 ++++++++++++++-------------- 2 files changed, 74 insertions(+), 42 deletions(-) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index feee6457459..9a422728bee 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -174,6 +174,7 @@ def _dataset_from_legacy_args( import pyarrow.fs # map old filesystems to new one + # TODO(dataset) deal with other file systems if isinstance(filesystem, LocalFileSystem): filesystem = pyarrow.fs.LocalFileSystem() @@ -1122,10 +1123,10 @@ def __new__(cls, path_or_paths=None, filesystem=None, schema=None, memory_map=False, buffer_size=0, use_dataset=False): if use_dataset: # TODO raise warning on unsupported keywords - return ParquetDatasetV2(path_or_paths, filesystem=filesystem, - filters=filters, - read_dictionary=read_dictionary, - buffer_size=buffer_size) + return _ParquetDatasetV2(path_or_paths, filesystem=filesystem, + filters=filters, + read_dictionary=read_dictionary, + buffer_size=buffer_size) self = object.__new__(cls) return self @@ -1355,7 +1356,7 @@ def _make_manifest(path_or_paths, fs, pathsep='/', metadata_nthreads=1, return pieces, partitions, common_metadata_path, metadata_path -class ParquetDatasetV2: +class _ParquetDatasetV2: """ ParquetDataset shim using the Dataset API under the hood. """ @@ -1375,12 +1376,42 @@ def __init__(self, path_or_paths, filesystem=None, filters=None, def schema(self): return self._dataset.schema - def read(self, columns=None, use_threads=True): - return self._dataset.to_table( + def read(self, columns=None, use_threads=True, use_pandas_metadata=False): + + # if use_pandas_metadata, we need to include index columns in the + # column selection, to be able to restore those in the pandas DataFrame + metadata = self._dataset.schema.metadata + if use_pandas_metadata: + if metadata and b'pandas' in metadata: + index_columns = _get_pandas_index_columns(metadata) + + columns = list(columns) + for index_col in index_columns: + if index_col not in columns: + columns += [index_col] + + table = self._dataset.to_table( columns=columns, filter=self.filter_expression, use_threads=use_threads ) + # if use_pandas_metadata, restore the pandas metadata (which gets + # lost if doing a specific `columns` selection in to_table) + if use_pandas_metadata: + if metadata and b"pandas" in metadata: + new_metadata = table.schema.metadata or {} + new_metadata.update({b"pandas": metadata[b"pandas"]}) + table = table.replace_schema_metadata(new_metadata) + + return table + + def read_pandas(self, **kwargs): + """ + Read dataset including pandas metadata, if any. Other arguments passed + through to ParquetDataset.read, see docstring for further details. + """ + return self.read(use_pandas_metadata=True, **kwargs) + @property def pieces(self): # TODO raise deprecation warning diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index bb2391e30db..f82e3ae185e 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -256,11 +256,11 @@ def test_memory_map(tempdir, use_dataset): _check_roundtrip(table, read_table_kwargs={'memory_map': True}, version='2.0', use_dataset=use_dataset) - # TODO add use_dataset to read_pandas as well filename = str(tempdir / 'tmp_file') with open(filename, 'wb') as f: _write_table(table, f, version='2.0') - table_read = pq.read_pandas(filename, memory_map=True) + table_read = pq.read_pandas(filename, memory_map=True, + use_dataset=use_dataset) assert table_read.equals(table) @@ -273,11 +273,11 @@ def test_enable_buffered_stream(tempdir, use_dataset): _check_roundtrip(table, read_table_kwargs={'buffer_size': 1025}, version='2.0', use_dataset=use_dataset) - # TODO add use_dataset to read_pandas as well filename = str(tempdir / 'tmp_file') with open(filename, 'wb') as f: _write_table(table, f, version='2.0') - table_read = pq.read_pandas(filename, buffer_size=4096) + table_read = pq.read_pandas(filename, buffer_size=4096, + use_dataset=use_dataset) assert table_read.equals(table) @@ -393,9 +393,9 @@ def test_pandas_parquet_custom_metadata(tempdir): 'step': 1}] -# TODO support read_pandas for use_dataset @pytest.mark.pandas -def test_pandas_parquet_column_multiindex(tempdir): +@parametrize_use_dataset +def test_pandas_parquet_column_multiindex(tempdir, use_dataset): df = alltypes_sample(size=10) df.columns = pd.MultiIndex.from_tuples( list(zip(df.columns, df.columns[::-1])), @@ -408,14 +408,16 @@ def test_pandas_parquet_column_multiindex(tempdir): _write_table(arrow_table, filename, version='2.0', coerce_timestamps='ms') - table_read = pq.read_pandas(filename) + table_read = pq.read_pandas(filename, use_dataset=use_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) -# TODO support read_pandas for use_dataset @pytest.mark.pandas -def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written(tempdir): +@parametrize_use_dataset +def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written( + tempdir, use_dataset +): df = alltypes_sample(size=10000) filename = tempdir / 'pandas_roundtrip.parquet' @@ -427,7 +429,7 @@ def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written(tempdir): assert js['columns'] _write_table(arrow_table, filename, version='2.0', coerce_timestamps='ms') - table_read = pq.read_pandas(filename) + table_read = pq.read_pandas(filename, use_dataset=use_dataset) js = table_read.schema.pandas_metadata assert not js['index_columns'] @@ -1964,22 +1966,12 @@ def test_invalid_pred_op(tempdir, use_dataset): use_dataset=use_dataset) assert dataset.read().num_rows == 0 - if not use_dataset: - with pytest.raises(ValueError): - pq.ParquetDataset(base_path, - filesystem=fs, - filters=[('integers', '!=', {3})], - use_dataset=use_dataset) - else: - # TODO(dataset) ARROW-8186: - # `ds.field('int') != {3}` returns bool instead of expression - with pytest.raises(TypeError): - # Datasets API gives filter expression that is always true - dataset = pq.ParquetDataset(base_path, - filesystem=fs, - filters=[('integers', '!=', {3})], - use_dataset=use_dataset) - assert dataset.read().num_rows == 5 + with pytest.raises((ValueError, TypeError)): + # dataset API returns TypeError when trying create invalid comparison + pq.ParquetDataset(base_path, + filesystem=fs, + filters=[('integers', '!=', {3})], + use_dataset=use_dataset) @pytest.mark.pandas @@ -2338,7 +2330,8 @@ def read_multiple_files(paths, columns=None, use_threads=False, **kwargs): @pytest.mark.pandas -def test_dataset_read_pandas(tempdir): +@parametrize_use_dataset +def test_dataset_read_pandas(tempdir, use_dataset): nfiles = 5 size = 5 @@ -2361,8 +2354,7 @@ def test_dataset_read_pandas(tempdir): frames.append(df) paths.append(path) - # TODO check read_pandas semantics - dataset = pq.ParquetDataset(dirpath) + dataset = pq.ParquetDataset(dirpath, use_dataset=use_dataset) columns = ['uint8', 'strings'] result = dataset.read_pandas(columns=columns).to_pandas() expected = pd.concat([x[columns] for x in frames]) @@ -2485,7 +2477,10 @@ def test_ignore_private_directories(tempdir, dir_prefix, use_dataset): (dirpath / '{}staging'.format(dir_prefix)).mkdir() dataset = pq.ParquetDataset(dirpath, use_dataset=use_dataset) - assert set(map(str, paths)) == {x.path for x in dataset.pieces} + if not use_dataset: + assert set(map(str, paths)) == {x.path for x in dataset.pieces} + else: + assert set(map(str, paths)) == set(dataset._dataset.files) @pytest.mark.pandas @@ -2504,7 +2499,10 @@ def test_ignore_hidden_files_dot(tempdir, use_dataset): f.write(b'gibberish') dataset = pq.ParquetDataset(dirpath, use_dataset=use_dataset) - assert set(map(str, paths)) == {x.path for x in dataset.pieces} + if not use_dataset: + assert set(map(str, paths)) == {x.path for x in dataset.pieces} + else: + assert set(map(str, paths)) == set(dataset._dataset.files) @pytest.mark.pandas @@ -2523,7 +2521,10 @@ def test_ignore_hidden_files_underscore(tempdir, use_dataset): f.write(b'abcd') dataset = pq.ParquetDataset(dirpath, use_dataset=use_dataset) - assert set(map(str, paths)) == {x.path for x in dataset.pieces} + if not use_dataset: + assert set(map(str, paths)) == {x.path for x in dataset.pieces} + else: + assert set(map(str, paths)) == set(dataset._dataset.files) @pytest.mark.pandas @@ -3688,8 +3689,7 @@ def test_multi_dataset_metadata(tempdir): assert md['serialized_size'] > 0 -# WONTFIX(dataset) schema unification now happens when dataset is created -@parametrize_use_dataset_not_supported +@parametrize_use_dataset @pytest.mark.pandas def test_filter_before_validate_schema(tempdir, use_dataset): # ARROW-4076 apply filter before schema validation @@ -3708,7 +3708,8 @@ def test_filter_before_validate_schema(tempdir, use_dataset): pq.write_table(table2, dir2 / 'data.parquet') # read single file using filter - table = pq.read_table(tempdir, filters=[[('A', '==', 0)]]) + table = pq.read_table(tempdir, filters=[[('A', '==', 0)]], + use_dataset=use_dataset) assert table.column('B').equals(pa.chunked_array([[1, 2, 3]])) From 31a2c8f8a2ef7a9a26f5e68e74e69c1762f20402 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 26 Mar 2020 18:00:23 +0100 Subject: [PATCH 10/26] rename use_dataset -> use_legacy_dataset --- python/pyarrow/parquet.py | 15 +- python/pyarrow/tests/test_parquet.py | 624 +++++++++++++++------------ 2 files changed, 345 insertions(+), 294 deletions(-) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 9a422728bee..92ae2f0d9a7 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -1120,8 +1120,8 @@ class ParquetDataset: def __new__(cls, path_or_paths=None, filesystem=None, schema=None, metadata=None, split_row_groups=False, validate_schema=True, filters=None, metadata_nthreads=1, read_dictionary=None, - memory_map=False, buffer_size=0, use_dataset=False): - if use_dataset: + memory_map=False, buffer_size=0, use_legacy_dataset=True): + if not use_legacy_dataset: # TODO raise warning on unsupported keywords return _ParquetDatasetV2(path_or_paths, filesystem=filesystem, filters=filters, @@ -1133,7 +1133,7 @@ def __new__(cls, path_or_paths=None, filesystem=None, schema=None, def __init__(self, path_or_paths, filesystem=None, schema=None, metadata=None, split_row_groups=False, validate_schema=True, filters=None, metadata_nthreads=1, read_dictionary=None, - memory_map=False, buffer_size=0, use_dataset=False): + memory_map=False, buffer_size=0, use_legacy_dataset=True): self._metadata = _ParquetDatasetMetadata() a_path = path_or_paths if isinstance(a_path, list): @@ -1451,8 +1451,8 @@ def pieces(self): def read_table(source, columns=None, use_threads=True, metadata=None, use_pandas_metadata=False, memory_map=False, read_dictionary=None, filesystem=None, filters=None, - buffer_size=0, use_dataset=False): - if use_dataset: + buffer_size=0, use_legacy_dataset=True): + if not use_legacy_dataset: import pyarrow.dataset as ds dataset = _dataset_from_legacy_args( source, filesystem=filesystem, read_dictionary=read_dictionary, @@ -1498,7 +1498,8 @@ def read_table(source, columns=None, use_threads=True, metadata=None, def read_pandas(source, columns=None, use_threads=True, memory_map=False, - metadata=None, filters=None, buffer_size=0, use_dataset=False): + metadata=None, filters=None, buffer_size=0, + use_legacy_dataset=True): return read_table( source, columns=columns, @@ -1508,7 +1509,7 @@ def read_pandas(source, columns=None, use_threads=True, memory_map=False, memory_map=memory_map, buffer_size=buffer_size, use_pandas_metadata=True, - use_dataset=use_dataset, + use_legacy_dataset=use_legacy_dataset, ) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index f82e3ae185e..bfa105a7dfc 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -56,17 +56,13 @@ def datadir(datadir): return datadir / 'parquet' -parametrize_use_dataset = pytest.mark.parametrize( - "use_dataset", [False, pytest.param(True, marks=pytest.mark.dataset)]) -parametrize_use_dataset_not_supported = pytest.mark.parametrize( - "use_dataset", [False, pytest.param(True, marks=pytest.mark.skip)]) -parametrize_use_dataset_skip_buffer = pytest.mark.parametrize( - "use_dataset", [False, pytest.param(True, marks=pytest.mark.skip)]) - - -# @pytest.fixture(params=[False, True]) -# def use_dataset(request): -# return request.param +parametrize_legacy_dataset = pytest.mark.parametrize( + "use_legacy_dataset", + [True, pytest.param(False, marks=pytest.mark.dataset)]) +parametrize_legacy_dataset_not_supported = pytest.mark.parametrize( + "use_legacy_dataset", [True, pytest.param(False, marks=pytest.mark.skip)]) +parametrize_legacy_dataset_skip_buffer = pytest.mark.parametrize( + "use_legacy_dataset", [True, pytest.param(False, marks=pytest.mark.skip)]) def _write_table(table, path, **kwargs): @@ -85,26 +81,26 @@ def _read_table(*args, **kwargs): def _roundtrip_table(table, read_table_kwargs=None, - write_table_kwargs=None, use_dataset=False): + write_table_kwargs=None, use_legacy_dataset=True): read_table_kwargs = read_table_kwargs or {} write_table_kwargs = write_table_kwargs or {} - if use_dataset: + if use_legacy_dataset: + buf = io.BytesIO() + _write_table(table, buf, **write_table_kwargs) + buf.seek(0) + return _read_table(buf, **read_table_kwargs) + else: from pyarrow.fs import _MockFileSystem mockfs = _MockFileSystem() with mockfs.open_output_stream("test") as out: _write_table(table, out, **write_table_kwargs) - return _read_table("test", filesystem=mockfs, use_dataset=True, + return _read_table("test", filesystem=mockfs, use_legacy_dataset=False, **read_table_kwargs) - else: - buf = io.BytesIO() - _write_table(table, buf, **write_table_kwargs) - buf.seek(0) - return _read_table(buf, **read_table_kwargs) def _check_roundtrip(table, expected=None, read_table_kwargs=None, - use_dataset=False, **write_table_kwargs): + use_legacy_dataset=True, **write_table_kwargs): if expected is None: expected = table @@ -113,42 +109,43 @@ def _check_roundtrip(table, expected=None, read_table_kwargs=None, # intentionally check twice result = _roundtrip_table(table, read_table_kwargs=read_table_kwargs, write_table_kwargs=write_table_kwargs, - use_dataset=use_dataset) + use_legacy_dataset=use_legacy_dataset) assert result.equals(expected) result = _roundtrip_table(result, read_table_kwargs=read_table_kwargs, write_table_kwargs=write_table_kwargs, - use_dataset=use_dataset) + use_legacy_dataset=use_legacy_dataset) assert result.equals(expected) -def _roundtrip_pandas_dataframe(df, write_kwargs, use_dataset=False): +def _roundtrip_pandas_dataframe(df, write_kwargs, use_legacy_dataset=True): table = pa.Table.from_pandas(df) - if use_dataset: - from pyarrow.fs import _MockFileSystem - mockfs = _MockFileSystem() - with mockfs.open_output_stream("test") as out: - _write_table(table, out, **write_kwargs) - - table1 = _read_table("test", filesystem=mockfs, use_dataset=True) - else: + if use_legacy_dataset: buf = io.BytesIO() _write_table(table, buf, **write_kwargs) buf.seek(0) table1 = _read_table(buf) + else: + from pyarrow.fs import _MockFileSystem + mockfs = _MockFileSystem() + with mockfs.open_output_stream("test") as out: + _write_table(table, out, **write_kwargs) + + table1 = _read_table( + "test", filesystem=mockfs, use_legacy_dataset=False) return table1.to_pandas() -@parametrize_use_dataset +@parametrize_legacy_dataset @pytest.mark.parametrize('dtype', [int, float]) -def test_single_pylist_column_roundtrip(tempdir, dtype, use_dataset): +def test_single_pylist_column_roundtrip(tempdir, dtype, use_legacy_dataset): filename = tempdir / 'single_{}_column.parquet'.format(dtype.__name__) data = [pa.array(list(map(dtype, range(5))))] table = pa.Table.from_arrays(data, names=['a']) _write_table(table, filename) - table_read = _read_table(filename, use_dataset=use_dataset) + table_read = _read_table(filename, use_legacy_dataset=use_legacy_dataset) for i in range(table.num_columns): col_written = table[i] col_read = table_read[i] @@ -189,9 +186,9 @@ def alltypes_sample(size=10000, seed=0, categorical=False): @pytest.mark.pandas -@parametrize_use_dataset +@parametrize_legacy_dataset @pytest.mark.parametrize('chunk_size', [None, 1000]) -def test_pandas_parquet_2_0_roundtrip(tempdir, chunk_size, use_dataset): +def test_pandas_parquet_2_0_roundtrip(tempdir, chunk_size, use_legacy_dataset): df = alltypes_sample(size=10000, categorical=True) filename = tempdir / 'pandas_roundtrip.parquet' @@ -201,9 +198,11 @@ def test_pandas_parquet_2_0_roundtrip(tempdir, chunk_size, use_dataset): _write_table(arrow_table, filename, version="2.0", coerce_timestamps='ms', chunk_size=chunk_size) # TODO(datasets) - use_threads = False if use_dataset and chunk_size is not None else True + use_threads = ( + False if not use_legacy_dataset and chunk_size is not None else True) table_read = pq.read_pandas( - filename, use_dataset=use_dataset, use_threads=use_threads) + filename, use_legacy_dataset=use_legacy_dataset, + use_threads=use_threads) assert table_read.schema.pandas_metadata is not None assert arrow_table.schema.metadata == table_read.schema.metadata @@ -218,8 +217,8 @@ def test_parquet_invalid_version(tempdir): _write_table(table, tempdir / 'test_version.parquet', version="2.2") -@parametrize_use_dataset -def test_set_data_page_size(use_dataset): +@parametrize_legacy_dataset +def test_set_data_page_size(use_legacy_dataset): arr = pa.array([1, 2, 3] * 100000) t = pa.Table.from_arrays([arr], names=['f0']) @@ -227,75 +226,77 @@ def test_set_data_page_size(use_dataset): page_sizes = [2 << 16, 2 << 18] for target_page_size in page_sizes: _check_roundtrip(t, data_page_size=target_page_size, - use_dataset=use_dataset) + use_legacy_dataset=use_legacy_dataset) @pytest.mark.pandas -@parametrize_use_dataset -def test_chunked_table_write(use_dataset): +@parametrize_legacy_dataset +def test_chunked_table_write(use_legacy_dataset): # ARROW-232 df = alltypes_sample(size=10) batch = pa.RecordBatch.from_pandas(df) table = pa.Table.from_batches([batch] * 3) - _check_roundtrip(table, version='2.0', use_dataset=use_dataset) + _check_roundtrip( + table, version='2.0', use_legacy_dataset=use_legacy_dataset) df, _ = dataframe_with_lists() batch = pa.RecordBatch.from_pandas(df) table = pa.Table.from_batches([batch] * 3) - _check_roundtrip(table, version='2.0', use_dataset=use_dataset) + _check_roundtrip( + table, version='2.0', use_legacy_dataset=use_legacy_dataset) @pytest.mark.pandas -@parametrize_use_dataset -def test_memory_map(tempdir, use_dataset): +@parametrize_legacy_dataset +def test_memory_map(tempdir, use_legacy_dataset): df = alltypes_sample(size=10) table = pa.Table.from_pandas(df) # TODO(dataset) memory_map is still ignored for now _check_roundtrip(table, read_table_kwargs={'memory_map': True}, - version='2.0', use_dataset=use_dataset) + version='2.0', use_legacy_dataset=use_legacy_dataset) filename = str(tempdir / 'tmp_file') with open(filename, 'wb') as f: _write_table(table, f, version='2.0') table_read = pq.read_pandas(filename, memory_map=True, - use_dataset=use_dataset) + use_legacy_dataset=use_legacy_dataset) assert table_read.equals(table) @pytest.mark.pandas -@parametrize_use_dataset -def test_enable_buffered_stream(tempdir, use_dataset): +@parametrize_legacy_dataset +def test_enable_buffered_stream(tempdir, use_legacy_dataset): df = alltypes_sample(size=10) table = pa.Table.from_pandas(df) _check_roundtrip(table, read_table_kwargs={'buffer_size': 1025}, - version='2.0', use_dataset=use_dataset) + version='2.0', use_legacy_dataset=use_legacy_dataset) filename = str(tempdir / 'tmp_file') with open(filename, 'wb') as f: _write_table(table, f, version='2.0') table_read = pq.read_pandas(filename, buffer_size=4096, - use_dataset=use_dataset) + use_legacy_dataset=use_legacy_dataset) assert table_read.equals(table) -@parametrize_use_dataset -def test_special_chars_filename(tempdir, use_dataset): +@parametrize_legacy_dataset +def test_special_chars_filename(tempdir, use_legacy_dataset): table = pa.Table.from_arrays([pa.array([42])], ["ints"]) filename = "foo # bar" path = tempdir / filename assert not path.exists() _write_table(table, str(path)) assert path.exists() - table_read = _read_table(str(path), use_dataset=use_dataset) + table_read = _read_table(str(path), use_legacy_dataset=use_legacy_dataset) assert table_read.equals(table) @pytest.mark.pandas -@parametrize_use_dataset -def test_empty_table_roundtrip(use_dataset): +@parametrize_legacy_dataset +def test_empty_table_roundtrip(use_legacy_dataset): df = alltypes_sample(size=10) # Create a non-empty table to infer the types correctly, then slice to 0 @@ -306,27 +307,28 @@ def test_empty_table_roundtrip(use_dataset): assert table.schema.field('null').type == pa.null() assert table.schema.field('null_list').type == pa.list_(pa.null()) - _check_roundtrip(table, version='2.0', use_dataset=use_dataset) + _check_roundtrip( + table, version='2.0', use_legacy_dataset=use_legacy_dataset) @pytest.mark.pandas -@parametrize_use_dataset -def test_empty_table_no_columns(use_dataset): +@parametrize_legacy_dataset +def test_empty_table_no_columns(use_legacy_dataset): df = pd.DataFrame() empty = pa.Table.from_pandas(df, preserve_index=False) - _check_roundtrip(empty, use_dataset=use_dataset) + _check_roundtrip(empty, use_legacy_dataset=use_legacy_dataset) -@parametrize_use_dataset -def test_empty_lists_table_roundtrip(use_dataset): +@parametrize_legacy_dataset +def test_empty_lists_table_roundtrip(use_legacy_dataset): # ARROW-2744: Shouldn't crash when writing an array of empty lists arr = pa.array([[], []], type=pa.list_(pa.int32())) table = pa.Table.from_arrays([arr], ["A"]) - _check_roundtrip(table, use_dataset=use_dataset) + _check_roundtrip(table, use_legacy_dataset=use_legacy_dataset) -@parametrize_use_dataset -def test_nested_list_nonnullable_roundtrip_bug(use_dataset): +@parametrize_legacy_dataset +def test_nested_list_nonnullable_roundtrip_bug(use_legacy_dataset): # Reproduce failure in ARROW-5630 typ = pa.list_(pa.field("item", pa.float32(), False)) num_rows = 10000 @@ -334,12 +336,13 @@ def test_nested_list_nonnullable_roundtrip_bug(use_dataset): pa.array(([[0] * ((i + 5) % 10) for i in range(0, 10)] * (num_rows // 10)), type=typ) ], ['a']) - _check_roundtrip(t, data_page_size=4096, use_dataset=use_dataset) + _check_roundtrip( + t, data_page_size=4096, use_legacy_dataset=use_legacy_dataset) @pytest.mark.pandas -@parametrize_use_dataset_skip_buffer -def test_pandas_parquet_datetime_tz(use_dataset): +@parametrize_legacy_dataset_skip_buffer +def test_pandas_parquet_datetime_tz(use_legacy_dataset): s = pd.Series([datetime.datetime(2017, 9, 6)]) s = s.dt.tz_localize('utc') @@ -364,13 +367,14 @@ def test_pandas_parquet_datetime_tz(use_dataset): @pytest.mark.pandas -@parametrize_use_dataset -def test_datetime_timezone_tzinfo(use_dataset): +@parametrize_legacy_dataset +def test_datetime_timezone_tzinfo(use_legacy_dataset): value = datetime.datetime(2018, 1, 1, 1, 23, 45, tzinfo=datetime.timezone.utc) df = pd.DataFrame({'foo': [value]}) - _roundtrip_pandas_dataframe(df, write_kwargs={}, use_dataset=use_dataset) + _roundtrip_pandas_dataframe( + df, write_kwargs={}, use_legacy_dataset=use_legacy_dataset) @pytest.mark.pandas @@ -394,8 +398,8 @@ def test_pandas_parquet_custom_metadata(tempdir): @pytest.mark.pandas -@parametrize_use_dataset -def test_pandas_parquet_column_multiindex(tempdir, use_dataset): +@parametrize_legacy_dataset +def test_pandas_parquet_column_multiindex(tempdir, use_legacy_dataset): df = alltypes_sample(size=10) df.columns = pd.MultiIndex.from_tuples( list(zip(df.columns, df.columns[::-1])), @@ -408,15 +412,16 @@ def test_pandas_parquet_column_multiindex(tempdir, use_dataset): _write_table(arrow_table, filename, version='2.0', coerce_timestamps='ms') - table_read = pq.read_pandas(filename, use_dataset=use_dataset) + table_read = pq.read_pandas( + filename, use_legacy_dataset=use_legacy_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @pytest.mark.pandas -@parametrize_use_dataset +@parametrize_legacy_dataset def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written( - tempdir, use_dataset + tempdir, use_legacy_dataset ): df = alltypes_sample(size=10000) @@ -429,7 +434,8 @@ def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written( assert js['columns'] _write_table(arrow_table, filename, version='2.0', coerce_timestamps='ms') - table_read = pq.read_pandas(filename, use_dataset=use_dataset) + table_read = pq.read_pandas( + filename, use_legacy_dataset=use_legacy_dataset) js = table_read.schema.pandas_metadata assert not js['index_columns'] @@ -441,8 +447,8 @@ def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written( @pytest.mark.pandas -@parametrize_use_dataset -def test_pandas_parquet_1_0_roundtrip(tempdir, use_dataset): +@parametrize_legacy_dataset +def test_pandas_parquet_1_0_roundtrip(tempdir, use_legacy_dataset): size = 10000 np.random.seed(0) df = pd.DataFrame({ @@ -464,7 +470,7 @@ def test_pandas_parquet_1_0_roundtrip(tempdir, use_dataset): filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df) _write_table(arrow_table, filename, version='1.0') - table_read = _read_table(filename, use_dataset=use_dataset) + table_read = _read_table(filename, use_legacy_dataset=use_legacy_dataset) df_read = table_read.to_pandas() # We pass uint32_t as int64_t if we write Parquet version 1.0 @@ -474,13 +480,13 @@ def test_pandas_parquet_1_0_roundtrip(tempdir, use_dataset): @pytest.mark.pandas -@parametrize_use_dataset -def test_multiple_path_types(tempdir, use_dataset): +@parametrize_legacy_dataset +def test_multiple_path_types(tempdir, use_legacy_dataset): # Test compatibility with PEP 519 path-like objects path = tempdir / 'zzz.parquet' df = pd.DataFrame({'x': np.arange(10, dtype=np.int64)}) _write_table(df, path) - table_read = _read_table(path, use_dataset=use_dataset) + table_read = _read_table(path, use_legacy_dataset=use_legacy_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @@ -488,15 +494,15 @@ def test_multiple_path_types(tempdir, use_dataset): path = str(tempdir) + 'zzz.parquet' df = pd.DataFrame({'x': np.arange(10, dtype=np.int64)}) _write_table(df, path) - table_read = _read_table(path, use_dataset=use_dataset) + table_read = _read_table(path, use_legacy_dataset=use_legacy_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) # TODO(dataset) duplicate column selection actually gives duplicate columns now @pytest.mark.pandas -@parametrize_use_dataset_not_supported -def test_pandas_column_selection(tempdir, use_dataset): +@parametrize_legacy_dataset_not_supported +def test_pandas_column_selection(tempdir, use_legacy_dataset): size = 10000 np.random.seed(0) df = pd.DataFrame({ @@ -507,7 +513,7 @@ def test_pandas_column_selection(tempdir, use_dataset): arrow_table = pa.Table.from_pandas(df) _write_table(arrow_table, filename) table_read = _read_table( - filename, columns=['uint8'], use_dataset=use_dataset) + filename, columns=['uint8'], use_legacy_dataset=use_legacy_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df[['uint8']], df_read) @@ -515,7 +521,8 @@ def test_pandas_column_selection(tempdir, use_dataset): # ARROW-4267: Selection of duplicate columns still leads to these columns # being read uniquely. table_read = _read_table( - filename, columns=['uint8', 'uint8'], use_dataset=use_dataset) + filename, columns=['uint8', 'uint8'], + use_legacy_dataset=use_legacy_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df[['uint8']], df_read) @@ -555,21 +562,22 @@ def _test_dataframe(size=10000, seed=0): # TODO NativeFile support @pytest.mark.pandas -@parametrize_use_dataset_skip_buffer -def test_pandas_parquet_native_file_roundtrip(tempdir, use_dataset): +@parametrize_legacy_dataset_skip_buffer +def test_pandas_parquet_native_file_roundtrip(tempdir, use_legacy_dataset): df = _test_dataframe(10000) arrow_table = pa.Table.from_pandas(df) imos = pa.BufferOutputStream() _write_table(arrow_table, imos, version="2.0") buf = imos.getvalue() reader = pa.BufferReader(buf) - df_read = _read_table(reader, use_dataset=use_dataset).to_pandas() + df_read = _read_table( + reader, use_legacy_dataset=use_legacy_dataset).to_pandas() tm.assert_frame_equal(df, df_read) @pytest.mark.pandas -@parametrize_use_dataset_skip_buffer -def test_parquet_incremental_file_build(tempdir, use_dataset): +@parametrize_legacy_dataset_skip_buffer +def test_parquet_incremental_file_build(tempdir, use_legacy_dataset): df = _test_dataframe(100) df['unique_id'] = 0 @@ -589,15 +597,16 @@ def test_parquet_incremental_file_build(tempdir, use_dataset): writer.close() buf = out.getvalue() - result = _read_table(pa.BufferReader(buf), use_dataset=use_dataset) + result = _read_table( + pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset) expected = pd.concat(frames, ignore_index=True) tm.assert_frame_equal(result.to_pandas(), expected) @pytest.mark.pandas -@parametrize_use_dataset_skip_buffer -def test_read_pandas_column_subset(tempdir, use_dataset): +@parametrize_legacy_dataset_skip_buffer +def test_read_pandas_column_subset(tempdir, use_legacy_dataset): df = _test_dataframe(10000) arrow_table = pa.Table.from_pandas(df) imos = pa.BufferOutputStream() @@ -605,21 +614,23 @@ def test_read_pandas_column_subset(tempdir, use_dataset): buf = imos.getvalue() reader = pa.BufferReader(buf) df_read = pq.read_pandas( - reader, columns=['strings', 'uint8'], use_dataset=use_dataset + reader, columns=['strings', 'uint8'], + use_legacy_dataset=use_legacy_dataset ).to_pandas() tm.assert_frame_equal(df[['strings', 'uint8']], df_read) @pytest.mark.pandas -@parametrize_use_dataset_skip_buffer -def test_pandas_parquet_empty_roundtrip(tempdir, use_dataset): +@parametrize_legacy_dataset_skip_buffer +def test_pandas_parquet_empty_roundtrip(tempdir, use_legacy_dataset): df = _test_dataframe(0) arrow_table = pa.Table.from_pandas(df) imos = pa.BufferOutputStream() _write_table(arrow_table, imos, version="2.0") buf = imos.getvalue() reader = pa.BufferReader(buf) - df_read = _read_table(reader, use_dataset=use_dataset).to_pandas() + df_read = _read_table( + reader, use_legacy_dataset=use_legacy_dataset).to_pandas() tm.assert_frame_equal(df, df_read) @@ -649,8 +660,8 @@ def test_pandas_can_write_nested_data(tempdir): @pytest.mark.pandas -@parametrize_use_dataset_skip_buffer -def test_pandas_parquet_pyfile_roundtrip(tempdir, use_dataset): +@parametrize_legacy_dataset_skip_buffer +def test_pandas_parquet_pyfile_roundtrip(tempdir, use_legacy_dataset): filename = tempdir / 'pandas_pyfile_roundtrip.parquet' size = 5 df = pd.DataFrame({ @@ -668,14 +679,14 @@ def test_pandas_parquet_pyfile_roundtrip(tempdir, use_dataset): data = io.BytesIO(filename.read_bytes()) - table_read = _read_table(data, use_dataset=use_dataset) + table_read = _read_table(data, use_legacy_dataset=use_legacy_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @pytest.mark.pandas -@parametrize_use_dataset -def test_pandas_parquet_configuration_options(tempdir, use_dataset): +@parametrize_legacy_dataset +def test_pandas_parquet_configuration_options(tempdir, use_legacy_dataset): size = 10000 np.random.seed(0) df = pd.DataFrame({ @@ -697,14 +708,16 @@ def test_pandas_parquet_configuration_options(tempdir, use_dataset): for use_dictionary in [True, False]: _write_table(arrow_table, filename, version='2.0', use_dictionary=use_dictionary) - table_read = _read_table(filename, use_dataset=use_dataset) + table_read = _read_table( + filename, use_legacy_dataset=use_legacy_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) for write_statistics in [True, False]: _write_table(arrow_table, filename, version='2.0', write_statistics=write_statistics) - table_read = _read_table(filename, use_dataset=use_dataset) + table_read = _read_table(filename, + use_legacy_dataset=use_legacy_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @@ -714,7 +727,8 @@ def test_pandas_parquet_configuration_options(tempdir, use_dataset): continue _write_table(arrow_table, filename, version='2.0', compression=compression) - table_read = _read_table(filename, use_dataset=use_dataset) + table_read = _read_table( + filename, use_legacy_dataset=use_legacy_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @@ -733,8 +747,8 @@ def make_sample_file(table_or_df): return pq.ParquetFile(buf) -@parametrize_use_dataset -def test_byte_stream_split(use_dataset): +@parametrize_legacy_dataset +def test_byte_stream_split(use_legacy_dataset): # This is only a smoke test. arr_float = pa.array(list(map(float, range(100)))) arr_int = pa.array(list(map(int, range(100)))) @@ -768,28 +782,31 @@ def test_byte_stream_split(use_dataset): table = pa.Table.from_arrays([arr_int], names=['tmp']) with pytest.raises(IOError): _check_roundtrip(table, expected=table, use_byte_stream_split=True, - use_dictionary=False, use_dataset=use_dataset) + use_dictionary=False, + use_legacy_dataset=use_legacy_dataset) -@parametrize_use_dataset -def test_compression_level(use_dataset): +@parametrize_legacy_dataset +def test_compression_level(use_legacy_dataset): arr = pa.array(list(map(int, range(1000)))) data = [arr, arr] table = pa.Table.from_arrays(data, names=['a', 'b']) # Check one compression level. _check_roundtrip(table, expected=table, compression="gzip", - compression_level=1, use_dataset=use_dataset) + compression_level=1, + use_legacy_dataset=use_legacy_dataset) # Check another one to make sure that compression_level=1 does not # coincide with the default one in Arrow. _check_roundtrip(table, expected=table, compression="gzip", - compression_level=5, use_dataset=use_dataset) + compression_level=5, + use_legacy_dataset=use_legacy_dataset) # Check that the user can provide a compression level per column _check_roundtrip(table, expected=table, compression="gzip", compression_level={'a': 2, 'b': 3}, - use_dataset=use_dataset) + use_legacy_dataset=use_legacy_dataset) # Check that specifying a compression level for a codec which does allow # specifying one, results into an error. @@ -1430,8 +1447,8 @@ def test_fixed_size_binary(): @pytest.mark.pandas -@parametrize_use_dataset_skip_buffer -def test_multithreaded_read(use_dataset): +@parametrize_legacy_dataset_skip_buffer +def test_multithreaded_read(use_legacy_dataset): df = alltypes_sample(size=10000) table = pa.Table.from_pandas(df) @@ -1440,17 +1457,19 @@ def test_multithreaded_read(use_dataset): _write_table(table, buf, compression='SNAPPY', version='2.0') buf.seek(0) - table1 = _read_table(buf, use_threads=True, use_dataset=use_dataset) + table1 = _read_table( + buf, use_threads=True, use_legacy_dataset=use_legacy_dataset) buf.seek(0) - table2 = _read_table(buf, use_threads=False, use_dataset=use_dataset) + table2 = _read_table( + buf, use_threads=False, use_legacy_dataset=use_legacy_dataset) assert table1.equals(table2) @pytest.mark.pandas -@parametrize_use_dataset_skip_buffer -def test_min_chunksize(use_dataset): +@parametrize_legacy_dataset_skip_buffer +def test_min_chunksize(use_legacy_dataset): data = pd.DataFrame([np.arange(4)], columns=['A', 'B', 'C', 'D']) table = pa.Table.from_pandas(data.reset_index()) @@ -1458,7 +1477,7 @@ def test_min_chunksize(use_dataset): _write_table(table, buf, chunk_size=-1) buf.seek(0) - result = _read_table(buf, use_dataset=use_dataset) + result = _read_table(buf, use_legacy_dataset=use_legacy_dataset) assert result.equals(table) @@ -1657,10 +1676,10 @@ def test_partition_set_dictionary_type(): @pytest.mark.pandas -@parametrize_use_dataset -def test_read_partitioned_directory(tempdir, use_dataset): +@parametrize_legacy_dataset +def test_read_partitioned_directory(tempdir, use_legacy_dataset): fs = LocalFileSystem.get_instance() - _partition_test_for_filesystem(fs, tempdir, use_dataset) + _partition_test_for_filesystem(fs, tempdir, use_legacy_dataset) @pytest.mark.pandas @@ -1681,8 +1700,8 @@ def test_create_parquet_dataset_multi_threaded(tempdir): @pytest.mark.pandas -@parametrize_use_dataset -def test_equivalency(tempdir, use_dataset): +@parametrize_legacy_dataset +def test_equivalency(tempdir, use_legacy_dataset): fs = LocalFileSystem.get_instance() base_path = tempdir @@ -1710,7 +1729,7 @@ def test_equivalency(tempdir, use_dataset): base_path, filesystem=fs, filters=[('integer', '=', 1), ('string', '!=', 'b'), ('boolean', '==', True)], - use_dataset=use_dataset, + use_legacy_dataset=use_legacy_dataset, ) table = dataset.read() result_df = (table.to_pandas().reset_index(drop=True)) @@ -1732,7 +1751,8 @@ def test_equivalency(tempdir, use_dataset): [('integer', '=', 0), ('boolean', '==', 'False')] ] dataset = pq.ParquetDataset( - base_path, filesystem=fs, filters=filters, use_dataset=use_dataset) + base_path, filesystem=fs, filters=filters, + use_legacy_dataset=use_legacy_dataset) table = dataset.read() result_df = table.to_pandas().reset_index(drop=True) @@ -1748,7 +1768,7 @@ def test_equivalency(tempdir, use_dataset): assert df_filter_2.sum() > 0 assert result_df.shape[0] == (df_filter_1.sum() + df_filter_2.sum()) - if not use_dataset: + if use_legacy_dataset: # Check for \0 in predicate values. Until they are correctly # implemented in ARROW-3391, they would otherwise lead to weird # results with the current code. @@ -1762,13 +1782,14 @@ def test_equivalency(tempdir, use_dataset): for filters in [[[('string', '==', b'1\0a')]], [[('string', '==', '1\0a')]]]: dataset = pq.ParquetDataset( - base_path, filesystem=fs, filters=filters, use_dataset=True) + base_path, filesystem=fs, filters=filters, + use_legacy_dataset=False) assert dataset.read().num_rows == 0 @pytest.mark.pandas -@parametrize_use_dataset -def test_cutoff_exclusive_integer(tempdir, use_dataset): +@parametrize_legacy_dataset +def test_cutoff_exclusive_integer(tempdir, use_legacy_dataset): fs = LocalFileSystem.get_instance() base_path = tempdir @@ -1791,7 +1812,7 @@ def test_cutoff_exclusive_integer(tempdir, use_dataset): ('integers', '<', 4), ('integers', '>', 1), ], - use_dataset=use_dataset + use_legacy_dataset=use_legacy_dataset ) table = dataset.read() result_df = (table.to_pandas() @@ -1803,14 +1824,14 @@ def test_cutoff_exclusive_integer(tempdir, use_dataset): @pytest.mark.pandas -@parametrize_use_dataset +@parametrize_legacy_dataset @pytest.mark.xfail( - # different error with use_datasets because result_df is no longer + # different error with use_legacy_datasets because result_df is no longer # categorical raises=(TypeError, AssertionError), reason='Loss of type information in creation of categoricals.' ) -def test_cutoff_exclusive_datetime(tempdir, use_dataset): +def test_cutoff_exclusive_datetime(tempdir, use_legacy_dataset): fs = LocalFileSystem.get_instance() base_path = tempdir @@ -1839,7 +1860,7 @@ def test_cutoff_exclusive_datetime(tempdir, use_dataset): ('dates', '<', "2018-04-12"), ('dates', '>', "2018-04-10") ], - use_dataset=use_dataset + use_legacy_dataset=use_legacy_dataset ) table = dataset.read() result_df = (table.to_pandas() @@ -1854,8 +1875,8 @@ def test_cutoff_exclusive_datetime(tempdir, use_dataset): @pytest.mark.pandas -@parametrize_use_dataset -def test_inclusive_integer(tempdir, use_dataset): +@parametrize_legacy_dataset +def test_inclusive_integer(tempdir, use_legacy_dataset): fs = LocalFileSystem.get_instance() base_path = tempdir @@ -1878,7 +1899,7 @@ def test_inclusive_integer(tempdir, use_dataset): ('integers', '<=', 3), ('integers', '>=', 2), ], - use_dataset=use_dataset + use_legacy_dataset=use_legacy_dataset ) table = dataset.read() result_df = (table.to_pandas() @@ -1890,8 +1911,8 @@ def test_inclusive_integer(tempdir, use_dataset): @pytest.mark.pandas -@parametrize_use_dataset -def test_inclusive_set(tempdir, use_dataset): +@parametrize_legacy_dataset +def test_inclusive_set(tempdir, use_legacy_dataset): fs = LocalFileSystem.get_instance() base_path = tempdir @@ -1917,7 +1938,7 @@ def test_inclusive_set(tempdir, use_dataset): base_path, filesystem=fs, filters=[('integer', 'in', {1}), ('string', 'in', {'a', 'b'}), ('boolean', 'in', {True})], - use_dataset=use_dataset + use_legacy_dataset=use_legacy_dataset ) table = dataset.read() result_df = (table.to_pandas().reset_index(drop=True)) @@ -1928,8 +1949,8 @@ def test_inclusive_set(tempdir, use_dataset): @pytest.mark.pandas -@parametrize_use_dataset -def test_invalid_pred_op(tempdir, use_dataset): +@parametrize_legacy_dataset +def test_invalid_pred_op(tempdir, use_legacy_dataset): fs = LocalFileSystem.get_instance() base_path = tempdir @@ -1950,20 +1971,20 @@ def test_invalid_pred_op(tempdir, use_dataset): pq.ParquetDataset(base_path, filesystem=fs, filters=[('integers', '=<', 3), ], - use_dataset=use_dataset) + use_legacy_dataset=use_legacy_dataset) - if not use_dataset: + if use_legacy_dataset: with pytest.raises(ValueError): pq.ParquetDataset(base_path, filesystem=fs, filters=[('integers', 'in', set()), ], - use_dataset=use_dataset) + use_legacy_dataset=use_legacy_dataset) else: # Dataset API returns empty table instead dataset = pq.ParquetDataset(base_path, filesystem=fs, filters=[('integers', 'in', set()), ], - use_dataset=use_dataset) + use_legacy_dataset=use_legacy_dataset) assert dataset.read().num_rows == 0 with pytest.raises((ValueError, TypeError)): @@ -1971,7 +1992,7 @@ def test_invalid_pred_op(tempdir, use_dataset): pq.ParquetDataset(base_path, filesystem=fs, filters=[('integers', '!=', {3})], - use_dataset=use_dataset) + use_legacy_dataset=use_legacy_dataset) @pytest.mark.pandas @@ -2060,7 +2081,7 @@ def test_read_partitioned_directory_s3fs(s3_example): dataset.read() -def _partition_test_for_filesystem(fs, base_path, use_dataset=False): +def _partition_test_for_filesystem(fs, base_path, use_legacy_dataset=True): foo_keys = [0, 1] bar_keys = ['a', 'b', 'c'] partition_spec = [ @@ -2079,7 +2100,7 @@ def _partition_test_for_filesystem(fs, base_path, use_dataset=False): _generate_partition_directories(fs, base_path, partition_spec, df) dataset = pq.ParquetDataset( - base_path, filesystem=fs, use_dataset=use_dataset) + base_path, filesystem=fs, use_legacy_dataset=use_legacy_dataset) table = dataset.read() result_df = (table.to_pandas() .sort_values(by='index') @@ -2088,7 +2109,7 @@ def _partition_test_for_filesystem(fs, base_path, use_dataset=False): expected_df = (df.sort_values(by='index') .reset_index(drop=True) .reindex(columns=result_df.columns)) - if not use_dataset: + if use_legacy_dataset: # TODO(dataset) Dataset API does not create categorical columns # for partition keys expected_df['foo'] = pd.Categorical(df['foo'], categories=foo_keys) @@ -2241,8 +2262,8 @@ def _filter_partition(df, part_keys): @pytest.mark.pandas -@parametrize_use_dataset -def test_read_multiple_files(tempdir, use_dataset): +@parametrize_legacy_dataset +def test_read_multiple_files(tempdir, use_legacy_dataset): nfiles = 10 size = 5 @@ -2271,7 +2292,8 @@ def test_read_multiple_files(tempdir, use_dataset): # TODO(datasets) changed to use_threads=False because otherwise the # row order is not deterministic def read_multiple_files(paths, columns=None, use_threads=False, **kwargs): - dataset = pq.ParquetDataset(paths, use_dataset=use_dataset, **kwargs) + dataset = pq.ParquetDataset( + paths, use_legacy_dataset=use_legacy_dataset, **kwargs) return dataset.read(columns=columns, use_threads=use_threads) result = read_multiple_files(paths) @@ -2308,7 +2330,7 @@ def read_multiple_files(paths, columns=None, use_threads=False, **kwargs): t = pa.Table.from_pandas(bad_apple) _write_table(t, bad_apple_path) - if use_dataset: + if not use_legacy_dataset: # TODO(dataset) Dataset API skips bad files return @@ -2330,8 +2352,8 @@ def read_multiple_files(paths, columns=None, use_threads=False, **kwargs): @pytest.mark.pandas -@parametrize_use_dataset -def test_dataset_read_pandas(tempdir, use_dataset): +@parametrize_legacy_dataset +def test_dataset_read_pandas(tempdir, use_legacy_dataset): nfiles = 5 size = 5 @@ -2354,7 +2376,7 @@ def test_dataset_read_pandas(tempdir, use_dataset): frames.append(df) paths.append(path) - dataset = pq.ParquetDataset(dirpath, use_dataset=use_dataset) + dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset) columns = ['uint8', 'strings'] result = dataset.read_pandas(columns=columns).to_pandas() expected = pd.concat([x[columns] for x in frames]) @@ -2363,8 +2385,8 @@ def test_dataset_read_pandas(tempdir, use_dataset): @pytest.mark.pandas -@parametrize_use_dataset_not_supported # TODO(dataset) support memory map -def test_dataset_memory_map(tempdir, use_dataset): +@parametrize_legacy_dataset_not_supported # TODO(dataset) support memory map +def test_dataset_memory_map(tempdir, use_legacy_dataset): # ARROW-2627: Check that we can use ParquetDataset with memory-mapping dirpath = tempdir / guid() dirpath.mkdir() @@ -2375,13 +2397,13 @@ def test_dataset_memory_map(tempdir, use_dataset): _write_table(table, path, version='2.0') dataset = pq.ParquetDataset( - dirpath, memory_map=True, use_dataset=use_dataset) + dirpath, memory_map=True, use_legacy_dataset=use_legacy_dataset) assert dataset.pieces[0].read().equals(table) @pytest.mark.pandas -@parametrize_use_dataset -def test_dataset_enable_buffered_stream(tempdir, use_dataset): +@parametrize_legacy_dataset +def test_dataset_enable_buffered_stream(tempdir, use_legacy_dataset): dirpath = tempdir / guid() dirpath.mkdir() @@ -2392,17 +2414,19 @@ def test_dataset_enable_buffered_stream(tempdir, use_dataset): # TODO(dataset) raises an OSError instead of ValueError with pytest.raises((ValueError, OSError)): - if use_dataset: - # Dataset API only raises when reading + if use_legacy_dataset: pq.ParquetDataset( - dirpath, buffer_size=-64, use_dataset=True).read() + dirpath, buffer_size=-64, + use_legacy_dataset=use_legacy_dataset) else: + # Dataset API only raises when reading pq.ParquetDataset( - dirpath, buffer_size=-64, use_dataset=use_dataset) + dirpath, buffer_size=-64, use_legacy_dataset=False).read() for buffer_size in [128, 1024]: dataset = pq.ParquetDataset( - dirpath, buffer_size=buffer_size, use_dataset=use_dataset) + dirpath, buffer_size=buffer_size, + use_legacy_dataset=use_legacy_dataset) assert dataset.read().equals(table) @@ -2464,9 +2488,9 @@ def _make_example_multifile_dataset(base_path, nfiles=10, file_nrows=5): @pytest.mark.pandas -@parametrize_use_dataset +@parametrize_legacy_dataset @pytest.mark.parametrize('dir_prefix', ['_', '.']) -def test_ignore_private_directories(tempdir, dir_prefix, use_dataset): +def test_ignore_private_directories(tempdir, dir_prefix, use_legacy_dataset): dirpath = tempdir / guid() dirpath.mkdir() @@ -2476,16 +2500,16 @@ def test_ignore_private_directories(tempdir, dir_prefix, use_dataset): # private directory (dirpath / '{}staging'.format(dir_prefix)).mkdir() - dataset = pq.ParquetDataset(dirpath, use_dataset=use_dataset) - if not use_dataset: + dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset) + if use_legacy_dataset: assert set(map(str, paths)) == {x.path for x in dataset.pieces} else: assert set(map(str, paths)) == set(dataset._dataset.files) @pytest.mark.pandas -@parametrize_use_dataset -def test_ignore_hidden_files_dot(tempdir, use_dataset): +@parametrize_legacy_dataset +def test_ignore_hidden_files_dot(tempdir, use_legacy_dataset): dirpath = tempdir / guid() dirpath.mkdir() @@ -2498,16 +2522,16 @@ def test_ignore_hidden_files_dot(tempdir, use_dataset): with (dirpath / '.private').open('wb') as f: f.write(b'gibberish') - dataset = pq.ParquetDataset(dirpath, use_dataset=use_dataset) - if not use_dataset: + dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset) + if use_legacy_dataset: assert set(map(str, paths)) == {x.path for x in dataset.pieces} else: assert set(map(str, paths)) == set(dataset._dataset.files) @pytest.mark.pandas -@parametrize_use_dataset -def test_ignore_hidden_files_underscore(tempdir, use_dataset): +@parametrize_legacy_dataset +def test_ignore_hidden_files_underscore(tempdir, use_legacy_dataset): dirpath = tempdir / guid() dirpath.mkdir() @@ -2520,16 +2544,16 @@ def test_ignore_hidden_files_underscore(tempdir, use_dataset): with (dirpath / '_started_321').open('wb') as f: f.write(b'abcd') - dataset = pq.ParquetDataset(dirpath, use_dataset=use_dataset) - if not use_dataset: + dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset) + if use_legacy_dataset: assert set(map(str, paths)) == {x.path for x in dataset.pieces} else: assert set(map(str, paths)) == set(dataset._dataset.files) @pytest.mark.pandas -@parametrize_use_dataset -def test_multiindex_duplicate_values(tempdir, use_dataset): +@parametrize_legacy_dataset +def test_multiindex_duplicate_values(tempdir, use_legacy_dataset): num_rows = 3 numbers = list(range(num_rows)) index = pd.MultiIndex.from_arrays( @@ -2543,7 +2567,7 @@ def test_multiindex_duplicate_values(tempdir, use_dataset): filename = tempdir / 'dup_multi_index_levels.parquet' _write_table(table, filename) - result_table = _read_table(filename, use_dataset=use_dataset) + result_table = _read_table(filename, use_legacy_dataset=use_legacy_dataset) assert table.equals(result_table) result_df = result_table.to_pandas() @@ -2600,25 +2624,26 @@ def test_noncoerced_nanoseconds_written_without_exception(tempdir): pq.write_table(tb, filename, coerce_timestamps='ms', version='2.0') -@parametrize_use_dataset -def test_read_non_existent_file(tempdir, use_dataset): +@parametrize_legacy_dataset +def test_read_non_existent_file(tempdir, use_legacy_dataset): path = 'non-existent-file.parquet' try: - pq.read_table(path, use_dataset=use_dataset) + pq.read_table(path, use_legacy_dataset=use_legacy_dataset) except Exception as e: assert path in e.args[0] -@parametrize_use_dataset -def test_read_table_doesnt_warn(datadir, use_dataset): +@parametrize_legacy_dataset +def test_read_table_doesnt_warn(datadir, use_legacy_dataset): with pytest.warns(None) as record: - pq.read_table(datadir / 'v0.7.1.parquet', use_dataset=use_dataset) + pq.read_table(datadir / 'v0.7.1.parquet', + use_legacy_dataset=use_legacy_dataset) assert len(record) == 0 def _test_write_to_dataset_with_partitions(base_path, - use_dataset=False, + use_legacy_dataset=True, filesystem=None, schema=None, index_name=None): @@ -2649,9 +2674,9 @@ def _test_write_to_dataset_with_partitions(base_path, dataset = pq.ParquetDataset(base_path, filesystem=filesystem, validate_schema=True, - use_dataset=use_dataset) + use_legacy_dataset=use_legacy_dataset) # ARROW-2209: Ensure the dataset schema also includes the partition columns - if not use_dataset: + if use_legacy_dataset: dataset_cols = set(dataset.schema.to_arrow_schema().names) else: # TODO(dataset) schema property is an arrow and not parquet schema @@ -2661,7 +2686,7 @@ def _test_write_to_dataset_with_partitions(base_path, # TODO(datasets) changed to use_threads=False because otherwise the # row order is not deterministic - kwargs = dict(use_threads=False) if use_dataset else {} + kwargs = dict(use_threads=False) if not use_legacy_dataset else {} input_table = dataset.read(**kwargs) input_df = input_table.to_pandas() @@ -2672,14 +2697,14 @@ def _test_write_to_dataset_with_partitions(base_path, # Partitioned columns become 'categorical' dtypes input_df = input_df[cols] - if not use_dataset: + if use_legacy_dataset: for col in partition_by: output_df[col] = output_df[col].astype('category') assert output_df.equals(input_df) def _test_write_to_dataset_no_partitions(base_path, - use_dataset=False, + use_legacy_dataset=True, filesystem=None): # ARROW-1400 output_df = pd.DataFrame({'group1': list('aaabbbbccc'), @@ -2704,9 +2729,10 @@ def _test_write_to_dataset_no_partitions(base_path, # Deduplicated incoming DataFrame should match # original outgoing Dataframe - input_table = pq.ParquetDataset(base_path, - filesystem=filesystem, - use_dataset=use_dataset).read() + input_table = pq.ParquetDataset( + base_path, filesystem=filesystem, + use_legacy_dataset=use_legacy_dataset + ).read() input_df = input_table.to_pandas() input_df = input_df.drop_duplicates() input_df = input_df[cols] @@ -2714,33 +2740,37 @@ def _test_write_to_dataset_no_partitions(base_path, @pytest.mark.pandas -@parametrize_use_dataset -def test_write_to_dataset_with_partitions(tempdir, use_dataset): - _test_write_to_dataset_with_partitions(str(tempdir), use_dataset) +@parametrize_legacy_dataset +def test_write_to_dataset_with_partitions(tempdir, use_legacy_dataset): + _test_write_to_dataset_with_partitions(str(tempdir), use_legacy_dataset) @pytest.mark.pandas -@parametrize_use_dataset -def test_write_to_dataset_with_partitions_and_schema(tempdir, use_dataset): +@parametrize_legacy_dataset +def test_write_to_dataset_with_partitions_and_schema( + tempdir, use_legacy_dataset +): schema = pa.schema([pa.field('group1', type=pa.string()), pa.field('group2', type=pa.string()), pa.field('num', type=pa.int64()), pa.field('nan', type=pa.int32()), pa.field('date', type=pa.timestamp(unit='us'))]) _test_write_to_dataset_with_partitions( - str(tempdir), use_dataset, schema=schema) + str(tempdir), use_legacy_dataset, schema=schema) @pytest.mark.pandas -@parametrize_use_dataset -def test_write_to_dataset_with_partitions_and_index_name(tempdir, use_dataset): +@parametrize_legacy_dataset +def test_write_to_dataset_with_partitions_and_index_name( + tempdir, use_legacy_dataset +): _test_write_to_dataset_with_partitions( - str(tempdir), use_dataset, index_name='index_name') + str(tempdir), use_legacy_dataset, index_name='index_name') @pytest.mark.pandas -@parametrize_use_dataset -def test_write_to_dataset_no_partitions(tempdir, use_dataset): +@parametrize_legacy_dataset +def test_write_to_dataset_no_partitions(tempdir, use_legacy_dataset): _test_write_to_dataset_no_partitions(str(tempdir)) @@ -2858,8 +2888,8 @@ def test_list_of_binary_large_cell(): @pytest.mark.pandas -@parametrize_use_dataset -def test_index_column_name_duplicate(tempdir, use_dataset): +@parametrize_legacy_dataset +def test_index_column_name_duplicate(tempdir, use_legacy_dataset): data = { 'close': { pd.Timestamp('2017-06-30 01:31:00'): 154.99958999999998, @@ -2878,14 +2908,14 @@ def test_index_column_name_duplicate(tempdir, use_dataset): dfx = pd.DataFrame(data).set_index('time', drop=False) tdfx = pa.Table.from_pandas(dfx) _write_table(tdfx, path) - arrow_table = _read_table(path, use_dataset=use_dataset) + arrow_table = _read_table(path, use_legacy_dataset=use_legacy_dataset) result_df = arrow_table.to_pandas() tm.assert_frame_equal(result_df, dfx) @pytest.mark.pandas -@parametrize_use_dataset -def test_parquet_nested_convenience(tempdir, use_dataset): +@parametrize_legacy_dataset +def test_parquet_nested_convenience(tempdir, use_legacy_dataset): # ARROW-1684 df = pd.DataFrame({ 'a': [[1, 2, 3], None, [4, 5], []], @@ -2897,16 +2927,18 @@ def test_parquet_nested_convenience(tempdir, use_dataset): table = pa.Table.from_pandas(df, preserve_index=False) _write_table(table, path) - read = pq.read_table(path, columns=['a'], use_dataset=use_dataset) + read = pq.read_table( + path, columns=['a'], use_legacy_dataset=use_legacy_dataset) tm.assert_frame_equal(read.to_pandas(), df[['a']]) - read = pq.read_table(path, columns=['a', 'b'], use_dataset=use_dataset) + read = pq.read_table( + path, columns=['a', 'b'], use_legacy_dataset=use_legacy_dataset) tm.assert_frame_equal(read.to_pandas(), df) @pytest.mark.pandas -@parametrize_use_dataset -def test_backwards_compatible_index_naming(datadir, use_dataset): +@parametrize_legacy_dataset +def test_backwards_compatible_index_naming(datadir, use_legacy_dataset): expected_string = b"""\ carat cut color clarity depth table price x y z 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 @@ -2921,14 +2953,17 @@ def test_backwards_compatible_index_naming(datadir, use_dataset): 0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39""" expected = pd.read_csv(io.BytesIO(expected_string), sep=r'\s{2,}', index_col=None, header=0, engine='python') - table = _read_table(datadir / 'v0.7.1.parquet', use_dataset=use_dataset) + table = _read_table( + datadir / 'v0.7.1.parquet', use_legacy_dataset=use_legacy_dataset) result = table.to_pandas() tm.assert_frame_equal(result, expected) @pytest.mark.pandas -@parametrize_use_dataset -def test_backwards_compatible_index_multi_level_named(datadir, use_dataset): +@parametrize_legacy_dataset +def test_backwards_compatible_index_multi_level_named( + datadir, use_legacy_dataset +): expected_string = b"""\ carat cut color clarity depth table price x y z 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 @@ -2948,15 +2983,15 @@ def test_backwards_compatible_index_multi_level_named(datadir, use_dataset): ).sort_index() table = _read_table(datadir / 'v0.7.1.all-named-index.parquet', - use_dataset=use_dataset) + use_legacy_dataset=use_legacy_dataset) result = table.to_pandas() tm.assert_frame_equal(result, expected) @pytest.mark.pandas -@parametrize_use_dataset +@parametrize_legacy_dataset def test_backwards_compatible_index_multi_level_some_named( - datadir, use_dataset + datadir, use_legacy_dataset ): expected_string = b"""\ carat cut color clarity depth table price x y z @@ -2978,14 +3013,16 @@ def test_backwards_compatible_index_multi_level_some_named( expected.index = expected.index.set_names(['cut', None, 'clarity']) table = _read_table(datadir / 'v0.7.1.some-named-index.parquet', - use_dataset=use_dataset) + use_legacy_dataset=use_legacy_dataset) result = table.to_pandas() tm.assert_frame_equal(result, expected) @pytest.mark.pandas -@parametrize_use_dataset -def test_backwards_compatible_column_metadata_handling(datadir, use_dataset): +@parametrize_legacy_dataset +def test_backwards_compatible_column_metadata_handling( + datadir, use_legacy_dataset +): expected = pd.DataFrame( {'a': [1, 2, 3], 'b': [.1, .2, .3], 'c': pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')}) @@ -2995,11 +3032,12 @@ def test_backwards_compatible_column_metadata_handling(datadir, use_dataset): names=['index', None]) path = datadir / 'v0.7.1.column-metadata-handling.parquet' - table = _read_table(path, use_dataset=use_dataset) + table = _read_table(path, use_legacy_dataset=use_legacy_dataset) result = table.to_pandas() tm.assert_frame_equal(result, expected) - table = _read_table(path, columns=['a'], use_dataset=use_dataset) + table = _read_table( + path, columns=['a'], use_legacy_dataset=use_legacy_dataset) result = table.to_pandas() tm.assert_frame_equal(result, expected[['a']].reset_index(drop=True)) @@ -3067,8 +3105,8 @@ def test_cloudpickle_dataset(tempdir, datadir): @pytest.mark.pandas -@parametrize_use_dataset -def test_decimal_roundtrip(tempdir, use_dataset): +@parametrize_legacy_dataset +def test_decimal_roundtrip(tempdir, use_legacy_dataset): num_values = 10 columns = {} @@ -3088,7 +3126,8 @@ def test_decimal_roundtrip(tempdir, use_dataset): string_filename = str(filename) table = pa.Table.from_pandas(expected) _write_table(table, string_filename) - result_table = _read_table(string_filename, use_dataset=use_dataset) + result_table = _read_table( + string_filename, use_legacy_dataset=use_legacy_dataset) result = result_table.to_pandas() tm.assert_frame_equal(result, expected) @@ -3109,8 +3148,8 @@ def test_decimal_roundtrip_negative_scale(tempdir): @pytest.mark.pandas -@parametrize_use_dataset_skip_buffer -def test_parquet_writer_context_obj(tempdir, use_dataset): +@parametrize_legacy_dataset_skip_buffer +def test_parquet_writer_context_obj(tempdir, use_legacy_dataset): df = _test_dataframe(100) df['unique_id'] = 0 @@ -3128,15 +3167,18 @@ def test_parquet_writer_context_obj(tempdir, use_dataset): frames.append(df.copy()) buf = out.getvalue() - result = _read_table(pa.BufferReader(buf), use_dataset=use_dataset) + result = _read_table( + pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset) expected = pd.concat(frames, ignore_index=True) tm.assert_frame_equal(result.to_pandas(), expected) @pytest.mark.pandas -@parametrize_use_dataset_skip_buffer -def test_parquet_writer_context_obj_with_exception(tempdir, use_dataset): +@parametrize_legacy_dataset_skip_buffer +def test_parquet_writer_context_obj_with_exception( + tempdir, use_legacy_dataset +): df = _test_dataframe(100) df['unique_id'] = 0 @@ -3161,22 +3203,23 @@ def test_parquet_writer_context_obj_with_exception(tempdir, use_dataset): assert str(e) == error_text buf = out.getvalue() - result = _read_table(pa.BufferReader(buf), use_dataset=use_dataset) + result = _read_table( + pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset) expected = pd.concat(frames, ignore_index=True) tm.assert_frame_equal(result.to_pandas(), expected) @pytest.mark.pandas -@parametrize_use_dataset_skip_buffer -def test_zlib_compression_bug(use_dataset): +@parametrize_legacy_dataset_skip_buffer +def test_zlib_compression_bug(use_legacy_dataset): # ARROW-3514: "zlib deflate failed, output buffer too small" table = pa.Table.from_arrays([pa.array(['abc', 'def'])], ['some_col']) f = io.BytesIO() pq.write_table(table, f, compression='gzip') f.seek(0) - roundtrip = pq.read_table(f, use_dataset=use_dataset) + roundtrip = pq.read_table(f, use_legacy_dataset=use_legacy_dataset) tm.assert_frame_equal(roundtrip.to_pandas(), table.to_pandas()) @@ -3228,8 +3271,8 @@ def test_empty_row_groups(tempdir): @pytest.mark.pandas -@parametrize_use_dataset_skip_buffer -def test_parquet_writer_with_caller_provided_filesystem(use_dataset): +@parametrize_legacy_dataset_skip_buffer +def test_parquet_writer_with_caller_provided_filesystem(use_legacy_dataset): out = pa.BufferOutputStream() class CustomFS(FileSystem): @@ -3256,7 +3299,8 @@ def open(self, path, mode='rb'): assert out.closed buf = out.getvalue() - table_read = _read_table(pa.BufferReader(buf), use_dataset=use_dataset) + table_read = _read_table( + pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df_read, df) @@ -3275,8 +3319,8 @@ def test_writing_empty_lists(): _check_roundtrip(table) -@parametrize_use_dataset -def test_write_nested_zero_length_array_chunk_failure(use_dataset): +@parametrize_legacy_dataset +def test_write_nested_zero_length_array_chunk_failure(use_legacy_dataset): # Bug report in ARROW-3792 cols = OrderedDict( int32=pa.int32(), @@ -3301,12 +3345,12 @@ def test_write_nested_zero_length_array_chunk_failure(use_dataset): my_batches = [pa.RecordBatch.from_arrays(batch, schema=pa.schema(cols)) for batch in my_arrays] tbl = pa.Table.from_batches(my_batches, pa.schema(cols)) - _check_roundtrip(tbl, use_dataset=use_dataset) + _check_roundtrip(tbl, use_legacy_dataset=use_legacy_dataset) @pytest.mark.pandas -@parametrize_use_dataset -def test_partitioned_dataset(tempdir, use_dataset): +@parametrize_legacy_dataset +def test_partitioned_dataset(tempdir, use_legacy_dataset): # ARROW-3208: Segmentation fault when reading a Parquet partitioned dataset # to a Parquet file path = tempdir / "ARROW-3208" @@ -3318,7 +3362,8 @@ def test_partitioned_dataset(tempdir, use_dataset): table = pa.Table.from_pandas(df) pq.write_to_dataset(table, root_path=str(path), partition_cols=['one', 'two']) - table = pq.ParquetDataset(path, use_dataset=use_dataset).read() + table = pq.ParquetDataset( + path, use_legacy_dataset=use_legacy_dataset).read() pq.write_table(table, path / "output.parquet") @@ -3336,8 +3381,8 @@ def test_read_column_invalid_index(): @pytest.mark.pandas -@parametrize_use_dataset_skip_buffer -def test_direct_read_dictionary(use_dataset): +@parametrize_legacy_dataset_skip_buffer +def test_direct_read_dictionary(use_legacy_dataset): # ARROW-3325 repeats = 10 nunique = 5 @@ -3353,7 +3398,8 @@ def test_direct_read_dictionary(use_dataset): contents = bio.getvalue() result = pq.read_table(pa.BufferReader(contents), - read_dictionary=['f0'], use_dataset=use_dataset) + read_dictionary=['f0'], + use_legacy_dataset=use_legacy_dataset) # Compute dictionary-encoded subfield expected = pa.table([table[0].dictionary_encode()], names=['f0']) @@ -3361,8 +3407,8 @@ def test_direct_read_dictionary(use_dataset): @pytest.mark.pandas -@parametrize_use_dataset -def test_dataset_read_dictionary(tempdir, use_dataset): +@parametrize_legacy_dataset +def test_dataset_read_dictionary(tempdir, use_legacy_dataset): path = tempdir / "ARROW-3325-dataset" t1 = pa.table([[util.rands(10) for i in range(5)] * 10], names=['f0']) t2 = pa.table([[util.rands(10) for i in range(5)] * 10], names=['f0']) @@ -3370,7 +3416,8 @@ def test_dataset_read_dictionary(tempdir, use_dataset): pq.write_to_dataset(t2, root_path=str(path)) result = pq.ParquetDataset( - path, read_dictionary=['f0'], use_dataset=use_dataset).read() + path, read_dictionary=['f0'], + use_legacy_dataset=use_legacy_dataset).read() # The order of the chunks is non-deterministic ex_chunks = [t1[0].chunk(0).dictionary_encode(), @@ -3386,8 +3433,8 @@ def test_dataset_read_dictionary(tempdir, use_dataset): @pytest.mark.pandas -@parametrize_use_dataset_skip_buffer -def test_direct_read_dictionary_subfield(use_dataset): +@parametrize_legacy_dataset_skip_buffer +def test_direct_read_dictionary_subfield(use_legacy_dataset): repeats = 10 nunique = 5 @@ -3401,7 +3448,7 @@ def test_direct_read_dictionary_subfield(use_dataset): contents = bio.getvalue() result = pq.read_table(pa.BufferReader(contents), read_dictionary=['f0.list.item'], - use_dataset=use_dataset) + use_legacy_dataset=use_legacy_dataset) arr = pa.array(data[0]) values_as_dict = arr.values.dictionary_encode() @@ -3485,8 +3532,8 @@ def test_write_to_dataset_metadata(tempdir): # TODO(dataset) better error message for invalid files (certainly if it # is the only one) -@parametrize_use_dataset_not_supported -def test_parquet_file_too_small(tempdir, use_dataset): +@parametrize_legacy_dataset_not_supported +def test_parquet_file_too_small(tempdir, use_legacy_dataset): path = str(tempdir / "test.parquet") with pytest.raises(pa.ArrowInvalid, match='size is 0 bytes'): @@ -3502,8 +3549,8 @@ def test_parquet_file_too_small(tempdir, use_dataset): @pytest.mark.pandas -@parametrize_use_dataset_skip_buffer -def test_categorical_index_survives_roundtrip(use_dataset): +@parametrize_legacy_dataset_skip_buffer +def test_categorical_index_survives_roundtrip(use_legacy_dataset): # ARROW-3652, addressed by ARROW-3246 df = pd.DataFrame([['a', 'b'], ['c', 'd']], columns=['c1', 'c2']) df['c1'] = df['c1'].astype('category') @@ -3513,14 +3560,14 @@ def test_categorical_index_survives_roundtrip(use_dataset): bos = pa.BufferOutputStream() pq.write_table(table, bos) ref_df = pq.read_pandas( - bos.getvalue(), use_dataset=use_dataset).to_pandas() + bos.getvalue(), use_legacy_dataset=use_legacy_dataset).to_pandas() assert isinstance(ref_df.index, pd.CategoricalIndex) assert ref_df.index.equals(df.index) @pytest.mark.pandas -@parametrize_use_dataset_skip_buffer -def test_categorical_order_survives_roundtrip(use_dataset): +@parametrize_legacy_dataset_skip_buffer +def test_categorical_order_survives_roundtrip(use_legacy_dataset): # ARROW-6302 df = pd.DataFrame({"a": pd.Categorical( ["a", "b", "c", "a"], categories=["b", "c", "d"], ordered=True)}) @@ -3530,7 +3577,8 @@ def test_categorical_order_survives_roundtrip(use_dataset): pq.write_table(table, bos) contents = bos.getvalue() - result = pq.read_pandas(contents, use_dataset=use_dataset).to_pandas() + result = pq.read_pandas( + contents, use_legacy_dataset=use_legacy_dataset).to_pandas() tm.assert_frame_equal(result, df) @@ -3542,8 +3590,8 @@ def _simple_table_write_read(table): return pq.read_table(pa.BufferReader(contents)) -@parametrize_use_dataset_skip_buffer -def test_dictionary_array_automatically_read(use_dataset): +@parametrize_legacy_dataset_skip_buffer +def test_dictionary_array_automatically_read(use_legacy_dataset): # ARROW-3246 # Make a large dictionary, a little over 4MB of data @@ -3608,8 +3656,8 @@ def test_field_id_metadata(): @pytest.mark.pandas -@parametrize_use_dataset_skip_buffer -def test_pandas_categorical_na_type_row_groups(use_dataset): +@parametrize_legacy_dataset_skip_buffer +def test_pandas_categorical_na_type_row_groups(use_legacy_dataset): # ARROW-5085 df = pd.DataFrame({"col": [None] * 100, "int": [1.0] * 100}) df_category = df.astype({"col": "category", "int": "category"}) @@ -3619,7 +3667,8 @@ def test_pandas_categorical_na_type_row_groups(use_dataset): # it works pq.write_table(table_cat, buf, version="2.0", chunk_size=10) - result = pq.read_table(buf.getvalue(), use_dataset=use_dataset) + result = pq.read_table( + buf.getvalue(), use_legacy_dataset=use_legacy_dataset) # Result is non-categorical assert result[0].equals(table[0]) @@ -3627,8 +3676,8 @@ def test_pandas_categorical_na_type_row_groups(use_dataset): @pytest.mark.pandas -@parametrize_use_dataset_skip_buffer -def test_pandas_categorical_roundtrip(use_dataset): +@parametrize_legacy_dataset_skip_buffer +def test_pandas_categorical_roundtrip(use_legacy_dataset): # ARROW-5480, this was enabled by ARROW-3246 # Have one of the categories unobserved and include a null (-1) @@ -3640,7 +3689,8 @@ def test_pandas_categorical_roundtrip(use_dataset): buf = pa.BufferOutputStream() pq.write_table(pa.table(df), buf) - result = pq.read_table(buf.getvalue(), use_dataset=use_dataset).to_pandas() + result = pq.read_table( + buf.getvalue(), use_legacy_dataset=use_legacy_dataset).to_pandas() assert result.x.dtype == 'category' assert (result.x.cat.categories == categories).all() tm.assert_frame_equal(result, df) @@ -3689,9 +3739,9 @@ def test_multi_dataset_metadata(tempdir): assert md['serialized_size'] > 0 -@parametrize_use_dataset +@parametrize_legacy_dataset @pytest.mark.pandas -def test_filter_before_validate_schema(tempdir, use_dataset): +def test_filter_before_validate_schema(tempdir, use_legacy_dataset): # ARROW-4076 apply filter before schema validation # to avoid checking unneeded schemas @@ -3709,7 +3759,7 @@ def test_filter_before_validate_schema(tempdir, use_dataset): # read single file using filter table = pq.read_table(tempdir, filters=[[('A', '==', 0)]], - use_dataset=use_dataset) + use_legacy_dataset=use_legacy_dataset) assert table.column('B').equals(pa.chunked_array([[1, 2, 3]])) @@ -3752,7 +3802,7 @@ def test_fastparquet_cross_compatibility(tempdir): tm.assert_frame_equal(table_fp.to_pandas(), df) -@parametrize_use_dataset_skip_buffer +@parametrize_legacy_dataset_skip_buffer @pytest.mark.parametrize('array_factory', [ lambda: pa.array([0, None] * 10), lambda: pa.array([0, None] * 10).dictionary_encode(), @@ -3762,7 +3812,7 @@ def test_fastparquet_cross_compatibility(tempdir): @pytest.mark.parametrize('use_dictionary', [False, True]) @pytest.mark.parametrize('read_dictionary', [False, True]) def test_buffer_contents( - array_factory, use_dictionary, read_dictionary, use_dataset + array_factory, use_dictionary, read_dictionary, use_legacy_dataset ): # Test that null values are deterministically initialized to zero # after a roundtrip through Parquet. @@ -3774,7 +3824,7 @@ def test_buffer_contents( read_dictionary = ['col'] if read_dictionary else None table = pq.read_table(bio, use_threads=False, read_dictionary=read_dictionary, - use_dataset=use_dataset) + use_legacy_dataset=use_legacy_dataset) for col in table.columns: [chunk] = col.chunks From 86498a199f441771d233fae58523cb20bc904c45 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 30 Mar 2020 10:32:52 +0200 Subject: [PATCH 11/26] consolidate read_table/ParquetDataset code + add errors for unsupported keywords --- python/pyarrow/parquet.py | 112 +++++++++++++++------------ python/pyarrow/tests/test_parquet.py | 46 +++++++++-- 2 files changed, 101 insertions(+), 57 deletions(-) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 92ae2f0d9a7..9df6f229d3f 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -117,6 +117,9 @@ def _filters_to_expression(filters): """ import pyarrow.dataset as ds + if isinstance(filters, ds.Expression): + return filters + filters = _check_filters(filters, check_null_strings=False) def convert_single_predicate(col, op, val): @@ -163,36 +166,6 @@ def convert_single_predicate(col, op, val): return expr -def _dataset_from_legacy_args( - path_or_paths, filesystem=None, read_dictionary=None, buffer_size=None -): - """ - Create a pyarrow.dataset.FileSystemDataset to use inside read_table - and ParquetDataset. - """ - import pyarrow.dataset as ds - import pyarrow.fs - - # map old filesystems to new one - # TODO(dataset) deal with other file systems - if isinstance(filesystem, LocalFileSystem): - filesystem = pyarrow.fs.LocalFileSystem() - - # map additional arguments - # TODO raise warning when unsupported arguments are passed - reader_options = {} - if buffer_size: - reader_options.update(use_buffered_stream=True, - buffer_size=buffer_size) - if read_dictionary is not None: - reader_options.update(dict_columns=read_dictionary) - parquet_format = ds.ParquetFileFormat(reader_options=reader_options) - - dataset = ds.dataset(path_or_paths, filesystem=filesystem, - format=parquet_format, partitioning="hive") - return dataset - - # ---------------------------------------------------------------------- # Reading a single Parquet file @@ -1122,11 +1095,16 @@ def __new__(cls, path_or_paths=None, filesystem=None, schema=None, filters=None, metadata_nthreads=1, read_dictionary=None, memory_map=False, buffer_size=0, use_legacy_dataset=True): if not use_legacy_dataset: - # TODO raise warning on unsupported keywords return _ParquetDatasetV2(path_or_paths, filesystem=filesystem, filters=filters, read_dictionary=read_dictionary, - buffer_size=buffer_size) + buffer_size=buffer_size, + # unsupported keywords + schema=schema, metadata=metadata, + split_row_groups=split_row_groups, + validate_schema=validate_schema, + metadata_nthreads=metadata_nthreads, + memory_map=memory_map) self = object.__new__(cls) return self @@ -1361,16 +1339,43 @@ class _ParquetDatasetV2: ParquetDataset shim using the Dataset API under the hood. """ def __init__(self, path_or_paths, filesystem=None, filters=None, - read_dictionary=None, buffer_size=None): - dataset = _dataset_from_legacy_args( - path_or_paths, filesystem=filesystem, - read_dictionary=read_dictionary, buffer_size=buffer_size) + read_dictionary=None, buffer_size=None, **kwargs): + import pyarrow.dataset as ds + import pyarrow.fs + + # Raise error for not supported keywords + for keyword, default in [ + ("schema", None), ("metadata", None), + ("split_row_groups", False), ("validate_schema", True), + ("metadata_nthreads", 1), ("memory_map", False)]: + if keyword in kwargs and kwargs[keyword] is not default: + raise ValueError( + "Keyword '{0}' is not yet supported with the new " + "Dataset API".format(keyword)) + + # map old filesystems to new one + # TODO(dataset) deal with other file systems + if isinstance(filesystem, LocalFileSystem): + filesystem = pyarrow.fs.LocalFileSystem() + + # map additional arguments + reader_options = {} + if buffer_size: + reader_options.update(use_buffered_stream=True, + buffer_size=buffer_size) + if read_dictionary is not None: + reader_options.update(dict_columns=read_dictionary) + parquet_format = ds.ParquetFileFormat(reader_options=reader_options) + + dataset = ds.dataset(path_or_paths, filesystem=filesystem, + format=parquet_format, partitioning="hive") + self._dataset = dataset - self.filters = filters + self._filters = filters if filters is not None: - self.filter_expression = _filters_to_expression(filters) + self._filter_expression = _filters_to_expression(filters) else: - self.filter_expression = None + self._filter_expression = None @property def schema(self): @@ -1381,7 +1386,7 @@ def read(self, columns=None, use_threads=True, use_pandas_metadata=False): # if use_pandas_metadata, we need to include index columns in the # column selection, to be able to restore those in the pandas DataFrame metadata = self._dataset.schema.metadata - if use_pandas_metadata: + if columns is not None and use_pandas_metadata: if metadata and b'pandas' in metadata: index_columns = _get_pandas_index_columns(metadata) @@ -1391,7 +1396,7 @@ def read(self, columns=None, use_threads=True, use_pandas_metadata=False): columns += [index_col] table = self._dataset.to_table( - columns=columns, filter=self.filter_expression, + columns=columns, filter=self._filter_expression, use_threads=use_threads ) @@ -1453,15 +1458,22 @@ def read_table(source, columns=None, use_threads=True, metadata=None, read_dictionary=None, filesystem=None, filters=None, buffer_size=0, use_legacy_dataset=True): if not use_legacy_dataset: - import pyarrow.dataset as ds - dataset = _dataset_from_legacy_args( - source, filesystem=filesystem, read_dictionary=read_dictionary, - buffer_size=buffer_size) - - if filters is not None and not isinstance(filters, ds.Expression): - filters = _filters_to_expression(filters) - table = dataset.to_table(columns=columns, filter=filters, - use_threads=use_threads) + if not _is_path_like(source): + raise ValueError("File-like objects are not yet supported with " + "the new Dataset API") + + dataset = _ParquetDatasetV2( + source, + filesystem=filesystem, + read_dictionary=read_dictionary, + buffer_size=buffer_size, + filters=filters, + # unsupported keywords + metadata=metadata, + memory_map=memory_map + ) + table = dataset.read(columns=columns, use_threads=use_threads, + use_pandas_metadata=use_pandas_metadata) # remove ARROW:schema metadata, current parquet version doesn't # preserve this diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index bfa105a7dfc..52b8eafe298 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -247,8 +247,9 @@ def test_chunked_table_write(use_legacy_dataset): table, version='2.0', use_legacy_dataset=use_legacy_dataset) +# TODO(dataset) support memory map @pytest.mark.pandas -@parametrize_legacy_dataset +@parametrize_legacy_dataset_not_supported def test_memory_map(tempdir, use_legacy_dataset): df = alltypes_sample(size=10) @@ -2302,13 +2303,15 @@ def read_multiple_files(paths, columns=None, use_threads=False, **kwargs): assert result.equals(expected) # Read with provided metadata - metadata = pq.read_metadata(paths[0]) + # TODO(dataset) specifying metadata not yet supported + if use_legacy_dataset: + metadata = pq.read_metadata(paths[0]) - result2 = read_multiple_files(paths, metadata=metadata) - assert result2.equals(expected) + result2 = read_multiple_files(paths, metadata=metadata) + assert result2.equals(expected) - result3 = pa.localfs.read_parquet(dirpath, schema=metadata.schema) - assert result3.equals(expected) + result3 = pa.localfs.read_parquet(dirpath, schema=metadata.schema) + assert result3.equals(expected) # Read column subset to_read = [0, 2, 6, result.num_columns - 1] @@ -2384,8 +2387,9 @@ def test_dataset_read_pandas(tempdir, use_legacy_dataset): tm.assert_frame_equal(result, expected) +# TODO(dataset) support memory map @pytest.mark.pandas -@parametrize_legacy_dataset_not_supported # TODO(dataset) support memory map +@parametrize_legacy_dataset_not_supported def test_dataset_memory_map(tempdir, use_legacy_dataset): # ARROW-2627: Check that we can use ParquetDataset with memory-mapping dirpath = tempdir / guid() @@ -3830,3 +3834,31 @@ def test_buffer_contents( [chunk] = col.chunks buf = chunk.buffers()[1] assert buf.to_pybytes() == buf.size * b"\0" + + +@pytest.mark.dataset +def test_dataset_unsupported_keywords(): + + with pytest.raises(ValueError, match="not yet supported with the new"): + pq.ParquetDataset("", use_legacy_dataset=False, schema=pa.schema([])) + + with pytest.raises(ValueError, match="not yet supported with the new"): + pq.ParquetDataset("", use_legacy_dataset=False, metadata=pa.schema([])) + + with pytest.raises(ValueError, match="not yet supported with the new"): + pq.ParquetDataset("", use_legacy_dataset=False, validate_schema=False) + + with pytest.raises(ValueError, match="not yet supported with the new"): + pq.ParquetDataset("", use_legacy_dataset=False, split_row_groups=True) + + with pytest.raises(ValueError, match="not yet supported with the new"): + pq.ParquetDataset("", use_legacy_dataset=False, metadata_nthreads=4) + + with pytest.raises(ValueError, match="not yet supported with the new"): + pq.ParquetDataset("", use_legacy_dataset=False, memory_map=True) + + with pytest.raises(ValueError, match="not yet supported with the new"): + pq.read_table("", use_legacy_dataset=False, metadata=pa.schema([])) + + with pytest.raises(ValueError, match="not yet supported with the new"): + pq.read_table("", use_legacy_dataset=False, memory_map=True) From 22c0e54d31c05c707c3d4b2624b3e020e3062e33 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 30 Mar 2020 11:06:05 +0200 Subject: [PATCH 12/26] fix expression syntax + add docstring --- python/pyarrow/parquet.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 9df6f229d3f..e01b912fd39 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -152,16 +152,19 @@ def convert_single_predicate(col, op, val): and_exprs = [] for col, op, val in conjunction: and_exprs.append(convert_single_predicate(col, op, val)) + + expr = and_exprs[0] if len(and_exprs) > 1: - expr = ds.AndExpression(*and_exprs) - else: - expr = and_exprs[0] + for and_expr in and_exprs[1:]: + expr = ds.AndExpression(expr, and_expr) + or_exprs.append(expr) + expr = or_exprs[0] if len(or_exprs) > 1: expr = ds.OrExpression(*or_exprs) - else: - expr = or_exprs[0] + for or_expr in or_exprs[1:]: + expr = ds.OrExpression(expr, or_expr) return expr @@ -1043,7 +1046,11 @@ def _open_dataset_file(dataset, path, meta=None): improve performance in some environments. buffer_size : int, default 0 If positive, perform read buffering when deserializing individual - column chunks. Otherwise IO calls are unbuffered.""" + column chunks. Otherwise IO calls are unbuffered. +use_legacy_dataset : bool, default True + Set to False to enable the new code path (experimental, using the + new Arrow Dataset API). This allows to pass `filters` for all columns + and not only the partition keys.""" class ParquetDataset: From c63d185c981811ff9dbf4ba7f3c3da3f7e7a7056 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 30 Mar 2020 11:47:06 +0200 Subject: [PATCH 13/26] fix paths test on Windows --- python/pyarrow/tests/test_parquet.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 52b8eafe298..aa31727be6c 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -2505,10 +2505,12 @@ def test_ignore_private_directories(tempdir, dir_prefix, use_legacy_dataset): (dirpath / '{}staging'.format(dir_prefix)).mkdir() dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset) + if use_legacy_dataset: assert set(map(str, paths)) == {x.path for x in dataset.pieces} else: - assert set(map(str, paths)) == set(dataset._dataset.files) + paths = [str(path.as_posix()) for path in paths] + assert set(paths) == set(dataset._dataset.files) @pytest.mark.pandas @@ -2527,10 +2529,12 @@ def test_ignore_hidden_files_dot(tempdir, use_legacy_dataset): f.write(b'gibberish') dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset) + if use_legacy_dataset: assert set(map(str, paths)) == {x.path for x in dataset.pieces} else: - assert set(map(str, paths)) == set(dataset._dataset.files) + paths = [str(path.as_posix()) for path in paths] + assert set(paths) == set(dataset._dataset.files) @pytest.mark.pandas @@ -2549,10 +2553,12 @@ def test_ignore_hidden_files_underscore(tempdir, use_legacy_dataset): f.write(b'abcd') dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset) + if use_legacy_dataset: assert set(map(str, paths)) == {x.path for x in dataset.pieces} else: - assert set(map(str, paths)) == set(dataset._dataset.files) + paths = [str(path.as_posix()) for path in paths] + assert set(paths) == set(dataset._dataset.files) @pytest.mark.pandas From 63d5acdd94917e0490e3d6d387fbefb6f5c22ac7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 31 Mar 2020 13:04:15 +0200 Subject: [PATCH 14/26] Update python/pyarrow/parquet.py Co-Authored-By: Benjamin Kietzman --- python/pyarrow/parquet.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index e01b912fd39..935a9effad3 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -108,12 +108,17 @@ def _filters_to_expression(filters): """ Check if filters are well-formed. - Predicates are expressed in disjunctive normal form (DNF). This means - that the innermost tuple describe a single column predicate. These - inner predicate make are all combined with a conjunction (AND) into a - larger predicate. The most outer list then combines all filters - with a disjunction (OR). By this, we should be able to express all - kinds of filters that are possible using boolean logic. + Predicates are expressed in disjunctive normal form (DNF), like + ``[[('x', '=', 0), ...], ...]``. DNF allows + arbitrary boolean logical combinations of single column predicates. The + innermost tuples each describe a single column predicate. The list + of inner predicates is interpreted as a conjunction (AND), forming a + more selective and multiple column predicate. Finally, the most outer + list combines these filters as a disjunction (OR). + + Predicates may also be passed as List[Tuple]. This form is interpreted + as a single conjunction. To express OR in predicates, one must + use the (preferred) List[List[Tuple]] notation. """ import pyarrow.dataset as ds From ce5166c2acc6a84481fe75dea537f4013503fd21 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 31 Mar 2020 13:05:00 +0200 Subject: [PATCH 15/26] Update python/pyarrow/parquet.py Co-Authored-By: Benjamin Kietzman --- python/pyarrow/parquet.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 935a9effad3..5795302459d 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -1400,12 +1400,8 @@ def read(self, columns=None, use_threads=True, use_pandas_metadata=False): metadata = self._dataset.schema.metadata if columns is not None and use_pandas_metadata: if metadata and b'pandas' in metadata: - index_columns = _get_pandas_index_columns(metadata) - - columns = list(columns) - for index_col in index_columns: - if index_col not in columns: - columns += [index_col] + index_columns = set(_get_pandas_index_columns(metadata)) + columns = columns + list(index_columns - set(columns)) table = self._dataset.to_table( columns=columns, filter=self._filter_expression, From cd972baffbf580891f5d46681990d94e8a496319 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 31 Mar 2020 13:05:39 +0200 Subject: [PATCH 16/26] Update python/pyarrow/tests/test_parquet.py Co-Authored-By: Benjamin Kietzman --- python/pyarrow/tests/test_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index aa31727be6c..70af8dbc85a 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -561,7 +561,7 @@ def _test_dataframe(size=10000, seed=0): return df -# TODO NativeFile support +# TODO(ARROW-8074) NativeFile support @pytest.mark.pandas @parametrize_legacy_dataset_skip_buffer def test_pandas_parquet_native_file_roundtrip(tempdir, use_legacy_dataset): From be7125b3658c595aa41b11aa45cdfbd75f57233d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 31 Mar 2020 13:06:18 +0200 Subject: [PATCH 17/26] Update python/pyarrow/tests/test_parquet.py Co-Authored-By: Benjamin Kietzman --- python/pyarrow/tests/test_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 70af8dbc85a..3395c337b11 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -1988,7 +1988,7 @@ def test_invalid_pred_op(tempdir, use_legacy_dataset): use_legacy_dataset=use_legacy_dataset) assert dataset.read().num_rows == 0 - with pytest.raises((ValueError, TypeError)): + with pytest.raises(ValueError if use_legacy_dataset else TypeError): # dataset API returns TypeError when trying create invalid comparison pq.ParquetDataset(base_path, filesystem=fs, From c5176d76f768bd72bbf91d09c854f855a0b093e4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 31 Mar 2020 13:34:56 +0200 Subject: [PATCH 18/26] consolidate filters docstring --- python/pyarrow/parquet.py | 61 +++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 5795302459d..c291e1c7451 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -104,21 +104,24 @@ def _check_filters(filters, check_null_strings=True): return filters +_DNF_filter_doc = """Predicates are expressed in disjunctive normal form (DNF), like + ``[[('x', '=', 0), ...], ...]``. DNF allows arbitrary boolean logical + combinations of single column predicates. The innermost tuples each + describe a single column predicate. The list of inner predicates is + interpreted as a conjunction (AND), forming a more selective and + multiple column predicate. Finally, the most outer list combines these + filters as a disjunction (OR). + + Predicates may also be passed as List[Tuple]. This form is interpreted + as a single conjunction. To express OR in predicates, one must + use the (preferred) List[List[Tuple]] notation.""" + + def _filters_to_expression(filters): """ Check if filters are well-formed. - Predicates are expressed in disjunctive normal form (DNF), like - ``[[('x', '=', 0), ...], ...]``. DNF allows - arbitrary boolean logical combinations of single column predicates. The - innermost tuples each describe a single column predicate. The list - of inner predicates is interpreted as a conjunction (AND), forming a - more selective and multiple column predicate. Finally, the most outer - list combines these filters as a disjunction (OR). - - Predicates may also be passed as List[Tuple]. This form is interpreted - as a single conjunction. To express OR in predicates, one must - use the (preferred) List[List[Tuple]] notation. + See _DNF_filter_doc above for more details. """ import pyarrow.dataset as ds @@ -1082,25 +1085,17 @@ class ParquetDataset: Check that individual file schemas are all the same / compatible. filters : List[Tuple] or List[List[Tuple]] or None (default) List of filters to apply, like ``[[('x', '=', 0), ...], ...]``. This - implements partition-level (hive) filtering only, i.e., to prevent the - loading of some files of the dataset. - - Predicates are expressed in disjunctive normal form (DNF). This means - that the innermost tuple describe a single column predicate. These - inner predicate make are all combined with a conjunction (AND) into a - larger predicate. The most outer list then combines all filters - with a disjunction (OR). By this, we should be able to express all - kinds of filters that are possible using boolean logic. - - This function also supports passing in as List[Tuple]. These predicates - are evaluated as a conjunction. To express OR in predicates, one must - use the (preferred) List[List[Tuple]] notation. + implements partition-level (hive) filtering, i.e., to prevent the + loading of some files of the dataset, as well as file-level filtering + (if `use_legacy_dataset` is set to False). + + {1} metadata_nthreads: int, default 1 How many threads to allow the thread pool which is used to read the dataset metadata. Increasing this is helpful to read partitioned datasets. -{} -""".format(_read_docstring_common) +{0} +""".format(_read_docstring_common, _DNF_filter_doc) def __new__(cls, path_or_paths=None, filesystem=None, schema=None, metadata=None, split_row_groups=False, validate_schema=True, @@ -1451,9 +1446,11 @@ def pieces(self): {1} filters : List[Tuple] or List[List[Tuple]] or None (default) List of filters to apply, like ``[[('x', '=', 0), ...], ...]``. This - implements partition-level (hive) filtering only, i.e., to prevent the - loading of some files of the dataset if `source` is a directory. - See the docstring of ParquetDataset for more details. + implements partition-level (hive) filtering, i.e., to prevent the + loading of some files of the dataset if `source` is a directory, as well + as file-level filtering (if `use_legacy_dataset` is set to False). + + {3} Returns ------- @@ -1514,7 +1511,8 @@ def read_table(source, columns=None, use_threads=True, metadata=None, If True and file has custom pandas schema metadata, ensure that index columns are also loaded""")), """pyarrow.Table - Content of the file as a table (of columns)""") + Content of the file as a table (of columns)""", + _DNF_filter_doc) def read_pandas(source, columns=None, use_threads=True, memory_map=False, @@ -1539,7 +1537,8 @@ def read_pandas(source, columns=None, use_threads=True, memory_map=False, _read_docstring_common, """pyarrow.Table Content of the file as a Table of Columns, including DataFrame - indexes as columns""") + indexes as columns""", + _DNF_filter_doc) def write_table(table, where, row_group_size=None, version='1.0', From 9e028be36167044430a6aea1ca66537f5e784d96 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 31 Mar 2020 13:47:02 +0200 Subject: [PATCH 19/26] support memory_map --- python/pyarrow/parquet.py | 27 +++++++++++++++------------ python/pyarrow/tests/test_parquet.py | 17 +++++------------ 2 files changed, 20 insertions(+), 24 deletions(-) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index c291e1c7451..a53144aa95f 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -1105,13 +1105,13 @@ def __new__(cls, path_or_paths=None, filesystem=None, schema=None, return _ParquetDatasetV2(path_or_paths, filesystem=filesystem, filters=filters, read_dictionary=read_dictionary, + memory_map=memory_map, buffer_size=buffer_size, # unsupported keywords schema=schema, metadata=metadata, split_row_groups=split_row_groups, validate_schema=validate_schema, - metadata_nthreads=metadata_nthreads, - memory_map=memory_map) + metadata_nthreads=metadata_nthreads) self = object.__new__(cls) return self @@ -1346,7 +1346,8 @@ class _ParquetDatasetV2: ParquetDataset shim using the Dataset API under the hood. """ def __init__(self, path_or_paths, filesystem=None, filters=None, - read_dictionary=None, buffer_size=None, **kwargs): + read_dictionary=None, buffer_size=None, memory_map=False, + **kwargs): import pyarrow.dataset as ds import pyarrow.fs @@ -1354,7 +1355,7 @@ def __init__(self, path_or_paths, filesystem=None, filters=None, for keyword, default in [ ("schema", None), ("metadata", None), ("split_row_groups", False), ("validate_schema", True), - ("metadata_nthreads", 1), ("memory_map", False)]: + ("metadata_nthreads", 1)]: if keyword in kwargs and kwargs[keyword] is not default: raise ValueError( "Keyword '{0}' is not yet supported with the new " @@ -1363,16 +1364,18 @@ def __init__(self, path_or_paths, filesystem=None, filters=None, # map old filesystems to new one # TODO(dataset) deal with other file systems if isinstance(filesystem, LocalFileSystem): - filesystem = pyarrow.fs.LocalFileSystem() + filesystem = pyarrow.fs.LocalFileSystem(use_mmap=memory_map) + elif filesystem is None and memory_map: + filesystem = pyarrow.fs.LocalFileSystem(use_mmap=True) # map additional arguments - reader_options = {} + read_options = {} if buffer_size: - reader_options.update(use_buffered_stream=True, - buffer_size=buffer_size) + read_options.update(use_buffered_stream=True, + buffer_size=buffer_size) if read_dictionary is not None: - reader_options.update(dict_columns=read_dictionary) - parquet_format = ds.ParquetFileFormat(reader_options=reader_options) + read_options.update(dict_columns=read_dictionary) + parquet_format = ds.ParquetFileFormat(read_options=read_options) dataset = ds.dataset(path_or_paths, filesystem=filesystem, format=parquet_format, partitioning="hive") @@ -1470,12 +1473,12 @@ def read_table(source, columns=None, use_threads=True, metadata=None, dataset = _ParquetDatasetV2( source, filesystem=filesystem, + memory_map=memory_map, read_dictionary=read_dictionary, buffer_size=buffer_size, filters=filters, # unsupported keywords - metadata=metadata, - memory_map=memory_map + metadata=metadata ) table = dataset.read(columns=columns, use_threads=use_threads, use_pandas_metadata=use_pandas_metadata) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 3395c337b11..4195f4b386b 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -247,14 +247,12 @@ def test_chunked_table_write(use_legacy_dataset): table, version='2.0', use_legacy_dataset=use_legacy_dataset) -# TODO(dataset) support memory map @pytest.mark.pandas -@parametrize_legacy_dataset_not_supported +@parametrize_legacy_dataset def test_memory_map(tempdir, use_legacy_dataset): df = alltypes_sample(size=10) table = pa.Table.from_pandas(df) - # TODO(dataset) memory_map is still ignored for now _check_roundtrip(table, read_table_kwargs={'memory_map': True}, version='2.0', use_legacy_dataset=use_legacy_dataset) @@ -2387,9 +2385,8 @@ def test_dataset_read_pandas(tempdir, use_legacy_dataset): tm.assert_frame_equal(result, expected) -# TODO(dataset) support memory map @pytest.mark.pandas -@parametrize_legacy_dataset_not_supported +@parametrize_legacy_dataset def test_dataset_memory_map(tempdir, use_legacy_dataset): # ARROW-2627: Check that we can use ParquetDataset with memory-mapping dirpath = tempdir / guid() @@ -2402,7 +2399,9 @@ def test_dataset_memory_map(tempdir, use_legacy_dataset): dataset = pq.ParquetDataset( dirpath, memory_map=True, use_legacy_dataset=use_legacy_dataset) - assert dataset.pieces[0].read().equals(table) + assert dataset.read().equals(table) + if use_legacy_dataset: + assert dataset.pieces[0].read().equals(table) @pytest.mark.pandas @@ -3860,11 +3859,5 @@ def test_dataset_unsupported_keywords(): with pytest.raises(ValueError, match="not yet supported with the new"): pq.ParquetDataset("", use_legacy_dataset=False, metadata_nthreads=4) - with pytest.raises(ValueError, match="not yet supported with the new"): - pq.ParquetDataset("", use_legacy_dataset=False, memory_map=True) - with pytest.raises(ValueError, match="not yet supported with the new"): pq.read_table("", use_legacy_dataset=False, metadata=pa.schema([])) - - with pytest.raises(ValueError, match="not yet supported with the new"): - pq.read_table("", use_legacy_dataset=False, memory_map=True) From 126e023b2455eac4c067292512212de8b6c99c99 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 31 Mar 2020 14:15:29 +0200 Subject: [PATCH 20/26] feedback --- python/pyarrow/_dataset.pyx | 6 ++- python/pyarrow/parquet.py | 12 +++--- python/pyarrow/tests/test_parquet.py | 61 +++++++++------------------- 3 files changed, 28 insertions(+), 51 deletions(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 71595baa339..a16eb9780dc 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -623,7 +623,7 @@ cdef class ParquetReadOptions: buffer_size : int, default 8192 Size of buffered stream, if enabled. Default is 8KB. dictionary_columns : list of string, default None - Names of columns which should be read as dictionaries. + Names of columns which should be read as dictionary type. """ cdef public: @@ -632,9 +632,11 @@ cdef class ParquetReadOptions: set dictionary_columns def __init__(self, bint use_buffered_stream=False, - uint32_t buffer_size=8192, + buffer_size=8192, dictionary_columns=None): self.use_buffered_stream = use_buffered_stream + if buffer_size <= 0: + raise ValueError("Buffer size must be larger than zero") self.buffer_size = buffer_size self.dictionary_columns = set(dictionary_columns or set()) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index a53144aa95f..5d468880667 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -1057,8 +1057,8 @@ def _open_dataset_file(dataset, path, meta=None): column chunks. Otherwise IO calls are unbuffered. use_legacy_dataset : bool, default True Set to False to enable the new code path (experimental, using the - new Arrow Dataset API). This allows to pass `filters` for all columns - and not only the partition keys.""" + new Arrow Dataset API). Among other things, this allows to pass + `filters` for all columns and not only the partition keys.""" class ParquetDataset: @@ -1374,13 +1374,11 @@ def __init__(self, path_or_paths, filesystem=None, filters=None, read_options.update(use_buffered_stream=True, buffer_size=buffer_size) if read_dictionary is not None: - read_options.update(dict_columns=read_dictionary) + read_options.update(dictionary_columns=read_dictionary) parquet_format = ds.ParquetFileFormat(read_options=read_options) - dataset = ds.dataset(path_or_paths, filesystem=filesystem, - format=parquet_format, partitioning="hive") - - self._dataset = dataset + self._dataset = ds.dataset(path_or_paths, filesystem=filesystem, + format=parquet_format, partitioning="hive") self._filters = filters if filters is not None: self._filter_expression = _filters_to_expression(filters) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 4195f4b386b..41d90d989d4 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -119,23 +119,10 @@ def _check_roundtrip(table, expected=None, read_table_kwargs=None, def _roundtrip_pandas_dataframe(df, write_kwargs, use_legacy_dataset=True): table = pa.Table.from_pandas(df) - - if use_legacy_dataset: - buf = io.BytesIO() - _write_table(table, buf, **write_kwargs) - - buf.seek(0) - table1 = _read_table(buf) - else: - from pyarrow.fs import _MockFileSystem - mockfs = _MockFileSystem() - with mockfs.open_output_stream("test") as out: - _write_table(table, out, **write_kwargs) - - table1 = _read_table( - "test", filesystem=mockfs, use_legacy_dataset=False) - - return table1.to_pandas() + result = _roundtrip_table( + table, write_table_kwargs=write_kwargs, + use_legacy_dataset=use_legacy_dataset) + return result.to_pandas() @parametrize_legacy_dataset @@ -2415,16 +2402,10 @@ def test_dataset_enable_buffered_stream(tempdir, use_legacy_dataset): table = pa.Table.from_pandas(df) _write_table(table, path, version='2.0') - # TODO(dataset) raises an OSError instead of ValueError - with pytest.raises((ValueError, OSError)): - if use_legacy_dataset: - pq.ParquetDataset( - dirpath, buffer_size=-64, - use_legacy_dataset=use_legacy_dataset) - else: - # Dataset API only raises when reading - pq.ParquetDataset( - dirpath, buffer_size=-64, use_legacy_dataset=False).read() + with pytest.raises(ValueError): + pq.ParquetDataset( + dirpath, buffer_size=-64, + use_legacy_dataset=use_legacy_dataset) for buffer_size in [128, 1024]: dataset = pq.ParquetDataset( @@ -2490,6 +2471,14 @@ def _make_example_multifile_dataset(base_path, nfiles=10, file_nrows=5): return paths +def _assert_dataset_paths(dataset, paths, use_legacy_dataset): + if use_legacy_dataset: + assert set(map(str, paths)) == {x.path for x in dataset.pieces} + else: + paths = [str(path.as_posix()) for path in paths] + assert set(paths) == set(dataset._dataset.files) + + @pytest.mark.pandas @parametrize_legacy_dataset @pytest.mark.parametrize('dir_prefix', ['_', '.']) @@ -2505,11 +2494,7 @@ def test_ignore_private_directories(tempdir, dir_prefix, use_legacy_dataset): dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset) - if use_legacy_dataset: - assert set(map(str, paths)) == {x.path for x in dataset.pieces} - else: - paths = [str(path.as_posix()) for path in paths] - assert set(paths) == set(dataset._dataset.files) + _assert_dataset_paths(dataset, paths, use_legacy_dataset) @pytest.mark.pandas @@ -2529,11 +2514,7 @@ def test_ignore_hidden_files_dot(tempdir, use_legacy_dataset): dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset) - if use_legacy_dataset: - assert set(map(str, paths)) == {x.path for x in dataset.pieces} - else: - paths = [str(path.as_posix()) for path in paths] - assert set(paths) == set(dataset._dataset.files) + _assert_dataset_paths(dataset, paths, use_legacy_dataset) @pytest.mark.pandas @@ -2553,11 +2534,7 @@ def test_ignore_hidden_files_underscore(tempdir, use_legacy_dataset): dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset) - if use_legacy_dataset: - assert set(map(str, paths)) == {x.path for x in dataset.pieces} - else: - paths = [str(path.as_posix()) for path in paths] - assert set(paths) == set(dataset._dataset.files) + _assert_dataset_paths(dataset, paths, use_legacy_dataset) @pytest.mark.pandas From 16de7765818da1ab8f022f12b0d18635cd7d2a68 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 2 Apr 2020 10:33:02 +0200 Subject: [PATCH 21/26] enable different partitioning schemes --- python/pyarrow/parquet.py | 35 +++++++++++++++++++++------- python/pyarrow/tests/test_parquet.py | 34 +++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 8 deletions(-) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 5d468880667..32512cd1ad7 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -1055,10 +1055,17 @@ def _open_dataset_file(dataset, path, meta=None): buffer_size : int, default 0 If positive, perform read buffering when deserializing individual column chunks. Otherwise IO calls are unbuffered. +partitioning : Partitioning or str or list of str, default "hive" + The partitioning scheme for a partitioned dataset. The default of "hive" + assumes directory names with key=value pairs like "/year=2009/month=11". + In addition, a scheme like "/2009/11" is also supported, in which case + you need to specify the field names or a full schema. See the + ``pyarrow.dataset.partitioning()`` function for more details. use_legacy_dataset : bool, default True Set to False to enable the new code path (experimental, using the new Arrow Dataset API). Among other things, this allows to pass - `filters` for all columns and not only the partition keys.""" + `filters` for all columns and not only the partition keys, enables + different partitioning schemes, etc.""" class ParquetDataset: @@ -1100,10 +1107,12 @@ class ParquetDataset: def __new__(cls, path_or_paths=None, filesystem=None, schema=None, metadata=None, split_row_groups=False, validate_schema=True, filters=None, metadata_nthreads=1, read_dictionary=None, - memory_map=False, buffer_size=0, use_legacy_dataset=True): + memory_map=False, buffer_size=0, partitioning="hive", + use_legacy_dataset=True): if not use_legacy_dataset: return _ParquetDatasetV2(path_or_paths, filesystem=filesystem, filters=filters, + partitioning=partitioning, read_dictionary=read_dictionary, memory_map=memory_map, buffer_size=buffer_size, @@ -1118,7 +1127,12 @@ def __new__(cls, path_or_paths=None, filesystem=None, schema=None, def __init__(self, path_or_paths, filesystem=None, schema=None, metadata=None, split_row_groups=False, validate_schema=True, filters=None, metadata_nthreads=1, read_dictionary=None, - memory_map=False, buffer_size=0, use_legacy_dataset=True): + memory_map=False, buffer_size=0, partitioning="hive", + use_legacy_dataset=True): + if partitioning != "hive": + raise ValueError( + 'Only "hive" for hive-like partitioning is supported when ' + 'using use_legacy_dataset=True') self._metadata = _ParquetDatasetMetadata() a_path = path_or_paths if isinstance(a_path, list): @@ -1346,8 +1360,8 @@ class _ParquetDatasetV2: ParquetDataset shim using the Dataset API under the hood. """ def __init__(self, path_or_paths, filesystem=None, filters=None, - read_dictionary=None, buffer_size=None, memory_map=False, - **kwargs): + partitioning="hive", read_dictionary=None, buffer_size=None, + memory_map=False, **kwargs): import pyarrow.dataset as ds import pyarrow.fs @@ -1366,6 +1380,8 @@ def __init__(self, path_or_paths, filesystem=None, filters=None, if isinstance(filesystem, LocalFileSystem): filesystem = pyarrow.fs.LocalFileSystem(use_mmap=memory_map) elif filesystem is None and memory_map: + # if memory_map is specified, assume local file system (string + # path can in principle be URI for any filesystem) filesystem = pyarrow.fs.LocalFileSystem(use_mmap=True) # map additional arguments @@ -1378,7 +1394,8 @@ def __init__(self, path_or_paths, filesystem=None, filters=None, parquet_format = ds.ParquetFileFormat(read_options=read_options) self._dataset = ds.dataset(path_or_paths, filesystem=filesystem, - format=parquet_format, partitioning="hive") + format=parquet_format, + partitioning=partitioning) self._filters = filters if filters is not None: self._filter_expression = _filters_to_expression(filters) @@ -1462,7 +1479,7 @@ def pieces(self): def read_table(source, columns=None, use_threads=True, metadata=None, use_pandas_metadata=False, memory_map=False, read_dictionary=None, filesystem=None, filters=None, - buffer_size=0, use_legacy_dataset=True): + buffer_size=0, partitioning="hive", use_legacy_dataset=True): if not use_legacy_dataset: if not _is_path_like(source): raise ValueError("File-like objects are not yet supported with " @@ -1471,6 +1488,7 @@ def read_table(source, columns=None, use_threads=True, metadata=None, dataset = _ParquetDatasetV2( source, filesystem=filesystem, + partitioning=partitioning, memory_map=memory_map, read_dictionary=read_dictionary, buffer_size=buffer_size, @@ -1495,7 +1513,8 @@ def read_table(source, columns=None, use_threads=True, metadata=None, pf = ParquetDataset(source, metadata=metadata, memory_map=memory_map, read_dictionary=read_dictionary, buffer_size=buffer_size, - filesystem=filesystem, filters=filters) + filesystem=filesystem, filters=filters, + partitioning=partitioning) else: pf = ParquetFile(source, metadata=metadata, read_dictionary=read_dictionary, diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 41d90d989d4..615668d8e7c 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -3838,3 +3838,37 @@ def test_dataset_unsupported_keywords(): with pytest.raises(ValueError, match="not yet supported with the new"): pq.read_table("", use_legacy_dataset=False, metadata=pa.schema([])) + + +@pytest.mark.dataset +def test_dataset_partitioning(tempdir): + import pyarrow.dataset as ds + + # create small dataset with directory partitioning + root_path = tempdir / "test_partitioning" + (root_path / "2012" / "10" / "01").mkdir(parents=True) + + table = pa.table({'a': [1, 2, 3]}) + pq.write_table( + table, str(root_path / "2012" / "10" / "01" / "data.parquet")) + + # This works with new dataset API + + # read_table + part = ds.partitioning(field_names=["year", "month", "day"]) + result = pq.read_table( + str(root_path), partitioning=part, use_legacy_dataset=False) + assert result.column_names == ["a", "year", "month", "day"] + + result = pq.ParquetDataset( + str(root_path), partitioning=part, use_legacy_dataset=False).read() + assert result.column_names == ["a", "year", "month", "day"] + + # This raises an error for legacy dataset + with pytest.raises(ValueError): + pq.read_table( + str(root_path), partitioning=part, use_legacy_dataset=True) + + with pytest.raises(ValueError): + pq.ParquetDataset( + str(root_path), partitioning=part, use_legacy_dataset=True) From 9650f655f6b27a3585f19d28b65dc1c1f86f771f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 2 Apr 2020 10:44:33 +0200 Subject: [PATCH 22/26] remove ARROW:schema removal from metadata in read_table for new API --- python/pyarrow/parquet.py | 14 ++------------ python/pyarrow/tests/test_parquet.py | 12 ++++++++++-- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 32512cd1ad7..b4af232cdc4 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -1496,18 +1496,8 @@ def read_table(source, columns=None, use_threads=True, metadata=None, # unsupported keywords metadata=metadata ) - table = dataset.read(columns=columns, use_threads=use_threads, - use_pandas_metadata=use_pandas_metadata) - - # remove ARROW:schema metadata, current parquet version doesn't - # preserve this - metadata = table.schema.metadata - if metadata: - metadata.pop(b"ARROW:schema", None) - if len(metadata) == 0: - metadata = None - table = table.replace_schema_metadata(metadata) - return table + return dataset.read(columns=columns, use_threads=use_threads, + use_pandas_metadata=use_pandas_metadata) if _is_path_like(source): pf = ParquetDataset(source, metadata=metadata, memory_map=memory_map, diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 615668d8e7c..71ece06bd5c 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -192,7 +192,11 @@ def test_pandas_parquet_2_0_roundtrip(tempdir, chunk_size, use_legacy_dataset): use_threads=use_threads) assert table_read.schema.pandas_metadata is not None - assert arrow_table.schema.metadata == table_read.schema.metadata + read_metadata = table_read.schema.metadata + if not use_legacy_dataset: + read_metadata.pop(b"ARROW:schema") + + assert arrow_table.schema.metadata == read_metadata df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @@ -426,7 +430,11 @@ def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written( js = table_read.schema.pandas_metadata assert not js['index_columns'] - assert arrow_table.schema.metadata == table_read.schema.metadata + read_metadata = table_read.schema.metadata + if not use_legacy_dataset: + read_metadata.pop(b"ARROW:schema") + + assert arrow_table.schema.metadata == read_metadata df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) From 608b6d4717b05c280cf8db50a7c283c5a8858f32 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 9 Apr 2020 13:52:06 +0200 Subject: [PATCH 23/26] Apply suggestions from code review Co-Authored-By: Benjamin Kietzman --- python/pyarrow/_dataset.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index a16eb9780dc..231588ab074 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -623,7 +623,8 @@ cdef class ParquetReadOptions: buffer_size : int, default 8192 Size of buffered stream, if enabled. Default is 8KB. dictionary_columns : list of string, default None - Names of columns which should be read as dictionary type. + Names of columns which should be dictionary encoded as + they are read. """ cdef public: From a2c80f83bc988efca7b0611026a54d3196c3fb23 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 9 Apr 2020 14:03:48 +0200 Subject: [PATCH 24/26] Apply suggestions from code review Co-Authored-By: Benjamin Kietzman --- python/pyarrow/tests/test_parquet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 71ece06bd5c..7ff4df0dd21 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -2673,7 +2673,7 @@ def _test_write_to_dataset_with_partitions(base_path, if use_legacy_dataset: dataset_cols = set(dataset.schema.to_arrow_schema().names) else: - # TODO(dataset) schema property is an arrow and not parquet schema + # NB schema property is an arrow and not parquet schema dataset_cols = set(dataset.schema.names) assert dataset_cols == set(output_table.schema.names) @@ -2813,7 +2813,7 @@ def test_large_table_int32_overflow(): _write_table(table, f) -# TODO buffer support +# TODO(ARROW-8074) buffer support def _simple_table_roundtrip(table, **write_kwargs): stream = pa.BufferOutputStream() _write_table(table, stream, **write_kwargs) From 9e721f4ae070268c51944c3140913c1b46da1a84 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 9 Apr 2020 14:29:25 +0200 Subject: [PATCH 25/26] update docstrings --- python/pyarrow/parquet.py | 39 +++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index b4af232cdc4..8586366d940 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -1091,10 +1091,13 @@ class ParquetDataset: validate_schema : bool, default True Check that individual file schemas are all the same / compatible. filters : List[Tuple] or List[List[Tuple]] or None (default) - List of filters to apply, like ``[[('x', '=', 0), ...], ...]``. This - implements partition-level (hive) filtering, i.e., to prevent the - loading of some files of the dataset, as well as file-level filtering - (if `use_legacy_dataset` is set to False). + Rows which do not match the filter predicate will be removed from scanned + data. Partition keys embedded in a nested directory structure will be + exploited to avoid loading files at all if they contain no matching rows. + If `use_legacy_dataset` is True, filters can only reference partition + keys and only a hive-style directory structure is supported. When + setting `use_legacy_dataset` to False, also within-file level filtering + and different partitioning schemes are supported. {1} metadata_nthreads: int, default 1 @@ -1407,7 +1410,24 @@ def schema(self): return self._dataset.schema def read(self, columns=None, use_threads=True, use_pandas_metadata=False): + """ + Read (multiple) Parquet files as a single pyarrow.Table. + Parameters + ---------- + columns : List[str] + Names of columns to read from the dataset. + use_threads : bool, default True + Perform multi-threaded column reads. + use_pandas_metadata : bool, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + + Returns + ------- + pyarrow.Table + Content of the file as a table (of columns). + """ # if use_pandas_metadata, we need to include index columns in the # column selection, to be able to restore those in the pandas DataFrame metadata = self._dataset.schema.metadata @@ -1463,10 +1483,13 @@ def pieces(self): If separately computed {1} filters : List[Tuple] or List[List[Tuple]] or None (default) - List of filters to apply, like ``[[('x', '=', 0), ...], ...]``. This - implements partition-level (hive) filtering, i.e., to prevent the - loading of some files of the dataset if `source` is a directory, as well - as file-level filtering (if `use_legacy_dataset` is set to False). + Rows which do not match the filter predicate will be removed from scanned + data. Partition keys embedded in a nested directory structure will be + exploited to avoid loading files at all if they contain no matching rows. + If `use_legacy_dataset` is True, filters can only reference partition + keys and only a hive-style directory structure is supported. When + setting `use_legacy_dataset` to False, also within-file level filtering + and different partitioning schemes are supported. {3} From 9cbaf3c71909d660030e2f9f1a1afd8462c2c932 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 9 Apr 2020 14:37:00 +0200 Subject: [PATCH 26/26] deterministic_row_order helper function --- python/pyarrow/tests/test_parquet.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 7ff4df0dd21..89131b428da 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -65,6 +65,12 @@ def datadir(datadir): "use_legacy_dataset", [True, pytest.param(False, marks=pytest.mark.skip)]) +def deterministic_row_order(use_legacy_dataset, chunk_size=None): + # TODO(datasets) ensure to use use_threads=False with the new dataset API + # in the tests because otherwise the row order is not deterministic + return False if not use_legacy_dataset and chunk_size is not None else True + + def _write_table(table, path, **kwargs): # So we see the ImportError somewhere import pyarrow.parquet as pq @@ -184,9 +190,7 @@ def test_pandas_parquet_2_0_roundtrip(tempdir, chunk_size, use_legacy_dataset): _write_table(arrow_table, filename, version="2.0", coerce_timestamps='ms', chunk_size=chunk_size) - # TODO(datasets) - use_threads = ( - False if not use_legacy_dataset and chunk_size is not None else True) + use_threads = deterministic_row_order(use_legacy_dataset, chunk_size) table_read = pq.read_pandas( filename, use_legacy_dataset=use_legacy_dataset, use_threads=use_threads) @@ -2678,10 +2682,8 @@ def _test_write_to_dataset_with_partitions(base_path, assert dataset_cols == set(output_table.schema.names) - # TODO(datasets) changed to use_threads=False because otherwise the - # row order is not deterministic - kwargs = dict(use_threads=False) if not use_legacy_dataset else {} - input_table = dataset.read(**kwargs) + use_threads = deterministic_row_order(use_legacy_dataset) + input_table = dataset.read(use_threads=use_threads) input_df = input_table.to_pandas() # Read data back in and compare with original DataFrame