From 457fb4462d2b4df899d00ac5a417c7c571bfa992 Mon Sep 17 00:00:00 2001 From: Dmitry Mottl Date: Tue, 6 May 2025 14:40:32 +0700 Subject: [PATCH 1/6] Fix docstring for pyarrow.parquet.read_table --- python/pyarrow/parquet/core.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index f5a472c9a9b..5a36d578bdc 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -1626,10 +1626,12 @@ def partitioning(self): Parameters ---------- -source : str, pyarrow.NativeFile, or file-like object - If a string passed, can be a single file name or directory name. For - file-like objects, only read a single file. Use pyarrow.BufferReader to - read a file contained in a bytes or buffer-like object. +source : str, List[str], pyarrow.NativeFile, or file-like object + The source to read data from. + If a single string is passed, it can be a single file name or directory name. + If a list of strings is passed, each string should be a file name. + For file-like objects, only read a single file. + Use pyarrow.BufferReader to read a file contained in a bytes or buffer-like object. columns : list If not None, only these columns will be read from the file. A column name may be a prefix of a nested field, e.g. 'a' will select 'a.b', From 22b4a85606505ec81853b4caec94250b3ae07e4e Mon Sep 17 00:00:00 2001 From: Dmitry Mottl Date: Tue, 6 May 2025 19:58:27 +0700 Subject: [PATCH 2/6] Fix docstring --- python/pyarrow/parquet/core.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 5a36d578bdc..a658dfaef66 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -1626,11 +1626,10 @@ def partitioning(self): Parameters ---------- -source : str, List[str], pyarrow.NativeFile, or file-like object - The source to read data from. - If a single string is passed, it can be a single file name or directory name. - If a list of strings is passed, each string should be a file name. - For file-like objects, only read a single file. +source : str, pyarrow.NativeFile, or file-like object + If a string is passed, it should be single file name. + If the dataset module is enabled, you can also pass a directory name or a list + of file names. Use pyarrow.BufferReader to read a file contained in a bytes or buffer-like object. columns : list If not None, only these columns will be read from the file. A column From db591783513c4eb38c5015c072ffca8ae3af29e0 Mon Sep 17 00:00:00 2001 From: Dmitry Mottl Date: Tue, 6 May 2025 20:39:35 +0700 Subject: [PATCH 3/6] Add type checks for source argument in read_table --- python/pyarrow/parquet/core.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index a658dfaef66..e91ce07a235 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -1826,7 +1826,14 @@ def read_table(source, *, columns=None, use_threads=True, filesystem, path = _resolve_filesystem_and_path(source, filesystem) if filesystem is not None: source = filesystem.open_input_file(path) - # TODO test that source is not a directory or a list + if not ( + isinstance(source, str) + or isinstance(source, pa.NativeFile) + or hasattr(source, "read") + ): + raise ValueError( + "source should be a file name, a pyarrow.NativeFile or a file-like object" + ) dataset = ParquetFile( source, read_dictionary=read_dictionary, memory_map=memory_map, buffer_size=buffer_size, From e93d845000ebe14e60df11b016bcd951b3742193 Mon Sep 17 00:00:00 2001 From: Dmitry Mottl Date: Fri, 9 May 2025 17:41:07 +0700 Subject: [PATCH 4/6] Update python/pyarrow/parquet/core.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Raúl Cumplido --- python/pyarrow/parquet/core.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index e91ce07a235..1c58ce9c007 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -1832,7 +1832,8 @@ def read_table(source, *, columns=None, use_threads=True, or hasattr(source, "read") ): raise ValueError( - "source should be a file name, a pyarrow.NativeFile or a file-like object" + "source should be a file name, a pyarrow.NativeFile or a file-like object " + "when the pyarrow.dataset module is not available" ) dataset = ParquetFile( source, read_dictionary=read_dictionary, From 06ba561316c98c0b3ff79b4a768e51018aaf048b Mon Sep 17 00:00:00 2001 From: Dmitry Mottl Date: Fri, 9 May 2025 18:17:22 +0700 Subject: [PATCH 5/6] Add directory check --- python/pyarrow/parquet/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 1c58ce9c007..0b89948ef36 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -1827,7 +1827,7 @@ def read_table(source, *, columns=None, use_threads=True, if filesystem is not None: source = filesystem.open_input_file(path) if not ( - isinstance(source, str) + (isinstance(source, str) and not os.path.isdir(source)) or isinstance(source, pa.NativeFile) or hasattr(source, "read") ): From a7818902ea6e7b46b7faccd4ff3f8477727ec129 Mon Sep 17 00:00:00 2001 From: Dmitry Mottl Date: Fri, 9 May 2025 18:32:57 +0700 Subject: [PATCH 6/6] linting --- python/pyarrow/parquet/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 0b89948ef36..4c2c8cba0b4 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -1832,8 +1832,8 @@ def read_table(source, *, columns=None, use_threads=True, or hasattr(source, "read") ): raise ValueError( - "source should be a file name, a pyarrow.NativeFile or a file-like object " - "when the pyarrow.dataset module is not available" + "source should be a file name, a pyarrow.NativeFile or a file-like " + "object when the pyarrow.dataset module is not available" ) dataset = ParquetFile( source, read_dictionary=read_dictionary,