diff --git a/ci/conda_env_sphinx.txt b/ci/conda_env_sphinx.txt index 9f6c99b132e..e18773a662a 100644 --- a/ci/conda_env_sphinx.txt +++ b/ci/conda_env_sphinx.txt @@ -19,8 +19,9 @@ breathe doxygen ipython -sphinx>=4.2 +numpydoc pydata-sphinx-theme +sphinx>=4.2 # Unable to install sphinx-tabs from conda-forge due to: # - package sphinx-tabs-1.2.1-py_0 requires sphinx >=2,<4, but none of the providers can be installed diff --git a/docs/source/conf.py b/docs/source/conf.py index f77ecd61fed..2b0afcdcbf1 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -61,17 +61,18 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ + 'breathe', + 'IPython.sphinxext.ipython_console_highlighting', + 'IPython.sphinxext.ipython_directive', + 'numpydoc', + 'sphinx_tabs.tabs', 'sphinx.ext.autodoc', 'sphinx.ext.autosummary', 'sphinx.ext.doctest', 'sphinx.ext.ifconfig', + 'sphinx.ext.intersphinx', 'sphinx.ext.mathjax', 'sphinx.ext.viewcode', - 'sphinx.ext.napoleon', - 'IPython.sphinxext.ipython_directive', - 'IPython.sphinxext.ipython_console_highlighting', - 'breathe', - 'sphinx_tabs.tabs' ] # Show members for classes in .. autosummary @@ -93,7 +94,35 @@ ipython_mplbackend = '' # numpydoc configuration -napoleon_use_rtype = False +numpydoc_xref_param_type = True +numpydoc_show_class_members = False +numpydoc_xref_ignore = { + "or", "and", "of", "if", "default", "optional", "object", + "dicts", "rows", "Python", "source", "filesystem", + "dataset", "datasets", + # TODO those one could be linked to a glossary or python docs? + "file", "path", "paths", "mapping", "Mapping", "URI", "function", + "iterator", "Iterator", + # TODO this term is used regularly, but isn't actually exposed (base class) + "RecordBatchReader", + # additional ignores that could be fixed by rewriting the docstrings + "other", "supporting", "buffer", "protocol", # from Codec / pa.compress + "depends", "on", "inputs", # pyarrow.compute + "values", "coercible", "to", "arrays", # pa.chunked_array, Table methods + "depending", # to_pandas +} +numpydoc_xref_aliases = { + "array-like": ":func:`array-like `", + "Array": "pyarrow.Array", + "Schema": "pyarrow.Schema", + "RecordBatch": "pyarrow.RecordBatch", + "Table": "pyarrow.Table", + "MemoryPool": "pyarrow.MemoryPool", + "NativeFile": "pyarrow.NativeFile", + "FileSystem": "pyarrow.fs.FileSystem", + "FileType": "pyarrow.fs.FileType", +} + # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] @@ -184,6 +213,12 @@ # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False +intersphinx_mapping = { + 'python': ('https://docs.python.org/3', None), + 'numpy': ('https://numpy.org/doc/stable/', None), + 'pandas': ('https://pandas.pydata.org/docs/', None) +} + # -- Options for HTML output ---------------------------------------------- diff --git a/docs/source/developers/guide/step_by_step/building.rst b/docs/source/developers/guide/step_by_step/building.rst index 46b15f1891c..372d9582016 100644 --- a/docs/source/developers/guide/step_by_step/building.rst +++ b/docs/source/developers/guide/step_by_step/building.rst @@ -84,7 +84,7 @@ documentation for any similar error advice. Also changing the CMake flags for compiling Arrow could be useful. CMake presets -^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^ You could also try to build with CMake presets which are a collection of build and test recipes for Arrow's CMake. They are a very useful @@ -135,6 +135,7 @@ Building other Arrow libraries process. .. seealso:: + Follow the instructions to build PyArrow together with the C++ library - :ref:`build_pyarrow` diff --git a/docs/source/python/api/dataset.rst b/docs/source/python/api/dataset.rst index 2211cad0e1c..821b536c856 100644 --- a/docs/source/python/api/dataset.rst +++ b/docs/source/python/api/dataset.rst @@ -51,6 +51,7 @@ Classes CsvFragmentScanOptions IpcFileFormat ParquetFileFormat + ParquetReadOptions ParquetFragmentScanOptions ORCFileFormat Partitioning @@ -62,5 +63,7 @@ Classes FileSystemFactoryOptions FileSystemDatasetFactory UnionDataset + Fragment + FragmentScanOptions Scanner Expression diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx index 8a41e79c259..5cf14230c2a 100644 --- a/python/pyarrow/_csv.pyx +++ b/python/pyarrow/_csv.pyx @@ -493,7 +493,7 @@ cdef class ConvertOptions(_Weakrefable): ---------- check_utf8 : bool, optional (default True) Whether to check UTF8 validity of string columns. - column_types : pa.Schema or dict, optional + column_types : pyarrow.Schema or dict, optional Explicitly map column names to column types. Passing this argument disables type inference on the defined columns. null_values : list, optional diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index db18e59b47d..f3c2220a55d 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -318,7 +318,7 @@ cdef class Dataset(_Weakrefable): Returns ------- - table : Table instance + Table """ return self.scanner(**kwargs).to_table() @@ -329,7 +329,7 @@ cdef class Dataset(_Weakrefable): Returns ------- - table : Table instance + Table """ return self.scanner(**kwargs).take(indices) @@ -340,7 +340,7 @@ cdef class Dataset(_Weakrefable): Returns ------- - table : Table instance + Table """ return self.scanner(**kwargs).head(num_rows) @@ -578,7 +578,7 @@ cdef class FileSystemDataset(Dataset): ParquetFileFormat, IpcFileFormat, and CsvFileFormat are supported. filesystem : FileSystem The filesystem which files are from. - partitions : List[Expression], optional + partitions : list[Expression], optional Attach additional partition information for the file paths. root_partition : Expression, optional The top-level partition of the DataDataset. @@ -886,7 +886,7 @@ cdef class Fragment(_Weakrefable): Returns ------- - table : Table instance + Table """ return self.scanner(**kwargs).take(indices) @@ -897,7 +897,7 @@ cdef class Fragment(_Weakrefable): Returns ------- - table : Table instance + Table """ return self.scanner(**kwargs).head(num_rows) @@ -1059,11 +1059,11 @@ cdef class CsvFileFormat(FileFormat): Parameters ---------- - parse_options : csv.ParseOptions + parse_options : pyarrow.csv.ParseOptions Options regarding CSV parsing. - convert_options : csv.ConvertOptions + convert_options : pyarrow.csv.ConvertOptions Options regarding value conversion. - read_options : csv.ReadOptions + read_options : pyarrow.csv.ReadOptions General read options. default_fragment_scan_options : CsvFragmentScanOptions Default options for fragments scan. @@ -1142,9 +1142,9 @@ cdef class CsvFragmentScanOptions(FragmentScanOptions): Parameters ---------- - convert_options : csv.ConvertOptions + convert_options : pyarrow.csv.ConvertOptions Options regarding value conversion. - read_options : csv.ReadOptions + read_options : pyarrow.csv.ReadOptions General read options. """ @@ -1313,7 +1313,7 @@ cdef class DirectoryPartitioning(Partitioning): ---------- schema : Schema The schema that describes the partitions present in the file path. - dictionaries : Dict[str, Array] + dictionaries : dict[str, Array] If the type of any field of `schema` is a dictionary type, the corresponding entry of `dictionaries` must be an array containing every value which may be taken by the corresponding column or an @@ -1459,7 +1459,7 @@ cdef class HivePartitioning(Partitioning): ---------- schema : Schema The schema that describes the partitions present in the file path. - dictionaries : Dict[str, Array] + dictionaries : dict[str, Array] If the type of any field of `schema` is a dictionary type, the corresponding entry of `dictionaries` must be an array containing every value which may be taken by the corresponding column or an @@ -1802,7 +1802,7 @@ cdef class FileSystemDatasetFactory(DatasetFactory): ---------- filesystem : pyarrow.fs.FileSystem Filesystem to discover. - paths_or_selector : pyarrow.fs.Selector or list of path-likes + paths_or_selector : pyarrow.fs.FileSelector or list of path-likes Either a Selector object or a list of path-like objects. format : FileFormat Currently only ParquetFileFormat and IpcFileFormat are supported. @@ -2301,7 +2301,7 @@ cdef class Scanner(_Weakrefable): Returns ------- - table : Table + Table """ cdef CResult[shared_ptr[CTable]] result @@ -2319,7 +2319,7 @@ cdef class Scanner(_Weakrefable): Returns ------- - table : Table + Table """ cdef CResult[shared_ptr[CTable]] result cdef shared_ptr[CArray] c_indices @@ -2337,7 +2337,7 @@ cdef class Scanner(_Weakrefable): Returns ------- - table : Table instance + Table """ cdef CResult[shared_ptr[CTable]] result with nogil: diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index bb59d1852c4..c58e032213c 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -332,8 +332,9 @@ cdef class FileSystem(_Weakrefable): Returns ------- - With (filesystem, path) tuple where path is the abstract path inside - the FileSystem instance. + tuple of (FileSystem, str path) + With (filesystem, path) tuple where path is the abstract path + inside the FileSystem instance. """ cdef: c_string c_path @@ -945,7 +946,8 @@ class FileSystemHandler(ABC): Parameters ---------- - paths : paths for which we want to retrieve the info. + paths : list of str + paths for which we want to retrieve the info. """ @abstractmethod @@ -955,7 +957,8 @@ class FileSystemHandler(ABC): Parameters ---------- - selector : selector for which we want to retrieve the info. + selector : FileSelector + selector for which we want to retrieve the info. """ @abstractmethod @@ -965,8 +968,10 @@ class FileSystemHandler(ABC): Parameters ---------- - path : path of the directory. - recursive : if the parent directories should be created too. + path : str + path of the directory. + recursive : bool + if the parent directories should be created too. """ @abstractmethod @@ -976,7 +981,8 @@ class FileSystemHandler(ABC): Parameters ---------- - path : path of the directory. + path : str + path of the directory. """ @abstractmethod @@ -986,7 +992,8 @@ class FileSystemHandler(ABC): Parameters ---------- - path : path of the directory. + path : str + path of the directory. """ @abstractmethod @@ -1002,7 +1009,8 @@ class FileSystemHandler(ABC): Parameters ---------- - path : path of the file. + path : str + path of the file. """ @abstractmethod @@ -1012,8 +1020,10 @@ class FileSystemHandler(ABC): Parameters ---------- - src : path of what should be moved. - dest : path of where it should be moved to. + src : str + path of what should be moved. + dest : str + path of where it should be moved to. """ @abstractmethod @@ -1023,8 +1033,10 @@ class FileSystemHandler(ABC): Parameters ---------- - src : path of what should be copied. - dest : path of where it should be copied to. + src : str + path of what should be copied. + dest : str + path of where it should be copied to. """ @abstractmethod @@ -1034,7 +1046,8 @@ class FileSystemHandler(ABC): Parameters ---------- - path : path of what should be opened. + path : str + path of what should be opened. """ @abstractmethod @@ -1044,7 +1057,8 @@ class FileSystemHandler(ABC): Parameters ---------- - path : path of what should be opened. + path : str + path of what should be opened. """ @abstractmethod @@ -1054,8 +1068,10 @@ class FileSystemHandler(ABC): Parameters ---------- - path : path of what should be opened. - metadata : mapping of string keys to string values. + path : str + path of what should be opened. + metadata : mapping + Mapping of string keys to string values. Some filesystems support storing metadata along the file (such as "Content-Type"). """ @@ -1067,8 +1083,10 @@ class FileSystemHandler(ABC): Parameters ---------- - path : path of what should be opened. - metadata : mapping of string keys to string values. + path : str + path of what should be opened. + metadata : mapping + Mapping of string keys to string values. Some filesystems support storing metadata along the file (such as "Content-Type"). """ @@ -1080,7 +1098,8 @@ class FileSystemHandler(ABC): Parameters ---------- - path : path of what should be normalized. + path : str + path of what should be normalized. """ # Callback definitions for CPyFileSystemVtable diff --git a/python/pyarrow/_plasma.pyx b/python/pyarrow/_plasma.pyx index e38c81f8026..7aeb9e83851 100644 --- a/python/pyarrow/_plasma.pyx +++ b/python/pyarrow/_plasma.pyx @@ -357,9 +357,10 @@ cdef class PlasmaClient(_Weakrefable): This exception is raised if the object could not be created because there already is an object with the same ID in the plasma store. - PlasmaStoreFull: This exception is raised if the object could - not be created because the plasma store is unable to evict - enough objects to create room for it. + PlasmaStoreFull + This exception is raised if the object could + not be created because the plasma store is unable to evict + enough objects to create room for it. """ cdef shared_ptr[CBuffer] data with nogil: @@ -498,7 +499,8 @@ cdef class PlasmaClient(_Weakrefable): Returns ------- - The object ID associated to the Python buffer object. + ObjectID + The object ID associated to the Python buffer object. """ cdef ObjectID target_id = (object_id if object_id else ObjectID.from_random()) @@ -530,7 +532,8 @@ cdef class PlasmaClient(_Weakrefable): Returns ------- - The object ID associated to the Python object. + ObjectID + The object ID associated to the Python object. """ cdef ObjectID target_id = (object_id if object_id else ObjectID.from_random()) diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx index d96eb922672..4d919c8b9d6 100644 --- a/python/pyarrow/_s3fs.pyx +++ b/python/pyarrow/_s3fs.pyx @@ -134,7 +134,7 @@ cdef class S3FileSystem(FileSystem): background_writes : boolean, default True Whether file writes will be issued in the background, without blocking. - default_metadata : mapping or KeyValueMetadata, default None + default_metadata : mapping or pyarrow.KeyValueMetadata, default None Default metadata for open_output_stream. This will be ignored if non-empty metadata is passed to open_output_stream. proxy_options : dict or str, default None diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 10ead0e6a95..7edba0f1ade 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -121,7 +121,7 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, Parameters ---------- - obj : sequence, iterable, ndarray or Series + obj : sequence, iterable, ndarray or pandas.Series If both type and size are specified may be a single use iterable. If not strongly-typed, Arrow type will be inferred for resulting array. type : pyarrow.DataType @@ -711,7 +711,7 @@ cdef class _PandasConvertible(_Weakrefable): useful if you have timestamps that don't fit in the normal date range of nanosecond timestamps (1678 CE-2262 CE). If False, all timestamps are converted to datetime64[ns] dtype. - use_threads: bool, default True + use_threads : bool, default True Whether to parallelize the conversion using multiple threads. deduplicate_objects : bool, default False Do not create multiple copies Python objects when created, to save @@ -872,7 +872,8 @@ cdef class Array(_PandasConvertible): Returns ------- - An array of structs + StructArray + An array of structs """ return _pc().call_function('value_counts', [self]) @@ -2515,7 +2516,8 @@ def concat_arrays(arrays, MemoryPool memory_pool=None): Raises ------ - ArrowInvalid : if not all of the arrays have the same type. + ArrowInvalid + If not all of the arrays have the same type. Parameters ---------- diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 11be3b6ffba..1cbf062f948 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -324,8 +324,8 @@ def cast(arr, target_type, safe=True): Parameters ---------- - arr : Array or ChunkedArray - target_type : DataType or type string alias + arr : Array-like + target_type : DataType or str Type to cast to safe : bool, default True Check for overflows or other unsafe conversions @@ -381,8 +381,8 @@ def index(data, value, start=None, end=None, *, memory_pool=None): Parameters ---------- - data : Array or ChunkedArray - value : Scalar + data : Array-like + value : Scalar-like object The value to search for. start : int, optional end : int, optional @@ -391,7 +391,8 @@ def index(data, value, start=None, end=None, *, memory_pool=None): Returns ------- - index : the index, or -1 if not found + index : int + the index, or -1 if not found """ if start is not None: if end is not None: diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py index 9a9f55f37e2..1d4bac8c1e5 100644 --- a/python/pyarrow/dataset.py +++ b/python/pyarrow/dataset.py @@ -130,7 +130,7 @@ def partitioning(schema=None, field_names=None, flavor=None, flavor : str, default None The default is DirectoryPartitioning. Specify ``flavor="hive"`` for a HivePartitioning. - dictionaries : Dict[str, Array] + dictionaries : dict[str, Array] If the type of any field of `schema` is a dictionary type, the corresponding entry of `dictionaries` must be an array containing every value which may be taken by the corresponding column or an @@ -528,8 +528,8 @@ def dataset(source, schema=None, format=None, filesystem=None, Parameters ---------- - source : path, list of paths, dataset, list of datasets, (list of) batches\ -or tables, iterable of batches, RecordBatchReader, or URI + source : path, list of paths, dataset, list of datasets, (list of) \ +RecordBatch or Table, iterable of RecordBatch, RecordBatchReader, or URI Path pointing to a single file: Open a FileSystemDataset from a single file. Path pointing to a directory: @@ -731,8 +731,8 @@ def write_dataset(data, base_dir, basename_template=None, format=None, Parameters ---------- - data : Dataset, Table/RecordBatch, RecordBatchReader, list of - Table/RecordBatch, or iterable of RecordBatch + data : Dataset, Table/RecordBatch, RecordBatchReader, list of \ +Table/RecordBatch, or iterable of RecordBatch The data to write. This can be a Dataset instance or in-memory Arrow data. If an iterable is given, the schema must also be given. @@ -760,7 +760,7 @@ def write_dataset(data, base_dir, basename_template=None, format=None, default of ``partitioning()`` which is directory partitioning. schema : Schema, optional filesystem : FileSystem, optional - file_options : FileWriteOptions, optional + file_options : pyarrow.dataset.FileWriteOptions, optional FileFormat specific write options, created using the ``FileFormat.make_write_options()`` function. use_threads : bool, default True @@ -789,7 +789,7 @@ def write_dataset(data, base_dir, basename_template=None, format=None, multiple row groups. If this value is set, then min_rows_per_group should also be set. Otherwise it could end up with very small row groups. - file_visitor : Function + file_visitor : function If set, this function will be called with a WrittenFile instance for each file created during the call. This object will have both a path attribute and a metadata attribute. diff --git a/python/pyarrow/fs.py b/python/pyarrow/fs.py index ae611557ac0..5299a838fc4 100644 --- a/python/pyarrow/fs.py +++ b/python/pyarrow/fs.py @@ -259,7 +259,7 @@ class FSSpecHandler(FileSystemHandler): Parameters ---------- - fs : The FSSpec-compliant filesystem instance. + fs : FSSpec-compliant filesystem instance. Examples -------- diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index f6c2b42193b..b3262713905 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -285,7 +285,8 @@ cdef class NativeFile(_Weakrefable): Returns ------- - new_position : the new absolute stream position + int + The new absolute stream position. """ cdef int64_t offset handle = self.get_random_access_file() @@ -331,7 +332,8 @@ cdef class NativeFile(_Weakrefable): Returns ------- - nbytes : number of bytes written + int + nbytes: number of bytes written """ self._assert_writable() handle = self.get_output_stream() @@ -447,7 +449,8 @@ cdef class NativeFile(_Weakrefable): Returns -------- - number of bytes written + int + number of bytes written """ cdef: @@ -1072,7 +1075,8 @@ cdef class Buffer(_Weakrefable): Returns ------- - are_equal : True if buffer contents and size are equal + are_equal : bool + True if buffer contents and size are equal """ cdef c_bool result = False with nogil: @@ -1266,7 +1270,7 @@ cdef class CompressedInputStream(NativeFile): Parameters ---------- - stream : string, path, pa.NativeFile, or file-like object + stream : string, path, pyarrow.NativeFile, or file-like object Input stream object to wrap with the compression. compression : str The compression type ("bz2", "brotli", "gzip", "lz4" or "zstd"). @@ -1293,7 +1297,7 @@ cdef class CompressedOutputStream(NativeFile): Parameters ---------- - stream : string, path, pa.NativeFile, or file-like object + stream : string, path, pyarrow.NativeFile, or file-like object Input stream object to wrap with the compression. compression : str The compression type ("bz2", "brotli", "gzip", "lz4" or "zstd"). @@ -1933,7 +1937,7 @@ cdef class Codec(_Weakrefable): Parameters ---------- buf : pyarrow.Buffer, bytes, or memoryview-compatible object - decompressed_size : int64_t, default None + decompressed_size : int, default None If not specified, will be computed if the codec is able to determine the uncompressed buffer size. asbytes : boolean, default False @@ -2014,7 +2018,7 @@ def decompress(object buf, decompressed_size=None, codec='lz4', ---------- buf : pyarrow.Buffer, bytes, or memoryview-compatible object Input object to decompress data from. - decompressed_size : int64_t, default None + decompressed_size : int, default None If not specified, will be computed if the codec is able to determine the uncompressed buffer size. codec : str, default 'lz4' diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi index f890dba526b..a6c2b3adf41 100644 --- a/python/pyarrow/ipc.pxi +++ b/python/pyarrow/ipc.pxi @@ -332,7 +332,8 @@ cdef class MessageReader(_Weakrefable): Parameters ---------- - source : a readable source, like an InputStream + source + A readable source, like an InputStream """ cdef: MessageReader result = MessageReader.__new__(MessageReader) @@ -356,7 +357,8 @@ cdef class MessageReader(_Weakrefable): Raises ------ - StopIteration : at end of stream + StopIteration + At end of stream """ cdef Message result = Message.__new__(Message) @@ -515,7 +517,8 @@ class _ReadPandasMixin: Parameters ---------- - **options : arguments to forward to Table.to_pandas + **options + Arguments to forward to Table.to_pandas. Returns ------- diff --git a/python/pyarrow/orc.py b/python/pyarrow/orc.py index 2243a0a2178..335fc241c0d 100644 --- a/python/pyarrow/orc.py +++ b/python/pyarrow/orc.py @@ -29,7 +29,7 @@ class ORCFile: Parameters ---------- - source : str or pyarrow.io.NativeFile + source : str or pyarrow.NativeFile Readable source. For passing Python file objects or byte buffers, see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. """ @@ -92,7 +92,7 @@ def read_stripe(self, n, columns=None): Returns ------- - pyarrow.lib.RecordBatch + pyarrow.RecordBatch Content of the stripe as a RecordBatch. """ columns = self._select_names(columns) @@ -110,7 +110,7 @@ def read(self, columns=None): Returns ------- - pyarrow.lib.Table + pyarrow.Table Content of the file as a Table. """ columns = self._select_names(columns) @@ -123,7 +123,7 @@ class ORCWriter: Parameters ---------- - where : str or pyarrow.io.NativeFile + where : str or pyarrow.NativeFile Writable target. For passing Python file objects or byte buffers, see pyarrow.io.PythonFileInterface, pyarrow.io.BufferOutputStream or pyarrow.io.FixedSizeBufferWriter. @@ -140,7 +140,7 @@ def write(self, table): Parameters ---------- - table : pyarrow.lib.Table + table : pyarrow.Table The table to be written into the ORC file """ self.writer.write(table) @@ -158,9 +158,9 @@ def write_table(table, where): Parameters ---------- - table : pyarrow.lib.Table + table : pyarrow.Table The table to be written into the ORC file - where : str or pyarrow.io.NativeFile + where : str or pyarrow.NativeFile Writable target. For passing Python file objects or byte buffers, see pyarrow.io.PythonFileInterface, pyarrow.io.BufferOutputStream or pyarrow.io.FixedSizeBufferWriter. diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 3a46f433c54..4a192a31778 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -605,7 +605,7 @@ class ParquetWriter: Parameters ---------- where : path or file-like object -schema : arrow Schema +schema : pyarrow.Schema {} writer_engine_version : unused **options : dict @@ -1039,7 +1039,8 @@ def get_index(self, level, name, key): Record a partition value at a particular level, returning the distinct code for that value at that level. - Example: + Examples + -------- partitions.get_index(1, 'foo', 'a') returns 0 partitions.get_index(1, 'foo', 'b') returns 1 @@ -1281,7 +1282,8 @@ def _open_dataset_file(dataset, path, meta=None): buffer_size : int, default 0 If positive, perform read buffering when deserializing individual column chunks. Otherwise IO calls are unbuffered. -partitioning : Partitioning or str or list of str, default "hive" +partitioning : pyarrow.dataset.Partitioning or str or list of str, \ +default "hive" The partitioning scheme for a partitioned dataset. The default of "hive" assumes directory names with key=value pairs like "/year=2009/month=11". In addition, a scheme like "/2009/11" is also supported, in which case @@ -2044,7 +2046,8 @@ def read_pandas(source, columns=None, **kwargs): 'Read a Table from Parquet format, also reading DataFrame\n' 'index values if known in the file metadata', "\n".join((_read_docstring_common, - """**kwargs : additional options for :func:`read_table`""")), + """**kwargs + additional options for :func:`read_table`""")), """pyarrow.Table Content of the file as a Table of Columns, including DataFrame indexes as columns""", @@ -2328,7 +2331,7 @@ def read_metadata(where, memory_map=False): Parameters ---------- - where : str (filepath) or file-like object + where : str (file path) or file-like object memory_map : bool, default False Create memory map when the source is a file path. @@ -2345,7 +2348,7 @@ def read_schema(where, memory_map=False): Parameters ---------- - where : str (filepath) or file-like object + where : str (file path) or file-like object memory_map : bool, default False Create memory map when the source is a file path. diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 37ea5ace22d..00f14deb054 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -327,7 +327,7 @@ cdef class ChunkedArray(_PandasConvertible): Returns ------- - result : List[ChunkedArray] + result : list of ChunkedArray """ cdef: vector[shared_ptr[CChunkedArray]] flattened @@ -513,7 +513,7 @@ def chunked_array(arrays, type=None): Parameters ---------- - arrays : Array, list of Array, or values coercible to arrays + arrays : Array, list of Array, or array-like Must all be the same data type. Can be empty only if type also passed. type : DataType or string coercible to DataType @@ -859,7 +859,7 @@ cdef class RecordBatch(_PandasConvertible): Returns ------- - list of pa.Array + list of pyarrow.Array """ return [self.column(i) for i in range(self.num_columns)] @@ -1105,9 +1105,10 @@ cdef class RecordBatch(_PandasConvertible): ``RecordBatch``. The default of None will store the index as a column, except for RangeIndex which is stored as metadata only. Use ``preserve_index=True`` to force it to be stored as a column. - nthreads : int, default None (may use up to system CPU count threads) + nthreads : int, default None If greater than 1, convert columns to Arrow in parallel using - indicated number of threads + indicated number of threads. By default, this follows + :func:`pyarrow.cpu_count` (may use up to system CPU count threads). columns : list, optional List of column to be converted. If None, use all columns. @@ -1304,8 +1305,8 @@ cdef class Table(_PandasConvertible): """ A collection of top-level named, equal length Arrow arrays. - Warning - ------- + Warnings + -------- Do not call this class's constructor directly, use one of the ``from_*`` methods instead. """ @@ -1692,9 +1693,10 @@ cdef class Table(_PandasConvertible): ``Table``. The default of None will store the index as a column, except for RangeIndex which is stored as metadata only. Use ``preserve_index=True`` to force it to be stored as a column. - nthreads : int, default None (may use up to system CPU count threads) + nthreads : int, default None If greater than 1, convert columns to Arrow in parallel using - indicated number of threads. + indicated number of threads. By default, this follows + :func:`pyarrow.cpu_count` (may use up to system CPU count threads). columns : list, optional List of column to be converted. If None, use all columns. safe : bool, default True @@ -1896,7 +1898,7 @@ cdef class Table(_PandasConvertible): Returns ------- - list of RecordBatch + list[RecordBatch] """ cdef: unique_ptr[TableBatchReader] reader @@ -2460,9 +2462,11 @@ def table(data, names=None, schema=None, metadata=None, nthreads=None): specified in the schema, when data is a dict or DataFrame). metadata : dict or Mapping, default None Optional metadata for the schema (if schema not passed). - nthreads : int, default None (may use up to system CPU count threads) + nthreads : int, default None For pandas.DataFrame inputs: if greater than 1, convert columns to - Arrow in parallel using indicated number of threads. + Arrow in parallel using indicated number of threads. By default, + this follows :func:`pyarrow.cpu_count` (may use up to system CPU count + threads). Returns ------- @@ -2664,8 +2668,8 @@ list[tuple(str, str, FunctionOptions)] Table Results of the aggregation functions. - Example - ------- + Examples + -------- >>> t = pa.table([ ... pa.array(["a", "a", "b", "b", "c"]), ... pa.array([1, 2, 3, 4, 5]),