From 31e0cb300b9f31c02386a5bdc41fc84826b266e0 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Fri, 18 Nov 2016 21:35:20 +0100 Subject: [PATCH] ARROW-382: Extend Python API documentation * Fix numpydoc compilation * Add simple examples to the API * Move away from deprecated Cython-property declaration * Add basic descriptions with return types to functions Change-Id: I5627601a5a338d5134789de608280d50c8dd2f4c --- python/doc/conf.py | 5 +- python/pyarrow/array.pyx | 53 ++++++- python/pyarrow/compat.py | 2 + python/pyarrow/table.pyx | 333 ++++++++++++++++++++++++++++++--------- 4 files changed, 318 insertions(+), 75 deletions(-) diff --git a/python/doc/conf.py b/python/doc/conf.py index 99ac3512ec9..4c324a8086c 100644 --- a/python/doc/conf.py +++ b/python/doc/conf.py @@ -59,9 +59,12 @@ 'sphinx.ext.doctest', 'sphinx.ext.mathjax', 'sphinx.ext.viewcode', - 'numpydoc' + 'sphinx.ext.napoleon' ] +# numpydoc configuration +napoleon_use_rtype = False + # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx index fbe4e387906..6c862751fc2 100644 --- a/python/pyarrow/array.pyx +++ b/python/pyarrow/array.pyx @@ -54,6 +54,41 @@ cdef class Array: @staticmethod def from_pandas(obj, mask=None): + """ + Create an array from a pandas.Series + + Parameters + ---------- + obj : pandas.Series or numpy.ndarray + vector holding the data + mask : numpy.ndarray, optional + boolean mask if the object is valid or null + + Returns + ------- + pyarrow.Array + + Examples + -------- + + >>> import pandas as pd + >>> import pyarrow as pa + >>> pa.Array.from_pandas(pd.Series([1, 2])) + + [ + 1, + 2 + ] + + + >>> import numpy as np + >>> pa.Array.from_pandas(pd.Series([1, 2]), np.array([0, 1], dtype=bool)) + + [ + 1, + NA + ] + """ return from_pandas_series(obj, mask) property null_count: @@ -228,6 +263,14 @@ cdef object box_arrow_array(const shared_ptr[CArray]& sp_array): def from_pylist(object list_obj, DataType type=None): """ Convert Python list to Arrow array + + Parameters + ---------- + list_obj : array_like + + Returns + ------- + pyarrow.array.Array """ cdef: shared_ptr[CArray] sp_array @@ -246,15 +289,19 @@ def from_pandas_series(object series, object mask=None, timestamps_to_ms=False): Parameters ---------- - series: pandas.Series or numpy.ndarray + series : pandas.Series or numpy.ndarray - mask: pandas.Series or numpy.ndarray + mask : pandas.Series or numpy.ndarray, optional array to mask null entries in the series - timestamps_to_ms: bool + timestamps_to_ms : bool, optional Convert datetime columns to ms resolution. This is needed for compability with other functionality like Parquet I/O which only supports milliseconds. + + Returns + ------- + pyarrow.array.Array """ cdef: shared_ptr[CArray] out diff --git a/python/pyarrow/compat.py b/python/pyarrow/compat.py index 08f0f237967..2dfdb5041d1 100644 --- a/python/pyarrow/compat.py +++ b/python/pyarrow/compat.py @@ -90,3 +90,5 @@ def frombytes(o): integer_types = six.integer_types + (np.integer,) + +__all__ = [] diff --git a/python/pyarrow/table.pyx b/python/pyarrow/table.pyx index 5459f26b80a..a6715b141ce 100644 --- a/python/pyarrow/table.pyx +++ b/python/pyarrow/table.pyx @@ -36,9 +36,13 @@ from pyarrow.compat import frombytes, tobytes cimport cpython cdef class ChunkedArray: - ''' + """ + Array backed via one or more memory chunks. + + Warning + ------- Do not call this class's constructor directly. - ''' + """ def __cinit__(self): self.chunked_array = NULL @@ -59,19 +63,42 @@ cdef class ChunkedArray: def __len__(self): return self.length() - property null_count: + @property + def null_count(self): + """ + Number of null entires - def __get__(self): - self._check_nullptr() - return self.chunked_array.null_count() + Returns + ------- + int + """ + self._check_nullptr() + return self.chunked_array.null_count() - property num_chunks: + @property + def num_chunks(self): + """ + Number of underlying chunks - def __get__(self): - self._check_nullptr() - return self.chunked_array.num_chunks() + Returns + ------- + int + """ + self._check_nullptr() + return self.chunked_array.num_chunks() def chunk(self, i): + """ + Select a chunk by its index + + Parameters + ---------- + i : int + + Returns + ------- + pyarrow.array.Array + """ self._check_nullptr() return box_arrow_array(self.chunked_array.chunk(i)) @@ -82,9 +109,13 @@ cdef class ChunkedArray: cdef class Column: - ''' + """ + Named vector of elements of equal type. + + Warning + ------- Do not call this class's constructor directly. - ''' + """ def __cinit__(self): self.column = NULL @@ -95,7 +126,11 @@ cdef class Column: def to_pandas(self): """ - Convert the arrow::Column to a pandas Series + Convert the arrow::Column to a pandas.Series + + Returns + ------- + pandas.Series """ cdef: PyObject* arr @@ -120,34 +155,64 @@ cdef class Column: self._check_nullptr() return self.column.length() - property shape: + @property + def shape(self): + """ + Dimensions of this columns - def __get__(self): - self._check_nullptr() - return (self.length(),) + Returns + ------- + (int,) + """ + self._check_nullptr() + return (self.length(),) - property null_count: + @property + def null_count(self): + """ + Number of null entires - def __get__(self): - self._check_nullptr() - return self.column.null_count() + Returns + ------- + int + """ + self._check_nullptr() + return self.column.null_count() - property name: + @property + def name(self): + """ + Label of the column - def __get__(self): - return frombytes(self.column.name()) + Returns + ------- + str + """ + return frombytes(self.column.name()) - property type: + @property + def type(self): + """ + Type information for this column - def __get__(self): - return box_data_type(self.column.type()) + Returns + ------- + pyarrow.schema.DataType + """ + return box_data_type(self.column.type()) - property data: + @property + def data(self): + """ + The underlying data - def __get__(self): - cdef ChunkedArray chunked_array = ChunkedArray() - chunked_array.init(self.column.data()) - return chunked_array + Returns + ------- + pyarrow.table.ChunkedArray + """ + cdef ChunkedArray chunked_array = ChunkedArray() + chunked_array.init(self.column.data()) + return chunked_array cdef _schema_from_arrays(arrays, names, shared_ptr[CSchema]* schema): @@ -186,6 +251,13 @@ cdef _dataframe_to_arrays(df, name, timestamps_to_ms): cdef class RecordBatch: + """ + Batch of rows of columns of equal length + + Warning + ------- + Do not call this class's constructor directly, use one of the ``from_*`` methods instead. + """ def __cinit__(self): self.batch = NULL @@ -203,28 +275,48 @@ cdef class RecordBatch: self._check_nullptr() return self.batch.num_rows() - property num_columns: + @property + def num_columns(self): + """ + Number of columns - def __get__(self): - self._check_nullptr() - return self.batch.num_columns() + Returns + ------- + int + """ + self._check_nullptr() + return self.batch.num_columns() - property num_rows: + @property + def num_rows(self): + """ + Number of rows - def __get__(self): - return len(self) + Due to the definition of a RecordBatch, all columns have the same number of rows. - property schema: + Returns + ------- + int + """ + return len(self) - def __get__(self): - cdef Schema schema - self._check_nullptr() - if self._schema is None: - schema = Schema() - schema.init_schema(self.batch.schema()) - self._schema = schema + @property + def schema(self): + """ + Schema of the RecordBatch and its columns - return self._schema + Returns + ------- + pyarrow.schema.Schema + """ + cdef Schema schema + self._check_nullptr() + if self._schema is None: + schema = Schema() + schema.init_schema(self.batch.schema()) + self._schema = schema + + return self._schema def __getitem__(self, i): cdef Array arr = Array() @@ -240,6 +332,10 @@ cdef class RecordBatch: def to_pandas(self): """ Convert the arrow::RecordBatch to a pandas DataFrame + + Returns + ------- + pandas.DataFrame """ cdef: PyObject* np_arr @@ -263,12 +359,34 @@ cdef class RecordBatch: def from_pandas(cls, df): """ Convert pandas.DataFrame to an Arrow RecordBatch + + Parameters + ---------- + df: pandas.DataFrame + + Returns + ------- + pyarrow.table.RecordBatch """ names, arrays = _dataframe_to_arrays(df, None, False) return cls.from_arrays(names, arrays) @staticmethod def from_arrays(names, arrays): + """ + Construct a RecordBatch from multiple pyarrow.Arrays + + Parameters + ---------- + names: list of str + Labels for the columns + arrays: list of pyarrow.Array + column-wise data vectors + + Returns + ------- + pyarrow.table.RecordBatch + """ cdef: Array arr RecordBatch result @@ -297,11 +415,13 @@ cdef class RecordBatch: cdef class Table: - ''' + """ A collection of top-level named, equal length Arrow arrays. - Do not call this class's constructor directly. - ''' + Warning + ------- + Do not call this class's constructor directly, use one of the ``from_*`` methods instead. + """ def __cinit__(self): self.table = NULL @@ -330,6 +450,22 @@ cdef class Table: Convert datetime columns to ms resolution. This is needed for compability with other functionality like Parquet I/O which only supports milliseconds. + + Returns + ------- + pyarrow.table.Table + + Examples + -------- + + >>> import pandas as pd + >>> import pyarrow as pa + >>> df = pd.DataFrame({ + ... 'int': [1, 2], + ... 'str': ['a', 'b'] + ... }) + >>> pa.table.from_pandas_dataframe(df) + """ names, arrays = _dataframe_to_arrays(df, name=name, timestamps_to_ms=timestamps_to_ms) @@ -347,8 +483,13 @@ cdef class Table: Names for the table columns arrays: list of pyarrow.array.Array Equal-length arrays that should form the table. - name: str - (optional) name for the Table + name: str, optional + name for the Table + + Returns + ------- + pyarrow.table.Table + """ cdef: Array arr @@ -382,6 +523,10 @@ cdef class Table: def to_pandas(self): """ Convert the arrow::Table to a pandas DataFrame + + Returns + ------- + pandas.DataFrame """ cdef: PyObject* arr @@ -402,18 +547,41 @@ cdef class Table: return pd.DataFrame(dict(zip(names, data)), columns=names) - property name: + @property + def name(self): + """ + Label of the table - def __get__(self): - self._check_nullptr() - return frombytes(self.table.name()) + Returns + ------- + str + """ + self._check_nullptr() + return frombytes(self.table.name()) - property schema: + @property + def schema(self): + """ + Schema of the table and its columns - def __get__(self): - raise box_schema(self.table.schema()) + Returns + ------- + pyarrow.schema.Schema + """ + return box_schema(self.table.schema()) def column(self, index): + """ + Select a column by its numeric index. + + Parameters + ---------- + index: int + + Returns + ------- + pyarrow.table.Column + """ self._check_nullptr() cdef Column column = Column() column.init(self.table.column(index)) @@ -423,28 +591,51 @@ cdef class Table: return self.column(i) def itercolumns(self): + """ + Iterator over all columns in their numerical order + """ for i in range(self.num_columns): yield self.column(i) - property num_columns: + @property + def num_columns(self): + """ + Number of columns in this table - def __get__(self): - self._check_nullptr() - return self.table.num_columns() + Returns + ------- + int + """ + self._check_nullptr() + return self.table.num_columns() - property num_rows: + @property + def num_rows(self): + """ + Number of rows in this table. - def __get__(self): - self._check_nullptr() - return self.table.num_rows() + Due to the definition of a table, all columns have the same number of rows. + + Returns + ------- + int + """ + self._check_nullptr() + return self.table.num_rows() def __len__(self): return self.num_rows - property shape: + @property + def shape(self): + """ + Dimensions of the table: (#rows, #columns) - def __get__(self): - return (self.num_rows, self.num_columns) + Returns + ------- + (int, int) + """ + return (self.num_rows, self.num_columns)