From 31e0cb300b9f31c02386a5bdc41fc84826b266e0 Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwelk@xhochy.com>
Date: Fri, 18 Nov 2016 21:35:20 +0100
Subject: [PATCH] ARROW-382: Extend Python API documentation

 * Fix numpydoc compilation
 * Add simple examples to the API
 * Move away from deprecated Cython-property declaration
 * Add basic descriptions with return types to functions

Change-Id: I5627601a5a338d5134789de608280d50c8dd2f4c
---
 python/doc/conf.py       |   5 +-
 python/pyarrow/array.pyx |  53 ++++++-
 python/pyarrow/compat.py |   2 +
 python/pyarrow/table.pyx | 333 ++++++++++++++++++++++++++++++---------
 4 files changed, 318 insertions(+), 75 deletions(-)

diff --git a/python/doc/conf.py b/python/doc/conf.py
index 99ac3512ec9..4c324a8086c 100644
--- a/python/doc/conf.py
+++ b/python/doc/conf.py
@@ -59,9 +59,12 @@
     'sphinx.ext.doctest',
     'sphinx.ext.mathjax',
     'sphinx.ext.viewcode',
-    'numpydoc'
+    'sphinx.ext.napoleon'
 ]
 
+# numpydoc configuration
+napoleon_use_rtype = False
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
 
diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx
index fbe4e387906..6c862751fc2 100644
--- a/python/pyarrow/array.pyx
+++ b/python/pyarrow/array.pyx
@@ -54,6 +54,41 @@ cdef class Array:
 
     @staticmethod
     def from_pandas(obj, mask=None):
+        """
+        Create an array from a pandas.Series
+
+        Parameters
+        ----------
+        obj : pandas.Series or numpy.ndarray
+            vector holding the data
+        mask : numpy.ndarray, optional
+            boolean mask if the object is valid or null
+
+        Returns
+        -------
+        pyarrow.Array
+
+        Examples
+        --------
+
+        >>> import pandas as pd
+        >>> import pyarrow as pa
+        >>> pa.Array.from_pandas(pd.Series([1, 2]))
+        <pyarrow.array.Int64Array object at 0x7f674e4c0e10>
+        [
+          1,
+          2
+        ]
+
+
+        >>> import numpy as np
+        >>> pa.Array.from_pandas(pd.Series([1, 2]), np.array([0, 1], dtype=bool))
+        <pyarrow.array.Int64Array object at 0x7f9019e11208>
+        [
+          1,
+          NA
+        ]
+        """
         return from_pandas_series(obj, mask)
 
     property null_count:
@@ -228,6 +263,14 @@ cdef object box_arrow_array(const shared_ptr[CArray]& sp_array):
 def from_pylist(object list_obj, DataType type=None):
     """
     Convert Python list to Arrow array
+
+    Parameters
+    ----------
+    list_obj : array_like
+
+    Returns
+    -------
+    pyarrow.array.Array
     """
     cdef:
         shared_ptr[CArray] sp_array
@@ -246,15 +289,19 @@ def from_pandas_series(object series, object mask=None, timestamps_to_ms=False):
 
     Parameters
     ----------
-    series: pandas.Series or numpy.ndarray
+    series : pandas.Series or numpy.ndarray
 
-    mask: pandas.Series or numpy.ndarray
+    mask : pandas.Series or numpy.ndarray, optional
         array to mask null entries in the series
 
-    timestamps_to_ms: bool
+    timestamps_to_ms : bool, optional
         Convert datetime columns to ms resolution. This is needed for
         compability with other functionality like Parquet I/O which
         only supports milliseconds.
+
+    Returns
+    -------
+    pyarrow.array.Array
     """
     cdef:
         shared_ptr[CArray] out
diff --git a/python/pyarrow/compat.py b/python/pyarrow/compat.py
index 08f0f237967..2dfdb5041d1 100644
--- a/python/pyarrow/compat.py
+++ b/python/pyarrow/compat.py
@@ -90,3 +90,5 @@ def frombytes(o):
 
 
 integer_types = six.integer_types + (np.integer,)
+
+__all__ = []
diff --git a/python/pyarrow/table.pyx b/python/pyarrow/table.pyx
index 5459f26b80a..a6715b141ce 100644
--- a/python/pyarrow/table.pyx
+++ b/python/pyarrow/table.pyx
@@ -36,9 +36,13 @@ from pyarrow.compat import frombytes, tobytes
 cimport cpython
 
 cdef class ChunkedArray:
-    '''
+    """
+    Array backed via one or more memory chunks.
+
+    Warning
+    -------
     Do not call this class's constructor directly.
-    '''
+    """
 
     def __cinit__(self):
         self.chunked_array = NULL
@@ -59,19 +63,42 @@ cdef class ChunkedArray:
     def __len__(self):
         return self.length()
 
-    property null_count:
+    @property
+    def null_count(self):
+        """
+        Number of null entires
 
-        def __get__(self):
-            self._check_nullptr()
-            return self.chunked_array.null_count()
+        Returns
+        -------
+        int
+        """
+        self._check_nullptr()
+        return self.chunked_array.null_count()
 
-    property num_chunks:
+    @property
+    def num_chunks(self):
+        """
+        Number of underlying chunks
 
-        def __get__(self):
-            self._check_nullptr()
-            return self.chunked_array.num_chunks()
+        Returns
+        -------
+        int
+        """
+        self._check_nullptr()
+        return self.chunked_array.num_chunks()
 
     def chunk(self, i):
+        """
+        Select a chunk by its index
+
+        Parameters
+        ----------
+        i : int
+
+        Returns
+        -------
+        pyarrow.array.Array
+        """
         self._check_nullptr()
         return box_arrow_array(self.chunked_array.chunk(i))
 
@@ -82,9 +109,13 @@ cdef class ChunkedArray:
 
 
 cdef class Column:
-    '''
+    """
+    Named vector of elements of equal type.
+
+    Warning
+    -------
     Do not call this class's constructor directly.
-    '''
+    """
 
     def __cinit__(self):
         self.column = NULL
@@ -95,7 +126,11 @@ cdef class Column:
 
     def to_pandas(self):
         """
-        Convert the arrow::Column to a pandas Series
+        Convert the arrow::Column to a pandas.Series
+
+        Returns
+        -------
+        pandas.Series
         """
         cdef:
             PyObject* arr
@@ -120,34 +155,64 @@ cdef class Column:
         self._check_nullptr()
         return self.column.length()
 
-    property shape:
+    @property
+    def shape(self):
+        """
+        Dimensions of this columns
 
-        def __get__(self):
-            self._check_nullptr()
-            return (self.length(),)
+        Returns
+        -------
+        (int,)
+        """
+        self._check_nullptr()
+        return (self.length(),)
 
-    property null_count:
+    @property
+    def null_count(self):
+        """
+        Number of null entires
 
-        def __get__(self):
-            self._check_nullptr()
-            return self.column.null_count()
+        Returns
+        -------
+        int
+        """
+        self._check_nullptr()
+        return self.column.null_count()
 
-    property name:
+    @property
+    def name(self):
+        """
+        Label of the column
 
-        def __get__(self):
-            return frombytes(self.column.name())
+        Returns
+        -------
+        str
+        """
+        return frombytes(self.column.name())
 
-    property type:
+    @property
+    def type(self):
+        """
+        Type information for this column
 
-        def __get__(self):
-            return box_data_type(self.column.type())
+        Returns
+        -------
+        pyarrow.schema.DataType
+        """
+        return box_data_type(self.column.type())
 
-    property data:
+    @property
+    def data(self):
+        """
+        The underlying data
 
-        def __get__(self):
-            cdef ChunkedArray chunked_array = ChunkedArray()
-            chunked_array.init(self.column.data())
-            return chunked_array
+        Returns
+        -------
+        pyarrow.table.ChunkedArray
+        """
+        cdef ChunkedArray chunked_array = ChunkedArray()
+        chunked_array.init(self.column.data())
+        return chunked_array
 
 
 cdef _schema_from_arrays(arrays, names, shared_ptr[CSchema]* schema):
@@ -186,6 +251,13 @@ cdef _dataframe_to_arrays(df, name, timestamps_to_ms):
 
 
 cdef class RecordBatch:
+    """
+    Batch of rows of columns of equal length
+
+    Warning
+    -------
+    Do not call this class's constructor directly, use one of the ``from_*`` methods instead.
+    """
 
     def __cinit__(self):
         self.batch = NULL
@@ -203,28 +275,48 @@ cdef class RecordBatch:
         self._check_nullptr()
         return self.batch.num_rows()
 
-    property num_columns:
+    @property
+    def num_columns(self):
+        """
+        Number of columns
 
-        def __get__(self):
-            self._check_nullptr()
-            return self.batch.num_columns()
+        Returns
+        -------
+        int
+        """
+        self._check_nullptr()
+        return self.batch.num_columns()
 
-    property num_rows:
+    @property
+    def num_rows(self):
+        """
+        Number of rows
 
-        def __get__(self):
-            return len(self)
+        Due to the definition of a RecordBatch, all columns have the same number of rows.
 
-    property schema:
+        Returns
+        -------
+        int
+        """
+        return len(self)
 
-        def __get__(self):
-            cdef Schema schema
-            self._check_nullptr()
-            if self._schema is None:
-                schema = Schema()
-                schema.init_schema(self.batch.schema())
-                self._schema = schema
+    @property
+    def schema(self):
+        """
+        Schema of the RecordBatch and its columns
 
-            return self._schema
+        Returns
+        -------
+        pyarrow.schema.Schema
+        """
+        cdef Schema schema
+        self._check_nullptr()
+        if self._schema is None:
+            schema = Schema()
+            schema.init_schema(self.batch.schema())
+            self._schema = schema
+
+        return self._schema
 
     def __getitem__(self, i):
         cdef Array arr = Array()
@@ -240,6 +332,10 @@ cdef class RecordBatch:
     def to_pandas(self):
         """
         Convert the arrow::RecordBatch to a pandas DataFrame
+
+        Returns
+        -------
+        pandas.DataFrame
         """
         cdef:
             PyObject* np_arr
@@ -263,12 +359,34 @@ cdef class RecordBatch:
     def from_pandas(cls, df):
         """
         Convert pandas.DataFrame to an Arrow RecordBatch
+
+        Parameters
+        ----------
+        df: pandas.DataFrame
+
+        Returns
+        -------
+        pyarrow.table.RecordBatch
         """
         names, arrays = _dataframe_to_arrays(df, None, False)
         return cls.from_arrays(names, arrays)
 
     @staticmethod
     def from_arrays(names, arrays):
+        """
+        Construct a RecordBatch from multiple pyarrow.Arrays
+
+        Parameters
+        ----------
+        names: list of str
+            Labels for the columns
+        arrays: list of pyarrow.Array
+            column-wise data vectors
+
+        Returns
+        -------
+        pyarrow.table.RecordBatch
+        """
         cdef:
             Array arr
             RecordBatch result
@@ -297,11 +415,13 @@ cdef class RecordBatch:
 
 
 cdef class Table:
-    '''
+    """
     A collection of top-level named, equal length Arrow arrays.
 
-    Do not call this class's constructor directly.
-    '''
+    Warning
+    -------
+    Do not call this class's constructor directly, use one of the ``from_*`` methods instead.
+    """
 
     def __cinit__(self):
         self.table = NULL
@@ -330,6 +450,22 @@ cdef class Table:
             Convert datetime columns to ms resolution. This is needed for
             compability with other functionality like Parquet I/O which
             only supports milliseconds.
+
+        Returns
+        -------
+        pyarrow.table.Table
+
+        Examples
+        --------
+
+        >>> import pandas as pd
+        >>> import pyarrow as pa
+        >>> df = pd.DataFrame({
+            ...     'int': [1, 2],
+            ...     'str': ['a', 'b']
+            ... })
+        >>> pa.table.from_pandas_dataframe(df)
+        <pyarrow.table.Table object at 0x7f05d1fb1b40>
         """
         names, arrays = _dataframe_to_arrays(df, name=name,
                                              timestamps_to_ms=timestamps_to_ms)
@@ -347,8 +483,13 @@ cdef class Table:
             Names for the table columns
         arrays: list of pyarrow.array.Array
             Equal-length arrays that should form the table.
-        name: str
-            (optional) name for the Table
+        name: str, optional
+            name for the Table
+
+        Returns
+        -------
+        pyarrow.table.Table
+
         """
         cdef:
             Array arr
@@ -382,6 +523,10 @@ cdef class Table:
     def to_pandas(self):
         """
         Convert the arrow::Table to a pandas DataFrame
+
+        Returns
+        -------
+        pandas.DataFrame
         """
         cdef:
             PyObject* arr
@@ -402,18 +547,41 @@ cdef class Table:
 
         return pd.DataFrame(dict(zip(names, data)), columns=names)
 
-    property name:
+    @property
+    def name(self):
+        """
+        Label of the table
 
-        def __get__(self):
-            self._check_nullptr()
-            return frombytes(self.table.name())
+        Returns
+        -------
+        str
+        """
+        self._check_nullptr()
+        return frombytes(self.table.name())
 
-    property schema:
+    @property
+    def schema(self):
+        """
+        Schema of the table and its columns
 
-        def __get__(self):
-            raise box_schema(self.table.schema())
+        Returns
+        -------
+        pyarrow.schema.Schema
+        """
+        return box_schema(self.table.schema())
 
     def column(self, index):
+        """
+        Select a column by its numeric index.
+
+        Parameters
+        ----------
+        index: int
+
+        Returns
+        -------
+        pyarrow.table.Column
+        """
         self._check_nullptr()
         cdef Column column = Column()
         column.init(self.table.column(index))
@@ -423,28 +591,51 @@ cdef class Table:
         return self.column(i)
 
     def itercolumns(self):
+        """
+        Iterator over all columns in their numerical order
+        """
         for i in range(self.num_columns):
             yield self.column(i)
 
-    property num_columns:
+    @property
+    def num_columns(self):
+        """
+        Number of columns in this table
 
-        def __get__(self):
-            self._check_nullptr()
-            return self.table.num_columns()
+        Returns
+        -------
+        int
+        """
+        self._check_nullptr()
+        return self.table.num_columns()
 
-    property num_rows:
+    @property
+    def num_rows(self):
+        """
+        Number of rows in this table.
 
-        def __get__(self):
-            self._check_nullptr()
-            return self.table.num_rows()
+        Due to the definition of a table, all columns have the same number of rows.
+
+        Returns
+        -------
+        int
+        """
+        self._check_nullptr()
+        return self.table.num_rows()
 
     def __len__(self):
         return self.num_rows
 
-    property shape:
+    @property
+    def shape(self):
+        """
+        Dimensions of the table: (#rows, #columns)
 
-        def __get__(self):
-            return (self.num_rows, self.num_columns)
+        Returns
+        -------
+        (int, int)
+        """
+        return (self.num_rows, self.num_columns)