apache · kszucs · Jul 13, 2020 · Jul 13, 2020 · Jul 13, 2020 · Dec 6, 2021
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
@@ -51,10 +51,16 @@ jobs:
       fail-fast: false
       matrix:
         name:
+          - conda-python-docs
           - conda-python-3.8-nopandas
           - conda-python-3.6-pandas-0.23
           - conda-python-3.7-pandas-latest
         include:
+          - name: conda-python-docs
+            cache: conda-python-3.9
+            image: conda-python-docs
+            title: AMD64 Conda Python 3.9 Sphinx & Numpydoc
+            python: 3.9
           - name: conda-python-3.8-nopandas
             cache: conda-python-3.8
             image: conda-python

diff --git a/dev/archery/archery/lang/python.py b/dev/archery/archery/lang/python.py
@@ -105,8 +105,8 @@ class NumpyDoc:
     def __init__(self, symbols=None):
         if not have_numpydoc:
             raise RuntimeError(
-                'Numpydoc is not available, install the development version '
-                'with command: pip install numpydoc==1.1.0'
+                'Numpydoc is not available, install with command: '
+                'pip install numpydoc==1.1.0'
             )
         self.symbols = set(symbols or {'pyarrow'})
 

diff --git a/dev/archery/archery/utils/lint.py b/dev/archery/archery/utils/lint.py
@@ -246,7 +246,7 @@ def python_numpydoc(symbols=None, allow_rules=None, disallow_rules=None):
         'pyarrow.csv',
         'pyarrow.dataset',
         'pyarrow.feather',
-        'pyarrow.flight',
+        # 'pyarrow.flight',
         'pyarrow.fs',
         'pyarrow.gandiva',
         'pyarrow.ipc',

diff --git a/dev/archery/setup.py b/dev/archery/setup.py
@@ -36,6 +36,7 @@
                  'setuptools_scm'],
     'crossbow-upload': ['github3.py', jinja_req, 'ruamel.yaml',
                         'setuptools_scm'],
+    'numpydoc': ['numpydoc==1.1.0']
 }
 extras['bot'] = extras['crossbow'] + ['pygithub', 'jira']
 extras['all'] = list(set(functools.reduce(operator.add, extras.values())))

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -101,15 +101,15 @@ x-hierarchy:
       - conda-cpp-hiveserver2
       - conda-cpp-valgrind
       - conda-python:
-        - conda-python-pandas
+        - conda-python-pandas:
+          - conda-python-docs
         - conda-python-dask
         - conda-python-hdfs
         - conda-python-jpype
         - conda-python-java-integration
         - conda-python-turbodbc
         - conda-python-kartothek
         - conda-python-spark
-
   - debian-cpp:
     - debian-c-glib:
       - debian-ruby
@@ -940,13 +940,33 @@ services:
     shm_size: *shm-size
     environment:
       <<: *ccache
-      BUILD_DOCS_PYTHON: "ON"
     volumes: *conda-volumes
     command:
       ["/arrow/ci/scripts/cpp_build.sh /arrow /build &&
         /arrow/ci/scripts/python_build.sh /arrow /build &&
         /arrow/ci/scripts/python_test.sh /arrow"]
 
+  conda-python-docs:
+    # Usage:
+    #   archery docker run conda-python-docs
+    #
+    # Only a single rule is enabled for now to check undocumented arguments.
+    # We should extend the list of enabled rules after adding this build to
+    # the CI pipeline.
+    image: ${REPO}:${ARCH}-conda-python-${PYTHON}-pandas-${PANDAS}
+    environment:
+      <<: *ccache
+      LC_ALL: "C.UTF-8"
+      LANG: "C.UTF-8"
+      BUILD_DOCS_CPP: "ON"
+      BUILD_DOCS_PYTHON: "ON"
+    volumes: *conda-volumes
+    command:
+      ["/arrow/ci/scripts/cpp_build.sh /arrow /build &&
+        /arrow/ci/scripts/python_build.sh /arrow /build &&
+        pip install -e /arrow/dev/archery[numpydoc] &&
+        archery numpydoc --allow-rule PR01"]
+
   conda-python-dask:
     # Possible $DASK parameters:
     #  - `latest`: latest release

diff --git a/python/pyarrow/compat.pxi b/python/pyarrow/compat.pxi
@@ -52,13 +52,31 @@ except ImportError:
 
 
 def tobytes(o):
+    """
+    Encode a unicode or bytes string to bytes.
+
+    Parameters
+    ----------
+    o : str or bytes
+        Input string.
+    """
     if isinstance(o, str):
         return o.encode('utf8')
     else:
         return o
 
 
 def frombytes(o, *, safe=False):
+    """
+    Decode the given bytestring to unicode.
+
+    Parameters
+    ----------
+    o : bytes-like
+        Input object.
+    safe : bool, default False
+        If true, raise on encoding errors.
+    """
     if safe:
         return o.decode('utf8', errors='replace')
     else:

diff --git a/python/pyarrow/gandiva.pyx b/python/pyarrow/gandiva.pyx
@@ -78,6 +78,7 @@ from pyarrow.includes.libgandiva cimport (
     CFunctionSignature,
     GetRegisteredFunctionSignatures)
 
+
 cdef class Node(_Weakrefable):
     cdef:
         shared_ptr[CNode] node
@@ -103,6 +104,7 @@ cdef class Node(_Weakrefable):
     def return_type(self):
         return pyarrow_wrap_data_type(self.node.get().return_type())
 
+
 cdef class Expression(_Weakrefable):
     cdef:
         shared_ptr[CExpression] expression
@@ -123,6 +125,7 @@ cdef class Expression(_Weakrefable):
     def result(self):
         return pyarrow_wrap_field(self.expression.get().result())
 
+
 cdef class Condition(_Weakrefable):
     cdef:
         shared_ptr[CCondition] condition
@@ -151,6 +154,7 @@ cdef class Condition(_Weakrefable):
     def result(self):
         return pyarrow_wrap_field(self.condition.get().result())
 
+
 cdef class SelectionVector(_Weakrefable):
     cdef:
         shared_ptr[CSelectionVector] selection_vector
@@ -169,6 +173,7 @@ cdef class SelectionVector(_Weakrefable):
         cdef shared_ptr[CArray] result = self.selection_vector.get().ToArray()
         return pyarrow_wrap_array(result)
 
+
 cdef class Projector(_Weakrefable):
     cdef:
         shared_ptr[CProjector] projector
@@ -206,6 +211,7 @@ cdef class Projector(_Weakrefable):
             arrays.append(pyarrow_wrap_array(result))
         return arrays
 
+
 cdef class Filter(_Weakrefable):
     cdef:
         shared_ptr[CFilter] filter
@@ -440,26 +446,70 @@ cdef class TreeExprBuilder(_Weakrefable):
             condition.node)
         return Condition.create(r)
 
+
 cpdef make_projector(Schema schema, children, MemoryPool pool,
                      str selection_mode="NONE"):
-    cdef c_vector[shared_ptr[CExpression]] c_children
-    cdef Expression child
+    """
+    Construct a projection using expressions.
+
+    A projector is built for a specific schema and vector of expressions.
+    Once the projector is built, it can be used to evaluate many row batches.
+
+    Parameters
+    ----------
+    schema : pyarrow.Schema
+        Schema for the record batches, and the expressions.
+    children : list[pyarrow.gandiva.Expression]
+        List of projectable expression objects.
+    pool : pyarrow.MemoryPool
+        Memory pool used to allocate output arrays.
+    selection_mode : str, default "NONE"
+        Possible values are NONE, UINT16, UINT32, UINT64.
+
+    Returns
+    -------
+    Projector instance
+    """
+    cdef:
+        Expression child
+        c_vector[shared_ptr[CExpression]] c_children
+        shared_ptr[CProjector] result
+
     for child in children:
         c_children.push_back(child.expression)
-    cdef shared_ptr[CProjector] result
+
     check_status(
         Projector_Make(schema.sp_schema, c_children,
                        _ensure_selection_mode(selection_mode),
                        CConfigurationBuilder.DefaultConfiguration(),
                        &result))
     return Projector.create(result, pool)
 
+
 cpdef make_filter(Schema schema, Condition condition):
+    """
+    Contruct a filter based on a condition.
+
+    A filter is built for a specific schema and condition. Once the filter is
+    built, it can be used to evaluate many row batches.
+
+    Parameters
+    ----------
+    schema : pyarrow.Schema
+        Schema for the record batches, and the condition.
+    condition : pyarrow.gandiva.Condition
+        Filter condition.
+
+    Returns
+    -------
+    Filter instance
+    """
     cdef shared_ptr[CFilter] result
     check_status(
         Filter_Make(schema.sp_schema, condition.condition, &result))
     return Filter.create(result)
 
+
 cdef class FunctionSignature(_Weakrefable):
     """
     Signature of a Gandiva function including name, parameter types

diff --git a/python/pyarrow/orc.py b/python/pyarrow/orc.py
@@ -140,7 +140,7 @@ def write(self, table):
 
         Parameters
         ----------
-        schema : pyarrow.lib.Table
+        table : pyarrow.lib.Table
             The table to be written into the ORC file
         """
         self.writer.write(table)

diff --git a/python/pyarrow/plasma.py b/python/pyarrow/plasma.py
@@ -83,20 +83,31 @@ def start_plasma_store(plasma_store_memory,
                        use_valgrind=False, use_profiler=False,
                        plasma_directory=None, use_hugepages=False,
                        external_store=None):
-    """Start a plasma store process.
-    Args:
-        plasma_store_memory (int): Capacity of the plasma store in bytes.
-        use_valgrind (bool): True if the plasma store should be started inside
-            of valgrind. If this is True, use_profiler must be False.
-        use_profiler (bool): True if the plasma store should be started inside
-            a profiler. If this is True, use_valgrind must be False.
-        plasma_directory (str): Directory where plasma memory mapped files
-            will be stored.
-        use_hugepages (bool): True if the plasma store should use huge pages.
-        external_store (str): External store to use for evicted objects.
-    Return:
+    """
+    Start a plasma store process.
+
+    Parameters
+    ----------
+    plasma_store_memory : int
+        Capacity of the plasma store in bytes.
+    use_valgrind : bool
+        True if the plasma store should be started inside of valgrind. If this
+        is True, use_profiler must be False.
+    use_profiler : bool
+        True if the plasma store should be started inside a profiler. If this
+        is True, use_valgrind must be False.
+    plasma_directory : str
+        Directory where plasma memory mapped files will be stored.
+    use_hugepages : bool
+        True if the plasma store should use huge pages.
+    external_store : str
+        External store to use for evicted objects.
+
+    Returns
+    -------
+    result : (str, subprocess.Popen)
         A tuple of the name of the plasma store socket and the process ID of
-            the plasma store process.
+        the plasma store process.
     """
     if use_valgrind and use_profiler:
         raise Exception("Cannot use valgrind and profiler at the same time.")

diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
@@ -2439,6 +2439,13 @@ def _from_pydict(cls, mapping, schema, metadata):
 class TableGroupBy:
     """
     A grouping of columns in a table on which to perform aggregations.
+
+    Parameters
+    ----------
+    table : pyarrow.Table
+        Input table to execute the aggregation on.
+    keys : str or list[str]
+        Name of the grouped columns.
     """
 
     def __init__(self, table, keys):