diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 81ec8c093a4..3a77e9e5dce 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -51,10 +51,16 @@ jobs: fail-fast: false matrix: name: + - conda-python-docs - conda-python-3.8-nopandas - conda-python-3.6-pandas-0.23 - conda-python-3.7-pandas-latest include: + - name: conda-python-docs + cache: conda-python-3.9 + image: conda-python-docs + title: AMD64 Conda Python 3.9 Sphinx & Numpydoc + python: 3.9 - name: conda-python-3.8-nopandas cache: conda-python-3.8 image: conda-python diff --git a/dev/archery/archery/lang/python.py b/dev/archery/archery/lang/python.py index c6ebbe65004..6ffa9014430 100644 --- a/dev/archery/archery/lang/python.py +++ b/dev/archery/archery/lang/python.py @@ -105,8 +105,8 @@ class NumpyDoc: def __init__(self, symbols=None): if not have_numpydoc: raise RuntimeError( - 'Numpydoc is not available, install the development version ' - 'with command: pip install numpydoc==1.1.0' + 'Numpydoc is not available, install with command: ' + 'pip install numpydoc==1.1.0' ) self.symbols = set(symbols or {'pyarrow'}) diff --git a/dev/archery/archery/utils/lint.py b/dev/archery/archery/utils/lint.py index 2f97ed9b2e5..a6507b5a394 100644 --- a/dev/archery/archery/utils/lint.py +++ b/dev/archery/archery/utils/lint.py @@ -246,7 +246,7 @@ def python_numpydoc(symbols=None, allow_rules=None, disallow_rules=None): 'pyarrow.csv', 'pyarrow.dataset', 'pyarrow.feather', - 'pyarrow.flight', + # 'pyarrow.flight', 'pyarrow.fs', 'pyarrow.gandiva', 'pyarrow.ipc', diff --git a/dev/archery/setup.py b/dev/archery/setup.py index 66480737547..3d70e858de6 100755 --- a/dev/archery/setup.py +++ b/dev/archery/setup.py @@ -36,6 +36,7 @@ 'setuptools_scm'], 'crossbow-upload': ['github3.py', jinja_req, 'ruamel.yaml', 'setuptools_scm'], + 'numpydoc': ['numpydoc==1.1.0'] } extras['bot'] = extras['crossbow'] + ['pygithub', 'jira'] extras['all'] = list(set(functools.reduce(operator.add, extras.values()))) diff --git a/docker-compose.yml b/docker-compose.yml index 9bf5bd0841d..559f86ab04a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -101,7 +101,8 @@ x-hierarchy: - conda-cpp-hiveserver2 - conda-cpp-valgrind - conda-python: - - conda-python-pandas + - conda-python-pandas: + - conda-python-docs - conda-python-dask - conda-python-hdfs - conda-python-jpype @@ -109,7 +110,6 @@ x-hierarchy: - conda-python-turbodbc - conda-python-kartothek - conda-python-spark - - debian-cpp: - debian-c-glib: - debian-ruby @@ -940,13 +940,33 @@ services: shm_size: *shm-size environment: <<: *ccache - BUILD_DOCS_PYTHON: "ON" volumes: *conda-volumes command: ["/arrow/ci/scripts/cpp_build.sh /arrow /build && /arrow/ci/scripts/python_build.sh /arrow /build && /arrow/ci/scripts/python_test.sh /arrow"] + conda-python-docs: + # Usage: + # archery docker run conda-python-docs + # + # Only a single rule is enabled for now to check undocumented arguments. + # We should extend the list of enabled rules after adding this build to + # the CI pipeline. + image: ${REPO}:${ARCH}-conda-python-${PYTHON}-pandas-${PANDAS} + environment: + <<: *ccache + LC_ALL: "C.UTF-8" + LANG: "C.UTF-8" + BUILD_DOCS_CPP: "ON" + BUILD_DOCS_PYTHON: "ON" + volumes: *conda-volumes + command: + ["/arrow/ci/scripts/cpp_build.sh /arrow /build && + /arrow/ci/scripts/python_build.sh /arrow /build && + pip install -e /arrow/dev/archery[numpydoc] && + archery numpydoc --allow-rule PR01"] + conda-python-dask: # Possible $DASK parameters: # - `latest`: latest release diff --git a/python/pyarrow/compat.pxi b/python/pyarrow/compat.pxi index a5db5741b81..b415d2170b2 100644 --- a/python/pyarrow/compat.pxi +++ b/python/pyarrow/compat.pxi @@ -52,6 +52,14 @@ except ImportError: def tobytes(o): + """ + Encode a unicode or bytes string to bytes. + + Parameters + ---------- + o : str or bytes + Input string. + """ if isinstance(o, str): return o.encode('utf8') else: @@ -59,6 +67,16 @@ def tobytes(o): def frombytes(o, *, safe=False): + """ + Decode the given bytestring to unicode. + + Parameters + ---------- + o : bytes-like + Input object. + safe : bool, default False + If true, raise on encoding errors. + """ if safe: return o.decode('utf8', errors='replace') else: diff --git a/python/pyarrow/gandiva.pyx b/python/pyarrow/gandiva.pyx index 12d572b3307..60bc84a8e14 100644 --- a/python/pyarrow/gandiva.pyx +++ b/python/pyarrow/gandiva.pyx @@ -78,6 +78,7 @@ from pyarrow.includes.libgandiva cimport ( CFunctionSignature, GetRegisteredFunctionSignatures) + cdef class Node(_Weakrefable): cdef: shared_ptr[CNode] node @@ -103,6 +104,7 @@ cdef class Node(_Weakrefable): def return_type(self): return pyarrow_wrap_data_type(self.node.get().return_type()) + cdef class Expression(_Weakrefable): cdef: shared_ptr[CExpression] expression @@ -123,6 +125,7 @@ cdef class Expression(_Weakrefable): def result(self): return pyarrow_wrap_field(self.expression.get().result()) + cdef class Condition(_Weakrefable): cdef: shared_ptr[CCondition] condition @@ -151,6 +154,7 @@ cdef class Condition(_Weakrefable): def result(self): return pyarrow_wrap_field(self.condition.get().result()) + cdef class SelectionVector(_Weakrefable): cdef: shared_ptr[CSelectionVector] selection_vector @@ -169,6 +173,7 @@ cdef class SelectionVector(_Weakrefable): cdef shared_ptr[CArray] result = self.selection_vector.get().ToArray() return pyarrow_wrap_array(result) + cdef class Projector(_Weakrefable): cdef: shared_ptr[CProjector] projector @@ -206,6 +211,7 @@ cdef class Projector(_Weakrefable): arrays.append(pyarrow_wrap_array(result)) return arrays + cdef class Filter(_Weakrefable): cdef: shared_ptr[CFilter] filter @@ -440,13 +446,38 @@ cdef class TreeExprBuilder(_Weakrefable): condition.node) return Condition.create(r) + cpdef make_projector(Schema schema, children, MemoryPool pool, str selection_mode="NONE"): - cdef c_vector[shared_ptr[CExpression]] c_children - cdef Expression child + """ + Construct a projection using expressions. + + A projector is built for a specific schema and vector of expressions. + Once the projector is built, it can be used to evaluate many row batches. + + Parameters + ---------- + schema : pyarrow.Schema + Schema for the record batches, and the expressions. + children : list[pyarrow.gandiva.Expression] + List of projectable expression objects. + pool : pyarrow.MemoryPool + Memory pool used to allocate output arrays. + selection_mode : str, default "NONE" + Possible values are NONE, UINT16, UINT32, UINT64. + + Returns + ------- + Projector instance + """ + cdef: + Expression child + c_vector[shared_ptr[CExpression]] c_children + shared_ptr[CProjector] result + for child in children: c_children.push_back(child.expression) - cdef shared_ptr[CProjector] result + check_status( Projector_Make(schema.sp_schema, c_children, _ensure_selection_mode(selection_mode), @@ -454,12 +485,31 @@ cpdef make_projector(Schema schema, children, MemoryPool pool, &result)) return Projector.create(result, pool) + cpdef make_filter(Schema schema, Condition condition): + """ + Contruct a filter based on a condition. + + A filter is built for a specific schema and condition. Once the filter is + built, it can be used to evaluate many row batches. + + Parameters + ---------- + schema : pyarrow.Schema + Schema for the record batches, and the condition. + condition : pyarrow.gandiva.Condition + Filter condition. + + Returns + ------- + Filter instance + """ cdef shared_ptr[CFilter] result check_status( Filter_Make(schema.sp_schema, condition.condition, &result)) return Filter.create(result) + cdef class FunctionSignature(_Weakrefable): """ Signature of a Gandiva function including name, parameter types diff --git a/python/pyarrow/orc.py b/python/pyarrow/orc.py index 87f80588621..2243a0a2178 100644 --- a/python/pyarrow/orc.py +++ b/python/pyarrow/orc.py @@ -140,7 +140,7 @@ def write(self, table): Parameters ---------- - schema : pyarrow.lib.Table + table : pyarrow.lib.Table The table to be written into the ORC file """ self.writer.write(table) diff --git a/python/pyarrow/plasma.py b/python/pyarrow/plasma.py index 239d29094e6..052c69975d3 100644 --- a/python/pyarrow/plasma.py +++ b/python/pyarrow/plasma.py @@ -83,20 +83,31 @@ def start_plasma_store(plasma_store_memory, use_valgrind=False, use_profiler=False, plasma_directory=None, use_hugepages=False, external_store=None): - """Start a plasma store process. - Args: - plasma_store_memory (int): Capacity of the plasma store in bytes. - use_valgrind (bool): True if the plasma store should be started inside - of valgrind. If this is True, use_profiler must be False. - use_profiler (bool): True if the plasma store should be started inside - a profiler. If this is True, use_valgrind must be False. - plasma_directory (str): Directory where plasma memory mapped files - will be stored. - use_hugepages (bool): True if the plasma store should use huge pages. - external_store (str): External store to use for evicted objects. - Return: + """ + Start a plasma store process. + + Parameters + ---------- + plasma_store_memory : int + Capacity of the plasma store in bytes. + use_valgrind : bool + True if the plasma store should be started inside of valgrind. If this + is True, use_profiler must be False. + use_profiler : bool + True if the plasma store should be started inside a profiler. If this + is True, use_valgrind must be False. + plasma_directory : str + Directory where plasma memory mapped files will be stored. + use_hugepages : bool + True if the plasma store should use huge pages. + external_store : str + External store to use for evicted objects. + + Returns + ------- + result : (str, subprocess.Popen) A tuple of the name of the plasma store socket and the process ID of - the plasma store process. + the plasma store process. """ if use_valgrind and use_profiler: raise Exception("Cannot use valgrind and profiler at the same time.") diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index e6b0b55b432..23a42b6abc1 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -2439,6 +2439,13 @@ def _from_pydict(cls, mapping, schema, metadata): class TableGroupBy: """ A grouping of columns in a table on which to perform aggregations. + + Parameters + ---------- + table : pyarrow.Table + Input table to execute the aggregation on. + keys : str or list[str] + Name of the grouped columns. """ def __init__(self, table, keys):