Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/workflows/python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,16 @@ jobs:
fail-fast: false
matrix:
name:
- conda-python-docs
- conda-python-3.8-nopandas
- conda-python-3.6-pandas-0.23
- conda-python-3.7-pandas-latest
include:
- name: conda-python-docs
cache: conda-python-3.9
image: conda-python-docs
title: AMD64 Conda Python 3.9 Sphinx & Numpydoc
python: 3.9
- name: conda-python-3.8-nopandas
cache: conda-python-3.8
image: conda-python
Expand Down
4 changes: 2 additions & 2 deletions dev/archery/archery/lang/python.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,8 @@ class NumpyDoc:
def __init__(self, symbols=None):
if not have_numpydoc:
raise RuntimeError(
'Numpydoc is not available, install the development version '
'with command: pip install numpydoc==1.1.0'
'Numpydoc is not available, install with command: '
'pip install numpydoc==1.1.0'
)
self.symbols = set(symbols or {'pyarrow'})

Expand Down
2 changes: 1 addition & 1 deletion dev/archery/archery/utils/lint.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ def python_numpydoc(symbols=None, allow_rules=None, disallow_rules=None):
'pyarrow.csv',
'pyarrow.dataset',
'pyarrow.feather',
'pyarrow.flight',
# 'pyarrow.flight',
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Temporarily disabled, created a follow-up: https://issues.apache.org/jira/browse/ARROW-14995

'pyarrow.fs',
'pyarrow.gandiva',
'pyarrow.ipc',
Expand Down
1 change: 1 addition & 0 deletions dev/archery/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
'setuptools_scm'],
'crossbow-upload': ['github3.py', jinja_req, 'ruamel.yaml',
'setuptools_scm'],
'numpydoc': ['numpydoc==1.1.0']
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a particular reason for pinning to this exact version?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This it the first version which provides the class and function we use from numpydoc which seem a bit like internal-ish at the moment.

}
extras['bot'] = extras['crossbow'] + ['pygithub', 'jira']
extras['all'] = list(set(functools.reduce(operator.add, extras.values())))
Expand Down
26 changes: 23 additions & 3 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -101,15 +101,15 @@ x-hierarchy:
- conda-cpp-hiveserver2
- conda-cpp-valgrind
- conda-python:
- conda-python-pandas
- conda-python-pandas:
- conda-python-docs
- conda-python-dask
- conda-python-hdfs
- conda-python-jpype
- conda-python-java-integration
- conda-python-turbodbc
- conda-python-kartothek
- conda-python-spark

- debian-cpp:
- debian-c-glib:
- debian-ruby
Expand Down Expand Up @@ -940,13 +940,33 @@ services:
shm_size: *shm-size
environment:
<<: *ccache
BUILD_DOCS_PYTHON: "ON"
volumes: *conda-volumes
command:
["/arrow/ci/scripts/cpp_build.sh /arrow /build &&
/arrow/ci/scripts/python_build.sh /arrow /build &&
/arrow/ci/scripts/python_test.sh /arrow"]

conda-python-docs:
# Usage:
# archery docker run conda-python-docs
#
# Only a single rule is enabled for now to check undocumented arguments.
# We should extend the list of enabled rules after adding this build to
# the CI pipeline.
image: ${REPO}:${ARCH}-conda-python-${PYTHON}-pandas-${PANDAS}
environment:
<<: *ccache
LC_ALL: "C.UTF-8"
LANG: "C.UTF-8"
BUILD_DOCS_CPP: "ON"
BUILD_DOCS_PYTHON: "ON"
volumes: *conda-volumes
command:
["/arrow/ci/scripts/cpp_build.sh /arrow /build &&
/arrow/ci/scripts/python_build.sh /arrow /build &&
pip install -e /arrow/dev/archery[numpydoc] &&
archery numpydoc --allow-rule PR01"]

conda-python-dask:
# Possible $DASK parameters:
# - `latest`: latest release
Expand Down
18 changes: 18 additions & 0 deletions python/pyarrow/compat.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,31 @@ except ImportError:


def tobytes(o):
"""
Encode a unicode or bytes string to bytes.

Parameters
----------
o : str or bytes
Input string.
"""
if isinstance(o, str):
return o.encode('utf8')
else:
return o


def frombytes(o, *, safe=False):
"""
Decode the given bytestring to unicode.

Parameters
----------
o : bytes-like
Input object.
safe : bool, default False
If true, raise on encoding errors.
"""
if safe:
return o.decode('utf8', errors='replace')
else:
Expand Down
56 changes: 53 additions & 3 deletions python/pyarrow/gandiva.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ from pyarrow.includes.libgandiva cimport (
CFunctionSignature,
GetRegisteredFunctionSignatures)


cdef class Node(_Weakrefable):
cdef:
shared_ptr[CNode] node
Expand All @@ -103,6 +104,7 @@ cdef class Node(_Weakrefable):
def return_type(self):
return pyarrow_wrap_data_type(self.node.get().return_type())


cdef class Expression(_Weakrefable):
cdef:
shared_ptr[CExpression] expression
Expand All @@ -123,6 +125,7 @@ cdef class Expression(_Weakrefable):
def result(self):
return pyarrow_wrap_field(self.expression.get().result())


cdef class Condition(_Weakrefable):
cdef:
shared_ptr[CCondition] condition
Expand Down Expand Up @@ -151,6 +154,7 @@ cdef class Condition(_Weakrefable):
def result(self):
return pyarrow_wrap_field(self.condition.get().result())


cdef class SelectionVector(_Weakrefable):
cdef:
shared_ptr[CSelectionVector] selection_vector
Expand All @@ -169,6 +173,7 @@ cdef class SelectionVector(_Weakrefable):
cdef shared_ptr[CArray] result = self.selection_vector.get().ToArray()
return pyarrow_wrap_array(result)


cdef class Projector(_Weakrefable):
cdef:
shared_ptr[CProjector] projector
Expand Down Expand Up @@ -206,6 +211,7 @@ cdef class Projector(_Weakrefable):
arrays.append(pyarrow_wrap_array(result))
return arrays


cdef class Filter(_Weakrefable):
cdef:
shared_ptr[CFilter] filter
Expand Down Expand Up @@ -440,26 +446,70 @@ cdef class TreeExprBuilder(_Weakrefable):
condition.node)
return Condition.create(r)


cpdef make_projector(Schema schema, children, MemoryPool pool,
str selection_mode="NONE"):
cdef c_vector[shared_ptr[CExpression]] c_children
cdef Expression child
"""
Construct a projection using expressions.

A projector is built for a specific schema and vector of expressions.
Once the projector is built, it can be used to evaluate many row batches.

Parameters
----------
schema : pyarrow.Schema
Schema for the record batches, and the expressions.
children : list[pyarrow.gandiva.Expression]
List of projectable expression objects.
pool : pyarrow.MemoryPool
Memory pool used to allocate output arrays.
selection_mode : str, default "NONE"
Possible values are NONE, UINT16, UINT32, UINT64.

Returns
-------
Projector instance
"""
cdef:
Expression child
c_vector[shared_ptr[CExpression]] c_children
shared_ptr[CProjector] result

for child in children:
c_children.push_back(child.expression)
cdef shared_ptr[CProjector] result

check_status(
Projector_Make(schema.sp_schema, c_children,
_ensure_selection_mode(selection_mode),
CConfigurationBuilder.DefaultConfiguration(),
&result))
return Projector.create(result, pool)


cpdef make_filter(Schema schema, Condition condition):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These functions are only used from the unittest, so we should not expose them to the public API.

Created a follow-up https://issues.apache.org/jira/browse/ARROW-14996

"""
Contruct a filter based on a condition.

A filter is built for a specific schema and condition. Once the filter is
built, it can be used to evaluate many row batches.

Parameters
----------
schema : pyarrow.Schema
Schema for the record batches, and the condition.
condition : pyarrow.gandiva.Condition
Filter condition.

Returns
-------
Filter instance
"""
cdef shared_ptr[CFilter] result
check_status(
Filter_Make(schema.sp_schema, condition.condition, &result))
return Filter.create(result)


cdef class FunctionSignature(_Weakrefable):
"""
Signature of a Gandiva function including name, parameter types
Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def write(self, table):

Parameters
----------
schema : pyarrow.lib.Table
table : pyarrow.lib.Table
The table to be written into the ORC file
"""
self.writer.write(table)
Expand Down
37 changes: 24 additions & 13 deletions python/pyarrow/plasma.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,20 +83,31 @@ def start_plasma_store(plasma_store_memory,
use_valgrind=False, use_profiler=False,
plasma_directory=None, use_hugepages=False,
external_store=None):
"""Start a plasma store process.
Args:
plasma_store_memory (int): Capacity of the plasma store in bytes.
use_valgrind (bool): True if the plasma store should be started inside
of valgrind. If this is True, use_profiler must be False.
use_profiler (bool): True if the plasma store should be started inside
a profiler. If this is True, use_valgrind must be False.
plasma_directory (str): Directory where plasma memory mapped files
will be stored.
use_hugepages (bool): True if the plasma store should use huge pages.
external_store (str): External store to use for evicted objects.
Return:
"""
Start a plasma store process.

Parameters
----------
plasma_store_memory : int
Capacity of the plasma store in bytes.
use_valgrind : bool
True if the plasma store should be started inside of valgrind. If this
is True, use_profiler must be False.
use_profiler : bool
True if the plasma store should be started inside a profiler. If this
is True, use_valgrind must be False.
plasma_directory : str
Directory where plasma memory mapped files will be stored.
use_hugepages : bool
True if the plasma store should use huge pages.
external_store : str
External store to use for evicted objects.

Returns
-------
result : (str, subprocess.Popen)
A tuple of the name of the plasma store socket and the process ID of
the plasma store process.
the plasma store process.
"""
if use_valgrind and use_profiler:
raise Exception("Cannot use valgrind and profiler at the same time.")
Expand Down
7 changes: 7 additions & 0 deletions python/pyarrow/table.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -2439,6 +2439,13 @@ def _from_pydict(cls, mapping, schema, metadata):
class TableGroupBy:
"""
A grouping of columns in a table on which to perform aggregations.

Parameters
----------
table : pyarrow.Table
Input table to execute the aggregation on.
keys : str or list[str]
Name of the grouped columns.
"""

def __init__(self, table, keys):
Expand Down