diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 048f01ab9f0..3e650aaf383 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -368,7 +368,8 @@ set(PYARROW_CPP_SRCS ${PYARROW_CPP_SOURCE_DIR}/python_test.cc ${PYARROW_CPP_SOURCE_DIR}/python_to_arrow.cc ${PYARROW_CPP_SOURCE_DIR}/pyarrow.cc - ${PYARROW_CPP_SOURCE_DIR}/udf.cc) + ${PYARROW_CPP_SOURCE_DIR}/udf.cc + ${PYARROW_CPP_SOURCE_DIR}/util.cc) set_source_files_properties(${PYARROW_CPP_SOURCE_DIR}/numpy_init.cc PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON) diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index c732c13764d..3555fa4f31d 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -191,6 +191,7 @@ def print_entry(label, value): SparseCOOTensor, SparseCSRMatrix, SparseCSCMatrix, SparseCSFTensor, infer_type, from_numpy_dtype, + arange, NullArray, NumericArray, IntegerArray, FloatingPointArray, BooleanArray, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index fc412990511..4e1f69aa3e2 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -572,22 +572,53 @@ def infer_type(values, mask=None, from_pandas=False): return pyarrow_wrap_data_type(out) +def arange(int64_t start, int64_t stop, int64_t step=1, *, memory_pool=None): + """ + Create an array of evenly spaced values within a given interval. + + This function is similar to Python's `range` function. + The resulting array will contain values starting from `start` up to but not + including `stop`, with a step size of `step`. + + Parameters + ---------- + start : int + The starting value for the sequence. The returned array will include this value. + stop : int + The stopping value for the sequence. The returned array will not include this value. + step : int, default 1 + The spacing between values. + memory_pool : MemoryPool, optional + A memory pool to use for memory allocations. + + Raises + ------ + ArrowInvalid + If `step` is zero. + + Returns + ------- + arange : Array + """ + cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) + with nogil: + c_array = GetResultValue(Arange(start, stop, step, pool)) + return pyarrow_wrap_array(c_array) + + def _normalize_slice(object arrow_obj, slice key): """ Slices with step not equal to 1 (or None) will produce a copy rather than a zero-copy view """ cdef: - Py_ssize_t start, stop, step + int64_t start, stop, step Py_ssize_t n = len(arrow_obj) start, stop, step = key.indices(n) if step != 1: - indices = list(range(start, stop, step)) - if len(indices) == 0: - return arrow_obj.slice(0, 0) - return arrow_obj.take(indices) + return arrow_obj.take(arange(start, stop, step)) else: length = max(stop - start, 0) return arrow_obj.slice(start, length) diff --git a/python/pyarrow/includes/libarrow_python.pxd b/python/pyarrow/includes/libarrow_python.pxd index e544aa0165d..bf90c13926b 100644 --- a/python/pyarrow/includes/libarrow_python.pxd +++ b/python/pyarrow/includes/libarrow_python.pxd @@ -73,6 +73,9 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: object obj, object mask, const PyConversionOptions& options, CMemoryPool* pool) + CResult[shared_ptr[CArray]] Arange(int64_t start, int64_t stop, + int64_t step, CMemoryPool* pool) + CResult[shared_ptr[CDataType]] NumPyDtypeToArrow(object dtype) CStatus NdarrayToArrow(CMemoryPool* pool, object ao, object mo, diff --git a/python/pyarrow/src/arrow/python/api.h b/python/pyarrow/src/arrow/python/api.h index e66bf49dfec..2af0963a9c0 100644 --- a/python/pyarrow/src/arrow/python/api.h +++ b/python/pyarrow/src/arrow/python/api.h @@ -26,3 +26,4 @@ #include "arrow/python/numpy_convert.h" #include "arrow/python/numpy_to_arrow.h" #include "arrow/python/python_to_arrow.h" +#include "arrow/python/util.h" diff --git a/python/pyarrow/src/arrow/python/util.cc b/python/pyarrow/src/arrow/python/util.cc new file mode 100644 index 00000000000..cffe1eb956b --- /dev/null +++ b/python/pyarrow/src/arrow/python/util.cc @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/python/util.h" + +#include "arrow/array.h" +#include "arrow/python/common.h" + +namespace arrow ::py { + +Result> Arange(int64_t start, int64_t stop, int64_t step, + MemoryPool* pool) { + int64_t size; + if (step == 0) { + return Status::Invalid("Step must not be zero"); + } + if (step > 0 && stop > start) { + // Ceiling division for positive step + size = (stop - start + step - 1) / step; + } else if (step < 0 && stop < start) { + // Ceiling division for negative step + size = (start - stop - step - 1) / (-step); + } else { + return MakeEmptyArray(int64()); + } + std::shared_ptr data_buffer; + ARROW_ASSIGN_OR_RAISE(data_buffer, AllocateBuffer(size * sizeof(int64_t), pool)); + auto values = reinterpret_cast(data_buffer->mutable_data()); + for (int64_t i = 0; i < size; ++i) { + values[i] = start + i * step; + } + auto data = ArrayData::Make(int64(), size, {nullptr, data_buffer}, 0); + return MakeArray(data); +} + +} // namespace arrow::py diff --git a/python/pyarrow/src/arrow/python/util.h b/python/pyarrow/src/arrow/python/util.h new file mode 100644 index 00000000000..ff2ffcaea9c --- /dev/null +++ b/python/pyarrow/src/arrow/python/util.h @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/python/common.h" +#include "arrow/python/visibility.h" + +namespace arrow::py { + +/// \brief Create an array of evenly spaced values within a given interval. +/// This function is similar to Python's `range` function. +/// The resulting array will contain values starting from `start` up to but not +/// including `stop`, with a step size of `step`. If `step` is zero, the function +/// will return an error. +/// The resulting array will have a data type of `int64`. +/// \param[in] start initial value of the sequence. +/// \param[in] stop final value of the sequence (exclusive). +/// \param[in] step step size between consecutive values. +/// \param[in] pool Memory pool for any memory allocations. +/// \return Result Array +ARROW_PYTHON_EXPORT +Result> Arange(int64_t start, int64_t stop, int64_t step, + MemoryPool* pool); + +} // namespace arrow::py diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 7dabb8396b2..97425df0f95 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -536,6 +536,32 @@ def test_array_slice_negative_step(): assert result.equals(expected) +def test_arange(): + cases = [ + (5, 103), # Default step + (-2, 128, 3), + (4, 103, 5), + (10, -7, -1), + (100, -20, -3), + (0, 0), # Empty array + (2, 10, -1), # Empty array + (10, 3, 1), # Empty array + ] + for case in cases: + result = pa.arange(*case) + result.validate(full=True) + assert result.equals(pa.array(list(range(*case)), type=pa.int64())) + + # Validate memory_pool keyword argument + result = pa.arange(-1, 101, memory_pool=pa.default_memory_pool()) + result.validate(full=True) + assert result.equals(pa.array(list(range(-1, 101)), type=pa.int64())) + + # Special case for invalid step (arange does not accept step of 0) + with pytest.raises(pa.ArrowInvalid): + pa.arange(0, 10, 0) + + def test_array_diff(): # ARROW-6252 arr1 = pa.array(['foo'], type=pa.utf8())