Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions cpp/src/arrow/python/arrow_to_pandas.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1951,11 +1951,12 @@ Status ConvertTableToPandas(const PandasOptions& options,
FunctionContext ctx;
for (int i = 0; i < table->num_columns(); i++) {
std::shared_ptr<ChunkedArray> col = table->column(i);
if (col->type()->id() == Type::DICTIONARY) {
// No need to dictionary encode again. Came up in ARROW-6434,
// ARROW-6435
continue;
}
if (categorical_columns.count(table->field(i)->name())) {
if (table->field(i)->type()->id() == Type::DICTIONARY) {
// this column is already dictionary encoded
continue;
}
Datum out;
RETURN_NOT_OK(DictionaryEncode(&ctx, Datum(col), &out));
std::shared_ptr<ChunkedArray> array = out.chunked_array();
Expand Down
7 changes: 7 additions & 0 deletions cpp/src/arrow/python/helpers.cc
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,13 @@ Status InvalidValue(PyObject* obj, const std::string& why) {
Py_TYPE(obj)->tp_name, ": ", why);
}

Status InvalidType(PyObject* obj, const std::string& why) {
std::string obj_as_str;
RETURN_NOT_OK(internal::PyObject_StdStringStr(obj, &obj_as_str));
return Status::TypeError("Could not convert ", obj_as_str, " with type ",
Py_TYPE(obj)->tp_name, ": ", why);
}

Status UnboxIntegerAsInt64(PyObject* obj, int64_t* out) {
if (PyLong_Check(obj)) {
int overflow = 0;
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/arrow/python/helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,9 @@ inline Status CastSize(Py_ssize_t size, int64_t* out, const char* error_msg = NU
ARROW_PYTHON_EXPORT
Status InvalidValue(PyObject* obj, const std::string& why);

ARROW_PYTHON_EXPORT
Status InvalidType(PyObject* obj, const std::string& why);

ARROW_PYTHON_EXPORT
Status IntegerScalarToDoubleSafe(PyObject* obj, double* result);
ARROW_PYTHON_EXPORT
Expand Down
256 changes: 139 additions & 117 deletions cpp/src/arrow/python/python_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -607,8 +607,26 @@ class StringConverter
// ----------------------------------------------------------------------
// Convert lists (NumPy arrays containing lists or ndarrays as values)

template <typename TypeClass>
class ListConverter : public TypedConverter<TypeClass, ListConverter<TypeClass>> {
// If the value type does not match the expected NumPy dtype, then fall through
// to a slower PySequence-based path
#define LIST_FAST_CASE(TYPE, NUMPY_TYPE, ArrowType) \
case Type::TYPE: { \
if (PyArray_DESCR(arr)->type_num != NUMPY_TYPE) { \
return value_converter_->AppendMultiple(obj, value_length); \
} \
return AppendNdarrayTypedItem<NUMPY_TYPE, ArrowType>(arr); \
}

// Use internal::VisitSequence, fast for NPY_OBJECT but slower otherwise
#define LIST_SLOW_CASE(TYPE) \
case Type::TYPE: { \
return value_converter_->AppendMultiple(obj, value_length); \
}
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved code


template <typename TypeClass, NullCoding null_coding>
class ListConverter
: public TypedConverter<TypeClass, ListConverter<TypeClass, null_coding>,
null_coding> {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Relevant changes

public:
using BuilderType = typename TypeTraits<TypeClass>::BuilderType;

Expand All @@ -626,18 +644,94 @@ class ListConverter : public TypedConverter<TypeClass, ListConverter<TypeClass>>
}

template <int NUMPY_TYPE, typename Type>
Status AppendNdarrayTypedItem(PyArrayObject* arr);
Status AppendNdarrayItem(PyObject* arr);
Status AppendNdarrayTypedItem(PyArrayObject* arr) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved code. Unchanged

using traits = internal::npy_traits<NUMPY_TYPE>;
using T = typename traits::value_type;
using ValueBuilderType = typename TypeTraits<Type>::BuilderType;

const bool null_sentinels_possible =
// Always treat Numpy's NaT as null
NUMPY_TYPE == NPY_DATETIME ||
// Observing pandas's null sentinels
(from_pandas_ && traits::supports_nulls);

auto child_builder = checked_cast<ValueBuilderType*>(value_converter_->builder());

// TODO(wesm): Vector append when not strided
Ndarray1DIndexer<T> values(arr);
if (null_sentinels_possible) {
for (int64_t i = 0; i < values.size(); ++i) {
if (traits::isnull(values[i])) {
RETURN_NOT_OK(child_builder->AppendNull());
} else {
RETURN_NOT_OK(child_builder->Append(values[i]));
}
}
} else {
for (int64_t i = 0; i < values.size(); ++i) {
RETURN_NOT_OK(child_builder->Append(values[i]));
}
}
return Status::OK();
}

Status AppendNdarrayItem(PyObject* obj) {
PyArrayObject* arr = reinterpret_cast<PyArrayObject*>(obj);

if (PyArray_NDIM(arr) != 1) {
return Status::Invalid("Can only convert 1-dimensional array values");
}

const int64_t value_length = PyArray_SIZE(arr);

switch (value_type_->id()) {
LIST_SLOW_CASE(NA)
LIST_FAST_CASE(UINT8, NPY_UINT8, UInt8Type)
LIST_FAST_CASE(INT8, NPY_INT8, Int8Type)
LIST_FAST_CASE(UINT16, NPY_UINT16, UInt16Type)
LIST_FAST_CASE(INT16, NPY_INT16, Int16Type)
LIST_FAST_CASE(UINT32, NPY_UINT32, UInt32Type)
LIST_FAST_CASE(INT32, NPY_INT32, Int32Type)
LIST_FAST_CASE(UINT64, NPY_UINT64, UInt64Type)
LIST_FAST_CASE(INT64, NPY_INT64, Int64Type)
LIST_SLOW_CASE(DATE32)
LIST_SLOW_CASE(DATE64)
LIST_SLOW_CASE(TIME32)
LIST_SLOW_CASE(TIME64)
LIST_FAST_CASE(TIMESTAMP, NPY_DATETIME, TimestampType)
LIST_FAST_CASE(HALF_FLOAT, NPY_FLOAT16, HalfFloatType)
LIST_FAST_CASE(FLOAT, NPY_FLOAT, FloatType)
LIST_FAST_CASE(DOUBLE, NPY_DOUBLE, DoubleType)
LIST_SLOW_CASE(BINARY)
LIST_SLOW_CASE(FIXED_SIZE_BINARY)
LIST_SLOW_CASE(STRING)
case Type::LIST: {
if (PyArray_DESCR(arr)->type_num != NPY_OBJECT) {
return Status::Invalid(
"Can only convert list types from NumPy object "
"array input");
}
return internal::VisitSequence(obj, [this](PyObject* item, bool*) {
return value_converter_->AppendSingleVirtual(item);
});
}
default: {
return Status::TypeError("Unknown list item type: ", value_type_->ToString());
}
}
}

Status AppendItem(PyObject* obj) {
RETURN_NOT_OK(this->typed_builder_->Append());
if (PyArray_Check(obj)) {
return AppendNdarrayItem(obj);
}
const auto list_size = static_cast<int64_t>(PySequence_Size(obj));
if (ARROW_PREDICT_FALSE(list_size == -1)) {
RETURN_IF_PYERROR();
if (!PySequence_Check(obj)) {
return internal::InvalidType(obj,
"was not a sequence or recognized null"
" for conversion to list type");
}
int64_t list_size = static_cast<int64_t>(PySequence_Size(obj));
return value_converter_->AppendMultiple(obj, list_size);
}

Expand All @@ -658,116 +752,22 @@ class ListConverter : public TypedConverter<TypeClass, ListConverter<TypeClass>>
bool strict_conversions_;
};

template <typename TypeClass>
template <int NUMPY_TYPE, typename Type>
Status ListConverter<TypeClass>::AppendNdarrayTypedItem(PyArrayObject* arr) {
using traits = internal::npy_traits<NUMPY_TYPE>;
using T = typename traits::value_type;
using ValueBuilderType = typename TypeTraits<Type>::BuilderType;

const bool null_sentinels_possible =
// Always treat Numpy's NaT as null
NUMPY_TYPE == NPY_DATETIME ||
// Observing pandas's null sentinels
(from_pandas_ && traits::supports_nulls);

auto child_builder = checked_cast<ValueBuilderType*>(value_converter_->builder());

// TODO(wesm): Vector append when not strided
Ndarray1DIndexer<T> values(arr);
if (null_sentinels_possible) {
for (int64_t i = 0; i < values.size(); ++i) {
if (traits::isnull(values[i])) {
RETURN_NOT_OK(child_builder->AppendNull());
} else {
RETURN_NOT_OK(child_builder->Append(values[i]));
}
}
} else {
for (int64_t i = 0; i < values.size(); ++i) {
RETURN_NOT_OK(child_builder->Append(values[i]));
}
}
return Status::OK();
}

// If the value type does not match the expected NumPy dtype, then fall through
// to a slower PySequence-based path
#define LIST_FAST_CASE(TYPE, NUMPY_TYPE, ArrowType) \
case Type::TYPE: { \
if (PyArray_DESCR(arr)->type_num != NUMPY_TYPE) { \
return value_converter_->AppendMultiple(obj, value_length); \
} \
return AppendNdarrayTypedItem<NUMPY_TYPE, ArrowType>(arr); \
}

// Use internal::VisitSequence, fast for NPY_OBJECT but slower otherwise
#define LIST_SLOW_CASE(TYPE) \
case Type::TYPE: { \
return value_converter_->AppendMultiple(obj, value_length); \
}

template <typename TypeClass>
Status ListConverter<TypeClass>::AppendNdarrayItem(PyObject* obj) {
PyArrayObject* arr = reinterpret_cast<PyArrayObject*>(obj);

if (PyArray_NDIM(arr) != 1) {
return Status::Invalid("Can only convert 1-dimensional array values");
}

const int64_t value_length = PyArray_SIZE(arr);

switch (value_type_->id()) {
LIST_SLOW_CASE(NA)
LIST_FAST_CASE(UINT8, NPY_UINT8, UInt8Type)
LIST_FAST_CASE(INT8, NPY_INT8, Int8Type)
LIST_FAST_CASE(UINT16, NPY_UINT16, UInt16Type)
LIST_FAST_CASE(INT16, NPY_INT16, Int16Type)
LIST_FAST_CASE(UINT32, NPY_UINT32, UInt32Type)
LIST_FAST_CASE(INT32, NPY_INT32, Int32Type)
LIST_FAST_CASE(UINT64, NPY_UINT64, UInt64Type)
LIST_FAST_CASE(INT64, NPY_INT64, Int64Type)
LIST_SLOW_CASE(DATE32)
LIST_SLOW_CASE(DATE64)
LIST_SLOW_CASE(TIME32)
LIST_SLOW_CASE(TIME64)
LIST_FAST_CASE(TIMESTAMP, NPY_DATETIME, TimestampType)
LIST_FAST_CASE(HALF_FLOAT, NPY_FLOAT16, HalfFloatType)
LIST_FAST_CASE(FLOAT, NPY_FLOAT, FloatType)
LIST_FAST_CASE(DOUBLE, NPY_DOUBLE, DoubleType)
LIST_SLOW_CASE(BINARY)
LIST_SLOW_CASE(FIXED_SIZE_BINARY)
LIST_SLOW_CASE(STRING)
case Type::LIST: {
if (PyArray_DESCR(arr)->type_num != NPY_OBJECT) {
return Status::Invalid(
"Can only convert list types from NumPy object "
"array input");
}
return internal::VisitSequence(obj, [this](PyObject* item, bool*) {
return value_converter_->AppendSingleVirtual(item);
});
}
default: {
return Status::TypeError("Unknown list item type: ", value_type_->ToString());
}
}
}

// ----------------------------------------------------------------------
// Convert structs

class StructConverter : public TypedConverter<StructType, StructConverter> {
template <NullCoding null_coding>
class StructConverter
: public TypedConverter<StructType, StructConverter<null_coding>, null_coding> {
public:
explicit StructConverter(bool from_pandas, bool strict_conversions)
: from_pandas_(from_pandas), strict_conversions_(strict_conversions) {}

Status Init(ArrayBuilder* builder) {
builder_ = builder;
typed_builder_ = checked_cast<StructBuilder*>(builder);
this->builder_ = builder;
this->typed_builder_ = checked_cast<StructBuilder*>(builder);
const auto& struct_type = checked_cast<const StructType&>(*builder->type());

num_fields_ = typed_builder_->num_fields();
num_fields_ = this->typed_builder_->num_fields();
DCHECK_EQ(num_fields_, struct_type.num_children());

field_name_list_.reset(PyList_New(num_fields_));
Expand All @@ -781,7 +781,7 @@ class StructConverter : public TypedConverter<StructType, StructConverter> {
std::unique_ptr<SeqConverter> value_converter;
RETURN_NOT_OK(
GetConverter(field_type, from_pandas_, strict_conversions_, &value_converter));
RETURN_NOT_OK(value_converter->Init(typed_builder_->field_builder(i)));
RETURN_NOT_OK(value_converter->Init(this->typed_builder_->field_builder(i)));
value_converters_.push_back(std::move(value_converter));

// Store the field name as a PyObject, for dict matching
Expand All @@ -795,7 +795,7 @@ class StructConverter : public TypedConverter<StructType, StructConverter> {
}

Status AppendItem(PyObject* obj) {
RETURN_NOT_OK(typed_builder_->Append());
RETURN_NOT_OK(this->typed_builder_->Append());
// Note heterogenous sequences are not allowed
if (ARROW_PREDICT_FALSE(source_kind_ == UNKNOWN)) {
if (PyDict_Check(obj)) {
Expand All @@ -809,13 +809,15 @@ class StructConverter : public TypedConverter<StructType, StructConverter> {
} else if (PyTuple_Check(obj) && source_kind_ == TUPLES) {
return AppendTupleItem(obj);
} else {
return Status::TypeError("Expected sequence of dicts or tuples for struct type");
return internal::InvalidType(obj,
"was not a dict, tuple, or recognized null value"
" for conversion to struct type");
}
}

// Append a missing item
Status AppendNull() {
RETURN_NOT_OK(typed_builder_->AppendNull());
RETURN_NOT_OK(this->typed_builder_->AppendNull());
// Need to also insert a missing item on all child builders
// (compare with ListConverter)
for (int i = 0; i < num_fields_; i++) {
Expand Down Expand Up @@ -959,16 +961,36 @@ Status GetConverter(const std::shared_ptr<DataType>& type, bool from_pandas,
bool strict_conversions, std::unique_ptr<SeqConverter>* out) {
switch (type->id()) {
case Type::LIST:
*out = std::unique_ptr<SeqConverter>(
new ListConverter<ListType>(from_pandas, strict_conversions));
if (from_pandas) {
*out = std::unique_ptr<SeqConverter>(
new ListConverter<ListType, NullCoding::PANDAS_SENTINELS>(
from_pandas, strict_conversions));
} else {
*out = std::unique_ptr<SeqConverter>(
new ListConverter<ListType, NullCoding::NONE_ONLY>(from_pandas,
strict_conversions));
}
return Status::OK();
case Type::LARGE_LIST:
*out = std::unique_ptr<SeqConverter>(
new ListConverter<LargeListType>(from_pandas, strict_conversions));
if (from_pandas) {
*out = std::unique_ptr<SeqConverter>(
new ListConverter<LargeListType, NullCoding::PANDAS_SENTINELS>(
from_pandas, strict_conversions));
} else {
*out = std::unique_ptr<SeqConverter>(
new ListConverter<LargeListType, NullCoding::NONE_ONLY>(from_pandas,
strict_conversions));
}
return Status::OK();
case Type::STRUCT:
*out = std::unique_ptr<SeqConverter>(
new StructConverter(from_pandas, strict_conversions));
if (from_pandas) {
*out = std::unique_ptr<SeqConverter>(
new StructConverter<NullCoding::PANDAS_SENTINELS>(from_pandas,
strict_conversions));
} else {
*out = std::unique_ptr<SeqConverter>(
new StructConverter<NullCoding::NONE_ONLY>(from_pandas, strict_conversions));
}
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Relevant changes

return Status::OK();
default:
break;
Expand Down
4 changes: 2 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -595,7 +595,7 @@ services:
# docker-compose build python
# docker-compose build hdfs-integration
# docker-compose run hdfs-integration
image: arrow:hdfs-${HDFS_VERSION:-2.6.5}
image: arrow:hdfs-${HDFS_VERSION:-2.9.2}
links:
- hdfs-namenode:hdfs-namenode
- hdfs-datanode-1:hdfs-datanode-1
Expand All @@ -608,7 +608,7 @@ services:
context: .
dockerfile: integration/hdfs/Dockerfile
args:
HDFS_VERSION: ${HDFS_VERSION:-2.6.5}
HDFS_VERSION: ${HDFS_VERSION:-2.9.2}
volumes: *ubuntu-volumes

# TODO(kszucs): pass dask version explicitly as a build argument
Expand Down
Loading