diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 2b6586fc23a..beabcebf9c3 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -1951,11 +1951,12 @@ Status ConvertTableToPandas(const PandasOptions& options, FunctionContext ctx; for (int i = 0; i < table->num_columns(); i++) { std::shared_ptr col = table->column(i); + if (col->type()->id() == Type::DICTIONARY) { + // No need to dictionary encode again. Came up in ARROW-6434, + // ARROW-6435 + continue; + } if (categorical_columns.count(table->field(i)->name())) { - if (table->field(i)->type()->id() == Type::DICTIONARY) { - // this column is already dictionary encoded - continue; - } Datum out; RETURN_NOT_OK(DictionaryEncode(&ctx, Datum(col), &out)); std::shared_ptr array = out.chunked_array(); diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc index 7bbac819709..bb3a2fe3fee 100644 --- a/cpp/src/arrow/python/helpers.cc +++ b/cpp/src/arrow/python/helpers.cc @@ -301,6 +301,13 @@ Status InvalidValue(PyObject* obj, const std::string& why) { Py_TYPE(obj)->tp_name, ": ", why); } +Status InvalidType(PyObject* obj, const std::string& why) { + std::string obj_as_str; + RETURN_NOT_OK(internal::PyObject_StdStringStr(obj, &obj_as_str)); + return Status::TypeError("Could not convert ", obj_as_str, " with type ", + Py_TYPE(obj)->tp_name, ": ", why); +} + Status UnboxIntegerAsInt64(PyObject* obj, int64_t* out) { if (PyLong_Check(obj)) { int overflow = 0; diff --git a/cpp/src/arrow/python/helpers.h b/cpp/src/arrow/python/helpers.h index 8661ee5dda7..3f4f1d21f19 100644 --- a/cpp/src/arrow/python/helpers.h +++ b/cpp/src/arrow/python/helpers.h @@ -130,6 +130,9 @@ inline Status CastSize(Py_ssize_t size, int64_t* out, const char* error_msg = NU ARROW_PYTHON_EXPORT Status InvalidValue(PyObject* obj, const std::string& why); +ARROW_PYTHON_EXPORT +Status InvalidType(PyObject* obj, const std::string& why); + ARROW_PYTHON_EXPORT Status IntegerScalarToDoubleSafe(PyObject* obj, double* result); ARROW_PYTHON_EXPORT diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 3a358969bdf..96fdf20727e 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -607,8 +607,26 @@ class StringConverter // ---------------------------------------------------------------------- // Convert lists (NumPy arrays containing lists or ndarrays as values) -template -class ListConverter : public TypedConverter> { +// If the value type does not match the expected NumPy dtype, then fall through +// to a slower PySequence-based path +#define LIST_FAST_CASE(TYPE, NUMPY_TYPE, ArrowType) \ + case Type::TYPE: { \ + if (PyArray_DESCR(arr)->type_num != NUMPY_TYPE) { \ + return value_converter_->AppendMultiple(obj, value_length); \ + } \ + return AppendNdarrayTypedItem(arr); \ + } + +// Use internal::VisitSequence, fast for NPY_OBJECT but slower otherwise +#define LIST_SLOW_CASE(TYPE) \ + case Type::TYPE: { \ + return value_converter_->AppendMultiple(obj, value_length); \ + } + +template +class ListConverter + : public TypedConverter, + null_coding> { public: using BuilderType = typename TypeTraits::BuilderType; @@ -626,18 +644,94 @@ class ListConverter : public TypedConverter> } template - Status AppendNdarrayTypedItem(PyArrayObject* arr); - Status AppendNdarrayItem(PyObject* arr); + Status AppendNdarrayTypedItem(PyArrayObject* arr) { + using traits = internal::npy_traits; + using T = typename traits::value_type; + using ValueBuilderType = typename TypeTraits::BuilderType; + + const bool null_sentinels_possible = + // Always treat Numpy's NaT as null + NUMPY_TYPE == NPY_DATETIME || + // Observing pandas's null sentinels + (from_pandas_ && traits::supports_nulls); + + auto child_builder = checked_cast(value_converter_->builder()); + + // TODO(wesm): Vector append when not strided + Ndarray1DIndexer values(arr); + if (null_sentinels_possible) { + for (int64_t i = 0; i < values.size(); ++i) { + if (traits::isnull(values[i])) { + RETURN_NOT_OK(child_builder->AppendNull()); + } else { + RETURN_NOT_OK(child_builder->Append(values[i])); + } + } + } else { + for (int64_t i = 0; i < values.size(); ++i) { + RETURN_NOT_OK(child_builder->Append(values[i])); + } + } + return Status::OK(); + } + + Status AppendNdarrayItem(PyObject* obj) { + PyArrayObject* arr = reinterpret_cast(obj); + + if (PyArray_NDIM(arr) != 1) { + return Status::Invalid("Can only convert 1-dimensional array values"); + } + + const int64_t value_length = PyArray_SIZE(arr); + + switch (value_type_->id()) { + LIST_SLOW_CASE(NA) + LIST_FAST_CASE(UINT8, NPY_UINT8, UInt8Type) + LIST_FAST_CASE(INT8, NPY_INT8, Int8Type) + LIST_FAST_CASE(UINT16, NPY_UINT16, UInt16Type) + LIST_FAST_CASE(INT16, NPY_INT16, Int16Type) + LIST_FAST_CASE(UINT32, NPY_UINT32, UInt32Type) + LIST_FAST_CASE(INT32, NPY_INT32, Int32Type) + LIST_FAST_CASE(UINT64, NPY_UINT64, UInt64Type) + LIST_FAST_CASE(INT64, NPY_INT64, Int64Type) + LIST_SLOW_CASE(DATE32) + LIST_SLOW_CASE(DATE64) + LIST_SLOW_CASE(TIME32) + LIST_SLOW_CASE(TIME64) + LIST_FAST_CASE(TIMESTAMP, NPY_DATETIME, TimestampType) + LIST_FAST_CASE(HALF_FLOAT, NPY_FLOAT16, HalfFloatType) + LIST_FAST_CASE(FLOAT, NPY_FLOAT, FloatType) + LIST_FAST_CASE(DOUBLE, NPY_DOUBLE, DoubleType) + LIST_SLOW_CASE(BINARY) + LIST_SLOW_CASE(FIXED_SIZE_BINARY) + LIST_SLOW_CASE(STRING) + case Type::LIST: { + if (PyArray_DESCR(arr)->type_num != NPY_OBJECT) { + return Status::Invalid( + "Can only convert list types from NumPy object " + "array input"); + } + return internal::VisitSequence(obj, [this](PyObject* item, bool*) { + return value_converter_->AppendSingleVirtual(item); + }); + } + default: { + return Status::TypeError("Unknown list item type: ", value_type_->ToString()); + } + } + } Status AppendItem(PyObject* obj) { RETURN_NOT_OK(this->typed_builder_->Append()); if (PyArray_Check(obj)) { return AppendNdarrayItem(obj); } - const auto list_size = static_cast(PySequence_Size(obj)); - if (ARROW_PREDICT_FALSE(list_size == -1)) { - RETURN_IF_PYERROR(); + if (!PySequence_Check(obj)) { + return internal::InvalidType(obj, + "was not a sequence or recognized null" + " for conversion to list type"); } + int64_t list_size = static_cast(PySequence_Size(obj)); return value_converter_->AppendMultiple(obj, list_size); } @@ -658,116 +752,22 @@ class ListConverter : public TypedConverter> bool strict_conversions_; }; -template -template -Status ListConverter::AppendNdarrayTypedItem(PyArrayObject* arr) { - using traits = internal::npy_traits; - using T = typename traits::value_type; - using ValueBuilderType = typename TypeTraits::BuilderType; - - const bool null_sentinels_possible = - // Always treat Numpy's NaT as null - NUMPY_TYPE == NPY_DATETIME || - // Observing pandas's null sentinels - (from_pandas_ && traits::supports_nulls); - - auto child_builder = checked_cast(value_converter_->builder()); - - // TODO(wesm): Vector append when not strided - Ndarray1DIndexer values(arr); - if (null_sentinels_possible) { - for (int64_t i = 0; i < values.size(); ++i) { - if (traits::isnull(values[i])) { - RETURN_NOT_OK(child_builder->AppendNull()); - } else { - RETURN_NOT_OK(child_builder->Append(values[i])); - } - } - } else { - for (int64_t i = 0; i < values.size(); ++i) { - RETURN_NOT_OK(child_builder->Append(values[i])); - } - } - return Status::OK(); -} - -// If the value type does not match the expected NumPy dtype, then fall through -// to a slower PySequence-based path -#define LIST_FAST_CASE(TYPE, NUMPY_TYPE, ArrowType) \ - case Type::TYPE: { \ - if (PyArray_DESCR(arr)->type_num != NUMPY_TYPE) { \ - return value_converter_->AppendMultiple(obj, value_length); \ - } \ - return AppendNdarrayTypedItem(arr); \ - } - -// Use internal::VisitSequence, fast for NPY_OBJECT but slower otherwise -#define LIST_SLOW_CASE(TYPE) \ - case Type::TYPE: { \ - return value_converter_->AppendMultiple(obj, value_length); \ - } - -template -Status ListConverter::AppendNdarrayItem(PyObject* obj) { - PyArrayObject* arr = reinterpret_cast(obj); - - if (PyArray_NDIM(arr) != 1) { - return Status::Invalid("Can only convert 1-dimensional array values"); - } - - const int64_t value_length = PyArray_SIZE(arr); - - switch (value_type_->id()) { - LIST_SLOW_CASE(NA) - LIST_FAST_CASE(UINT8, NPY_UINT8, UInt8Type) - LIST_FAST_CASE(INT8, NPY_INT8, Int8Type) - LIST_FAST_CASE(UINT16, NPY_UINT16, UInt16Type) - LIST_FAST_CASE(INT16, NPY_INT16, Int16Type) - LIST_FAST_CASE(UINT32, NPY_UINT32, UInt32Type) - LIST_FAST_CASE(INT32, NPY_INT32, Int32Type) - LIST_FAST_CASE(UINT64, NPY_UINT64, UInt64Type) - LIST_FAST_CASE(INT64, NPY_INT64, Int64Type) - LIST_SLOW_CASE(DATE32) - LIST_SLOW_CASE(DATE64) - LIST_SLOW_CASE(TIME32) - LIST_SLOW_CASE(TIME64) - LIST_FAST_CASE(TIMESTAMP, NPY_DATETIME, TimestampType) - LIST_FAST_CASE(HALF_FLOAT, NPY_FLOAT16, HalfFloatType) - LIST_FAST_CASE(FLOAT, NPY_FLOAT, FloatType) - LIST_FAST_CASE(DOUBLE, NPY_DOUBLE, DoubleType) - LIST_SLOW_CASE(BINARY) - LIST_SLOW_CASE(FIXED_SIZE_BINARY) - LIST_SLOW_CASE(STRING) - case Type::LIST: { - if (PyArray_DESCR(arr)->type_num != NPY_OBJECT) { - return Status::Invalid( - "Can only convert list types from NumPy object " - "array input"); - } - return internal::VisitSequence(obj, [this](PyObject* item, bool*) { - return value_converter_->AppendSingleVirtual(item); - }); - } - default: { - return Status::TypeError("Unknown list item type: ", value_type_->ToString()); - } - } -} - // ---------------------------------------------------------------------- // Convert structs -class StructConverter : public TypedConverter { +template +class StructConverter + : public TypedConverter, null_coding> { public: explicit StructConverter(bool from_pandas, bool strict_conversions) : from_pandas_(from_pandas), strict_conversions_(strict_conversions) {} Status Init(ArrayBuilder* builder) { - builder_ = builder; - typed_builder_ = checked_cast(builder); + this->builder_ = builder; + this->typed_builder_ = checked_cast(builder); const auto& struct_type = checked_cast(*builder->type()); - num_fields_ = typed_builder_->num_fields(); + num_fields_ = this->typed_builder_->num_fields(); DCHECK_EQ(num_fields_, struct_type.num_children()); field_name_list_.reset(PyList_New(num_fields_)); @@ -781,7 +781,7 @@ class StructConverter : public TypedConverter { std::unique_ptr value_converter; RETURN_NOT_OK( GetConverter(field_type, from_pandas_, strict_conversions_, &value_converter)); - RETURN_NOT_OK(value_converter->Init(typed_builder_->field_builder(i))); + RETURN_NOT_OK(value_converter->Init(this->typed_builder_->field_builder(i))); value_converters_.push_back(std::move(value_converter)); // Store the field name as a PyObject, for dict matching @@ -795,7 +795,7 @@ class StructConverter : public TypedConverter { } Status AppendItem(PyObject* obj) { - RETURN_NOT_OK(typed_builder_->Append()); + RETURN_NOT_OK(this->typed_builder_->Append()); // Note heterogenous sequences are not allowed if (ARROW_PREDICT_FALSE(source_kind_ == UNKNOWN)) { if (PyDict_Check(obj)) { @@ -809,13 +809,15 @@ class StructConverter : public TypedConverter { } else if (PyTuple_Check(obj) && source_kind_ == TUPLES) { return AppendTupleItem(obj); } else { - return Status::TypeError("Expected sequence of dicts or tuples for struct type"); + return internal::InvalidType(obj, + "was not a dict, tuple, or recognized null value" + " for conversion to struct type"); } } // Append a missing item Status AppendNull() { - RETURN_NOT_OK(typed_builder_->AppendNull()); + RETURN_NOT_OK(this->typed_builder_->AppendNull()); // Need to also insert a missing item on all child builders // (compare with ListConverter) for (int i = 0; i < num_fields_; i++) { @@ -959,16 +961,36 @@ Status GetConverter(const std::shared_ptr& type, bool from_pandas, bool strict_conversions, std::unique_ptr* out) { switch (type->id()) { case Type::LIST: - *out = std::unique_ptr( - new ListConverter(from_pandas, strict_conversions)); + if (from_pandas) { + *out = std::unique_ptr( + new ListConverter( + from_pandas, strict_conversions)); + } else { + *out = std::unique_ptr( + new ListConverter(from_pandas, + strict_conversions)); + } return Status::OK(); case Type::LARGE_LIST: - *out = std::unique_ptr( - new ListConverter(from_pandas, strict_conversions)); + if (from_pandas) { + *out = std::unique_ptr( + new ListConverter( + from_pandas, strict_conversions)); + } else { + *out = std::unique_ptr( + new ListConverter(from_pandas, + strict_conversions)); + } return Status::OK(); case Type::STRUCT: - *out = std::unique_ptr( - new StructConverter(from_pandas, strict_conversions)); + if (from_pandas) { + *out = std::unique_ptr( + new StructConverter(from_pandas, + strict_conversions)); + } else { + *out = std::unique_ptr( + new StructConverter(from_pandas, strict_conversions)); + } return Status::OK(); default: break; diff --git a/docker-compose.yml b/docker-compose.yml index 69986fe12b1..4f3f4128af5 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -595,7 +595,7 @@ services: # docker-compose build python # docker-compose build hdfs-integration # docker-compose run hdfs-integration - image: arrow:hdfs-${HDFS_VERSION:-2.6.5} + image: arrow:hdfs-${HDFS_VERSION:-2.9.2} links: - hdfs-namenode:hdfs-namenode - hdfs-datanode-1:hdfs-datanode-1 @@ -608,7 +608,7 @@ services: context: . dockerfile: integration/hdfs/Dockerfile args: - HDFS_VERSION: ${HDFS_VERSION:-2.6.5} + HDFS_VERSION: ${HDFS_VERSION:-2.9.2} volumes: *ubuntu-volumes # TODO(kszucs): pass dask version explicitly as a build argument diff --git a/integration/hdfs/Dockerfile b/integration/hdfs/Dockerfile index 8744b795f9c..f173b39e0ba 100644 --- a/integration/hdfs/Dockerfile +++ b/integration/hdfs/Dockerfile @@ -18,19 +18,20 @@ FROM arrow:python-3.6 # installing libhdfs (JNI) -ARG HADOOP_VERSION=2.6.5 +ARG HADOOP_VERSION=2.9.2 ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \ HADOOP_HOME=/usr/local/hadoop \ HADOOP_OPTS=-Djava.library.path=/usr/local/hadoop/lib/native \ PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin RUN apt-get update -y && \ apt-get install -y --no-install-recommends openjdk-8-jdk && \ - wget -q -O hadoop-$HADOOP_VERSION.tar.gz "https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename=hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz" && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN wget -q -O hadoop-$HADOOP_VERSION.tar.gz "https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename=hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz" && \ tar -zxf /hadoop-$HADOOP_VERSION.tar.gz && \ rm /hadoop-$HADOOP_VERSION.tar.gz && \ - mv /hadoop-$HADOOP_VERSION /usr/local/hadoop \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* + mv /hadoop-$HADOOP_VERSION /usr/local/hadoop COPY integration/hdfs/hdfs-site.xml $HADOOP_HOME/etc/hadoop/ # installing libhdfs3, it needs to be pinned, see ARROW-1465 and ARROW-1445 diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 011615ac9ab..decbf888309 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -1490,6 +1490,13 @@ def test_to_pandas_categorical_zero_length(self): # This would segfault under 0.11.0 table.to_pandas(categories=['col']) + def test_to_pandas_categories_already_dictionary(self): + # Showed up in ARROW-6434, ARROW-6435 + array = pa.array(['foo', 'foo', 'foo', 'bar']).dictionary_encode() + table = pa.Table.from_arrays(arrays=[array], names=['col']) + result = table.to_pandas(categories=['col']) + assert table.to_pandas().equals(result) + def test_table_str_to_categorical_without_na(self): values = ['a', 'a', 'b', 'b', 'c'] df = pd.DataFrame({'strings': values}) @@ -1708,6 +1715,20 @@ def test_column_of_decimal_list(self): [decimal.Decimal('3.3')]]}) tm.assert_frame_equal(df, expected_df) + def test_nested_types_from_ndarray_null_entries(self): + # Root cause of ARROW-6435 + s = pd.Series(np.array([np.nan, np.nan], dtype=object)) + + for ty in [pa.list_(pa.int64()), + pa.large_list(pa.int64()), + pa.struct([pa.field('f0', 'int32')])]: + result = pa.array(s, type=ty) + expected = pa.array([None, None], type=ty) + assert result.equals(expected) + + with pytest.raises(TypeError): + pa.array(s.values, type=ty) + def test_column_of_lists(self): df, schema = dataframe_with_lists() _check_pandas_roundtrip(df, schema=schema, expected_schema=schema)