diff --git a/c_glib/arrow-glib/Makefile.am b/c_glib/arrow-glib/Makefile.am index a2965955714..998103c0550 100644 --- a/c_glib/arrow-glib/Makefile.am +++ b/c_glib/arrow-glib/Makefile.am @@ -55,7 +55,6 @@ libarrow_glib_la_headers = \ buffer.h \ chunked-array.h \ codec.h \ - column.h \ composite-array.h \ composite-data-type.h \ data-type.h \ @@ -107,7 +106,6 @@ libarrow_glib_la_sources = \ buffer.cpp \ chunked-array.cpp \ codec.cpp \ - column.cpp \ composite-array.cpp \ composite-data-type.cpp \ decimal128.cpp \ @@ -153,7 +151,6 @@ libarrow_glib_la_cpp_headers = \ buffer.hpp \ chunked-array.hpp \ codec.hpp \ - column.hpp \ data-type.hpp \ decimal128.hpp \ error.hpp \ @@ -187,9 +184,13 @@ libarrow_glib_la_cpp_headers += \ orc-file-reader.hpp endif +libarrow_glib_la_cpp_internal_headers = \ + internal-index.hpp + libarrow_glib_la_SOURCES = \ $(libarrow_glib_la_sources) \ - $(libarrow_glib_la_cpp_headers) + $(libarrow_glib_la_cpp_headers) \ + $(libarrow_glib_la_cpp_internal_headers) BUILT_SOURCES = \ $(libarrow_glib_la_genearted_headers) \ diff --git a/c_glib/arrow-glib/arrow-glib.h b/c_glib/arrow-glib/arrow-glib.h index 2a4de13f2da..3f44c665edf 100644 --- a/c_glib/arrow-glib/arrow-glib.h +++ b/c_glib/arrow-glib/arrow-glib.h @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/c_glib/arrow-glib/arrow-glib.hpp b/c_glib/arrow-glib/arrow-glib.hpp index ac8563cb543..d755b2ba6b1 100644 --- a/c_glib/arrow-glib/arrow-glib.hpp +++ b/c_glib/arrow-glib/arrow-glib.hpp @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/c_glib/arrow-glib/chunked-array.cpp b/c_glib/arrow-glib/chunked-array.cpp index 6d9598bc106..20437c2c6e7 100644 --- a/c_glib/arrow-glib/chunked-array.cpp +++ b/c_glib/arrow-glib/chunked-array.cpp @@ -206,9 +206,25 @@ garrow_chunked_array_get_value_type(GArrowChunkedArray *chunked_array) * @chunked_array: A #GArrowChunkedArray. * * Returns: The total number of rows in the chunked array. + * + * Deprecated: 1.0.0: Use garrow_chunked_array_get_n_rows() instead. */ guint64 garrow_chunked_array_get_length(GArrowChunkedArray *chunked_array) +{ + return garrow_chunked_array_get_n_rows(chunked_array); +} + +/** + * garrow_chunked_array_get_n_rows: + * @chunked_array: A #GArrowChunkedArray. + * + * Returns: The total number of rows in the chunked array. + * + * Since: 1.0.0 + */ +guint64 +garrow_chunked_array_get_n_rows(GArrowChunkedArray *chunked_array) { const auto arrow_chunked_array = garrow_chunked_array_get_raw(chunked_array); return arrow_chunked_array->length(); diff --git a/c_glib/arrow-glib/chunked-array.h b/c_glib/arrow-glib/chunked-array.h index 882f8f2d3f5..8c67eead2cd 100644 --- a/c_glib/arrow-glib/chunked-array.h +++ b/c_glib/arrow-glib/chunked-array.h @@ -44,7 +44,10 @@ garrow_chunked_array_get_value_data_type(GArrowChunkedArray *chunked_array); GArrowType garrow_chunked_array_get_value_type(GArrowChunkedArray *chunked_array); +GARROW_DEPRECATED_IN_1_0_FOR(garrow_chunked_array_get_n_rows) guint64 garrow_chunked_array_get_length (GArrowChunkedArray *chunked_array); +GARROW_AVAILABLE_IN_1_0 +guint64 garrow_chunked_array_get_n_rows (GArrowChunkedArray *chunked_array); guint64 garrow_chunked_array_get_n_nulls(GArrowChunkedArray *chunked_array); guint garrow_chunked_array_get_n_chunks (GArrowChunkedArray *chunked_array); diff --git a/c_glib/arrow-glib/column.cpp b/c_glib/arrow-glib/column.cpp deleted file mode 100644 index 68694b3d679..00000000000 --- a/c_glib/arrow-glib/column.cpp +++ /dev/null @@ -1,415 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#ifdef HAVE_CONFIG_H -# include -#endif - -#include -#include -#include -#include -#include -#include - -#include - -G_BEGIN_DECLS - -/** - * SECTION: column - * @short_description: Column class - * - * #GArrowColumn is a class for column. Column has a #GArrowField and - * zero or more values. Values are #GArrowChunkedArray. - */ - -typedef struct GArrowColumnPrivate_ { - std::shared_ptr column; - GArrowField *field; - GArrowArray *array; - GArrowChunkedArray *chunked_array; -} GArrowColumnPrivate; - -enum { - PROP_0, - PROP_COLUMN, - PROP_FIELD, - PROP_ARRAY, - PROP_CHUNKED_ARRAY -}; - -G_DEFINE_TYPE_WITH_PRIVATE(GArrowColumn, - garrow_column, - G_TYPE_OBJECT) - -#define GARROW_COLUMN_GET_PRIVATE(object) \ - static_cast( \ - garrow_column_get_instance_private( \ - GARROW_COLUMN(object))) - -static void -garrow_column_dispose(GObject *object) -{ - auto priv = GARROW_COLUMN_GET_PRIVATE(object); - - if (priv->field) { - g_object_unref(priv->field); - priv->field = nullptr; - } - - if (priv->array) { - g_object_unref(priv->array); - priv->array = nullptr; - } - - if (priv->chunked_array) { - g_object_unref(priv->chunked_array); - priv->chunked_array = nullptr; - } - - G_OBJECT_CLASS(garrow_column_parent_class)->dispose(object); -} - -static void -garrow_column_finalize(GObject *object) -{ - auto priv = GARROW_COLUMN_GET_PRIVATE(object); - - priv->column = nullptr; - - G_OBJECT_CLASS(garrow_column_parent_class)->finalize(object); -} - -static void -garrow_column_set_property(GObject *object, - guint prop_id, - const GValue *value, - GParamSpec *pspec) -{ - auto priv = GARROW_COLUMN_GET_PRIVATE(object); - - switch (prop_id) { - case PROP_COLUMN: - priv->column = - *static_cast *>(g_value_get_pointer(value)); - break; - case PROP_FIELD: - priv->field = static_cast(g_value_dup_object(value)); - break; - case PROP_ARRAY: - priv->array = static_cast(g_value_dup_object(value)); - break; - case PROP_CHUNKED_ARRAY: - priv->chunked_array = - static_cast(g_value_dup_object(value)); - break; - default: - G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); - break; - } -} - -static void -garrow_column_get_property(GObject *object, - guint prop_id, - GValue *value, - GParamSpec *pspec) -{ - auto priv = GARROW_COLUMN_GET_PRIVATE(object); - - switch (prop_id) { - case PROP_FIELD: - g_value_set_object(value, priv->field); - break; - case PROP_ARRAY: - g_value_set_object(value, priv->array); - break; - case PROP_CHUNKED_ARRAY: - g_value_set_object(value, priv->chunked_array); - break; - default: - G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); - break; - } -} - -static void -garrow_column_init(GArrowColumn *object) -{ -} - -static void -garrow_column_class_init(GArrowColumnClass *klass) -{ - auto gobject_class = G_OBJECT_CLASS(klass); - - gobject_class->dispose = garrow_column_dispose; - gobject_class->finalize = garrow_column_finalize; - gobject_class->set_property = garrow_column_set_property; - gobject_class->get_property = garrow_column_get_property; - - GParamSpec *spec; - spec = g_param_spec_pointer("column", - "Column", - "The raw std::shared *", - static_cast(G_PARAM_WRITABLE | - G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_COLUMN, spec); - - spec = g_param_spec_object("field", - "Field", - "The field of the column", - GARROW_TYPE_FIELD, - static_cast(G_PARAM_READWRITE | - G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_FIELD, spec); - - spec = g_param_spec_object("array", - "Array", - "The array of the column", - GARROW_TYPE_ARRAY, - static_cast(G_PARAM_READWRITE | - G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_ARRAY, spec); - - spec = g_param_spec_object("chunked-array", - "Chunked array", - "The chunked array of the column", - GARROW_TYPE_CHUNKED_ARRAY, - static_cast(G_PARAM_READWRITE | - G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_CHUNKED_ARRAY, spec); -} - -/** - * garrow_column_new_array: - * @field: The metadata of the column. - * @array: The data of the column. - * - * Returns: A newly created #GArrowColumn. - */ -GArrowColumn * -garrow_column_new_array(GArrowField *field, - GArrowArray *array) -{ - auto arrow_column = - std::make_shared(garrow_field_get_raw(field), - garrow_array_get_raw(array)); - auto column = GARROW_COLUMN(g_object_new(GARROW_TYPE_COLUMN, - "column", &arrow_column, - "field", field, - "array", array, - NULL)); - return column; -} - -/** - * garrow_column_new_chunked_array: - * @field: The metadata of the column. - * @chunked_array: The data of the column. - * - * Returns: A newly created #GArrowColumn. - */ -GArrowColumn * -garrow_column_new_chunked_array(GArrowField *field, - GArrowChunkedArray *chunked_array) -{ - auto arrow_column = - std::make_shared(garrow_field_get_raw(field), - garrow_chunked_array_get_raw(chunked_array)); - auto column = GARROW_COLUMN(g_object_new(GARROW_TYPE_COLUMN, - "column", &arrow_column, - "field", field, - "chunked-array", chunked_array, - NULL)); - return column; -} - -/** - * garrow_column_slice: - * @column: A #GArrowColumn. - * @offset: The offset of sub #GArrowColumn. - * @length: The length of sub #GArrowColumn. - * - * Returns: (transfer full): The sub #GArrowColumn. It covers only from - * `offset` to `offset + length` range. The sub #GArrowColumn shares - * values with the base #GArrowColumn. - */ -GArrowColumn * -garrow_column_slice(GArrowColumn *column, - guint64 offset, - guint64 length) -{ - const auto arrow_column = garrow_column_get_raw(column); - auto arrow_sub_column = arrow_column->Slice(offset, length); - return garrow_column_new_raw(&arrow_sub_column); -} - -/** - * garrow_column_equal: - * @column: A #GArrowColumn. - * @other_column: A #GArrowColumn to be compared. - * - * Returns: %TRUE if both of them have the same data, %FALSE - * otherwise. - * - * Since: 0.4.0 - */ -gboolean -garrow_column_equal(GArrowColumn *column, GArrowColumn *other_column) -{ - const auto arrow_column = garrow_column_get_raw(column); - const auto arrow_other_column = garrow_column_get_raw(other_column); - return arrow_column->Equals(arrow_other_column); -} - -/** - * garrow_column_get_length: - * @column: A #GArrowColumn. - * - * Returns: The number of data of the column. - */ -guint64 -garrow_column_get_length(GArrowColumn *column) -{ - const auto arrow_column = garrow_column_get_raw(column); - return arrow_column->length(); -} - -/** - * garrow_column_get_n_nulls: - * @column: A #GArrowColumn. - * - * Returns: The number of nulls of the column. - */ -guint64 -garrow_column_get_n_nulls(GArrowColumn *column) -{ - const auto arrow_column = garrow_column_get_raw(column); - return arrow_column->null_count(); -} - -/** - * garrow_column_get_field: - * @column: A #GArrowColumn. - * - * Returns: (transfer full): The metadata of the column. - */ -GArrowField * -garrow_column_get_field(GArrowColumn *column) -{ - auto priv = GARROW_COLUMN_GET_PRIVATE(column); - if (priv->field) { - g_object_ref(priv->field); - return priv->field; - } else { - const auto arrow_column = garrow_column_get_raw(column); - auto arrow_field = arrow_column->field(); - auto data_type = garrow_column_get_data_type(column); - auto field = garrow_field_new_raw(&arrow_field, data_type); - g_object_unref(data_type); - return field; - } -} - -/** - * garrow_column_get_name: - * @column: A #GArrowColumn. - * - * Returns: The name of the column. - */ -const gchar * -garrow_column_get_name(GArrowColumn *column) -{ - const auto arrow_column = garrow_column_get_raw(column); - return arrow_column->name().c_str(); -} - -/** - * garrow_column_get_data_type: - * @column: A #GArrowColumn. - * - * Returns: (transfer full): The data type of the column. - */ -GArrowDataType * -garrow_column_get_data_type(GArrowColumn *column) -{ - const auto arrow_column = garrow_column_get_raw(column); - auto arrow_data_type = arrow_column->type(); - return garrow_data_type_new_raw(&arrow_data_type); -} - -/** - * garrow_column_get_data: - * @column: A #GArrowColumn. - * - * Returns: (transfer full): The data of the column. - */ -GArrowChunkedArray * -garrow_column_get_data(GArrowColumn *column) -{ - const auto arrow_column = garrow_column_get_raw(column); - auto arrow_chunked_array = arrow_column->data(); - return garrow_chunked_array_new_raw(&arrow_chunked_array); -} - -/** - * garrow_column_to_string: - * @column: A #GArrowColumn. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: (nullable) (transfer full): - * The formatted column content or %NULL on error. - * - * The returned string should be freed when with g_free() when no - * longer needed. - * - * Since: 0.12.0 - */ -gchar * -garrow_column_to_string(GArrowColumn *column, GError **error) -{ - const auto arrow_column = garrow_column_get_raw(column); - std::stringstream sink; - auto status = arrow::PrettyPrint(*arrow_column, 0, &sink); - if (garrow_error_check(error, status, "[column][to-string]")) { - return g_strdup(sink.str().c_str()); - } else { - return NULL; - } -} - -G_END_DECLS - -GArrowColumn * -garrow_column_new_raw(std::shared_ptr *arrow_column) -{ - auto column = GARROW_COLUMN(g_object_new(GARROW_TYPE_COLUMN, - "column", arrow_column, - NULL)); - return column; -} - -std::shared_ptr -garrow_column_get_raw(GArrowColumn *column) -{ - auto priv = GARROW_COLUMN_GET_PRIVATE(column); - return priv->column; -} diff --git a/c_glib/arrow-glib/column.h b/c_glib/arrow-glib/column.h deleted file mode 100644 index 274595858dd..00000000000 --- a/c_glib/arrow-glib/column.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#pragma once - -#include -#include -#include - -G_BEGIN_DECLS - -#define GARROW_TYPE_COLUMN (garrow_column_get_type()) -G_DECLARE_DERIVABLE_TYPE(GArrowColumn, - garrow_column, - GARROW, - COLUMN, - GObject) -struct _GArrowColumnClass -{ - GObjectClass parent_class; -}; - -GArrowColumn *garrow_column_new_array(GArrowField *field, - GArrowArray *array); -GArrowColumn *garrow_column_new_chunked_array(GArrowField *field, - GArrowChunkedArray *chunked_array); -GArrowColumn *garrow_column_slice(GArrowColumn *column, - guint64 offset, - guint64 length); - -gboolean garrow_column_equal (GArrowColumn *column, - GArrowColumn *other_column); - -guint64 garrow_column_get_length (GArrowColumn *column); -guint64 garrow_column_get_n_nulls (GArrowColumn *column); -GArrowField *garrow_column_get_field (GArrowColumn *column); -const gchar *garrow_column_get_name (GArrowColumn *column); -GArrowDataType *garrow_column_get_data_type (GArrowColumn *column); -GArrowChunkedArray *garrow_column_get_data (GArrowColumn *column); -gchar *garrow_column_to_string (GArrowColumn *column, - GError **error); - -G_END_DECLS diff --git a/c_glib/arrow-glib/error.h b/c_glib/arrow-glib/error.h index 2fac5ad0d3e..d600663592f 100644 --- a/c_glib/arrow-glib/error.h +++ b/c_glib/arrow-glib/error.h @@ -39,7 +39,7 @@ G_BEGIN_DECLS * in Gandiva. * @GARROW_ERROR_EXPRESSION_VALIDATION: Validation errors in expression given for code generation. * @GARROW_ERROR_EXECUTION: Execution error while evaluating the expression against a record batch. - * @GARROW_ALREADY_EXISTS: Item already exists error. + * @GARROW_ERROR_ALREADY_EXISTS: Item already exists error. * * The error codes are used by all arrow-glib functions. * diff --git a/c_glib/arrow-glib/column.hpp b/c_glib/arrow-glib/internal-index.hpp similarity index 78% rename from c_glib/arrow-glib/column.hpp rename to c_glib/arrow-glib/internal-index.hpp index 4ebb742bb50..e3d709fc093 100644 --- a/c_glib/arrow-glib/column.hpp +++ b/c_glib/arrow-glib/internal-index.hpp @@ -19,9 +19,19 @@ #pragma once -#include +#include -#include - -GArrowColumn *garrow_column_new_raw(std::shared_ptr *arrow_column); -std::shared_ptr garrow_column_get_raw(GArrowColumn *column); +static inline bool +garrow_internal_index_adjust(gint &i, const gint max) +{ + if (i < 0) { + i += max; + if (i < 0) { + return false; + } + } + if (i >= max) { + return false; + } + return true; +} diff --git a/c_glib/arrow-glib/meson.build b/c_glib/arrow-glib/meson.build index 4e8e1cc111f..fd426328f8c 100644 --- a/c_glib/arrow-glib/meson.build +++ b/c_glib/arrow-glib/meson.build @@ -24,7 +24,6 @@ sources = files( 'buffer.cpp', 'chunked-array.cpp', 'codec.cpp', - 'column.cpp', 'composite-array.cpp', 'composite-data-type.cpp', 'decimal128.cpp', @@ -73,7 +72,6 @@ c_headers = files( 'buffer.h', 'chunked-array.h', 'codec.h', - 'column.h', 'composite-array.h', 'composite-data-type.h', 'data-type.h', @@ -126,7 +124,6 @@ cpp_headers = files( 'buffer.hpp', 'chunked-array.hpp', 'codec.hpp', - 'column.hpp', 'data-type.hpp', 'decimal128.hpp', 'error.hpp', @@ -165,6 +162,10 @@ if have_arrow_orc ) endif +cpp_internal_headers = files( + 'internal-index.hpp', +) + version_h_conf = configuration_data() version_h_conf.set('GARROW_VERSION_MAJOR', version_major) version_h_conf.set('GARROW_VERSION_MINOR', version_minor) diff --git a/c_glib/arrow-glib/reader.cpp b/c_glib/arrow-glib/reader.cpp index 88af8c7b37f..7783362104d 100644 --- a/c_glib/arrow-glib/reader.cpp +++ b/c_glib/arrow-glib/reader.cpp @@ -21,18 +21,18 @@ # include #endif -#include +#include +#include #include #include #include -#include -#include -#include - #include - +#include #include #include +#include +#include +#include G_BEGIN_DECLS @@ -734,9 +734,11 @@ garrow_feather_file_reader_get_n_columns(GArrowFeatherFileReader *reader) /** * garrow_feather_file_reader_get_column_name: * @reader: A #GArrowFeatherFileReader. - * @i: The index of the target column. + * @i: The index of the target column. If it's negative, index is + * counted backward from the end of the columns. `-1` means the last + * column. * - * Returns: (transfer full): The i-th column name in the file. + * Returns: (nullable) (transfer full): The i-th column name in the file. * * It should be freed with g_free() when no longer needed. * @@ -747,69 +749,52 @@ garrow_feather_file_reader_get_column_name(GArrowFeatherFileReader *reader, gint i) { auto arrow_reader = garrow_feather_file_reader_get_raw(reader); - auto column_name = arrow_reader->GetColumnName(i); + if (!garrow_internal_index_adjust(i, arrow_reader->num_columns())) { + return NULL; + } + const auto &column_name = arrow_reader->GetColumnName(i); return g_strndup(column_name.data(), column_name.size()); } /** - * garrow_feather_file_reader_get_column: + * garrow_feather_file_reader_get_column_data: * @reader: A #GArrowFeatherFileReader. - * @i: The index of the target column. + * @i: The index of the target column. If it's negative, index is + * counted backward from the end of the columns. `-1` means the last + * column. * @error: (nullable): Return location for a #GError or %NULL. * * Returns: (nullable) (transfer full): - * The i-th column in the file or %NULL on error. + * The i-th column's data in the file or %NULL on error. * - * Since: 0.4.0 + * Since: 1.0.0 */ -GArrowColumn * -garrow_feather_file_reader_get_column(GArrowFeatherFileReader *reader, - gint i, - GError **error) +GArrowChunkedArray * +garrow_feather_file_reader_get_column_data(GArrowFeatherFileReader *reader, + gint i, + GError **error) { + const auto tag = "[feather-file-reader][get-column-data]"; auto arrow_reader = garrow_feather_file_reader_get_raw(reader); - std::shared_ptr arrow_column; - auto status = arrow_reader->GetColumn(i, &arrow_column); - if (garrow_error_check(error, status, "[feather-file-reader][get-column]")) { - return garrow_column_new_raw(&arrow_column); - } else { + const auto n_columns = arrow_reader->num_columns(); + if (!garrow_internal_index_adjust(i, n_columns)) { + garrow_error_check(error, + arrow::Status::IndexError("Out of index: " + "<0..", n_columns, ">: " + "<", i, ">"), + tag); return NULL; } -} -/** - * garrow_feather_file_reader_get_columns: - * @reader: A #GArrowFeatherFileReader. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: (element-type GArrowColumn) (transfer full): - * The columns in the file. - * - * Since: 0.4.0 - */ -GList * -garrow_feather_file_reader_get_columns(GArrowFeatherFileReader *reader, - GError **error) -{ - GList *columns = NULL; - auto arrow_reader = garrow_feather_file_reader_get_raw(reader); - auto n_columns = arrow_reader->num_columns(); - for (gint i = 0; i < n_columns; ++i) { - std::shared_ptr arrow_column; - auto status = arrow_reader->GetColumn(i, &arrow_column); - if (!garrow_error_check(error, - status, - "[feather-file-reader][get-columns]")) { - g_list_foreach(columns, (GFunc)g_object_unref, NULL); - g_list_free(columns); - return NULL; - } - columns = g_list_prepend(columns, - garrow_column_new_raw(&arrow_column)); + std::shared_ptr arrow_chunked_array; + auto status = arrow_reader->GetColumn(i, &arrow_chunked_array); + if (garrow_error_check(error, status, tag)) { + return garrow_chunked_array_new_raw(&arrow_chunked_array); + } else { + return NULL; } - return g_list_reverse(columns); } /** diff --git a/c_glib/arrow-glib/reader.h b/c_glib/arrow-glib/reader.h index c2cce2dcc25..ff83e247890 100644 --- a/c_glib/arrow-glib/reader.h +++ b/c_glib/arrow-glib/reader.h @@ -222,13 +222,11 @@ gint64 garrow_feather_file_reader_get_n_columns( gchar *garrow_feather_file_reader_get_column_name( GArrowFeatherFileReader *reader, gint i); -GArrowColumn *garrow_feather_file_reader_get_column( - GArrowFeatherFileReader *reader, - gint i, - GError **error); -GList *garrow_feather_file_reader_get_columns( - GArrowFeatherFileReader *reader, - GError **error); +GARROW_AVAILABLE_IN_1_0 +GArrowChunkedArray * +garrow_feather_file_reader_get_column_data(GArrowFeatherFileReader *reader, + gint i, + GError **error); GArrowTable * garrow_feather_file_reader_read(GArrowFeatherFileReader *reader, GError **error); diff --git a/c_glib/arrow-glib/record-batch.cpp b/c_glib/arrow-glib/record-batch.cpp index 04d442b409a..e566514e547 100644 --- a/c_glib/arrow-glib/record-batch.cpp +++ b/c_glib/arrow-glib/record-batch.cpp @@ -23,29 +23,13 @@ #include #include +#include +#include #include #include -#include #include -static inline bool -garrow_record_batch_adjust_index(const std::shared_ptr arrow_record_batch, - gint &i) -{ - auto n_columns = arrow_record_batch->num_columns(); - if (i < 0) { - i += n_columns; - if (i < 0) { - return false; - } - } - if (i >= n_columns) { - return false; - } - return true; -} - G_BEGIN_DECLS /** @@ -215,7 +199,7 @@ garrow_record_batch_get_schema(GArrowRecordBatch *record_batch) } /** - * garrow_record_batch_get_column: + * garrow_record_batch_get_column_data: * @record_batch: A #GArrowRecordBatch. * @i: The index of the target column. If it's negative, index is * counted backward from the end of the columns. `-1` means the last @@ -223,41 +207,21 @@ garrow_record_batch_get_schema(GArrowRecordBatch *record_batch) * * Returns: (transfer full) (nullable): The i-th column in the record batch * on success, %NULL on out of index. + * + * Since: 1.0.0 */ GArrowArray * -garrow_record_batch_get_column(GArrowRecordBatch *record_batch, - gint i) +garrow_record_batch_get_column_data(GArrowRecordBatch *record_batch, + gint i) { - const auto arrow_record_batch = garrow_record_batch_get_raw(record_batch); - if (!garrow_record_batch_adjust_index(arrow_record_batch, i)) { + const auto &arrow_record_batch = garrow_record_batch_get_raw(record_batch); + if (!garrow_internal_index_adjust(i, arrow_record_batch->num_columns())) { return NULL; } auto arrow_column = arrow_record_batch->column(i); return garrow_array_new_raw(&arrow_column); } -/** - * garrow_record_batch_get_columns: - * @record_batch: A #GArrowRecordBatch. - * - * Returns: (element-type GArrowArray) (transfer full): - * The columns in the record batch. - */ -GList * -garrow_record_batch_get_columns(GArrowRecordBatch *record_batch) -{ - const auto arrow_record_batch = garrow_record_batch_get_raw(record_batch); - - GList *columns = NULL; - for (int i = 0; i < arrow_record_batch->num_columns(); ++i) { - auto arrow_column = arrow_record_batch->column(i); - GArrowArray *column = garrow_array_new_raw(&arrow_column); - columns = g_list_prepend(columns, column); - } - - return g_list_reverse(columns); -} - /** * garrow_record_batch_get_column_name: * @record_batch: A #GArrowRecordBatch. @@ -272,8 +236,8 @@ const gchar * garrow_record_batch_get_column_name(GArrowRecordBatch *record_batch, gint i) { - const auto arrow_record_batch = garrow_record_batch_get_raw(record_batch); - if (!garrow_record_batch_adjust_index(arrow_record_batch, i)) { + const auto &arrow_record_batch = garrow_record_batch_get_raw(record_batch); + if (!garrow_internal_index_adjust(i, arrow_record_batch->num_columns())) { return NULL; } return arrow_record_batch->column_name(i).c_str(); diff --git a/c_glib/arrow-glib/record-batch.h b/c_glib/arrow-glib/record-batch.h index b01abf78904..b40a2aecfbe 100644 --- a/c_glib/arrow-glib/record-batch.h +++ b/c_glib/arrow-glib/record-batch.h @@ -44,9 +44,9 @@ gboolean garrow_record_batch_equal(GArrowRecordBatch *record_batch, GArrowRecordBatch *other_record_batch); GArrowSchema *garrow_record_batch_get_schema (GArrowRecordBatch *record_batch); -GArrowArray *garrow_record_batch_get_column (GArrowRecordBatch *record_batch, +GARROW_AVAILABLE_IN_1_0 +GArrowArray *garrow_record_batch_get_column_data(GArrowRecordBatch *record_batch, gint i); -GList *garrow_record_batch_get_columns (GArrowRecordBatch *record_batch); const gchar *garrow_record_batch_get_column_name(GArrowRecordBatch *record_batch, gint i); guint garrow_record_batch_get_n_columns (GArrowRecordBatch *record_batch); diff --git a/c_glib/arrow-glib/schema.cpp b/c_glib/arrow-glib/schema.cpp index 1bbe82f9a3c..5730dee8ce7 100644 --- a/c_glib/arrow-glib/schema.cpp +++ b/c_glib/arrow-glib/schema.cpp @@ -198,6 +198,23 @@ garrow_schema_get_field_by_name(GArrowSchema *schema, } } +/** + * garrow_schema_get_field_index: + * @schema: A #GArrowSchema. + * @name: The name of the field to be found. + * + * Returns: The index of the found field, -1 on not found. + * + * Since: 1.0.0 + */ +gint +garrow_schema_get_field_index(GArrowSchema *schema, + const gchar *name) +{ + const auto &arrow_schema = garrow_schema_get_raw(schema); + return arrow_schema->GetFieldIndex(std::string(name)); +} + /** * garrow_schema_n_fields: * @schema: A #GArrowSchema. diff --git a/c_glib/arrow-glib/schema.h b/c_glib/arrow-glib/schema.h index d5e27bbad98..745d266b21f 100644 --- a/c_glib/arrow-glib/schema.h +++ b/c_glib/arrow-glib/schema.h @@ -42,6 +42,9 @@ GArrowField *garrow_schema_get_field (GArrowSchema *schema, guint i); GArrowField *garrow_schema_get_field_by_name(GArrowSchema *schema, const gchar *name); +GARROW_AVAILABLE_IN_1_0 +gint garrow_schema_get_field_index (GArrowSchema *schema, + const gchar *name); guint garrow_schema_n_fields (GArrowSchema *schema); GList *garrow_schema_get_fields (GArrowSchema *schema); diff --git a/c_glib/arrow-glib/table.cpp b/c_glib/arrow-glib/table.cpp index a29d18bc402..511a2354d0d 100644 --- a/c_glib/arrow-glib/table.cpp +++ b/c_glib/arrow-glib/table.cpp @@ -22,8 +22,10 @@ #endif #include -#include +#include #include +#include +#include #include #include #include @@ -37,7 +39,7 @@ G_BEGIN_DECLS * @short_description: Table class * * #GArrowTable is a class for table. Table has zero or more - * #GArrowColumns and zero or more records. + * #GArrowChunkedArrays and zero or more records. */ typedef struct GArrowTablePrivate_ { @@ -129,36 +131,12 @@ garrow_table_class_init(GArrowTableClass *klass) g_object_class_install_property(gobject_class, PROP_TABLE, spec); } -/** - * garrow_table_new: - * @schema: The schema of the table. - * @columns: (element-type GArrowColumn): The columns of the table. - * - * Returns: A newly created #GArrowTable. - * - * Deprecated: 0.12.0: Use garrow_table_new_values() instead. - */ -GArrowTable * -garrow_table_new(GArrowSchema *schema, - GList *columns) -{ - auto arrow_schema = garrow_schema_get_raw(schema); - std::vector> arrow_columns; - for (GList *node = columns; node; node = node->next) { - auto column = GARROW_COLUMN(node->data); - arrow_columns.push_back(garrow_column_get_raw(column)); - } - - auto arrow_table = arrow::Table::Make(arrow_schema, arrow_columns); - return garrow_table_new_raw(&arrow_table); -} - /** * garrow_table_new_values: (skip) * @schema: The schema of the table. - * @values: The values of the table. All values must be instance of the - * same class. Available classes are #GArrowColumn, #GArrowArray and - * #GArrowRecordBatch. + * @values: The values of the table. All values must be instance of + * the same class. Available classes are #GArrowChunkedArray, + * #GArrowArray and #GArrowRecordBatch. * @error: (nullable): Return location for a #GError or %NULL. * * Returns: (nullable): A newly created #GArrowTable or %NULL on error. @@ -172,13 +150,13 @@ garrow_table_new_values(GArrowSchema *schema, { const auto context = "[table][new][values]"; auto arrow_schema = garrow_schema_get_raw(schema); - std::vector> arrow_columns; + std::vector> arrow_chunked_arrays; std::vector> arrow_arrays; std::vector> arrow_record_batches; for (GList *node = values; node; node = node->next) { - if (GARROW_IS_COLUMN(node->data)) { - auto column = GARROW_COLUMN(node->data); - arrow_columns.push_back(garrow_column_get_raw(column)); + if (GARROW_IS_CHUNKED_ARRAY(node->data)) { + auto chunked_array = GARROW_CHUNKED_ARRAY(node->data); + arrow_chunked_arrays.push_back(garrow_chunked_array_get_raw(chunked_array)); } else if (GARROW_IS_ARRAY(node->data)) { auto array = GARROW_ARRAY(node->data); arrow_arrays.push_back(garrow_array_get_raw(array)); @@ -192,13 +170,13 @@ garrow_table_new_values(GArrowSchema *schema, "%s: %s", context, "value must be one of " - "GArrowColumn, GArrowArray and GArrowRecordBatch"); + "GArrowChunkedArray, GArrowArray and GArrowRecordBatch"); return NULL; } } size_t n_types = 0; - if (!arrow_columns.empty()) { + if (!arrow_chunked_arrays.empty()) { ++n_types; } if (!arrow_arrays.empty()) { @@ -214,12 +192,12 @@ garrow_table_new_values(GArrowSchema *schema, "%s: %s", context, "all values must be the same objects of " - "GArrowColumn, GArrowArray or GArrowRecordBatch"); + "GArrowChunkedArray, GArrowArray or GArrowRecordBatch"); return NULL; } - if (!arrow_columns.empty()) { - auto arrow_table = arrow::Table::Make(arrow_schema, arrow_columns); + if (!arrow_chunked_arrays.empty()) { + auto arrow_table = arrow::Table::Make(arrow_schema, arrow_chunked_arrays); auto status = arrow_table->Validate(); if (garrow_error_check(error, status, context)) { return garrow_table_new_raw(&arrow_table); @@ -248,31 +226,33 @@ garrow_table_new_values(GArrowSchema *schema, } /** - * garrow_table_new_columns: + * garrow_table_new_chunked_arrays: * @schema: The schema of the table. - * @columns: (array length=n_columns): The columns of the table. - * @n_columns: The number of columns. + * @chunked_arrays: (array length=n_chunked_arrays): The chunked arrays of + * the table. + * @n_chunked_arrays: The number of chunked arrays. * @error: (nullable): Return location for a #GError or %NULL. * * Returns: (nullable): A newly created #GArrowTable or %NULL on error. * - * Since: 0.12.0 + * Since: 1.0.0 */ GArrowTable * -garrow_table_new_columns(GArrowSchema *schema, - GArrowColumn **columns, - gsize n_columns, - GError **error) +garrow_table_new_chunked_arrays(GArrowSchema *schema, + GArrowChunkedArray **chunked_arrays, + gsize n_chunked_arrays, + GError **error) { auto arrow_schema = garrow_schema_get_raw(schema); - std::vector> arrow_columns; - for (gsize i = 0; i < n_columns; ++i) { - arrow_columns.push_back(garrow_column_get_raw(columns[i])); + std::vector> arrow_chunked_arrays; + for (gsize i = 0; i < n_chunked_arrays; ++i) { + auto arrow_chunked_array = garrow_chunked_array_get_raw(chunked_arrays[i]); + arrow_chunked_arrays.push_back(arrow_chunked_array); } - auto arrow_table = arrow::Table::Make(arrow_schema, arrow_columns); + auto arrow_table = arrow::Table::Make(arrow_schema, arrow_chunked_arrays); auto status = arrow_table->Validate(); - if (garrow_error_check(error, status, "[table][new][columns]")) { + if (garrow_error_check(error, status, "[table][new][chunked-arrays]")) { return garrow_table_new_raw(&arrow_table); } else { return NULL; @@ -380,19 +360,26 @@ garrow_table_get_schema(GArrowTable *table) } /** - * garrow_table_get_column: + * garrow_table_get_column_data: * @table: A #GArrowTable. - * @i: The index of the target column. + * @i: The index of the target column. If it's negative, index is + * counted backward from the end of the columns. `-1` means the last + * column. * - * Returns: (transfer full): The i-th column in the table. + * Returns: (nullable) (transfer full): The i-th column's data in the table. + * + * Since: 1.0.0 */ -GArrowColumn * -garrow_table_get_column(GArrowTable *table, - guint i) +GArrowChunkedArray * +garrow_table_get_column_data(GArrowTable *table, + gint i) { - const auto arrow_table = garrow_table_get_raw(table); + const auto &arrow_table = garrow_table_get_raw(table); + if (!garrow_internal_index_adjust(i, arrow_table->num_columns())) { + return NULL; + } auto arrow_column = arrow_table->column(i); - return garrow_column_new_raw(&arrow_column); + return garrow_chunked_array_new_raw(&arrow_column); } /** @@ -425,24 +412,30 @@ garrow_table_get_n_rows(GArrowTable *table) * garrow_table_add_column: * @table: A #GArrowTable. * @i: The index of the new column. - * @column: The column to be added. + * @field: The field for the column to be added. + * @chunked_array: The column data to be added. * @error: (nullable): Return location for a #GError or %NULL. * * Returns: (nullable) (transfer full): The newly allocated * #GArrowTable that has a new column or %NULL on error. * - * Since: 0.3.0 + * Since: 1.0.0 */ GArrowTable * garrow_table_add_column(GArrowTable *table, guint i, - GArrowColumn *column, + GArrowField *field, + GArrowChunkedArray *chunked_array, GError **error) { const auto arrow_table = garrow_table_get_raw(table); - const auto arrow_column = garrow_column_get_raw(column); + const auto arrow_field = garrow_field_get_raw(field); + const auto arrow_chunked_array = garrow_chunked_array_get_raw(chunked_array); std::shared_ptr arrow_new_table; - auto status = arrow_table->AddColumn(i, arrow_column, &arrow_new_table); + auto status = arrow_table->AddColumn(i, + arrow_field, + arrow_chunked_array, + &arrow_new_table); if (garrow_error_check(error, status, "[table][add-column]")) { return garrow_table_new_raw(&arrow_new_table); } else { @@ -480,25 +473,31 @@ garrow_table_remove_column(GArrowTable *table, * garrow_table_replace_column: * @table: A #GArrowTable. * @i: The index of the column to be replaced. - * @column: The newly added #GArrowColumn. + * @field: The field for the new column. + * @chunked_array: The newly added column data. * @error: (nullable): Return location for a #GError or %NULL. * * Returns: (nullable) (transfer full): The newly allocated * #GArrowTable that has @column as the @i-th column or %NULL on * error. * - * Since: 0.10.0 + * Since: 1.0.0 */ GArrowTable * garrow_table_replace_column(GArrowTable *table, guint i, - GArrowColumn *column, + GArrowField *field, + GArrowChunkedArray *chunked_array, GError **error) { const auto arrow_table = garrow_table_get_raw(table); - const auto arrow_column = garrow_column_get_raw(column); + const auto arrow_field = garrow_field_get_raw(field); + const auto arrow_chunked_array = garrow_chunked_array_get_raw(chunked_array); std::shared_ptr arrow_new_table; - auto status = arrow_table->SetColumn(i, arrow_column, &arrow_new_table); + auto status = arrow_table->SetColumn(i, + arrow_field, + arrow_chunked_array, + &arrow_new_table); if (garrow_error_check(error, status, "[table][replace-column]")) { return garrow_table_new_raw(&arrow_new_table); } else { diff --git a/c_glib/arrow-glib/table.h b/c_glib/arrow-glib/table.h index f802637c737..f24414b6578 100644 --- a/c_glib/arrow-glib/table.h +++ b/c_glib/arrow-glib/table.h @@ -19,7 +19,7 @@ #pragma once -#include +#include #include #include #include @@ -37,23 +37,17 @@ struct _GArrowTableClass GObjectClass parent_class; }; -#ifndef GARROW_DISABLE_DEPRECATED -GARROW_DEPRECATED_IN_0_12_FOR(garrow_table_new_values) -GArrowTable * -garrow_table_new(GArrowSchema *schema, - GList *columns); -#endif GARROW_AVAILABLE_IN_0_12 GArrowTable * garrow_table_new_values(GArrowSchema *schema, GList *values, GError **error); -GARROW_AVAILABLE_IN_0_12 +GARROW_AVAILABLE_IN_1_0 GArrowTable * -garrow_table_new_columns(GArrowSchema *schema, - GArrowColumn **columns, - gsize n_columns, - GError **error); +garrow_table_new_chunked_arrays(GArrowSchema *schema, + GArrowChunkedArray **chunked_arrays, + gsize n_chunked_arrays, + GError **error); GARROW_AVAILABLE_IN_0_12 GArrowTable * garrow_table_new_arrays(GArrowSchema *schema, @@ -71,21 +65,28 @@ gboolean garrow_table_equal (GArrowTable *table, GArrowTable *other_table); GArrowSchema *garrow_table_get_schema (GArrowTable *table); -GArrowColumn *garrow_table_get_column (GArrowTable *table, - guint i); +GARROW_AVAILABLE_IN_1_0 +GArrowChunkedArray * +garrow_table_get_column_data(GArrowTable *table, + gint i); + guint garrow_table_get_n_columns (GArrowTable *table); guint64 garrow_table_get_n_rows (GArrowTable *table); +GARROW_AVAILABLE_IN_1_0 GArrowTable *garrow_table_add_column (GArrowTable *table, guint i, - GArrowColumn *column, + GArrowField *field, + GArrowChunkedArray *chunked_array, GError **error); GArrowTable *garrow_table_remove_column (GArrowTable *table, guint i, GError **error); +GARROW_AVAILABLE_IN_1_0 GArrowTable *garrow_table_replace_column(GArrowTable *table, guint i, - GArrowColumn *column, + GArrowField *field, + GArrowChunkedArray *chunked_array, GError **error); gchar *garrow_table_to_string (GArrowTable *table, GError **error); diff --git a/c_glib/arrow-glib/version.h.in b/c_glib/arrow-glib/version.h.in index 43a89dcff1d..dc925dd9252 100644 --- a/c_glib/arrow-glib/version.h.in +++ b/c_glib/arrow-glib/version.h.in @@ -110,6 +110,15 @@ # define GARROW_UNAVAILABLE(major, minor) G_UNAVAILABLE(major, minor) #endif +/** + * GARROW_VERSION_1_0: + * + * You can use this macro value for compile time API version check. + * + * Since: 1.0.0 + */ +#define GARROW_VERSION_1_0 G_ENCODE_VERSION(1, 0) + /** * GARROW_VERSION_0_14: * @@ -193,6 +202,20 @@ #define GARROW_AVAILABLE_IN_ALL +#if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_1_0 +# define GARROW_DEPRECATED_IN_1_0 GARROW_DEPRECATED +# define GARROW_DEPRECATED_IN_1_0_FOR(function) GARROW_DEPRECATED_FOR(function) +#else +# define GARROW_DEPRECATED_IN_1_0 +# define GARROW_DEPRECATED_IN_1_0_FOR(function) +#endif + +#if GARROW_VERSION_MAX_ALLOWED < GARROW_VERSION_0_1 +# define GARROW_AVAILABLE_IN_1_0 GARROW_UNAVAILABLE(1, 0) +#else +# define GARROW_AVAILABLE_IN_1_0 +#endif + #if GARROW_VERSION_MIN_REQUIRED >= GARROW_VERSION_0_14 # define GARROW_DEPRECATED_IN_0_14 GARROW_DEPRECATED # define GARROW_DEPRECATED_IN_0_14_FOR(function) GARROW_DEPRECATED_FOR(function) diff --git a/c_glib/doc/arrow-glib/arrow-glib-docs.xml b/c_glib/doc/arrow-glib/arrow-glib-docs.xml index 7429a22aaba..47e96a4ec19 100644 --- a/c_glib/doc/arrow-glib/arrow-glib-docs.xml +++ b/c_glib/doc/arrow-glib/arrow-glib-docs.xml @@ -71,7 +71,6 @@ Table - @@ -160,6 +159,10 @@ Index of deprecated API + + Index of new symbols in 1.0.0 + + Index of new symbols in 0.14.0 diff --git a/c_glib/doc/parquet-glib/parquet-glib-docs.xml b/c_glib/doc/parquet-glib/parquet-glib-docs.xml index 4485a6765cb..d58e92eacd1 100644 --- a/c_glib/doc/parquet-glib/parquet-glib-docs.xml +++ b/c_glib/doc/parquet-glib/parquet-glib-docs.xml @@ -57,6 +57,10 @@ Index of deprecated API + + Index of new symbols in 1.0.0 + + Index of new symbols in 0.12.0 diff --git a/c_glib/example/lua/read-batch.lua b/c_glib/example/lua/read-batch.lua index 8dc2fd82b39..a4c86763f08 100644 --- a/c_glib/example/lua/read-batch.lua +++ b/c_glib/example/lua/read-batch.lua @@ -28,14 +28,14 @@ for i = 0, reader:get_n_record_batches() - 1 do print(string.rep("=", 40)) print("record-batch["..i.."]:") for j = 0, record_batch:get_n_columns() - 1 do - local column = record_batch:get_column(j) local column_name = record_batch:get_column_name(j) + local column_data = record_batch:get_column_data(j) io.write(" "..column_name..": [") for k = 0, record_batch:get_n_rows() - 1 do if k > 0 then io.write(", ") end - io.write(column:get_value(k)) + io.write(column_data:get_value(k)) end print("]") end diff --git a/c_glib/example/lua/read-stream.lua b/c_glib/example/lua/read-stream.lua index e67acf506ff..7bf1083e225 100644 --- a/c_glib/example/lua/read-stream.lua +++ b/c_glib/example/lua/read-stream.lua @@ -33,14 +33,14 @@ while true do print(string.rep("=", 40)) print("record-batch["..i.."]:") for j = 0, record_batch:get_n_columns() - 1 do - local column = record_batch:get_column(j) local column_name = record_batch:get_column_name(j) + local column_data = record_batch:get_column_data(j) io.write(" "..column_name..": [") for k = 0, record_batch:get_n_rows() - 1 do if k > 0 then io.write(", ") end - io.write(column:get_value(k)) + io.write(column_data:get_value(k)) end print("]") end diff --git a/c_glib/example/read-batch.c b/c_glib/example/read-batch.c index 4382816e048..273dc70ffa5 100644 --- a/c_glib/example/read-batch.c +++ b/c_glib/example/read-batch.c @@ -78,8 +78,9 @@ print_record_batch(GArrowRecordBatch *record_batch) g_print("columns[%u](%s): ", nth_column, garrow_record_batch_get_column_name(record_batch, nth_column)); - array = garrow_record_batch_get_column(record_batch, nth_column); + array = garrow_record_batch_get_column_data(record_batch, nth_column); print_array(array); + g_object_unref(array); } } diff --git a/c_glib/example/read-stream.c b/c_glib/example/read-stream.c index 92411bcc780..133418faa90 100644 --- a/c_glib/example/read-stream.c +++ b/c_glib/example/read-stream.c @@ -78,8 +78,9 @@ print_record_batch(GArrowRecordBatch *record_batch) g_print("columns[%u](%s): ", nth_column, garrow_record_batch_get_column_name(record_batch, nth_column)); - array = garrow_record_batch_get_column(record_batch, nth_column); + array = garrow_record_batch_get_column_data(record_batch, nth_column); print_array(array); + g_object_unref(array); } } diff --git a/c_glib/parquet-glib/arrow-file-reader.cpp b/c_glib/parquet-glib/arrow-file-reader.cpp index 5c16e827fc1..217bd190d51 100644 --- a/c_glib/parquet-glib/arrow-file-reader.cpp +++ b/c_glib/parquet-glib/arrow-file-reader.cpp @@ -22,6 +22,7 @@ #endif #include +#include #include @@ -252,7 +253,7 @@ gparquet_arrow_file_reader_get_schema(GParquetArrowFileReader *reader, * gparquet_arrow_file_reader_select_schema: * @reader: A #GParquetArrowFileReader. * @column_indexes: (array length=n_column_indexes): - * The array of column indexes to be selected + * The array of column indexes to be selected. * @n_column_indexes: The length of `column_indexes`. * @error: (nullable): Return locatipcn for a #GError or %NULL. * @@ -285,42 +286,44 @@ gparquet_arrow_file_reader_select_schema(GParquetArrowFileReader *reader, } /** - * gparquet_arrow_file_reader_read_column: + * gparquet_arrow_file_reader_read_column_data: * @reader: A #GParquetArrowFileReader. - * @column_index: Index integer of the column to be read. + * @i: The index of the column to be read. If it's negative, index is + * counted backward from the end of the columns. `-1` means the last + * column. * @error: (nullable): Return locatipcn for a #GError or %NULL. * - * Returns: (transfer full) (nullable): A read #GArrowColumn. + * Returns: (transfer full) (nullable): A read #GArrowChunkedArray. * - * Since: 0.12.0 + * Since: 1.0.0 */ -GArrowColumn * -gparquet_arrow_file_reader_read_column(GParquetArrowFileReader *reader, - gint column_index, - GError **error) +GArrowChunkedArray * +gparquet_arrow_file_reader_read_column_data(GParquetArrowFileReader *reader, + gint i, + GError **error) { + const auto tag = "[parquet][arrow][file-reader][read-column-data]"; auto parquet_arrow_file_reader = gparquet_arrow_file_reader_get_raw(reader); - std::vector indices = {column_index}; - std::shared_ptr arrow_schema; - auto status = parquet_arrow_file_reader->GetSchema(indices, &arrow_schema); - if (!garrow_error_check(error, - status, - "[parquet][arrow][file-reader][read-column][get-schema]")) { + const auto n_columns = + parquet_arrow_file_reader->parquet_reader()->metadata()->num_columns(); + if (!garrow_internal_index_adjust(i, n_columns)) { + garrow_error_check(error, + arrow::Status::IndexError("Out of index: " + "<0..", n_columns, ">: " + "<", i, ">"), + tag); return NULL; } std::shared_ptr arrow_chunked_array; - status = parquet_arrow_file_reader->ReadColumn(column_index, &arrow_chunked_array); - if (!garrow_error_check(error, - status, - "[parquet][arrow][file-reader][read-column]")) { + auto status = + parquet_arrow_file_reader->ReadColumn(i, &arrow_chunked_array); + if (!garrow_error_check(error, status, tag)) { return NULL; } - auto arrow_field = arrow_schema->field(0); - auto arrow_column = std::make_shared(arrow_field, arrow_chunked_array); - return garrow_column_new_raw(&arrow_column); + return garrow_chunked_array_new_raw(&arrow_chunked_array); } /** diff --git a/c_glib/parquet-glib/arrow-file-reader.h b/c_glib/parquet-glib/arrow-file-reader.h index c251dcd0371..a0d1a8eca88 100644 --- a/c_glib/parquet-glib/arrow-file-reader.h +++ b/c_glib/parquet-glib/arrow-file-reader.h @@ -54,10 +54,11 @@ gparquet_arrow_file_reader_select_schema(GParquetArrowFileReader *reader, gsize n_column_indexes, GError **error); -GArrowColumn * -gparquet_arrow_file_reader_read_column(GParquetArrowFileReader *reader, - gint column_index, - GError **error); +GARROW_AVAILABLE_IN_1_0 +GArrowChunkedArray * +gparquet_arrow_file_reader_read_column_data(GParquetArrowFileReader *reader, + gint i, + GError **error); gint gparquet_arrow_file_reader_get_n_row_groups(GParquetArrowFileReader *reader); diff --git a/c_glib/test/helper/buildable.rb b/c_glib/test/helper/buildable.rb index 788cffe6b90..f5412a932c1 100644 --- a/c_glib/test/helper/buildable.rb +++ b/c_glib/test/helper/buildable.rb @@ -157,15 +157,15 @@ def append_to_builder(builder, value) end end - def build_table(arrays) - fields = arrays.collect do |name, array| - Arrow::Field.new(name, array.value_data_type) + def build_table(columns) + fields = [] + arrays = [] + columns.each do |name, array| + fields << Arrow::Field.new(name, array.value_data_type) + arrays << array end schema = Arrow::Schema.new(fields) - columns = arrays.collect.with_index do |(_name, array), i| - Arrow::Column.new(fields[i], array) - end - Arrow::Table.new(schema, columns) + Arrow::Table.new(schema, arrays) end def build_record_batch(arrays) diff --git a/c_glib/test/parquet/test-arrow-file-reader.rb b/c_glib/test/parquet/test-arrow-file-reader.rb index 96574542a4d..7ff17c2ba11 100644 --- a/c_glib/test/parquet/test-arrow-file-reader.rb +++ b/c_glib/test/parquet/test-arrow-file-reader.rb @@ -53,24 +53,13 @@ def test_select_schema end def test_read_column - a = @reader.read_column(0) assert_equal([ - "a: string", - Arrow::ChunkedArray.new([@a_array]).to_s, + Arrow::ChunkedArray.new([@a_array]), + Arrow::ChunkedArray.new([@b_array]), ], [ - a.field.to_s, - a.data.to_s, - ]) - - b = @reader.read_column(1) - assert_equal([ - "b: int32", - Arrow::ChunkedArray.new([@b_array]).to_s, - ], - [ - b.field.to_s, - b.data.to_s, + @reader.read_column_data(0), + @reader.read_column_data(-1), ]) end end diff --git a/c_glib/test/test-chunked-array.rb b/c_glib/test/test-chunked-array.rb index 05ea66b561d..82b46968a0d 100644 --- a/c_glib/test/test-chunked-array.rb +++ b/c_glib/test/test-chunked-array.rb @@ -49,13 +49,13 @@ def test_value_type Arrow::ChunkedArray.new(chunks).value_type) end - def test_length + def test_n_rows chunks = [ build_boolean_array([true, false]), build_boolean_array([true]), ] chunked_array = Arrow::ChunkedArray.new(chunks) - assert_equal(3, chunked_array.length) + assert_equal(3, chunked_array.n_rows) end def test_n_nulls diff --git a/c_glib/test/test-column.rb b/c_glib/test/test-column.rb deleted file mode 100644 index 01127de6e02..00000000000 --- a/c_glib/test/test-column.rb +++ /dev/null @@ -1,115 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -class TestColumn < Test::Unit::TestCase - include Helper::Buildable - - sub_test_case(".new") do - def test_array - field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) - array = build_boolean_array([true]) - column = Arrow::Column.new(field, array) - assert_equal(1, column.length) - end - - def test_chunked_array - field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) - chunks = [ - build_boolean_array([true]), - build_boolean_array([false, true]), - ] - chunked_array = Arrow::ChunkedArray.new(chunks) - column = Arrow::Column.new(field, chunked_array) - assert_equal(3, column.length) - end - end - - def test_equal - field1 = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) - array1 = build_boolean_array([true, false]) - field2 = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) - chunks = [ - build_boolean_array([true]), - build_boolean_array([false]), - ] - array2 = Arrow::ChunkedArray.new(chunks) - assert_equal(Arrow::Column.new(field1, array1), - Arrow::Column.new(field2, array2)) - end - - def test_length - field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) - array = build_boolean_array([true, false]) - column = Arrow::Column.new(field, array) - assert_equal(2, column.length) - end - - def test_n_nulls - field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) - array = build_boolean_array([true, nil, nil]) - column = Arrow::Column.new(field, array) - assert_equal(2, column.n_nulls) - end - - def test_field - field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) - array = build_boolean_array([true]) - column = Arrow::Column.new(field, array) - assert_equal("enabled", column.field.name) - end - - def test_name - field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) - array = build_boolean_array([true]) - column = Arrow::Column.new(field, array) - assert_equal("enabled", column.name) - end - - def test_data_type - field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) - array = build_boolean_array([true]) - column = Arrow::Column.new(field, array) - assert_equal("bool", column.data_type.to_s) - end - - def test_data - field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) - chunks = [ - build_boolean_array([true]), - build_boolean_array([false, true]), - ] - chunked_array = Arrow::ChunkedArray.new(chunks) - column = Arrow::Column.new(field, chunked_array) - assert_equal(3, column.data.length) - end - - def test_slice - field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) - chunks1 = [ - build_boolean_array([true, false, true]), - build_boolean_array([false, true]), - ] - chunks2 = [ - build_boolean_array([false, true]), - build_boolean_array([false]), - ] - chunked_array = Arrow::ChunkedArray.new(chunks1) - column = Arrow::Column.new(field, chunked_array) - sub_column = column.slice(1, 3) - assert_equal(chunks2, sub_column.data.chunks) - end -end diff --git a/c_glib/test/test-feather-file-reader.rb b/c_glib/test/test-feather-file-reader.rb index 901b94d2151..48a4fc75488 100644 --- a/c_glib/test/test-feather-file-reader.rb +++ b/c_glib/test/test-feather-file-reader.rb @@ -113,45 +113,31 @@ def setup_file(data) "is_critical" => build_boolean_array([]), } setup_file(:columns => columns) do |reader| + actual_column_names = reader.n_columns.times.collect do |i| + reader.get_column_name(i) + end assert_equal([ "message", "is_critical", ], - [ - reader.get_column_name(0), - reader.get_column_name(1), - ]) - end - end - - test("#get_column") do - columns = { - "message" => build_string_array([]), - "is_critical" => build_boolean_array([]), - } - setup_file(:columns => columns) do |reader| - assert_equal([ - "message", - "is_critical", - ], - [ - reader.get_column(0).name, - reader.get_column(1).name, - ]) + actual_column_names) end end - test("#columns") do + test("#get_column_data") do columns = { - "message" => build_string_array([]), - "is_critical" => build_boolean_array([]), + "message" => build_string_array(["Hello"]), + "is_critical" => build_boolean_array([false]), } setup_file(:columns => columns) do |reader| + actual_columns = reader.n_columns.times.collect do |i| + reader.get_column_data(i).get_chunk(0) + end assert_equal([ - "message", - "is_critical", + columns["message"], + columns["is_critical"], ], - reader.columns.collect(&:name)) + actual_columns) end end diff --git a/c_glib/test/test-feather-file-writer.rb b/c_glib/test/test-feather-file-writer.rb index 91dd1120939..247d937e93e 100644 --- a/c_glib/test/test-feather-file-writer.rb +++ b/c_glib/test/test-feather-file-writer.rb @@ -40,27 +40,31 @@ def test_append input = Arrow::MemoryMappedInputStream.new(tempfile.path) begin reader = Arrow::FeatherFileReader.new(input) - assert_equal([true, "Log"], - [reader.has_description?, reader.description]) - column_values = {} - reader.columns.each do |column| - values = [] - column.data.chunks.each do |array| - array.length.times do |j| - if array.respond_to?(:get_string) - values << array.get_string(j) - else - values << array.get_value(j) - end - end - end - column_values[column.name] = values + columns = reader.n_columns.times.collect do |i| + [ + reader.get_column_name(i), + reader.get_column_data(i).get_chunk(0), + ] end - assert_equal({ - "message" => ["Crash", "Error", "Shutdown"], - "is_critical" => [true, true, false], - }, - column_values) + assert_equal([ + true, + "Log", + [ + [ + "message", + build_string_array(["Crash", "Error", "Shutdown"]), + ], + [ + "is_critical", + build_boolean_array([true, true, false]), + ], + ], + ], + [ + reader.has_description?, + reader.description, + columns, + ]) ensure input.close end diff --git a/c_glib/test/test-file-writer.rb b/c_glib/test/test-file-writer.rb index 67aed85f73b..5f9c3c4e19a 100644 --- a/c_glib/test/test-file-writer.rb +++ b/c_glib/test/test-file-writer.rb @@ -60,12 +60,11 @@ def test_write_table array = build_boolean_array([true, false, true]) field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) schema = Arrow::Schema.new([field]) - column = Arrow::Column.new(field, array) begin file_writer = Arrow::RecordBatchFileWriter.new(output, schema) begin - table = Arrow::Table.new(schema, [column]) + table = Arrow::Table.new(schema, [array]) file_writer.write_table(table) ensure file_writer.close diff --git a/c_glib/test/test-record-batch.rb b/c_glib/test/test-record-batch.rb index 23078d784ff..c9ac75000bc 100644 --- a/c_glib/test/test-record-batch.rb +++ b/c_glib/test/test-record-batch.rb @@ -87,31 +87,26 @@ def test_schema @record_batch.schema.fields.collect(&:name)) end - sub_test_case("#column") do + sub_test_case("#column_data") do def test_positive assert_equal(build_boolean_array(@valid_values), - @record_batch.get_column(1)) + @record_batch.get_column_data(1)) end def test_negative assert_equal(build_boolean_array(@visible_values), - @record_batch.get_column(-2)) + @record_batch.get_column_data(-2)) end def test_positive_out_of_index - assert_nil(@record_batch.get_column(2)) + assert_nil(@record_batch.get_column_data(2)) end def test_negative_out_of_index - assert_nil(@record_batch.get_column(-3)) + assert_nil(@record_batch.get_column_data(-3)) end end - def test_columns - assert_equal([5, 5], - @record_batch.columns.collect(&:length)) - end - def test_n_columns assert_equal(2, @record_batch.n_columns) end @@ -123,7 +118,7 @@ def test_n_rows def test_slice sub_record_batch = @record_batch.slice(3, 2) sub_visible_values = sub_record_batch.n_rows.times.collect do |i| - sub_record_batch.get_column(0).get_value(i) + sub_record_batch.get_column_data(0).get_value(i) end assert_equal([false, true], sub_visible_values) diff --git a/c_glib/test/test-schema.rb b/c_glib/test/test-schema.rb index 4710cfb149d..6ff5514afdc 100644 --- a/c_glib/test/test-schema.rb +++ b/c_glib/test/test-schema.rb @@ -47,6 +47,20 @@ def test_not_found end end + sub_test_case("#get_field_index") do + def test_found + field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) + schema = Arrow::Schema.new([field]) + assert_equal(0, schema.get_field_index("enabled")) + end + + def test_not_found + field = Arrow::Field.new("enabled", Arrow::BooleanDataType.new) + schema = Arrow::Schema.new([field]) + assert_equal(-1, schema.get_field_index("nonexistent")) + end + end + def test_n_fields fields = [ Arrow::Field.new("enabled", Arrow::BooleanDataType.new), diff --git a/c_glib/test/test-table.rb b/c_glib/test/test-table.rb index 54ba7392dae..9e46a4ba78a 100644 --- a/c_glib/test/test-table.rb +++ b/c_glib/test/test-table.rb @@ -30,26 +30,28 @@ def setup def dump_table(table) table.n_columns.times.collect do |i| - column = table.get_column(i) + field = table.schema.get_field(i) + chunked_array = table.get_column_data(i) values = [] - column.data.chunks.each do |chunk| + chunked_array.chunks.each do |chunk| chunk.length.times do |j| values << chunk.get_value(j) end end [ - column.name, + field.name, values, ] end end - def test_columns - columns = [ - Arrow::Column.new(@fields[0], build_boolean_array([true])), - Arrow::Column.new(@fields[1], build_boolean_array([false])), + def test_arrays + require_gi_bindings(3, 3, 1) + arrays = [ + build_boolean_array([true]), + build_boolean_array([false]), ] - table = Arrow::Table.new(@schema, columns) + table = Arrow::Table.new(@schema, arrays) assert_equal([ ["visible", [true]], ["valid", [false]], @@ -57,16 +59,18 @@ def test_columns dump_table(table)) end - def test_arrays + def test_chunked_arrays require_gi_bindings(3, 3, 1) arrays = [ - build_boolean_array([true]), - build_boolean_array([false]), + Arrow::ChunkedArray.new([build_boolean_array([true]), + build_boolean_array([false])]), + Arrow::ChunkedArray.new([build_boolean_array([false]), + build_boolean_array([true])]), ] table = Arrow::Table.new(@schema, arrays) assert_equal([ - ["visible", [true]], - ["valid", [false]], + ["visible", [true, false]], + ["valid", [false, true]], ], dump_table(table)) end @@ -101,8 +105,8 @@ def setup ] schema = Arrow::Schema.new(fields) columns = [ - Arrow::Column.new(fields[0], build_boolean_array([true])), - Arrow::Column.new(fields[1], build_boolean_array([false])), + build_boolean_array([true]), + build_boolean_array([false]), ] @table = Arrow::Table.new(schema, columns) end @@ -114,8 +118,8 @@ def test_equal ] schema = Arrow::Schema.new(fields) columns = [ - Arrow::Column.new(fields[0], build_boolean_array([true])), - Arrow::Column.new(fields[1], build_boolean_array([false])), + build_boolean_array([true]), + build_boolean_array([false]), ] other_table = Arrow::Table.new(schema, columns) assert_equal(@table, other_table) @@ -126,8 +130,15 @@ def test_schema @table.schema.fields.collect(&:name)) end - def test_column - assert_equal("valid", @table.get_column(1).name) + def test_column_data + assert_equal([ + Arrow::ChunkedArray.new([build_boolean_array([true])]), + Arrow::ChunkedArray.new([build_boolean_array([false])]), + ], + [ + @table.get_column_data(0), + @table.get_column_data(-1), + ]) end def test_n_columns @@ -140,8 +151,8 @@ def test_n_rows def test_add_column field = Arrow::Field.new("added", Arrow::BooleanDataType.new) - column = Arrow::Column.new(field, build_boolean_array([true])) - new_table = @table.add_column(1, column) + chunked_array = Arrow::ChunkedArray.new([build_boolean_array([true])]) + new_table = @table.add_column(1, field, chunked_array) assert_equal(["visible", "added", "valid"], new_table.schema.fields.collect(&:name)) end @@ -154,8 +165,8 @@ def test_remove_column def test_replace_column field = Arrow::Field.new("added", Arrow::BooleanDataType.new) - column = Arrow::Column.new(field, build_boolean_array([true])) - new_table = @table.replace_column(0, column) + chunked_array = Arrow::ChunkedArray.new([build_boolean_array([true])]) + new_table = @table.replace_column(0, field, chunked_array) assert_equal(["added", "valid"], new_table.schema.fields.collect(&:name)) end diff --git a/cpp/examples/arrow/row-wise-conversion-example.cc b/cpp/examples/arrow/row-wise-conversion-example.cc index db8c28753db..c6e45d0d41e 100644 --- a/cpp/examples/arrow/row-wise-conversion-example.cc +++ b/cpp/examples/arrow/row-wise-conversion-example.cc @@ -139,11 +139,11 @@ arrow::Status ColumnarTableToVector(const std::shared_ptr& table, // border would be inside a byte. auto ids = - std::static_pointer_cast(table->column(0)->data()->chunk(0)); + std::static_pointer_cast(table->column(0)->chunk(0)); auto costs = - std::static_pointer_cast(table->column(1)->data()->chunk(0)); + std::static_pointer_cast(table->column(1)->chunk(0)); auto cost_components = - std::static_pointer_cast(table->column(2)->data()->chunk(0)); + std::static_pointer_cast(table->column(2)->chunk(0)); auto cost_components_values = std::static_pointer_cast(cost_components->values()); // To enable zero-copy slices, the native values pointer might need to account diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 68ab60c31b3..10f067e187b 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -370,7 +370,6 @@ add_arrow_test(tensor-test) add_arrow_test(sparse_tensor-test) add_arrow_benchmark(builder-benchmark) -add_arrow_benchmark(column-benchmark) add_subdirectory(array) add_subdirectory(csv) diff --git a/cpp/src/arrow/column-benchmark.cc b/cpp/src/arrow/column-benchmark.cc deleted file mode 100644 index bb2c63179ab..00000000000 --- a/cpp/src/arrow/column-benchmark.cc +++ /dev/null @@ -1,57 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "benchmark/benchmark.h" - -#include "arrow/array.h" -#include "arrow/memory_pool.h" -#include "arrow/table.h" -#include "arrow/testing/gtest_util.h" - -namespace arrow { -namespace { -template -Status MakePrimitive(int64_t length, int64_t null_count, std::shared_ptr* out) { - std::shared_ptr data, null_bitmap; - - RETURN_NOT_OK(AllocateBuffer(length * sizeof(typename ArrayType::value_type), &data)); - RETURN_NOT_OK(AllocateBuffer(BitUtil::BytesForBits(length), &null_bitmap)); - - *out = std::make_shared(length, data, null_bitmap, null_count); - return Status::OK(); -} -} // anonymous namespace - -static void BuildInt32ColumnByChunk( - benchmark::State& state) { // NOLINT non-const reference - ArrayVector arrays; - for (int chunk_n = 0; chunk_n < state.range(0); ++chunk_n) { - std::shared_ptr array; - ABORT_NOT_OK(MakePrimitive(100, 10, &array)); - arrays.push_back(array); - } - const auto INT32 = std::make_shared(); - const auto field = std::make_shared("c0", INT32); - std::unique_ptr column; - while (state.KeepRunning()) { - column.reset(new Column(field, arrays)); - } -} - -BENCHMARK(BuildInt32ColumnByChunk)->Range(5, 50000); - -} // namespace arrow diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc index 4731ed11a06..ec4d179fb9c 100644 --- a/cpp/src/arrow/csv/reader.cc +++ b/cpp/src/arrow/csv/reader.cc @@ -206,13 +206,13 @@ class BaseTableReader : public csv::TableReader { DCHECK_EQ(column_builders_.size(), static_cast(num_cols_)); std::vector> fields; - std::vector> columns; + std::vector> columns; for (int32_t i = 0; i < num_cols_; ++i) { std::shared_ptr array; RETURN_NOT_OK(column_builders_[i]->Finish(&array)); - columns.push_back(std::make_shared(column_names_[i], array)); - fields.push_back(columns.back()->field()); + fields.push_back(::arrow::field(column_names_[i], array->type())); + columns.emplace_back(std::move(array)); } *out = Table::Make(schema(fields), columns); return Status::OK(); diff --git a/cpp/src/arrow/ipc/feather-test.cc b/cpp/src/arrow/ipc/feather-test.cc index 001e36ac0df..af68a5dc929 100644 --- a/cpp/src/arrow/ipc/feather-test.cc +++ b/cpp/src/arrow/ipc/feather-test.cc @@ -425,14 +425,14 @@ TEST_F(TestTableWriter, PrimitiveRoundTrip) { ASSERT_OK(writer_->Append("f1", *batch->column(1))); Finish(); - std::shared_ptr col; + std::shared_ptr col; ASSERT_OK(reader_->GetColumn(0, &col)); - ASSERT_TRUE(col->data()->chunk(0)->Equals(batch->column(0))); - ASSERT_EQ("f0", col->name()); + ASSERT_TRUE(col->chunk(0)->Equals(batch->column(0))); + ASSERT_EQ("f0", reader_->GetColumnName(0)); ASSERT_OK(reader_->GetColumn(1, &col)); - ASSERT_TRUE(col->data()->chunk(0)->Equals(batch->column(1))); - ASSERT_EQ("f1", col->name()); + ASSERT_TRUE(col->chunk(0)->Equals(batch->column(1))); + ASSERT_EQ("f1", reader_->GetColumnName(1)); } TEST_F(TestTableWriter, CategoryRoundtrip) { @@ -502,14 +502,14 @@ TEST_F(TestTableWriter, PrimitiveNullRoundTrip) { } Finish(); - std::shared_ptr col; + std::shared_ptr col; for (int i = 0; i < batch->num_columns(); ++i) { ASSERT_OK(reader_->GetColumn(i, &col)); - ASSERT_EQ(batch->column_name(i), col->name()); + ASSERT_EQ(batch->column_name(i), reader_->GetColumnName(i)); StringArray str_values(batch->column(i)->length(), nullptr, nullptr, batch->column(i)->null_bitmap(), batch->column(i)->null_count()); - CheckArrays(str_values, *col->data()->chunk(0)); + CheckArrays(str_values, *col->chunk(0)); } } @@ -527,14 +527,14 @@ class TestTableWriterSlice : public TestTableWriter, ASSERT_OK(writer_->Append("f1", *batch->column(1))); Finish(); - std::shared_ptr col; + std::shared_ptr col; ASSERT_OK(reader_->GetColumn(0, &col)); - ASSERT_TRUE(col->data()->chunk(0)->Equals(batch->column(0))); - ASSERT_EQ("f0", col->name()); + ASSERT_TRUE(col->chunk(0)->Equals(batch->column(0))); + ASSERT_EQ("f0", reader_->GetColumnName(0)); ASSERT_OK(reader_->GetColumn(1, &col)); - ASSERT_TRUE(col->data()->chunk(0)->Equals(batch->column(1))); - ASSERT_EQ("f1", col->name()); + ASSERT_TRUE(col->chunk(0)->Equals(batch->column(1))); + ASSERT_EQ("f1", reader_->GetColumnName(1)); } }; diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc index 5965d361631..7cd64c8d78a 100644 --- a/cpp/src/arrow/ipc/feather.cc +++ b/cpp/src/arrow/ipc/feather.cc @@ -434,7 +434,7 @@ class TableReader::TableReaderImpl { return col_meta->name()->str(); } - Status GetColumn(int i, std::shared_ptr* out) { + Status GetColumn(int i, std::shared_ptr* out) { const fbs::Column* col_meta = metadata_->column(i); // auto user_meta = column->user_metadata(); @@ -443,18 +443,18 @@ class TableReader::TableReaderImpl { std::shared_ptr values; RETURN_NOT_OK(LoadValues(col_meta->values(), col_meta->metadata_type(), col_meta->metadata(), &values)); - out->reset(new Column(col_meta->name()->str(), values)); + *out = std::make_shared(values); return Status::OK(); } Status Read(std::shared_ptr* out) { std::vector> fields; - std::vector> columns; + std::vector> columns; for (int i = 0; i < num_columns(); ++i) { - std::shared_ptr column; + std::shared_ptr column; RETURN_NOT_OK(GetColumn(i, &column)); columns.push_back(column); - fields.push_back(column->field()); + fields.push_back(::arrow::field(GetColumnName(i), column->type())); } *out = Table::Make(schema(fields), columns); return Status::OK(); @@ -462,7 +462,7 @@ class TableReader::TableReaderImpl { Status Read(const std::vector& indices, std::shared_ptr
* out) { std::vector> fields; - std::vector> columns; + std::vector> columns; for (int i = 0; i < num_columns(); ++i) { bool found = false; for (auto j : indices) { @@ -474,10 +474,10 @@ class TableReader::TableReaderImpl { if (!found) { continue; } - std::shared_ptr column; + std::shared_ptr column; RETURN_NOT_OK(GetColumn(i, &column)); columns.push_back(column); - fields.push_back(column->field()); + fields.push_back(::arrow::field(GetColumnName(i), column->type())); } *out = Table::Make(schema(fields), columns); return Status::OK(); @@ -485,7 +485,7 @@ class TableReader::TableReaderImpl { Status Read(const std::vector& names, std::shared_ptr
* out) { std::vector> fields; - std::vector> columns; + std::vector> columns; for (int i = 0; i < num_columns(); ++i) { auto name = GetColumnName(i); bool found = false; @@ -498,10 +498,10 @@ class TableReader::TableReaderImpl { if (!found) { continue; } - std::shared_ptr column; + std::shared_ptr column; RETURN_NOT_OK(GetColumn(i, &column)); columns.push_back(column); - fields.push_back(column->field()); + fields.push_back(::arrow::field(name, column->type())); } *out = Table::Make(schema(fields), columns); return Status::OK(); @@ -539,7 +539,7 @@ int64_t TableReader::num_columns() const { return impl_->num_columns(); } std::string TableReader::GetColumnName(int i) const { return impl_->GetColumnName(i); } -Status TableReader::GetColumn(int i, std::shared_ptr* out) { +Status TableReader::GetColumn(int i, std::shared_ptr* out) { return impl_->GetColumn(i, out); } @@ -813,9 +813,8 @@ class TableWriter::TableWriterImpl : public ArrayVisitor { Status Write(const Table& table) { for (int i = 0; i < table.num_columns(); ++i) { auto column = table.column(i); - current_column_ = metadata_.AddColumn(column->name()); - auto chunked_array = column->data(); - for (const auto chunk : chunked_array->chunks()) { + current_column_ = metadata_.AddColumn(table.field(i)->name()); + for (const auto chunk : column->chunks()) { RETURN_NOT_OK(chunk->Accept(this)); } RETURN_NOT_OK(current_column_->Finish()); diff --git a/cpp/src/arrow/ipc/feather.h b/cpp/src/arrow/ipc/feather.h index b6bd4ff5e5b..c4b5f6b2718 100644 --- a/cpp/src/arrow/ipc/feather.h +++ b/cpp/src/arrow/ipc/feather.h @@ -31,7 +31,7 @@ namespace arrow { class Array; -class Column; +class ChunkedArray; class Status; class Table; @@ -84,14 +84,14 @@ class ARROW_EXPORT TableReader { std::string GetColumnName(int i) const; - /// \brief Read a column from the file as an arrow::Column. + /// \brief Read a column from the file as an arrow::ChunkedArray. /// /// \param[in] i the column index to read /// \param[out] out the returned column /// \return Status /// /// This function is zero-copy if the file source supports zero-copy reads - Status GetColumn(int i, std::shared_ptr* out); + Status GetColumn(int i, std::shared_ptr* out); /// \brief Read all columns from the file as an arrow::Table. /// diff --git a/cpp/src/arrow/json/reader-test.cc b/cpp/src/arrow/json/reader-test.cc index f538ce743de..b6b21ce6868 100644 --- a/cpp/src/arrow/json/reader-test.cc +++ b/cpp/src/arrow/json/reader-test.cc @@ -62,18 +62,13 @@ class ReaderTest : public ::testing::TestWithParam { SetUpReader(); } - std::shared_ptr ColumnFromJSON(const std::shared_ptr& field, - const std::string& data) { - return std::make_shared(field, ArrayFromJSON(field->type(), data)); - } - - std::shared_ptr ColumnFromJSON(const std::shared_ptr& field, - const std::vector& data) { + std::shared_ptr ChunkedFromJSON(const std::shared_ptr& field, + const std::vector& data) { ArrayVector chunks(data.size()); for (size_t i = 0; i < chunks.size(); ++i) { chunks[i] = ArrayFromJSON(field->type(), data[i]); } - return std::make_shared(field, std::move(chunks)); + return std::make_shared(std::move(chunks)); } ParseOptions parse_options_ = ParseOptions::Defaults(); @@ -99,11 +94,16 @@ TEST_P(ReaderTest, Basics) { SetUpReader(src); ASSERT_OK(reader_->Read(&table_)); - auto expected_table = Table::Make({ - ColumnFromJSON(field("hello", float64()), "[3.5, 3.25, 3.125, 0.0]"), - ColumnFromJSON(field("world", boolean()), "[false, null, null, true]"), - ColumnFromJSON(field("yo", utf8()), "[\"thing\", null, \"\xe5\xbf\x8d\", null]"), - }); + auto schema = ::arrow::schema( + {field("hello", float64()), field("world", boolean()), field("yo", utf8())}); + + auto expected_table = Table::Make( + schema, { + ArrayFromJSON(schema->field(0)->type(), "[3.5, 3.25, 3.125, 0.0]"), + ArrayFromJSON(schema->field(1)->type(), "[false, null, null, true]"), + ArrayFromJSON(schema->field(2)->type(), + "[\"thing\", null, \"\xe5\xbf\x8d\", null]"), + }); AssertTablesEqual(*expected_table, *table_); } @@ -113,14 +113,18 @@ TEST_P(ReaderTest, Nested) { SetUpReader(src); ASSERT_OK(reader_->Read(&table_)); - auto expected_table = Table::Make({ - ColumnFromJSON(field("hello", float64()), "[3.5, 3.25, 3.125, 0.0]"), - ColumnFromJSON(field("world", boolean()), "[false, null, null, true]"), - ColumnFromJSON(field("yo", utf8()), "[\"thing\", null, \"\xe5\xbf\x8d\", null]"), - ColumnFromJSON(field("arr", list(int64())), R"([[1, 2, 3], [2], [], null])"), - ColumnFromJSON(field("nuf", struct_({field("ps", int64())})), - R"([{"ps":null}, null, {"ps":78}, {"ps":90}])"), - }); + auto schema = ::arrow::schema({field("hello", float64()), field("world", boolean()), + field("yo", utf8()), field("arr", list(int64())), + field("nuf", struct_({field("ps", int64())}))}); + + auto a0 = ArrayFromJSON(schema->field(0)->type(), "[3.5, 3.25, 3.125, 0.0]"); + auto a1 = ArrayFromJSON(schema->field(1)->type(), "[false, null, null, true]"); + auto a2 = ArrayFromJSON(schema->field(2)->type(), + "[\"thing\", null, \"\xe5\xbf\x8d\", null]"); + auto a3 = ArrayFromJSON(schema->field(3)->type(), "[[1, 2, 3], [2], [], null]"); + auto a4 = ArrayFromJSON(schema->field(4)->type(), + R"([{"ps":null}, null, {"ps":78}, {"ps":90}])"); + auto expected_table = Table::Make(schema, {a0, a1, a2, a3, a4}); AssertTablesEqual(*expected_table, *table_); } @@ -133,17 +137,25 @@ TEST_P(ReaderTest, PartialSchema) { SetUpReader(src); ASSERT_OK(reader_->Read(&table_)); - auto expected_table = Table::Make({ - // NB: explicitly declared fields will appear first - ColumnFromJSON( - field("nuf", struct_({field("absent", date32()), field("ps", int64())})), - R"([{"absent":null,"ps":null}, null, {"absent":null,"ps":78}, {"absent":null,"ps":90}])"), - ColumnFromJSON(field("arr", list(float32())), R"([[1, 2, 3], [2], [], null])"), - // ...followed by undeclared fields - ColumnFromJSON(field("hello", float64()), "[3.5, 3.25, 3.125, 0.0]"), - ColumnFromJSON(field("world", boolean()), "[false, null, null, true]"), - ColumnFromJSON(field("yo", utf8()), "[\"thing\", null, \"\xe5\xbf\x8d\", null]"), - }); + auto schema = ::arrow::schema( + {field("nuf", struct_({field("absent", date32()), field("ps", int64())})), + field("arr", list(float32())), field("hello", float64()), + field("world", boolean()), field("yo", utf8())}); + + auto expected_table = Table::Make( + schema, + { + // NB: explicitly declared fields will appear first + ArrayFromJSON( + schema->field(0)->type(), + R"([{"absent":null,"ps":null}, null, {"absent":null,"ps":78}, {"absent":null,"ps":90}])"), + ArrayFromJSON(schema->field(1)->type(), R"([[1, 2, 3], [2], [], null])"), + // ...followed by undeclared fields + ArrayFromJSON(schema->field(2)->type(), "[3.5, 3.25, 3.125, 0.0]"), + ArrayFromJSON(schema->field(3)->type(), "[false, null, null, true]"), + ArrayFromJSON(schema->field(4)->type(), + "[\"thing\", null, \"\xe5\xbf\x8d\", null]"), + }); AssertTablesEqual(*expected_table, *table_); } @@ -156,14 +168,16 @@ TEST_P(ReaderTest, TypeInference) { )"); ASSERT_OK(reader_->Read(&table_)); - auto expected_table = - Table::Make({ColumnFromJSON(field("ts", timestamp(TimeUnit::SECOND)), - R"([null, "1970-01-01", "2018-11-13 17:11:10"])"), - ColumnFromJSON(field("f", float64()), R"([null, 3, 3.125])")}); + auto schema = + ::arrow::schema({field("ts", timestamp(TimeUnit::SECOND)), field("f", float64())}); + auto expected_table = Table::Make( + schema, {ArrayFromJSON(schema->field(0)->type(), + R"([null, "1970-01-01", "2018-11-13 17:11:10"])"), + ArrayFromJSON(schema->field(1)->type(), R"([null, 3, 3.125])")}); AssertTablesEqual(*expected_table, *table_); } -TEST_P(ReaderTest, MutlipleChunks) { +TEST_P(ReaderTest, MultipleChunks) { parse_options_.unexpected_field_behavior = UnexpectedFieldBehavior::InferType; auto src = scalars_only_src(); @@ -172,15 +186,18 @@ TEST_P(ReaderTest, MutlipleChunks) { SetUpReader(src); ASSERT_OK(reader_->Read(&table_)); + auto schema = ::arrow::schema( + {field("hello", float64()), field("world", boolean()), field("yo", utf8())}); + // there is an empty chunk because the last block of the file is " " - auto expected_table = Table::Make({ - ColumnFromJSON(field("hello", float64()), - {"[3.5]", "[3.25]", "[3.125, 0.0]", "[]"}), - ColumnFromJSON(field("world", boolean()), - {"[false]", "[null]", "[null, true]", "[]"}), - ColumnFromJSON(field("yo", utf8()), - {"[\"thing\"]", "[null]", "[\"\xe5\xbf\x8d\", null]", "[]"}), - }); + auto expected_table = Table::Make( + schema, + { + ChunkedFromJSON(schema->field(0), {"[3.5]", "[3.25]", "[3.125, 0.0]", "[]"}), + ChunkedFromJSON(schema->field(1), {"[false]", "[null]", "[null, true]", "[]"}), + ChunkedFromJSON(schema->field(2), + {"[\"thing\"]", "[null]", "[\"\xe5\xbf\x8d\", null]", "[]"}), + }); AssertTablesEqual(*expected_table, *table_); } @@ -225,7 +242,7 @@ TEST(ReaderTest, MultipleChunksParallel) { ASSERT_EQ(serial->column(0)->type()->id(), Type::INT64); int expected = 0; - for (auto chunk : serial->column(0)->data()->chunks()) { + for (auto chunk : serial->column(0)->chunks()) { for (int64_t i = 0; i < chunk->length(); ++i) { ASSERT_EQ(checked_cast(chunk.get())->GetView(i), expected) << " at index " << i; diff --git a/cpp/src/arrow/pretty_print-test.cc b/cpp/src/arrow/pretty_print-test.cc index 7600ab41f54..c77a92b7fa6 100644 --- a/cpp/src/arrow/pretty_print-test.cc +++ b/cpp/src/arrow/pretty_print-test.cc @@ -567,51 +567,10 @@ TEST_F(TestPrettyPrint, ChunkedArrayPrimitiveType) { CheckStream(chunked_array_2, {0}, expected_2); } -TEST_F(TestPrettyPrint, ColumnPrimitiveType) { - std::shared_ptr int_field = field("column", int32()); - auto array = ArrayFromJSON(int_field->type(), "[0, 1, null, 3, null]"); - Column column(int_field, ArrayVector({array})); - - static const char* expected = R"expected(column: int32 -[ - [ - 0, - 1, - null, - 3, - null - ] -])expected"; - CheckStream(column, {0}, expected); - - Column column_2(int_field, {array, array}); - - static const char* expected_2 = R"expected(column: int32 -[ - [ - 0, - 1, - null, - 3, - null - ], - [ - 0, - 1, - null, - 3, - null - ] -])expected"; - - CheckStream(column_2, {0}, expected_2); -} - TEST_F(TestPrettyPrint, TablePrimitive) { std::shared_ptr int_field = field("column", int32()); auto array = ArrayFromJSON(int_field->type(), "[0, 1, null, 3, null]"); - std::shared_ptr column = - std::make_shared(int_field, ArrayVector({array})); + auto column = std::make_shared(ArrayVector({array})); std::shared_ptr table_schema = schema({int_field}); std::shared_ptr
table = Table::Make(table_schema, {column}); diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index cb67b0dcf95..6caef1714bf 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -510,16 +510,6 @@ Status PrettyPrint(const ChunkedArray& chunked_arr, const PrettyPrintOptions& op return Status::OK(); } -Status PrettyPrint(const Column& column, const PrettyPrintOptions& options, - std::ostream* sink) { - for (int i = 0; i < options.indent; ++i) { - (*sink) << " "; - } - (*sink) << column.field()->ToString() << "\n"; - - return PrettyPrint(*column.data(), options, sink); -} - Status PrettyPrint(const ChunkedArray& chunked_arr, const PrettyPrintOptions& options, std::string* result) { std::ostringstream sink; @@ -552,7 +542,7 @@ Status PrettyPrint(const Table& table, const PrettyPrintOptions& options, (*sink) << " "; } (*sink) << table.schema()->field(i)->name() << ":\n"; - RETURN_NOT_OK(PrettyPrint(*table.column(i)->data(), column_options, sink)); + RETURN_NOT_OK(PrettyPrint(*table.column(i), column_options, sink)); (*sink) << "\n"; } (*sink) << std::flush; diff --git a/cpp/src/arrow/pretty_print.h b/cpp/src/arrow/pretty_print.h index 9c2708f16ee..5740341a67d 100644 --- a/cpp/src/arrow/pretty_print.h +++ b/cpp/src/arrow/pretty_print.h @@ -26,7 +26,6 @@ namespace arrow { class Array; -class Column; class ChunkedArray; class RecordBatch; class Schema; @@ -91,11 +90,6 @@ ARROW_EXPORT Status PrettyPrint(const ChunkedArray& chunked_arr, const PrettyPrintOptions& options, std::string* result); -/// \brief Print human-readable representation of Column -ARROW_EXPORT -Status PrettyPrint(const Column& column, const PrettyPrintOptions& options, - std::ostream* sink); - ARROW_EXPORT Status PrettyPrint(const Schema& schema, const PrettyPrintOptions& options, std::ostream* sink); diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 59bdb17c896..f4f35acba93 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -216,7 +216,7 @@ class PandasBlock { virtual ~PandasBlock() {} virtual Status Allocate() = 0; - virtual Status Write(const std::shared_ptr& col, int64_t abs_placement, + virtual Status Write(const std::shared_ptr& data, int64_t abs_placement, int64_t rel_placement) = 0; PyObject* block_arr() const { return block_arr_.obj(); } @@ -547,25 +547,21 @@ inline Status ConvertStruct(const PandasOptions& options, const ChunkedArray& da } template -inline Status ConvertListsLike(const PandasOptions& options, - const std::shared_ptr& col, +inline Status ConvertListsLike(const PandasOptions& options, const ChunkedArray& data, PyObject** out_values) { - const ChunkedArray& data = *col->data().get(); - const auto& list_type = checked_cast(*col->type()); - // Get column of underlying value arrays std::vector> value_arrays; for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = checked_cast(*data.chunk(c)); value_arrays.emplace_back(arr.values()); } - auto flat_column = std::make_shared(list_type.value_field(), value_arrays); + auto flat_column = std::make_shared(value_arrays); // TODO(ARROW-489): Currently we don't have a Python reference for single columns. // Storing a reference to the whole Array would be to expensive. OwnedRefNoGIL owned_numpy_array; - RETURN_NOT_OK( - ConvertColumnToPandas(options, flat_column, nullptr, owned_numpy_array.ref())); + RETURN_NOT_OK(ConvertChunkedArrayToPandas(options, flat_column, nullptr, + owned_numpy_array.ref())); PyObject* numpy_array = owned_numpy_array.obj(); @@ -709,9 +705,9 @@ static Status ConvertDecimals(const PandasOptions& options, const ChunkedArray& return Status::OK(); } -#define CONVERTLISTSLIKE_CASE(ArrowType, ArrowEnum) \ - case Type::ArrowEnum: \ - RETURN_NOT_OK((ConvertListsLike(options_, col, out_buffer))); \ +#define CONVERTLISTSLIKE_CASE(ArrowType, ArrowEnum) \ + case Type::ArrowEnum: \ + RETURN_NOT_OK((ConvertListsLike(options_, *data, out_buffer))); \ break; class ObjectBlock : public PandasBlock { @@ -719,53 +715,51 @@ class ObjectBlock : public PandasBlock { using PandasBlock::PandasBlock; Status Allocate() override { return AllocateNDArray(NPY_OBJECT); } - Status Write(const std::shared_ptr& col, int64_t abs_placement, + Status Write(const std::shared_ptr& data, int64_t abs_placement, int64_t rel_placement) override { - Type::type type = col->type()->id(); + Type::type type = data->type()->id(); PyObject** out_buffer = reinterpret_cast(block_data_) + rel_placement * num_rows_; - const ChunkedArray& data = *col->data().get(); - if (type == Type::BOOL) { - RETURN_NOT_OK(ConvertBooleanWithNulls(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertBooleanWithNulls(options_, *data, out_buffer)); } else if (type == Type::UINT8) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, *data, out_buffer)); } else if (type == Type::INT8) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, *data, out_buffer)); } else if (type == Type::UINT16) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, *data, out_buffer)); } else if (type == Type::INT16) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, *data, out_buffer)); } else if (type == Type::UINT32) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, *data, out_buffer)); } else if (type == Type::INT32) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, *data, out_buffer)); } else if (type == Type::UINT64) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, *data, out_buffer)); } else if (type == Type::INT64) { - RETURN_NOT_OK(ConvertIntegerObjects(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertIntegerObjects(options_, *data, out_buffer)); } else if (type == Type::BINARY) { - RETURN_NOT_OK(ConvertBinaryLike(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertBinaryLike(options_, *data, out_buffer)); } else if (type == Type::STRING) { - RETURN_NOT_OK(ConvertBinaryLike(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertBinaryLike(options_, *data, out_buffer)); } else if (type == Type::FIXED_SIZE_BINARY) { - RETURN_NOT_OK(ConvertBinaryLike(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertBinaryLike(options_, *data, out_buffer)); } else if (type == Type::DATE32) { - RETURN_NOT_OK(ConvertDates(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertDates(options_, *data, out_buffer)); } else if (type == Type::DATE64) { - RETURN_NOT_OK(ConvertDates(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertDates(options_, *data, out_buffer)); } else if (type == Type::TIME32) { - RETURN_NOT_OK(ConvertTimes(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertTimes(options_, *data, out_buffer)); } else if (type == Type::TIME64) { - RETURN_NOT_OK(ConvertTimes(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertTimes(options_, *data, out_buffer)); } else if (type == Type::DECIMAL) { - RETURN_NOT_OK(ConvertDecimals(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertDecimals(options_, *data, out_buffer)); } else if (type == Type::NA) { - RETURN_NOT_OK(ConvertNulls(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertNulls(options_, *data, out_buffer)); } else if (type == Type::LIST) { - auto list_type = std::static_pointer_cast(col->type()); + auto list_type = std::static_pointer_cast(data->type()); switch (list_type->value_type()->id()) { CONVERTLISTSLIKE_CASE(BooleanType, BOOL) CONVERTLISTSLIKE_CASE(UInt8Type, UINT8) @@ -795,10 +789,10 @@ class ObjectBlock : public PandasBlock { } } } else if (type == Type::STRUCT) { - RETURN_NOT_OK(ConvertStruct(options_, data, out_buffer)); + RETURN_NOT_OK(ConvertStruct(options_, *data, out_buffer)); } else { return Status::NotImplemented("Unsupported type for object array output: ", - col->type()->ToString()); + data->type()->ToString()); } placement_data_[rel_placement] = abs_placement; @@ -814,22 +808,20 @@ class IntBlock : public PandasBlock { return AllocateNDArray(internal::arrow_traits::npy_type); } - Status Write(const std::shared_ptr& col, int64_t abs_placement, + Status Write(const std::shared_ptr& data, int64_t abs_placement, int64_t rel_placement) override { - Type::type type = col->type()->id(); + Type::type type = data->type()->id(); C_TYPE* out_buffer = reinterpret_cast(block_data_) + rel_placement * num_rows_; - const ChunkedArray& data = *col->data().get(); - if (type != ARROW_TYPE) { return Status::NotImplemented("Cannot write Arrow data of type ", - col->type()->ToString(), " to a Pandas int", + data->type()->ToString(), " to a Pandas int", sizeof(C_TYPE), " block"); } - ConvertIntegerNoNullsSameType(options_, data, out_buffer); + ConvertIntegerNoNullsSameType(options_, *data, out_buffer); placement_data_[rel_placement] = abs_placement; return Status::OK(); } @@ -849,20 +841,20 @@ class Float16Block : public PandasBlock { using PandasBlock::PandasBlock; Status Allocate() override { return AllocateNDArray(NPY_FLOAT16); } - Status Write(const std::shared_ptr& col, int64_t abs_placement, + Status Write(const std::shared_ptr& data, int64_t abs_placement, int64_t rel_placement) override { - Type::type type = col->type()->id(); + Type::type type = data->type()->id(); if (type != Type::HALF_FLOAT) { return Status::NotImplemented("Cannot write Arrow data of type ", - col->type()->ToString(), + data->type()->ToString(), " to a Pandas float16 block"); } npy_half* out_buffer = reinterpret_cast(block_data_) + rel_placement * num_rows_; - ConvertNumericNullable(*col->data().get(), NPY_HALF_NAN, out_buffer); + ConvertNumericNullable(*data, NPY_HALF_NAN, out_buffer); placement_data_[rel_placement] = abs_placement; return Status::OK(); } @@ -873,19 +865,19 @@ class Float32Block : public PandasBlock { using PandasBlock::PandasBlock; Status Allocate() override { return AllocateNDArray(NPY_FLOAT32); } - Status Write(const std::shared_ptr& col, int64_t abs_placement, + Status Write(const std::shared_ptr& data, int64_t abs_placement, int64_t rel_placement) override { - Type::type type = col->type()->id(); + Type::type type = data->type()->id(); if (type != Type::FLOAT) { return Status::NotImplemented("Cannot write Arrow data of type ", - col->type()->ToString(), + data->type()->ToString(), " to a Pandas float32 block"); } float* out_buffer = reinterpret_cast(block_data_) + rel_placement * num_rows_; - ConvertNumericNullable(*col->data().get(), NAN, out_buffer); + ConvertNumericNullable(*data, NAN, out_buffer); placement_data_[rel_placement] = abs_placement; return Status::OK(); } @@ -896,17 +888,15 @@ class Float64Block : public PandasBlock { using PandasBlock::PandasBlock; Status Allocate() override { return AllocateNDArray(NPY_FLOAT64); } - Status Write(const std::shared_ptr& col, int64_t abs_placement, + Status Write(const std::shared_ptr& data, int64_t abs_placement, int64_t rel_placement) override { - Type::type type = col->type()->id(); + Type::type type = data->type()->id(); double* out_buffer = reinterpret_cast(block_data_) + rel_placement * num_rows_; - const ChunkedArray& data = *col->data().get(); - -#define INTEGER_CASE(IN_TYPE) \ - ConvertIntegerWithNulls(options_, data, out_buffer); \ +#define INTEGER_CASE(IN_TYPE) \ + ConvertIntegerWithNulls(options_, *data, out_buffer); \ break; switch (type) { @@ -927,14 +917,14 @@ class Float64Block : public PandasBlock { case Type::INT64: INTEGER_CASE(int64_t); case Type::FLOAT: - ConvertNumericNullableCast(data, NAN, out_buffer); + ConvertNumericNullableCast(*data, NAN, out_buffer); break; case Type::DOUBLE: - ConvertNumericNullable(data, NAN, out_buffer); + ConvertNumericNullable(*data, NAN, out_buffer); break; default: return Status::NotImplemented("Cannot write Arrow data of type ", - col->type()->ToString(), + data->type()->ToString(), " to a Pandas float64 block"); } @@ -950,20 +940,18 @@ class BoolBlock : public PandasBlock { using PandasBlock::PandasBlock; Status Allocate() override { return AllocateNDArray(NPY_BOOL); } - Status Write(const std::shared_ptr& col, int64_t abs_placement, + Status Write(const std::shared_ptr& data, int64_t abs_placement, int64_t rel_placement) override { - Type::type type = col->type()->id(); - - if (type != Type::BOOL) { + if (data->type()->id() != Type::BOOL) { return Status::NotImplemented("Cannot write Arrow data of type ", - col->type()->ToString(), + data->type()->ToString(), " to a Pandas boolean block"); } uint8_t* out_buffer = reinterpret_cast(block_data_) + rel_placement * num_rows_; - ConvertBooleanNoNulls(options_, *col->data(), out_buffer); + ConvertBooleanNoNulls(options_, *data, out_buffer); placement_data_[rel_placement] = abs_placement; return Status::OK(); } @@ -984,39 +972,37 @@ class DatetimeBlock : public PandasBlock { Status Allocate() override { return AllocateDatetime(2); } - Status Write(const std::shared_ptr& col, int64_t abs_placement, + Status Write(const std::shared_ptr& data, int64_t abs_placement, int64_t rel_placement) override { - Type::type type = col->type()->id(); + Type::type type = data->type()->id(); int64_t* out_buffer = reinterpret_cast(block_data_) + rel_placement * num_rows_; - const ChunkedArray& data = *col->data(); - if (type == Type::DATE32) { // Convert from days since epoch to datetime64[ns] - ConvertDatetimeNanos(data, out_buffer); + ConvertDatetimeNanos(*data, out_buffer); } else if (type == Type::DATE64) { // Date64Type is millisecond timestamp stored as int64_t // TODO(wesm): Do we want to make sure to zero out the milliseconds? - ConvertDatetimeNanos(data, out_buffer); + ConvertDatetimeNanos(*data, out_buffer); } else if (type == Type::TIMESTAMP) { - const auto& ts_type = checked_cast(*col->type()); + const auto& ts_type = checked_cast(*data->type()); if (ts_type.unit() == TimeUnit::NANO) { - ConvertNumericNullable(data, kPandasTimestampNull, out_buffer); + ConvertNumericNullable(*data, kPandasTimestampNull, out_buffer); } else if (ts_type.unit() == TimeUnit::MICRO) { - ConvertDatetimeNanos(data, out_buffer); + ConvertDatetimeNanos(*data, out_buffer); } else if (ts_type.unit() == TimeUnit::MILLI) { - ConvertDatetimeNanos(data, out_buffer); + ConvertDatetimeNanos(*data, out_buffer); } else if (ts_type.unit() == TimeUnit::SECOND) { - ConvertDatetimeNanos(data, out_buffer); + ConvertDatetimeNanos(*data, out_buffer); } else { return Status::NotImplemented("Unsupported time unit"); } } else { return Status::NotImplemented("Cannot write Arrow data of type ", - col->type()->ToString(), + data->type()->ToString(), " to a Pandas datetime block."); } @@ -1070,16 +1056,14 @@ class CategoricalBlock : public PandasBlock { } template - Status WriteIndices(const std::shared_ptr& col) { + Status WriteIndices(const std::shared_ptr& data) { using ArrayType = typename TypeTraits::ArrayType; using TRAITS = internal::arrow_traits; using T = typename TRAITS::T; constexpr int npy_type = TRAITS::npy_type; - const ChunkedArray& data = *col->data().get(); - // Sniff the first chunk - const std::shared_ptr arr_first = data.chunk(0); + const std::shared_ptr arr_first = data->chunk(0); const auto& dict_arr_first = checked_cast(*arr_first); const auto indices_first = std::static_pointer_cast(dict_arr_first.indices()); @@ -1095,7 +1079,7 @@ class CategoricalBlock : public PandasBlock { return Status::OK(); }; - if (!needs_copy_ && data.num_chunks() == 1 && indices_first->null_count() == 0) { + if (!needs_copy_ && data->num_chunks() == 1 && indices_first->null_count() == 0) { RETURN_NOT_OK(CheckIndices(*indices_first, dict_arr_first.dictionary()->length())); RETURN_NOT_OK(AllocateNDArrayFromIndices(npy_type, indices_first)); } else { @@ -1106,7 +1090,7 @@ class CategoricalBlock : public PandasBlock { "allowed"); } - return Status::Invalid("Needed to copy ", data.num_chunks(), " chunks with ", + return Status::Invalid("Needed to copy ", data->num_chunks(), " chunks with ", indices_first->null_count(), " indices nulls, but zero_copy_only was True"); } @@ -1115,8 +1099,8 @@ class CategoricalBlock : public PandasBlock { // No relative placement offset because a single column T* out_values = reinterpret_cast(block_data_); - for (int c = 0; c < data.num_chunks(); c++) { - const std::shared_ptr arr = data.chunk(c); + for (int c = 0; c < data->num_chunks(); c++) { + const std::shared_ptr arr = data->chunk(c); const auto& dict_arr = checked_cast(*arr); const auto& indices = checked_cast(*dict_arr.indices()); @@ -1133,50 +1117,48 @@ class CategoricalBlock : public PandasBlock { return Status::OK(); } - Status Write(const std::shared_ptr& col, int64_t abs_placement, + Status Write(const std::shared_ptr& data, int64_t abs_placement, int64_t rel_placement) override { - std::shared_ptr converted_col; + std::shared_ptr converted_data; if (options_.strings_to_categorical && - (col->type()->id() == Type::STRING || col->type()->id() == Type::BINARY)) { + (data->type()->id() == Type::STRING || data->type()->id() == Type::BINARY)) { needs_copy_ = true; compute::FunctionContext ctx(pool_); Datum out; - RETURN_NOT_OK(compute::DictionaryEncode(&ctx, Datum(col->data()), &out)); + RETURN_NOT_OK(compute::DictionaryEncode(&ctx, data, &out)); DCHECK_EQ(out.kind(), Datum::CHUNKED_ARRAY); - converted_col = - std::make_shared(field(col->name(), out.type()), out.chunked_array()); + converted_data = out.chunked_array(); } else { // check if all dictionaries are equal - const ChunkedArray& data = *col->data().get(); - const std::shared_ptr arr_first = data.chunk(0); + const std::shared_ptr arr_first = data->chunk(0); const auto& dict_arr_first = checked_cast(*arr_first); - for (int c = 1; c < data.num_chunks(); c++) { - const std::shared_ptr arr = data.chunk(c); + for (int c = 1; c < data->num_chunks(); c++) { + const std::shared_ptr arr = data->chunk(c); const auto& dict_arr = checked_cast(*arr); if (!(dict_arr_first.dictionary()->Equals(dict_arr.dictionary()))) { return Status::NotImplemented("Variable dictionary type not supported"); } } - converted_col = col; + converted_data = data; } - const auto& dict_type = checked_cast(*converted_col->type()); + const auto& dict_type = checked_cast(*converted_data->type()); switch (dict_type.index_type()->id()) { case Type::INT8: - RETURN_NOT_OK(WriteIndices(converted_col)); + RETURN_NOT_OK(WriteIndices(converted_data)); break; case Type::INT16: - RETURN_NOT_OK(WriteIndices(converted_col)); + RETURN_NOT_OK(WriteIndices(converted_data)); break; case Type::INT32: - RETURN_NOT_OK(WriteIndices(converted_col)); + RETURN_NOT_OK(WriteIndices(converted_data)); break; case Type::INT64: - RETURN_NOT_OK(WriteIndices(converted_col)); + RETURN_NOT_OK(WriteIndices(converted_data)); break; default: { return Status::NotImplemented("Categorical index type not supported: ", @@ -1185,7 +1167,7 @@ class CategoricalBlock : public PandasBlock { } // TODO(wesm): variable dictionaries - auto arr = converted_col->data()->chunk(0); + auto arr = converted_data->chunk(0); const auto& dict_arr = checked_cast(*arr); placement_data_[rel_placement] = abs_placement; @@ -1308,18 +1290,18 @@ Status MakeBlock(const PandasOptions& options, PandasBlock::type type, int64_t n using BlockMap = std::unordered_map>; -static Status GetPandasBlockType(const Column& col, const PandasOptions& options, +static Status GetPandasBlockType(const ChunkedArray& data, const PandasOptions& options, PandasBlock::type* output_type) { #define INTEGER_CASE(NAME) \ *output_type = \ - col.null_count() > 0 \ + data.null_count() > 0 \ ? options.integer_object_nulls ? PandasBlock::OBJECT : PandasBlock::DOUBLE \ : PandasBlock::NAME; \ break; - switch (col.type()->id()) { + switch (data.type()->id()) { case Type::BOOL: - *output_type = col.null_count() > 0 ? PandasBlock::OBJECT : PandasBlock::BOOL; + *output_type = data.null_count() > 0 ? PandasBlock::OBJECT : PandasBlock::BOOL; break; case Type::UINT8: INTEGER_CASE(UINT8); @@ -1365,7 +1347,7 @@ static Status GetPandasBlockType(const Column& col, const PandasOptions& options *output_type = options.date_as_object ? PandasBlock::OBJECT : PandasBlock::DATETIME; break; case Type::TIMESTAMP: { - const auto& ts_type = checked_cast(*col.type()); + const auto& ts_type = checked_cast(*data.type()); if (ts_type.timezone() != "") { *output_type = PandasBlock::DATETIME_WITH_TZ; } else { @@ -1373,7 +1355,7 @@ static Status GetPandasBlockType(const Column& col, const PandasOptions& options } } break; case Type::LIST: { - auto list_type = std::static_pointer_cast(col.type()); + auto list_type = std::static_pointer_cast(data.type()); if (!ListTypeSupported(*list_type->value_type())) { return Status::NotImplemented("Not implemented type for list in DataFrameBlock: ", list_type->value_type()->ToString()); @@ -1386,7 +1368,7 @@ static Status GetPandasBlockType(const Column& col, const PandasOptions& options default: return Status::NotImplemented( "No known equivalent Pandas block for Arrow data of type ", - col.type()->ToString(), " is known."); + data.type()->ToString(), " is known."); } return Status::OK(); } @@ -1418,7 +1400,7 @@ class DataFrameBlockCreator { Status CreateBlocks() { for (int i = 0; i < table_->num_columns(); ++i) { - std::shared_ptr col = table_->column(i); + std::shared_ptr col = table_->column(i); PandasBlock::type output_type = PandasBlock::OBJECT; RETURN_NOT_OK(GetPandasBlockType(*col, options_, &output_type)); @@ -1558,14 +1540,14 @@ class DataFrameBlockCreator { class ArrowDeserializer { public: - ArrowDeserializer(const PandasOptions& options, const std::shared_ptr& col, - PyObject* py_ref) - : col_(col), data_(*col->data().get()), options_(options), py_ref_(py_ref) {} + ArrowDeserializer(const PandasOptions& options, + const std::shared_ptr& data, PyObject* py_ref) + : data_(data), options_(options), py_ref_(py_ref) {} Status AllocateOutput(int type) { PyAcquireGIL lock; - result_ = NewArray1DFromType(col_->type().get(), type, col_->length(), nullptr); + result_ = NewArray1DFromType(data_->type().get(), type, data_->length(), nullptr); RETURN_IF_PYERROR(); arr_ = reinterpret_cast(result_); return Status::OK(); @@ -1584,7 +1566,7 @@ class ArrowDeserializer { PyAcquireGIL lock; // Zero-Copy. We can pass the data pointer directly to NumPy. - result_ = NewArray1DFromType(col_->type().get(), npy_type, col_->length(), data); + result_ = NewArray1DFromType(data_->type().get(), npy_type, data_->length(), data); arr_ = reinterpret_cast(result_); if (arr_ == nullptr) { @@ -1677,16 +1659,16 @@ class ArrowDeserializer { typedef typename traits::T T; int npy_type = traits::npy_type; - if (data_.num_chunks() == 1 && data_.null_count() == 0) { - return ConvertValuesZeroCopy(options_, npy_type, data_.chunk(0)); + if (data_->num_chunks() == 1 && data_->null_count() == 0) { + return ConvertValuesZeroCopy(options_, npy_type, data_->chunk(0)); } else if (options_.zero_copy_only) { - return Status::Invalid("Needed to copy ", data_.num_chunks(), " chunks with ", - data_.null_count(), " nulls, but zero_copy_only was True"); + return Status::Invalid("Needed to copy ", data_->num_chunks(), " chunks with ", + data_->null_count(), " nulls, but zero_copy_only was True"); } RETURN_NOT_OK(AllocateOutput(npy_type)); auto out_values = reinterpret_cast(PyArray_DATA(arr_)); - ConvertNumericNullable(data_, traits::na_value, out_values); + ConvertNumericNullable(*data_, traits::na_value, out_values); return Status::OK(); } @@ -1710,8 +1692,8 @@ class ArrowDeserializer { constexpr T na_value = traits::na_value; constexpr int64_t kShift = traits::npy_shift; - for (int c = 0; c < data_.num_chunks(); c++) { - const auto& arr = *data_.chunk(c); + for (int c = 0; c < data_->num_chunks(); c++) { + const auto& arr = *data_->chunk(c); const c_type* in_values = GetPrimitiveValues(arr); for (int64_t i = 0; i < arr.length(); ++i) { @@ -1743,8 +1725,8 @@ class ArrowDeserializer { constexpr T na_value = traits::na_value; constexpr int64_t kShift = traits::npy_shift; - for (int c = 0; c < data_.num_chunks(); c++) { - const auto& arr = *data_.chunk(c); + for (int c = 0; c < data_->num_chunks(); c++) { + const auto& arr = *data_->chunk(c); const c_type* in_values = GetPrimitiveValues(arr); for (int64_t i = 0; i < arr.length(); ++i) { @@ -1769,25 +1751,25 @@ class ArrowDeserializer { typedef typename traits::T T; - if (data_.num_chunks() == 1 && data_.null_count() == 0) { - return ConvertValuesZeroCopy(options_, traits::npy_type, data_.chunk(0)); + if (data_->num_chunks() == 1 && data_->null_count() == 0) { + return ConvertValuesZeroCopy(options_, traits::npy_type, data_->chunk(0)); } else if (options_.zero_copy_only) { - return Status::Invalid("Needed to copy ", data_.num_chunks(), " chunks with ", - data_.null_count(), " nulls, but zero_copy_only was True"); + return Status::Invalid("Needed to copy ", data_->num_chunks(), " chunks with ", + data_->null_count(), " nulls, but zero_copy_only was True"); } - if (data_.null_count() > 0) { + if (data_->null_count() > 0) { if (options_.integer_object_nulls) { return VisitObjects(ConvertIntegerObjects); } else { RETURN_NOT_OK(AllocateOutput(NPY_FLOAT64)); auto out_values = reinterpret_cast(PyArray_DATA(arr_)); - ConvertIntegerWithNulls(options_, data_, out_values); + ConvertIntegerWithNulls(options_, *data_, out_values); } } else { RETURN_NOT_OK(AllocateOutput(traits::npy_type)); auto out_values = reinterpret_cast(PyArray_DATA(arr_)); - ConvertIntegerNoNullsSameType(options_, data_, out_values); + ConvertIntegerNoNullsSameType(options_, *data_, out_values); } return Status::OK(); @@ -1800,7 +1782,7 @@ class ArrowDeserializer { } RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); auto out_values = reinterpret_cast(PyArray_DATA(arr_)); - return func(options_, data_, out_values); + return func(options_, *data_, out_values); } // Strings and binary @@ -1829,12 +1811,12 @@ class ArrowDeserializer { Status Visit(const BooleanType& type) { if (options_.zero_copy_only) { return Status::Invalid("BooleanType needs copies, but zero_copy_only was True"); - } else if (data_.null_count() > 0) { + } else if (data_->null_count() > 0) { return VisitObjects(ConvertBooleanWithNulls); } else { RETURN_NOT_OK(AllocateOutput(internal::arrow_traits::npy_type)); auto out_values = reinterpret_cast(PyArray_DATA(arr_)); - ConvertBooleanNoNulls(options_, data_, out_values); + ConvertBooleanNoNulls(options_, *data_, out_values); } return Status::OK(); } @@ -1845,11 +1827,11 @@ class ArrowDeserializer { } #define CONVERTVALUES_LISTSLIKE_CASE(ArrowType, ArrowEnum) \ case Type::ArrowEnum: \ - return ConvertListsLike(options_, col_, out_values); + return ConvertListsLike(options_, *data_, out_values); RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); auto out_values = reinterpret_cast(PyArray_DATA(arr_)); - auto list_type = std::static_pointer_cast(col_->type()); + auto list_type = std::static_pointer_cast(data_->type()); switch (list_type->value_type()->id()) { CONVERTVALUES_LISTSLIKE_CASE(UInt8Type, UINT8) CONVERTVALUES_LISTSLIKE_CASE(Int8Type, INT8) @@ -1879,8 +1861,8 @@ class ArrowDeserializer { } Status Visit(const DictionaryType& type) { - auto block = std::make_shared(options_, nullptr, col_->length()); - RETURN_NOT_OK(block->Write(col_, 0, 0)); + auto block = std::make_shared(options_, nullptr, data_->length()); + RETURN_NOT_OK(block->Write(data_, 0, 0)); PyAcquireGIL lock; result_ = PyDict_New(); @@ -1903,14 +1885,13 @@ class ArrowDeserializer { Status Visit(const DataType& type) { return Status::NotImplemented(type.name()); } Status Convert(PyObject** out) { - RETURN_NOT_OK(VisitTypeInline(*col_->type(), this)); + RETURN_NOT_OK(VisitTypeInline(*data_->type(), this)); *out = result_; return Status::OK(); } private: - std::shared_ptr col_; - const ChunkedArray& data_; + std::shared_ptr data_; PandasOptions options_; PyObject* py_ref_; PyArrayObject* arr_; @@ -1920,25 +1901,14 @@ class ArrowDeserializer { Status ConvertArrayToPandas(const PandasOptions& options, const std::shared_ptr& arr, PyObject* py_ref, PyObject** out) { - static std::string dummy_name = "dummy"; - auto field = std::make_shared(dummy_name, arr->type()); - auto col = std::make_shared(field, arr); - return ConvertColumnToPandas(options, col, py_ref, out); + auto carr = std::make_shared(arr); + return ConvertChunkedArrayToPandas(options, carr, py_ref, out); } Status ConvertChunkedArrayToPandas(const PandasOptions& options, const std::shared_ptr& ca, PyObject* py_ref, PyObject** out) { - static std::string dummy_name = "dummy"; - auto field = std::make_shared(dummy_name, ca->type()); - auto col = std::make_shared(field, ca); - return ConvertColumnToPandas(options, col, py_ref, out); -} - -Status ConvertColumnToPandas(const PandasOptions& options, - const std::shared_ptr& col, PyObject* py_ref, - PyObject** out) { - ArrowDeserializer converter(options, col, py_ref); + ArrowDeserializer converter(options, ca, py_ref); return converter.Convert(out); } @@ -1957,16 +1927,14 @@ Status ConvertTableToPandas(const PandasOptions& options, if (!categorical_columns.empty()) { FunctionContext ctx; for (int i = 0; i < table->num_columns(); i++) { - const Column& col = *table->column(i); - if (categorical_columns.count(col.name())) { + std::shared_ptr col = table->column(i); + if (categorical_columns.count(table->field(i)->name())) { Datum out; - RETURN_NOT_OK(DictionaryEncode(&ctx, Datum(col.data()), &out)); + RETURN_NOT_OK(DictionaryEncode(&ctx, Datum(col), &out)); std::shared_ptr array = out.chunked_array(); - auto field = std::make_shared( - col.name(), array->type(), col.field()->nullable(), col.field()->metadata()); - auto column = std::make_shared(field, array); + auto field = table->field(i)->WithType(array->type()); RETURN_NOT_OK(current_table->RemoveColumn(i, ¤t_table)); - RETURN_NOT_OK(current_table->AddColumn(i, column, ¤t_table)); + RETURN_NOT_OK(current_table->AddColumn(i, field, array, ¤t_table)); } } } diff --git a/cpp/src/arrow/python/pyarrow.cc b/cpp/src/arrow/python/pyarrow.cc index e037318bce2..38fd56d80fc 100644 --- a/cpp/src/arrow/python/pyarrow.cc +++ b/cpp/src/arrow/python/pyarrow.cc @@ -161,21 +161,6 @@ PyObject* wrap_sparse_tensor_coo(const std::shared_ptr& sparse_ return ::pyarrow_wrap_sparse_tensor_coo(sparse_tensor); } -bool is_column(PyObject* column) { return ::pyarrow_is_column(column) != 0; } - -Status unwrap_column(PyObject* column, std::shared_ptr* out) { - *out = ::pyarrow_unwrap_column(column); - if (*out) { - return Status::OK(); - } else { - return Status::Invalid("Could not unwrap Column from the passed Python object."); - } -} - -PyObject* wrap_column(const std::shared_ptr& column) { - return ::pyarrow_wrap_column(column); -} - bool is_table(PyObject* table) { return ::pyarrow_is_table(table) != 0; } Status unwrap_table(PyObject* table, std::shared_ptr
* out) { diff --git a/cpp/src/arrow/python/pyarrow.h b/cpp/src/arrow/python/pyarrow.h index b4834f79f78..aad7a4a5dd9 100644 --- a/cpp/src/arrow/python/pyarrow.h +++ b/cpp/src/arrow/python/pyarrow.h @@ -30,7 +30,6 @@ namespace arrow { class Array; class Buffer; -class Column; class DataType; class Field; class RecordBatch; @@ -81,10 +80,6 @@ unwrap_sparse_tensor_csr(PyObject* sparse_tensor, std::shared_ptr& sparse_tensor); -ARROW_PYTHON_EXPORT bool is_column(PyObject* column); -ARROW_PYTHON_EXPORT Status unwrap_column(PyObject* column, std::shared_ptr* out); -ARROW_PYTHON_EXPORT PyObject* wrap_column(const std::shared_ptr& column); - ARROW_PYTHON_EXPORT bool is_table(PyObject* table); ARROW_PYTHON_EXPORT Status unwrap_table(PyObject* table, std::shared_ptr
* out); ARROW_PYTHON_EXPORT PyObject* wrap_table(const std::shared_ptr
& table); diff --git a/cpp/src/arrow/python/pyarrow_api.h b/cpp/src/arrow/python/pyarrow_api.h index 2d8f71c8c5a..76e72812361 100644 --- a/cpp/src/arrow/python/pyarrow_api.h +++ b/cpp/src/arrow/python/pyarrow_api.h @@ -36,8 +36,6 @@ static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_batch)(std::shared_ptr #define pyarrow_wrap_batch __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_batch static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_buffer)(std::shared_ptr< arrow::Buffer> const &) = 0; #define pyarrow_wrap_buffer __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_buffer -static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_column)(std::shared_ptr< arrow::Column> const &) = 0; -#define pyarrow_wrap_column __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_column static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_data_type)(std::shared_ptr< arrow::DataType> const &) = 0; #define pyarrow_wrap_data_type __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_data_type static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_field)(std::shared_ptr< arrow::Field> const &) = 0; @@ -60,8 +58,6 @@ static std::shared_ptr< arrow::RecordBatch> (*__pyx_api_f_7pyarrow_3lib_pyarrow #define pyarrow_unwrap_batch __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_batch static std::shared_ptr< arrow::Buffer> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_buffer)(PyObject *) = 0; #define pyarrow_unwrap_buffer __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_buffer -static std::shared_ptr< arrow::Column> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_column)(PyObject *) = 0; -#define pyarrow_unwrap_column __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_column static std::shared_ptr< arrow::DataType> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_data_type)(PyObject *) = 0; #define pyarrow_unwrap_data_type __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_data_type static std::shared_ptr< arrow::Field> (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_field)(PyObject *) = 0; @@ -96,8 +92,6 @@ static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_tensor_csr)(PyObject *) #define pyarrow_is_sparse_tensor_csr __pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_tensor_csr static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_tensor_coo)(PyObject *) = 0; #define pyarrow_is_sparse_tensor_coo __pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_tensor_coo -static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_column)(PyObject *) = 0; -#define pyarrow_is_column __pyx_api_f_7pyarrow_3lib_pyarrow_is_column static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_table)(PyObject *) = 0; #define pyarrow_is_table __pyx_api_f_7pyarrow_3lib_pyarrow_is_table static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_batch)(PyObject *) = 0; @@ -172,7 +166,6 @@ static int import_pyarrow__lib(void) { if (__Pyx_ImportFunction(module, "pyarrow_wrap_chunked_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_chunked_array, "PyObject *(std::shared_ptr< arrow::ChunkedArray> const &)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_wrap_batch", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_batch, "PyObject *(std::shared_ptr< arrow::RecordBatch> const &)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_wrap_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_buffer, "PyObject *(std::shared_ptr< arrow::Buffer> const &)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_wrap_column", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_column, "PyObject *(std::shared_ptr< arrow::Column> const &)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_wrap_data_type", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_data_type, "PyObject *(std::shared_ptr< arrow::DataType> const &)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_wrap_field", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_field, "PyObject *(std::shared_ptr< arrow::Field> const &)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_wrap_resizable_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_resizable_buffer, "PyObject *(std::shared_ptr< arrow::ResizableBuffer> const &)") < 0) goto bad; @@ -184,7 +177,6 @@ static int import_pyarrow__lib(void) { if (__Pyx_ImportFunction(module, "pyarrow_unwrap_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_array, "std::shared_ptr< arrow::Array> (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_unwrap_batch", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_batch, "std::shared_ptr< arrow::RecordBatch> (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_unwrap_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_buffer, "std::shared_ptr< arrow::Buffer> (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_unwrap_column", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_column, "std::shared_ptr< arrow::Column> (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_unwrap_data_type", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_data_type, "std::shared_ptr< arrow::DataType> (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_unwrap_field", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_field, "std::shared_ptr< arrow::Field> (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_unwrap_schema", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_schema, "std::shared_ptr< arrow::Schema> (PyObject *)") < 0) goto bad; @@ -202,7 +194,6 @@ static int import_pyarrow__lib(void) { if (__Pyx_ImportFunction(module, "pyarrow_is_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_tensor, "int (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_is_sparse_tensor_csr", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_tensor_csr, "int (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_is_sparse_tensor_coo", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_tensor_coo, "int (PyObject *)") < 0) goto bad; - if (__Pyx_ImportFunction(module, "pyarrow_is_column", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_column, "int (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_is_table", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_table, "int (PyObject *)") < 0) goto bad; if (__Pyx_ImportFunction(module, "pyarrow_is_batch", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_batch, "int (PyObject *)") < 0) goto bad; Py_DECREF(module); module = 0; diff --git a/cpp/src/arrow/stl.h b/cpp/src/arrow/stl.h index d641e39955b..37f3ce3fb3c 100644 --- a/cpp/src/arrow/stl.h +++ b/cpp/src/arrow/stl.h @@ -254,11 +254,11 @@ struct EnsureColumnTypes { if (!table.schema()->field(N - 1)->type()->Equals(*expected_type)) { compute::Datum casted; - ARROW_RETURN_NOT_OK(compute::Cast(ctx, compute::Datum(table.column(N - 1)->data()), + ARROW_RETURN_NOT_OK(compute::Cast(ctx, compute::Datum(table.column(N - 1)), expected_type, cast_options, &casted)); - std::shared_ptr new_column = std::make_shared( - table.schema()->field(N - 1)->WithType(expected_type), casted.chunked_array()); - ARROW_RETURN_NOT_OK(table.SetColumn(N - 1, new_column, table_owner)); + auto new_field = table.schema()->field(N - 1)->WithType(expected_type); + ARROW_RETURN_NOT_OK( + table.SetColumn(N - 1, new_field, casted.chunked_array(), table_owner)); *result = **table_owner; } @@ -286,7 +286,7 @@ struct TupleSetter { typename TypeTraits::ArrowType>::ArrayType; auto iter = rows->begin(); - const ChunkedArray& chunked_array = *table.column(N - 1)->data(); + const ChunkedArray& chunked_array = *table.column(N - 1); for (int i = 0; i < chunked_array.num_chunks(); i++) { const ArrayType& array = ::arrow::internal::checked_cast(*chunked_array.chunk(i)); diff --git a/cpp/src/arrow/table-test.cc b/cpp/src/arrow/table-test.cc index b0a870ee27d..a89cdc039f9 100644 --- a/cpp/src/arrow/table-test.cc +++ b/cpp/src/arrow/table-test.cc @@ -33,11 +33,6 @@ namespace arrow { -std::shared_ptr column(const std::shared_ptr& field, - const std::vector>& arrays) { - return std::make_shared(field, arrays); -} - class TestChunkedArray : public TestBase { protected: virtual void Construct() { @@ -161,103 +156,6 @@ TEST_F(TestChunkedArray, Validate) { ASSERT_RAISES(Invalid, one_->Validate()); } -class TestColumn : public TestChunkedArray { - protected: - void Construct() override { - TestChunkedArray::Construct(); - - one_col_ = std::make_shared(one_field_, one_); - another_col_ = std::make_shared(another_field_, another_); - } - - std::shared_ptr data_; - std::unique_ptr column_; - - std::shared_ptr one_field_; - std::shared_ptr another_field_; - - std::shared_ptr one_col_; - std::shared_ptr another_col_; -}; - -TEST_F(TestColumn, BasicAPI) { - ArrayVector arrays; - arrays.push_back(MakeRandomArray(100)); - arrays.push_back(MakeRandomArray(100, 10)); - arrays.push_back(MakeRandomArray(100, 20)); - - auto f0 = field("c0", int32()); - column_.reset(new Column(f0, arrays)); - - ASSERT_EQ("c0", column_->name()); - ASSERT_TRUE(column_->type()->Equals(int32())); - ASSERT_EQ(300, column_->length()); - ASSERT_EQ(30, column_->null_count()); - ASSERT_EQ(3, column_->data()->num_chunks()); -} - -TEST_F(TestColumn, ChunksInhomogeneous) { - ArrayVector arrays; - arrays.push_back(MakeRandomArray(100)); - arrays.push_back(MakeRandomArray(100, 10)); - - auto f0 = field("c0", int32()); - column_.reset(new Column(f0, arrays)); - - ASSERT_OK(column_->ValidateData()); - - arrays.push_back(MakeRandomArray(100, 10)); - column_.reset(new Column(f0, arrays)); - ASSERT_RAISES(Invalid, column_->ValidateData()); -} - -TEST_F(TestColumn, SliceEquals) { - arrays_one_.push_back(MakeRandomArray(100)); - arrays_one_.push_back(MakeRandomArray(50)); - arrays_one_.push_back(MakeRandomArray(50)); - one_field_ = field("column", int32()); - Construct(); - - std::shared_ptr slice = one_col_->Slice(125, 50); - ASSERT_EQ(slice->length(), 50); - ASSERT_TRUE(slice->Equals(one_col_->Slice(125, 50))); - - std::shared_ptr slice2 = one_col_->Slice(75)->Slice(25)->Slice(25, 50); - ASSERT_EQ(slice2->length(), 50); - ASSERT_TRUE(slice2->Equals(slice)); -} - -TEST_F(TestColumn, Equals) { - std::vector null_bitmap(100, true); - std::vector data(100, 1); - std::shared_ptr array; - ArrayFromVector(null_bitmap, data, &array); - arrays_one_.push_back(array); - arrays_another_.push_back(array); - - one_field_ = field("column", int32()); - another_field_ = field("column", int32()); - - Construct(); - ASSERT_TRUE(one_col_->Equals(one_col_)); - ASSERT_FALSE(one_col_->Equals(nullptr)); - ASSERT_TRUE(one_col_->Equals(another_col_)); - ASSERT_TRUE(one_col_->Equals(*another_col_.get())); - - // Field is different - another_field_ = field("two", int32()); - Construct(); - ASSERT_FALSE(one_col_->Equals(another_col_)); - ASSERT_FALSE(one_col_->Equals(*another_col_.get())); - - // ChunkedArray is different - another_field_ = field("column", int32()); - arrays_another_.push_back(array); - Construct(); - ASSERT_FALSE(one_col_->Equals(another_col_)); - ASSERT_FALSE(one_col_->Equals(*another_col_.get())); -} - class TestTable : public TestBase { public: void MakeExample1(int length) { @@ -271,9 +169,9 @@ class TestTable : public TestBase { arrays_ = {MakeRandomArray(length), MakeRandomArray(length), MakeRandomArray(length)}; - columns_ = {std::make_shared(schema_->field(0), arrays_[0]), - std::make_shared(schema_->field(1), arrays_[1]), - std::make_shared(schema_->field(2), arrays_[2])}; + columns_ = {std::make_shared(arrays_[0]), + std::make_shared(arrays_[1]), + std::make_shared(arrays_[2])}; } protected: @@ -281,7 +179,7 @@ class TestTable : public TestBase { std::shared_ptr schema_; std::vector> arrays_; - std::vector> columns_; + std::vector> columns_; }; TEST_F(TestTable, EmptySchema) { @@ -323,7 +221,6 @@ TEST_F(TestTable, Metadata) { ASSERT_TRUE(table_->schema()->Equals(*schema_)); auto col = table_->column(0); - ASSERT_EQ(schema_->field(0)->name(), col->name()); ASSERT_EQ(schema_->field(0)->type(), col->type()); } @@ -341,11 +238,9 @@ TEST_F(TestTable, InvalidColumns) { table_ = Table::Make(schema_, columns_, length); ASSERT_RAISES(Invalid, table_->Validate()); - columns_ = { - std::make_shared(schema_->field(0), MakeRandomArray(length)), - std::make_shared(schema_->field(1), MakeRandomArray(length)), - std::make_shared(schema_->field(2), - MakeRandomArray(length - 1))}; + columns_ = {std::make_shared(MakeRandomArray(length)), + std::make_shared(MakeRandomArray(length)), + std::make_shared(MakeRandomArray(length - 1))}; table_ = Table::Make(schema_, columns_, length); ASSERT_RAISES(Invalid, table_->Validate()); @@ -367,13 +262,10 @@ TEST_F(TestTable, Equals) { auto other = Table::Make(other_schema, columns_); ASSERT_FALSE(table_->Equals(*other)); // Differing columns - std::vector> other_columns = { - std::make_shared(schema_->field(0), - MakeRandomArray(length, 10)), - std::make_shared(schema_->field(1), - MakeRandomArray(length, 10)), - std::make_shared(schema_->field(2), - MakeRandomArray(length, 10))}; + std::vector> other_columns = { + std::make_shared(MakeRandomArray(length, 10)), + std::make_shared(MakeRandomArray(length, 10)), + std::make_shared(MakeRandomArray(length, 10))}; other = Table::Make(schema_, other_columns); ASSERT_FALSE(table_->Equals(*other)); @@ -391,10 +283,10 @@ TEST_F(TestTable, FromRecordBatches) { expected = Table::Make(schema_, columns_); ASSERT_TRUE(result->Equals(*expected)); - std::vector> other_columns; + std::vector> other_columns; for (int i = 0; i < schema_->num_fields(); ++i) { std::vector> col_arrays = {arrays_[i], arrays_[i]}; - other_columns.push_back(std::make_shared(schema_->field(i), col_arrays)); + other_columns.push_back(std::make_shared(col_arrays)); } ASSERT_OK(Table::FromRecordBatches({batch1, batch1}, &result)); @@ -446,7 +338,7 @@ TEST_F(TestTable, CombineChunks) { std::shared_ptr
table; ASSERT_OK(Table::FromRecordBatches({batch1, batch2}, &table)); for (int i = 0; i < table->num_columns(); ++i) { - ASSERT_EQ(2, table->column(i)->data()->num_chunks()); + ASSERT_EQ(2, table->column(i)->num_chunks()); } std::shared_ptr
compacted; @@ -454,7 +346,7 @@ TEST_F(TestTable, CombineChunks) { EXPECT_TRUE(compacted->Equals(*table)); for (int i = 0; i < compacted->num_columns(); ++i) { - EXPECT_EQ(1, compacted->column(i)->data()->num_chunks()); + EXPECT_EQ(1, compacted->column(i)->num_chunks()); } } @@ -517,7 +409,8 @@ TEST_F(TestTable, RemoveColumn) { ASSERT_OK(table.RemoveColumn(0, &result)); auto ex_schema = ::arrow::schema({schema_->field(1), schema_->field(2)}); - std::vector> ex_columns = {table.column(1), table.column(2)}; + std::vector> ex_columns = {table.column(1), + table.column(2)}; auto expected = Table::Make(ex_schema, ex_columns); ASSERT_TRUE(result->Equals(*expected)); @@ -544,14 +437,13 @@ TEST_F(TestTable, SetColumn) { const Table& table = *table_sp; std::shared_ptr
result; - ASSERT_OK(table.SetColumn(0, table.column(1), &result)); + ASSERT_OK(table.SetColumn(0, schema_->field(1), table.column(1), &result)); auto ex_schema = ::arrow::schema({schema_->field(1), schema_->field(1), schema_->field(2)}); - std::vector> ex_columns = {table.column(1), table.column(1), - table.column(2)}; - auto expected = Table::Make(ex_schema, ex_columns); + auto expected = + Table::Make(ex_schema, {table.column(1), table.column(1), table.column(2)}); ASSERT_TRUE(result->Equals(*expected)); } @@ -576,7 +468,7 @@ TEST_F(TestTable, RemoveColumnEmpty) { auto schema = ::arrow::schema({f0}); auto a0 = MakeRandomArray(length); - auto table = Table::Make(schema, {std::make_shared(f0, a0)}); + auto table = Table::Make(schema, {std::make_shared(a0)}); std::shared_ptr
empty; ASSERT_OK(table->RemoveColumn(0, &empty)); @@ -584,7 +476,7 @@ TEST_F(TestTable, RemoveColumnEmpty) { ASSERT_EQ(table->num_rows(), empty->num_rows()); std::shared_ptr
added; - ASSERT_OK(empty->AddColumn(0, table->column(0), &added)); + ASSERT_OK(empty->AddColumn(0, f0, table->column(0), &added)); ASSERT_EQ(table->num_rows(), added->num_rows()); } @@ -595,23 +487,25 @@ TEST_F(TestTable, AddColumn) { auto table_sp = Table::Make(schema_, columns_); const Table& table = *table_sp; + auto f0 = schema_->field(0); + std::shared_ptr
result; // Some negative tests with invalid index - Status status = table.AddColumn(10, columns_[0], &result); + Status status = table.AddColumn(10, f0, columns_[0], &result); ASSERT_TRUE(status.IsInvalid()); - status = table.AddColumn(4, columns_[0], &result); + status = table.AddColumn(4, f0, columns_[0], &result); ASSERT_TRUE(status.IsInvalid()); - status = table.AddColumn(-1, columns_[0], &result); + status = table.AddColumn(-1, f0, columns_[0], &result); ASSERT_TRUE(status.IsInvalid()); // Add column with wrong length - auto longer_col = std::make_shared(schema_->field(0), - MakeRandomArray(length + 1)); - status = table.AddColumn(0, longer_col, &result); + auto longer_col = + std::make_shared(MakeRandomArray(length + 1)); + status = table.AddColumn(0, f0, longer_col, &result); ASSERT_TRUE(status.IsInvalid()); // Add column 0 in different places - ASSERT_OK(table.AddColumn(0, columns_[0], &result)); + ASSERT_OK(table.AddColumn(0, f0, columns_[0], &result)); auto ex_schema = ::arrow::schema( {schema_->field(0), schema_->field(0), schema_->field(1), schema_->field(2)}); @@ -619,7 +513,7 @@ TEST_F(TestTable, AddColumn) { ex_schema, {table.column(0), table.column(0), table.column(1), table.column(2)}); ASSERT_TRUE(result->Equals(*expected)); - ASSERT_OK(table.AddColumn(1, columns_[0], &result)); + ASSERT_OK(table.AddColumn(1, f0, columns_[0], &result)); ex_schema = ::arrow::schema( {schema_->field(0), schema_->field(0), schema_->field(1), schema_->field(2)}); @@ -627,14 +521,14 @@ TEST_F(TestTable, AddColumn) { ex_schema, {table.column(0), table.column(0), table.column(1), table.column(2)}); ASSERT_TRUE(result->Equals(*expected)); - ASSERT_OK(table.AddColumn(2, columns_[0], &result)); + ASSERT_OK(table.AddColumn(2, f0, columns_[0], &result)); ex_schema = ::arrow::schema( {schema_->field(0), schema_->field(1), schema_->field(0), schema_->field(2)}); expected = Table::Make( ex_schema, {table.column(0), table.column(1), table.column(0), table.column(2)}); ASSERT_TRUE(result->Equals(*expected)); - ASSERT_OK(table.AddColumn(3, columns_[0], &result)); + ASSERT_OK(table.AddColumn(3, f0, columns_[0], &result)); ex_schema = ::arrow::schema( {schema_->field(0), schema_->field(1), schema_->field(2), schema_->field(0)}); expected = Table::Make( @@ -844,11 +738,14 @@ TEST_F(TestTableBatchReader, ReadNext) { auto sch1 = arrow::schema({field("f1", int32()), field("f2", int32())}); - std::vector> columns; + std::vector> columns; std::shared_ptr batch; - columns = {column(sch1->field(0), {a1, a4, a2}), column(sch1->field(1), {a2, a2})}; + std::vector> arrays_1 = {a1, a4, a2}; + std::vector> arrays_2 = {a2, a2}; + columns = {std::make_shared(arrays_1), + std::make_shared(arrays_2)}; auto t1 = Table::Make(sch1, columns); TableBatchReader i1(*t1); @@ -865,7 +762,10 @@ TEST_F(TestTableBatchReader, ReadNext) { ASSERT_OK(i1.ReadNext(&batch)); ASSERT_EQ(nullptr, batch); - columns = {column(sch1->field(0), {a1}), column(sch1->field(1), {a4})}; + arrays_1 = {a1}; + arrays_2 = {a4}; + columns = {std::make_shared(arrays_1), + std::make_shared(arrays_2)}; auto t2 = Table::Make(sch1, columns); TableBatchReader i2(*t2); @@ -887,7 +787,9 @@ TEST_F(TestTableBatchReader, Chunksize) { auto a3 = MakeRandomArray(10); auto sch1 = arrow::schema({field("f1", int32())}); - auto t1 = Table::Make(sch1, {column(sch1->field(0), {a1, a2, a3})}); + + std::vector> arrays = {a1, a2, a3}; + auto t1 = Table::Make(sch1, {std::make_shared(arrays)}); TableBatchReader i1(*t1); diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index 5c58adcd740..907cc8c2241 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -36,27 +36,8 @@ namespace arrow { using internal::checked_cast; -namespace { - -// If a column contains multiple chunks, concatenates those chunks into one and -// makes a new column out of it. Otherwise makes `compacted` point to the same -// column. -Status CompactColumn(const std::shared_ptr& column, MemoryPool* pool, - std::shared_ptr* compacted) { - if (column->data()->num_chunks() <= 1) { - *compacted = column; - return Status::OK(); - } - std::shared_ptr merged_data_array; - RETURN_NOT_OK(Concatenate(column->data()->chunks(), pool, &merged_data_array)); - *compacted = std::make_shared(column->field(), merged_data_array); - return Status::OK(); -} - -} // namespace - // ---------------------------------------------------------------------- -// ChunkedArray and Column methods +// ChunkedArray methods ChunkedArray::ChunkedArray(const ArrayVector& chunks) : chunks_(chunks) { length_ = 0; @@ -211,75 +192,6 @@ Status ChunkedArray::Validate() const { return Status::OK(); } -// ---------------------------------------------------------------------- - -Column::Column(const std::shared_ptr& field, const ArrayVector& chunks) - : field_(field) { - data_ = std::make_shared(chunks, field->type()); -} - -Column::Column(const std::shared_ptr& field, const std::shared_ptr& data) - : field_(field) { - if (!data) { - data_ = std::make_shared(ArrayVector({}), field->type()); - } else { - data_ = std::make_shared(ArrayVector({data}), field->type()); - } -} - -Column::Column(const std::string& name, const std::shared_ptr& data) - : Column(::arrow::field(name, data->type()), data) {} - -Column::Column(const std::string& name, const std::shared_ptr& data) - : Column(::arrow::field(name, data->type()), data) {} - -Column::Column(const std::shared_ptr& field, - const std::shared_ptr& data) - : field_(field), data_(data) {} - -Status Column::Flatten(MemoryPool* pool, - std::vector>* out) const { - std::vector> flattened; - std::vector> flattened_fields = field_->Flatten(); - std::vector> flattened_data; - RETURN_NOT_OK(data_->Flatten(pool, &flattened_data)); - DCHECK_EQ(flattened_fields.size(), flattened_data.size()); - for (size_t i = 0; i < flattened_fields.size(); ++i) { - flattened.push_back(std::make_shared(flattened_fields[i], flattened_data[i])); - } - *out = flattened; - return Status::OK(); -} - -bool Column::Equals(const Column& other) const { - if (!field_->Equals(other.field())) { - return false; - } - return data_->Equals(other.data()); -} - -bool Column::Equals(const std::shared_ptr& other) const { - if (this == other.get()) { - return true; - } - if (!other) { - return false; - } - - return Equals(*other.get()); -} - -Status Column::ValidateData() { - for (int i = 0; i < data_->num_chunks(); ++i) { - std::shared_ptr type = data_->chunk(i)->type(); - if (!this->type()->Equals(type)) { - return Status::Invalid("In chunk ", i, " expected type ", this->type()->ToString(), - " but saw ", type->ToString()); - } - } - return Status::OK(); -} - // ---------------------------------------------------------------------- // Table methods @@ -288,7 +200,8 @@ Status Column::ValidateData() { class SimpleTable : public Table { public: SimpleTable(const std::shared_ptr& schema, - const std::vector>& columns, int64_t num_rows = -1) + const std::vector>& columns, + int64_t num_rows = -1) : columns_(columns) { schema_ = schema; if (num_rows < 0) { @@ -317,12 +230,11 @@ class SimpleTable : public Table { columns_.resize(columns.size()); for (size_t i = 0; i < columns.size(); ++i) { - columns_[i] = - std::make_shared(schema->field(static_cast(i)), columns[i]); + columns_[i] = std::make_shared(columns[i]); } } - std::shared_ptr column(int i) const override { return columns_[i]; } + std::shared_ptr column(int i) const override { return columns_[i]; } std::shared_ptr
Slice(int64_t offset, int64_t length) const override { auto sliced = columns_; @@ -343,7 +255,8 @@ class SimpleTable : public Table { return Status::OK(); } - Status AddColumn(int i, const std::shared_ptr& col, + Status AddColumn(int i, std::shared_ptr field_arg, + std::shared_ptr col, std::shared_ptr
* out) const override { DCHECK(col != nullptr); @@ -353,14 +266,20 @@ class SimpleTable : public Table { " but got length ", col->length()); } + if (!field_arg->type()->Equals(col->type())) { + return Status::Invalid("Field type did not match data type"); + } + std::shared_ptr new_schema; - RETURN_NOT_OK(schema_->AddField(i, col->field(), &new_schema)); + RETURN_NOT_OK(schema_->AddField(i, field_arg, &new_schema)); - *out = Table::Make(new_schema, internal::AddVectorElement(columns_, i, col)); + *out = + Table::Make(new_schema, internal::AddVectorElement(columns_, i, std::move(col))); return Status::OK(); } - Status SetColumn(int i, const std::shared_ptr& col, + Status SetColumn(int i, std::shared_ptr field_arg, + std::shared_ptr col, std::shared_ptr
* out) const override { DCHECK(col != nullptr); @@ -370,10 +289,14 @@ class SimpleTable : public Table { " but got length ", col->length()); } - std::shared_ptr new_schema; - RETURN_NOT_OK(schema_->SetField(i, col->field(), &new_schema)); + if (!field_arg->type()->Equals(col->type())) { + return Status::Invalid("Field type did not match data type"); + } - *out = Table::Make(new_schema, internal::ReplaceVectorElement(columns_, i, col)); + std::shared_ptr new_schema; + RETURN_NOT_OK(schema_->SetField(i, field_arg, &new_schema)); + *out = Table::Make(new_schema, + internal::ReplaceVectorElement(columns_, i, std::move(col))); return Status::OK(); } @@ -385,13 +308,15 @@ class SimpleTable : public Table { Status Flatten(MemoryPool* pool, std::shared_ptr
* out) const override { std::vector> flattened_fields; - std::vector> flattened_columns; - for (const auto& column : columns_) { - std::vector> new_columns; - RETURN_NOT_OK(column->Flatten(pool, &new_columns)); - for (const auto& new_col : new_columns) { - flattened_fields.push_back(new_col->field()); - flattened_columns.push_back(new_col); + std::vector> flattened_columns; + for (int i = 0; i < num_columns(); ++i) { + std::vector> new_columns; + std::vector> new_fields = field(i)->Flatten(); + RETURN_NOT_OK(column(i)->Flatten(pool, &new_columns)); + DCHECK_EQ(new_columns.size(), new_fields.size()); + for (size_t j = 0; j < new_columns.size(); ++j) { + flattened_fields.push_back(new_fields[j]); + flattened_columns.push_back(new_columns[j]); } } auto flattened_schema = @@ -406,48 +331,41 @@ class SimpleTable : public Table { return Status::Invalid("Number of columns did not match schema"); } for (int i = 0; i < num_columns(); ++i) { - const Column* col = columns_[i].get(); + const ChunkedArray* col = columns_[i].get(); if (col == nullptr) { return Status::Invalid("Column ", i, " was null"); } - if (!col->field()->Equals(*schema_->field(i))) { - return Status::Invalid("Column field ", i, " named ", col->name(), - " is inconsistent with schema"); + if (!col->type()->Equals(*schema_->field(i)->type())) { + return Status::Invalid("Column data for field ", i, " with type ", + col->type()->ToString(), " is inconsistent with schema ", + schema_->field(i)->type()->ToString()); } } // Make sure columns are all the same length for (int i = 0; i < num_columns(); ++i) { - const Column* col = columns_[i].get(); + const ChunkedArray* col = columns_[i].get(); if (col->length() != num_rows_) { - return Status::Invalid("Column ", i, " named ", col->name(), " expected length ", - num_rows_, " but got length ", col->length()); + return Status::Invalid("Column ", i, " named ", field(i)->name(), + " expected length ", num_rows_, " but got length ", + col->length()); } } return Status::OK(); } private: - std::vector> columns_; + std::vector> columns_; }; Table::Table() : num_rows_(0) {} -std::shared_ptr
Table::Make(const std::shared_ptr& schema, - const std::vector>& columns, - int64_t num_rows) { +std::shared_ptr
Table::Make( + const std::shared_ptr& schema, + const std::vector>& columns, int64_t num_rows) { return std::make_shared(schema, columns, num_rows); } -std::shared_ptr
Table::Make(const std::vector>& columns, - int64_t num_rows) { - std::vector> fields(columns.size()); - std::transform(columns.begin(), columns.end(), fields.begin(), - [](const std::shared_ptr& column) { return column->field(); }); - return std::make_shared(::arrow::schema(std::move(fields)), columns, - num_rows); -} - std::shared_ptr
Table::Make(const std::shared_ptr& schema, const std::vector>& arrays, int64_t num_rows) { @@ -468,14 +386,14 @@ Status Table::FromRecordBatches(const std::shared_ptr& schema, } } - std::vector> columns(ncolumns); + std::vector> columns(ncolumns); std::vector> column_arrays(nbatches); for (int i = 0; i < ncolumns; ++i) { for (int j = 0; j < nbatches; ++j) { column_arrays[j] = batches[j]->column(i); } - columns[i] = std::make_shared(schema->field(i), column_arrays); + columns[i] = std::make_shared(column_arrays, schema->field(i)->type()); } *table = Table::Make(schema, columns); @@ -501,14 +419,14 @@ Status Table::FromChunkedStructArray(const std::shared_ptr& array, int num_chunks = array->num_chunks(); const auto& struct_chunks = array->chunks(); - std::vector> columns(num_columns); + std::vector> columns(num_columns); for (int i = 0; i < num_columns; ++i) { ArrayVector chunks(num_chunks); std::transform(struct_chunks.begin(), struct_chunks.end(), chunks.begin(), [i](const std::shared_ptr& struct_chunk) { return static_cast(*struct_chunk).field(i); }); - columns[i] = std::make_shared(type->child(i), chunks); + columns[i] = std::make_shared(chunks); } *table = Table::Make(::arrow::schema(type->children()), columns, array->length()); @@ -518,7 +436,7 @@ Status Table::FromChunkedStructArray(const std::shared_ptr& array, std::vector Table::ColumnNames() const { std::vector names(num_columns()); for (int i = 0; i < num_columns(); ++i) { - names[i] = column(i)->name(); + names[i] = field(i)->name(); } return names; } @@ -529,11 +447,11 @@ Status Table::RenameColumns(const std::vector& names, return Status::Invalid("tried to rename a table of ", num_columns(), " columns but only ", names.size(), " names were provided"); } - std::vector> columns(num_columns()); + std::vector> columns(num_columns()); std::vector> fields(num_columns()); for (int i = 0; i < num_columns(); ++i) { - fields[i] = column(i)->field()->WithName(names[i]); - columns[i] = std::make_shared(fields[i], column(i)->data()); + columns[i] = column(i); + fields[i] = field(i)->WithName(names[i]); } *out = Table::Make(::arrow::schema(std::move(fields)), std::move(columns), num_rows()); return Status::OK(); @@ -558,17 +476,16 @@ Status ConcatenateTables(const std::vector>& tables, } } - std::vector> columns(ncolumns); + std::vector> columns(ncolumns); for (int i = 0; i < ncolumns; ++i) { std::vector> column_arrays; for (int j = 0; j < ntables; ++j) { - const std::vector>& chunks = - tables[j]->column(i)->data()->chunks(); + const std::vector>& chunks = tables[j]->column(i)->chunks(); for (const auto& chunk : chunks) { column_arrays.push_back(chunk); } } - columns[i] = std::make_shared(schema->field(i), column_arrays); + columns[i] = std::make_shared(column_arrays); } *table = Table::Make(schema, columns); return Status::OK(); @@ -595,9 +512,16 @@ bool Table::Equals(const Table& other) const { Status Table::CombineChunks(MemoryPool* pool, std::shared_ptr
* out) const { const int ncolumns = num_columns(); - std::vector> compacted_columns(ncolumns); + std::vector> compacted_columns(ncolumns); for (int i = 0; i < ncolumns; ++i) { - RETURN_NOT_OK(CompactColumn(column(i), pool, &compacted_columns[i])); + auto col = column(i); + if (col->num_chunks() <= 1) { + compacted_columns[i] = col; + } else { + std::shared_ptr compacted; + RETURN_NOT_OK(Concatenate(col->chunks(), pool, &compacted)); + compacted_columns[i] = std::make_shared(compacted); + } } *out = Table::Make(schema(), compacted_columns); return Status::OK(); @@ -616,7 +540,7 @@ class TableBatchReader::TableBatchReaderImpl { absolute_row_position_(0), max_chunksize_(std::numeric_limits::max()) { for (int i = 0; i < table.num_columns(); ++i) { - column_data_[i] = table.column(i)->data().get(); + column_data_[i] = table.column(i).get(); } } diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index 2e7dcee904c..6a3bdc5ca31 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -111,95 +111,6 @@ class ARROW_EXPORT ChunkedArray { ARROW_DISALLOW_COPY_AND_ASSIGN(ChunkedArray); }; -/// \class Column -/// \brief An immutable column data structure consisting of a field (type -/// metadata) and a chunked data array -class ARROW_EXPORT Column { - public: - /// \brief Construct a column from a vector of arrays - /// - /// The array chunks' datatype must match the field's datatype. - Column(const std::shared_ptr& field, const ArrayVector& chunks); - /// \brief Construct a column from a chunked array - /// - /// The chunked array's datatype must match the field's datatype. - Column(const std::shared_ptr& field, const std::shared_ptr& data); - /// \brief Construct a column from a single array - /// - /// The array's datatype must match the field's datatype. - Column(const std::shared_ptr& field, const std::shared_ptr& data); - - /// \brief Construct a column from a name and an array - /// - /// A field with the given name and the array's datatype is automatically created. - Column(const std::string& name, const std::shared_ptr& data); - /// \brief Construct a column from a name and a chunked array - /// - /// A field with the given name and the array's datatype is automatically created. - Column(const std::string& name, const std::shared_ptr& data); - - int64_t length() const { return data_->length(); } - - int64_t null_count() const { return data_->null_count(); } - - std::shared_ptr field() const { return field_; } - - /// \brief The column name - /// \return the column's name in the passed metadata - const std::string& name() const { return field_->name(); } - - /// \brief The column type - /// \return the column's type according to the metadata - std::shared_ptr type() const { return field_->type(); } - - /// \brief The column data as a chunked array - /// \return the column's data as a chunked logical array - std::shared_ptr data() const { return data_; } - - /// \brief Construct a zero-copy slice of the column with the indicated - /// offset and length - /// - /// \param[in] offset the position of the first element in the constructed - /// slice - /// \param[in] length the length of the slice. If there are not enough - /// elements in the column, the length will be adjusted accordingly - /// - /// \return a new object wrapped in std::shared_ptr - std::shared_ptr Slice(int64_t offset, int64_t length) const { - return std::make_shared(field_, data_->Slice(offset, length)); - } - - /// \brief Slice from offset until end of the column - std::shared_ptr Slice(int64_t offset) const { - return std::make_shared(field_, data_->Slice(offset)); - } - - /// \brief Flatten this column as a vector of columns - /// - /// \param[in] pool The pool for buffer allocations, if any - /// \param[out] out The resulting vector of arrays - Status Flatten(MemoryPool* pool, std::vector>* out) const; - - /// \brief Determine if two columns are equal. - /// - /// Two columns can be equal only if they have equal datatypes. - /// However, they may be equal even if they have different chunkings. - bool Equals(const Column& other) const; - /// \brief Determine if the two columns are equal. - bool Equals(const std::shared_ptr& other) const; - - /// \brief Verify that the column's array data is consistent with the passed - /// field's metadata - Status ValidateData(); - - protected: - std::shared_ptr field_; - std::shared_ptr data_; - - private: - ARROW_DISALLOW_COPY_AND_ASSIGN(Column); -}; - /// \class Table /// \brief Logical table as sequence of chunked arrays class ARROW_EXPORT Table { @@ -209,18 +120,11 @@ class ARROW_EXPORT Table { /// \brief Construct a Table from schema and columns /// If columns is zero-length, the table's number of rows is zero /// \param schema The table schema (column types) - /// \param columns The table's columns + /// \param columns The table's columns as chunked arrays /// \param num_rows number of rows in table, -1 (default) to infer from columns - static std::shared_ptr
Make(const std::shared_ptr& schema, - const std::vector>& columns, - int64_t num_rows = -1); - - /// \brief Construct a Table from columns, schema is assembled from column fields - /// If columns is zero-length, the table's number of rows is zero - /// \param columns The table's columns - /// \param num_rows number of rows in table, -1 (default) to infer from columns - static std::shared_ptr
Make(const std::vector>& columns, - int64_t num_rows = -1); + static std::shared_ptr
Make( + const std::shared_ptr& schema, + const std::vector>& columns, int64_t num_rows = -1); /// \brief Construct a Table from schema and arrays /// \param schema The table schema (column types) @@ -265,7 +169,10 @@ class ARROW_EXPORT Table { std::shared_ptr schema() const { return schema_; } /// Return a column by index - virtual std::shared_ptr column(int i) const = 0; + virtual std::shared_ptr column(int i) const = 0; + + /// Return a column's field by index + std::shared_ptr field(int i) const { return schema_->field(i); } /// \brief Construct a zero-copy slice of the table with the /// indicated offset and length @@ -284,7 +191,7 @@ class ARROW_EXPORT Table { /// \brief Return a column by name /// \param[in] name field name /// \return an Array or null if no field was found - std::shared_ptr GetColumnByName(const std::string& name) const { + std::shared_ptr GetColumnByName(const std::string& name) const { auto i = schema_->GetFieldIndex(name); return i == -1 ? NULLPTR : column(i); } @@ -293,11 +200,13 @@ class ARROW_EXPORT Table { virtual Status RemoveColumn(int i, std::shared_ptr
* out) const = 0; /// \brief Add column to the table, producing a new Table - virtual Status AddColumn(int i, const std::shared_ptr& column, + virtual Status AddColumn(int i, std::shared_ptr field_arg, + std::shared_ptr column, std::shared_ptr
* out) const = 0; /// \brief Replace a column in the table, producing a new Table - virtual Status SetColumn(int i, const std::shared_ptr& column, + virtual Status SetColumn(int i, std::shared_ptr field_arg, + std::shared_ptr column, std::shared_ptr
* out) const = 0; /// \brief Return names of all columns diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index ee66b2e30d8..70870a74c45 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -45,8 +45,7 @@ namespace arrow { -static void PrintColumn(const Column& col, std::stringstream* ss) { - const ChunkedArray& carr = *col.data(); +static void PrintChunkedArray(const ChunkedArray& carr, std::stringstream* ss) { for (int i = 0; i < carr.num_chunks(); ++i) { auto c1 = carr.chunk(i); *ss << "Chunk " << i << std::endl; @@ -147,17 +146,17 @@ void AssertTablesEqual(const Table& expected, const Table& actual, if (same_chunk_layout) { for (int i = 0; i < actual.num_columns(); ++i) { - AssertChunkedEqual(*expected.column(i)->data(), *actual.column(i)->data()); + AssertChunkedEqual(*expected.column(i), *actual.column(i)); } } else { std::stringstream ss; if (!actual.Equals(expected)) { for (int i = 0; i < expected.num_columns(); ++i) { ss << "Actual column " << i << std::endl; - PrintColumn(*actual.column(i), &ss); + PrintChunkedArray(*actual.column(i), &ss); ss << "Expected column " << i << std::endl; - PrintColumn(*expected.column(i), &ss); + PrintChunkedArray(*expected.column(i), &ss); } FAIL() << ss.str(); } diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index 914aeb011ca..e7c4bb3db20 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -120,7 +120,6 @@ typedef ::testing::Types field = this->child(i); - s << field->name() << ": " << field->type()->ToString(); + s << field->ToString(); } s << ">"; return s.str(); diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index ea32b49d168..c42d66152da 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -36,7 +36,6 @@ class Field; class Tensor; class ChunkedArray; -class Column; class RecordBatch; class Table; diff --git a/cpp/src/arrow/util/stl.h b/cpp/src/arrow/util/stl.h index 48898140bf1..f1b1e182c14 100644 --- a/cpp/src/arrow/util/stl.h +++ b/cpp/src/arrow/util/stl.h @@ -59,14 +59,14 @@ inline std::vector DeleteVectorElement(const std::vector& values, size_t i template inline std::vector AddVectorElement(const std::vector& values, size_t index, - const T& new_element) { + T new_element) { DCHECK_LE(index, values.size()); std::vector out; out.reserve(values.size() + 1); for (size_t i = 0; i < index; ++i) { out.push_back(values[i]); } - out.push_back(new_element); + out.emplace_back(std::move(new_element)); for (size_t i = index; i < values.size(); ++i) { out.push_back(values[i]); } @@ -75,14 +75,14 @@ inline std::vector AddVectorElement(const std::vector& values, size_t inde template inline std::vector ReplaceVectorElement(const std::vector& values, size_t index, - const T& new_element) { + T new_element) { DCHECK_LE(index, values.size()); std::vector out; out.reserve(values.size()); for (size_t i = 0; i < index; ++i) { out.push_back(values[i]); } - out.push_back(new_element); + out.emplace_back(std::move(new_element)); for (size_t i = index + 1; i < values.size(); ++i) { out.push_back(values[i]); } diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc index 2cc8b0d05f4..c5d638d9788 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc @@ -50,7 +50,6 @@ using arrow::Array; using arrow::ArrayVisitor; using arrow::Buffer; using arrow::ChunkedArray; -using arrow::Column; using arrow::DataType; using arrow::default_memory_pool; using arrow::ListArray; @@ -74,8 +73,6 @@ using parquet::schema::GroupNode; using parquet::schema::NodePtr; using parquet::schema::PrimitiveNode; -using ColumnVector = std::vector>; - namespace parquet { namespace arrow { @@ -581,7 +578,7 @@ class TestParquetIO : public ::testing::Test { ASSERT_EQ(1, out->num_columns()); ASSERT_EQ(values->length(), out->num_rows()); - std::shared_ptr chunked_array = out->column(0)->data(); + std::shared_ptr chunked_array = out->column(0); ASSERT_EQ(1, chunked_array->num_chunks()); auto result = chunked_array->chunk(0); @@ -661,7 +658,7 @@ TYPED_TEST(TestParquetIO, SingleColumnTableRequiredWrite) { ASSERT_EQ(1, out->num_columns()); ASSERT_EQ(100, out->num_rows()); - std::shared_ptr chunked_array = out->column(0)->data(); + std::shared_ptr chunked_array = out->column(0); ASSERT_EQ(1, chunked_array->num_chunks()); AssertArraysEqual(*values, *chunked_array->chunk(0)); @@ -841,7 +838,7 @@ TYPED_TEST(TestParquetIO, SingleColumnTableRequiredChunkedWriteArrowIO) { ASSERT_EQ(1, out->num_columns()); ASSERT_EQ(values->length(), out->num_rows()); - std::shared_ptr chunked_array = out->column(0)->data(); + std::shared_ptr chunked_array = out->column(0); ASSERT_EQ(1, chunked_array->num_chunks()); AssertArraysEqual(*values, *chunked_array->chunk(0)); @@ -939,9 +936,7 @@ TYPED_TEST(TestParquetIO, CheckIterativeColumnRead) { } auto chunked = std::make_shared<::arrow::ChunkedArray>(batches); - auto chunked_col = - std::make_shared<::arrow::Column>(table->schema()->field(0), chunked); - auto chunked_table = ::arrow::Table::Make(table->schema(), {chunked_col}); + auto chunked_table = ::arrow::Table::Make(table->schema(), {chunked}); ASSERT_TRUE(table->Equals(*chunked_table)); } @@ -1099,7 +1094,7 @@ TEST_F(TestStringParquetIO, EmptyStringColumnRequiredWrite) { ASSERT_EQ(1, out->num_columns()); ASSERT_EQ(100, out->num_rows()); - std::shared_ptr chunked_array = out->column(0)->data(); + std::shared_ptr chunked_array = out->column(0); ASSERT_EQ(1, chunked_array->num_chunks()); AssertArraysEqual(*values, *chunked_array->chunk(0)); @@ -1124,7 +1119,7 @@ TEST_F(TestNullParquetIO, NullColumn) { ASSERT_EQ(1, out->num_columns()); ASSERT_EQ(num_rows, out->num_rows()); - std::shared_ptr chunked_array = out->column(0)->data(); + std::shared_ptr chunked_array = out->column(0); ASSERT_EQ(1, chunked_array->num_chunks()); AssertArraysEqual(*values, *chunked_array->chunk(0)); } @@ -1154,7 +1149,7 @@ TEST_F(TestNullParquetIO, NullListColumn) { ASSERT_EQ(1, out->num_columns()); ASSERT_EQ(offsets.size() - 1, out->num_rows()); - std::shared_ptr chunked_array = out->column(0)->data(); + std::shared_ptr chunked_array = out->column(0); ASSERT_EQ(1, chunked_array->num_chunks()); AssertArraysEqual(*list_array, *chunked_array->chunk(0)); } @@ -1181,7 +1176,7 @@ TEST_F(TestNullParquetIO, NullDictionaryColumn) { ASSERT_EQ(1, out->num_columns()); ASSERT_EQ(100, out->num_rows()); - std::shared_ptr chunked_array = out->column(0)->data(); + std::shared_ptr chunked_array = out->column(0); ASSERT_EQ(1, chunked_array->num_chunks()); std::shared_ptr expected_values = @@ -1243,7 +1238,7 @@ class TestPrimitiveParquetIO : public TestParquetIO { ASSERT_EQ(1, out->num_columns()); ASSERT_EQ(SMALL_SIZE, out->num_rows()); - std::shared_ptr chunked_array = out->column(0)->data(); + std::shared_ptr chunked_array = out->column(0); ASSERT_EQ(1, chunked_array->num_chunks()); ExpectArrayT(values.data(), chunked_array->chunk(0).get()); } @@ -1325,16 +1320,7 @@ void MakeDateTimeTypesTable(std::shared_ptr
* out, bool expected = false) ArrayFromVector<::arrow::Time64Type, int64_t>(f5->type(), is_valid, t64_us_values, &a5); ArrayFromVector<::arrow::Time64Type, int64_t>(f6->type(), is_valid, t64_ns_values, &a6); - std::vector> columns = { - std::make_shared("f0", a0), - std::make_shared("f1", a1), - std::make_shared("f2", a2), - std::make_shared("f3", (expected ? a3_x : a3)), - std::make_shared("f4", a4), - std::make_shared("f5", a5), - std::make_shared("f6", a6)}; - - *out = Table::Make(schema, columns); + *out = Table::Make(schema, {a0, a1, a2, expected ? a3_x : a3, a4, a5, a6}); } TEST(TestArrowReadWrite, DateTimeTypes) { @@ -1380,19 +1366,13 @@ TEST(TestArrowReadWrite, UseDeprecatedInt96) { // Each input is typed with a unique TimeUnit auto input_schema = schema( {field("f_s", t_s), field("f_ms", t_ms), field("f_us", t_us), field("f_ns", t_ns)}); - auto input = Table::Make( - input_schema, - {std::make_shared("f_s", a_s), std::make_shared("f_ms", a_ms), - std::make_shared("f_us", a_us), std::make_shared("f_ns", a_ns)}); + auto input = Table::Make(input_schema, {a_s, a_ms, a_us, a_ns}); // When reading parquet files, all int96 schema fields are converted to // timestamp nanoseconds auto ex_schema = schema({field("f_s", t_ns), field("f_ms", t_ns), field("f_us", t_ns), field("f_ns", t_ns)}); - auto ex_result = Table::Make( - ex_schema, - {std::make_shared("f_s", a_ns), std::make_shared("f_ms", a_ns), - std::make_shared("f_us", a_ns), std::make_shared("f_ns", a_ns)}); + auto ex_result = Table::Make(ex_schema, {a_ns, a_ns, a_ns, a_ns}); std::shared_ptr
result; ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip( @@ -1446,18 +1426,12 @@ TEST(TestArrowReadWrite, CoerceTimestamps) { // Input table, all data as is auto s1 = ::arrow::schema( {field("f_s", t_s), field("f_ms", t_ms), field("f_us", t_us), field("f_ns", t_ns)}); - auto input = Table::Make( - s1, - {std::make_shared("f_s", a_s), std::make_shared("f_ms", a_ms), - std::make_shared("f_us", a_us), std::make_shared("f_ns", a_ns)}); + auto input = Table::Make(s1, {a_s, a_ms, a_us, a_ns}); // Result when coercing to milliseconds auto s2 = ::arrow::schema({field("f_s", t_ms), field("f_ms", t_ms), field("f_us", t_ms), field("f_ns", t_ms)}); - auto ex_milli_result = Table::Make( - s2, - {std::make_shared("f_s", a_ms), std::make_shared("f_ms", a_ms), - std::make_shared("f_us", a_ms), std::make_shared("f_ns", a_ms)}); + auto ex_milli_result = Table::Make(s2, {a_ms, a_ms, a_ms, a_ms}); std::shared_ptr
milli_result; ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip( input, false /* use_threads */, input->num_rows(), {}, &milli_result, @@ -1469,10 +1443,7 @@ TEST(TestArrowReadWrite, CoerceTimestamps) { // Result when coercing to microseconds auto s3 = ::arrow::schema({field("f_s", t_us), field("f_ms", t_us), field("f_us", t_us), field("f_ns", t_us)}); - auto ex_micro_result = Table::Make( - s3, - {std::make_shared("f_s", a_us), std::make_shared("f_ms", a_us), - std::make_shared("f_us", a_us), std::make_shared("f_ns", a_us)}); + auto ex_micro_result = Table::Make(s3, {a_us, a_us, a_us, a_us}); std::shared_ptr
micro_result; ASSERT_NO_FATAL_FAILURE(DoSimpleRoundtrip( input, false /* use_threads */, input->num_rows(), {}, µ_result, @@ -1514,15 +1485,10 @@ TEST(TestArrowReadWrite, CoerceTimestampsLosePrecision) { auto s3 = ::arrow::schema({field("f_us", t_us)}); auto s4 = ::arrow::schema({field("f_ns", t_ns)}); - auto c1 = std::make_shared("f_s", a_s); - auto c2 = std::make_shared("f_ms", a_ms); - auto c3 = std::make_shared("f_us", a_us); - auto c4 = std::make_shared("f_ns", a_ns); - - auto t1 = Table::Make(s1, {c1}); - auto t2 = Table::Make(s2, {c2}); - auto t3 = Table::Make(s3, {c3}); - auto t4 = Table::Make(s4, {c4}); + auto t1 = Table::Make(s1, {a_s}); + auto t2 = Table::Make(s2, {a_ms}); + auto t3 = Table::Make(s3, {a_us}); + auto t4 = Table::Make(s4, {a_ns}); auto sink = CreateOutputStream(); @@ -1594,12 +1560,9 @@ TEST(TestArrowReadWrite, ImplicitSecondToMillisecondTimestampCoercion) { auto si = schema({field("timestamp", t_s)}); auto sx = schema({field("timestamp", t_ms)}); - auto ci = std::make_shared("timestamp", a_s); - auto cx = std::make_shared("timestamp", a_ms); - - auto ti = Table::Make(si, {ci}); // input - auto tx = Table::Make(sx, {cx}); // expected output - std::shared_ptr
to; // actual output + auto ti = Table::Make(si, {a_s}); // input + auto tx = Table::Make(sx, {a_ms}); // expected output + std::shared_ptr
to; // actual output // default properties (without explicit coercion instructions) used ... ASSERT_NO_FATAL_FAILURE( @@ -1635,14 +1598,9 @@ TEST(TestArrowReadWrite, ParquetVersionTimestampDifferences) { ArrayFromVector<::arrow::TimestampType, int64_t>(t_us, d_us, &a_us); ArrayFromVector<::arrow::TimestampType, int64_t>(t_ns, d_ns, &a_ns); - auto c_s = std::make_shared("ts:s", a_s); - auto c_ms = std::make_shared("ts:ms", a_ms); - auto c_us = std::make_shared("ts:us", a_us); - auto c_ns = std::make_shared("ts:ns", a_ns); - auto input_schema = schema({field("ts:s", t_s), field("ts:ms", t_ms), field("ts:us", t_us), field("ts:ns", t_ns)}); - auto input_table = Table::Make(input_schema, {c_s, c_ms, c_us, c_ns}); + auto input_table = Table::Make(input_schema, {a_s, a_ms, a_us, a_ns}); auto parquet_version_1_properties = ::parquet::default_writer_properties(); auto parquet_version_2_properties = ::parquet::WriterProperties::Builder() @@ -1654,7 +1612,7 @@ TEST(TestArrowReadWrite, ParquetVersionTimestampDifferences) { // and nanoseconds should be coerced to microseconds auto expected_schema = schema({field("ts:s", t_ms), field("ts:ms", t_ms), field("ts:us", t_us), field("ts:ns", t_us)}); - auto expected_table = Table::Make(expected_schema, {c_ms, c_ms, c_us, c_us}); + auto expected_table = Table::Make(expected_schema, {a_ms, a_ms, a_us, a_us}); ASSERT_NO_FATAL_FAILURE(CheckConfiguredRoundtrip(input_table, expected_table, parquet_version_1_properties)); } @@ -1663,7 +1621,7 @@ TEST(TestArrowReadWrite, ParquetVersionTimestampDifferences) { // and nanoseconds should be retained auto expected_schema = schema({field("ts:s", t_ms), field("ts:ms", t_ms), field("ts:us", t_us), field("ts:ns", t_ns)}); - auto expected_table = Table::Make(expected_schema, {c_ms, c_ms, c_us, c_ns}); + auto expected_table = Table::Make(expected_schema, {a_ms, a_ms, a_us, a_ns}); ASSERT_NO_FATAL_FAILURE(CheckConfiguredRoundtrip(input_table, expected_table, parquet_version_2_properties)); } @@ -1693,14 +1651,14 @@ TEST(TestArrowReadWrite, ParquetVersionTimestampDifferences) { // Using Parquet version 1.0, coercing to milliseconds or microseconds is allowed auto expected_schema = schema({field("ts:s", t_ms), field("ts:ms", t_ms), field("ts:us", t_ms), field("ts:ns", t_ms)}); - auto expected_table = Table::Make(expected_schema, {c_ms, c_ms, c_ms, c_ms}); + auto expected_table = Table::Make(expected_schema, {a_ms, a_ms, a_ms, a_ms}); ASSERT_NO_FATAL_FAILURE(CheckConfiguredRoundtrip(input_table, expected_table, parquet_version_1_properties, arrow_coerce_to_millis_properties)); expected_schema = schema({field("ts:s", t_us), field("ts:ms", t_us), field("ts:us", t_us), field("ts:ns", t_us)}); - expected_table = Table::Make(expected_schema, {c_us, c_us, c_us, c_us}); + expected_table = Table::Make(expected_schema, {a_us, a_us, a_us, a_us}); ASSERT_NO_FATAL_FAILURE(CheckConfiguredRoundtrip(input_table, expected_table, parquet_version_1_properties, arrow_coerce_to_micros_properties)); @@ -1709,14 +1667,14 @@ TEST(TestArrowReadWrite, ParquetVersionTimestampDifferences) { // Using Parquet version 2.0, coercing to milliseconds or microseconds is allowed auto expected_schema = schema({field("ts:s", t_ms), field("ts:ms", t_ms), field("ts:us", t_ms), field("ts:ns", t_ms)}); - auto expected_table = Table::Make(expected_schema, {c_ms, c_ms, c_ms, c_ms}); + auto expected_table = Table::Make(expected_schema, {a_ms, a_ms, a_ms, a_ms}); ASSERT_NO_FATAL_FAILURE(CheckConfiguredRoundtrip(input_table, expected_table, parquet_version_2_properties, arrow_coerce_to_millis_properties)); expected_schema = schema({field("ts:s", t_us), field("ts:ms", t_us), field("ts:us", t_us), field("ts:ns", t_us)}); - expected_table = Table::Make(expected_schema, {c_us, c_us, c_us, c_us}); + expected_table = Table::Make(expected_schema, {a_us, a_us, a_us, a_us}); ASSERT_NO_FATAL_FAILURE(CheckConfiguredRoundtrip(input_table, expected_table, parquet_version_2_properties, arrow_coerce_to_micros_properties)); @@ -1734,7 +1692,7 @@ TEST(TestArrowReadWrite, ParquetVersionTimestampDifferences) { // Using Parquet version 2.0, coercing to (int64) nanoseconds is allowed auto expected_schema = schema({field("ts:s", t_ns), field("ts:ms", t_ns), field("ts:us", t_ns), field("ts:ns", t_ns)}); - auto expected_table = Table::Make(expected_schema, {c_ns, c_ns, c_ns, c_ns}); + auto expected_table = Table::Make(expected_schema, {a_ns, a_ns, a_ns, a_ns}); ASSERT_NO_FATAL_FAILURE(CheckConfiguredRoundtrip(input_table, expected_table, parquet_version_2_properties, arrow_coerce_to_nanos_properties)); @@ -1747,7 +1705,7 @@ TEST(TestArrowReadWrite, ParquetVersionTimestampDifferences) { // storage is used auto expected_schema = schema({field("ts:s", t_ns), field("ts:ms", t_ns), field("ts:us", t_ns), field("ts:ns", t_ns)}); - auto expected_table = Table::Make(expected_schema, {c_ns, c_ns, c_ns, c_ns}); + auto expected_table = Table::Make(expected_schema, {a_ns, a_ns, a_ns, a_ns}); ASSERT_NO_FATAL_FAILURE(CheckConfiguredRoundtrip(input_table, expected_table, parquet_version_1_properties, arrow_enable_int96_properties)); @@ -1781,11 +1739,7 @@ TEST(TestArrowReadWrite, ConvertedDateTimeTypes) { ArrayFromVector<::arrow::Time32Type, int32_t>(f1->type(), is_valid, a1_values, &a1); ArrayFromVector<::arrow::Time32Type, int32_t>(f1->type(), a1_values, &a1_nonnull); - std::vector> columns = { - std::make_shared("f0", a0), std::make_shared("f1", a1), - std::make_shared("f2", a0_nonnull), - std::make_shared("f3", a1_nonnull)}; - auto table = Table::Make(schema, columns); + auto table = Table::Make(schema, {a0, a1, a0_nonnull, a1_nonnull}); // Expected schema and values auto e0 = field("f0", ::arrow::date32()); @@ -1802,11 +1756,7 @@ TEST(TestArrowReadWrite, ConvertedDateTimeTypes) { ArrayFromVector<::arrow::Time32Type, int32_t>(e1->type(), is_valid, x1_values, &x1); ArrayFromVector<::arrow::Time32Type, int32_t>(e1->type(), x1_values, &x1_nonnull); - std::vector> ex_columns = { - std::make_shared("f0", x0), std::make_shared("f1", x1), - std::make_shared("f2", x0_nonnull), - std::make_shared("f3", x1_nonnull)}; - auto ex_table = Table::Make(ex_schema, ex_columns); + auto ex_table = Table::Make(ex_schema, {x0, x1, x0_nonnull, x1_nonnull}); std::shared_ptr
result; ASSERT_NO_FATAL_FAILURE( @@ -1819,8 +1769,7 @@ TEST(TestArrowReadWrite, ConvertedDateTimeTypes) { void MakeDoubleTable(int num_columns, int num_rows, int nchunks, std::shared_ptr
* out) { - std::shared_ptr<::arrow::Column> column; - std::vector> columns(num_columns); + std::vector> columns(num_columns); std::vector> fields(num_columns); for (int i = 0; i < num_columns; ++i) { @@ -1834,10 +1783,8 @@ void MakeDoubleTable(int num_columns, int num_rows, int nchunks, for (int j = 0; j < nchunks; ++j) { arrays.push_back(values); } - column = MakeColumn(ss.str(), arrays, true); - - columns[i] = column; - fields[i] = column->field(); + columns[i] = std::make_shared(arrays); + fields[i] = ::arrow::field(ss.str(), values->type()); } auto schema = std::make_shared<::arrow::Schema>(fields); *out = Table::Make(schema, columns); @@ -2000,11 +1947,11 @@ TEST(TestArrowReadWrite, ReadColumnSubset) { ASSERT_NO_FATAL_FAILURE( DoSimpleRoundtrip(table, use_threads, table->num_rows(), column_subset, &result)); - std::vector> ex_columns; + std::vector> ex_columns; std::vector> ex_fields; for (int i : column_subset) { ex_columns.push_back(table->column(i)); - ex_fields.push_back(table->column(i)->field()); + ex_fields.push_back(table->field(i)); } auto ex_schema = ::arrow::schema(ex_fields); @@ -2057,11 +2004,7 @@ TEST(TestArrowReadWrite, ListLargeRecords) { pieces.push_back(chunked_piece->chunk(0)); } auto chunked = std::make_shared<::arrow::ChunkedArray>(pieces); - - auto chunked_col = - std::make_shared<::arrow::Column>(table->schema()->field(0), chunked); - std::vector> columns = {chunked_col}; - auto chunked_table = Table::Make(table->schema(), columns); + auto chunked_table = Table::Make(table->schema(), {chunked}); ASSERT_TRUE(table->Equals(*chunked_table)); } @@ -2146,8 +2089,7 @@ TEST(TestArrowReadWrite, TableWithChunkedColumns) { auto field = ::arrow::field("fname", type); auto schema = ::arrow::schema({field}); - auto col = std::make_shared<::arrow::Column>(field, arrays); - auto table = Table::Make(schema, {col}); + auto table = Table::Make(schema, {std::make_shared(arrays)}); ASSERT_NO_FATAL_FAILURE(CheckSimpleRoundtrip(table, 2)); ASSERT_NO_FATAL_FAILURE(CheckSimpleRoundtrip(table, 3)); @@ -2171,8 +2113,7 @@ TEST(TestArrowReadWrite, TableWithDuplicateColumns) { ArrayFromVector<::arrow::Int8Type, int8_t>(a0_values, &a0); ArrayFromVector<::arrow::Int16Type, int16_t>(a1_values, &a1); - auto table = Table::Make(schema, {std::make_shared(f0->name(), a0), - std::make_shared(f1->name(), a1)}); + auto table = Table::Make(schema, {a0, a1}); ASSERT_NO_FATAL_FAILURE(CheckSimpleRoundtrip(table, table->num_rows())); } @@ -2207,9 +2148,8 @@ TEST(TestArrowReadWrite, DictionaryColumnChunkedWrite) { std::make_shared<::arrow::DictionaryArray>(dict_type, f0_values, dict_values), std::make_shared<::arrow::DictionaryArray>(dict_type, f1_values, dict_values)}; - std::vector> columns; - auto column = MakeColumn("dictionary", dict_arrays, true); - columns.emplace_back(column); + std::vector> columns; + columns.emplace_back(std::make_shared(dict_arrays)); auto table = Table::Make(schema, columns); @@ -2230,7 +2170,7 @@ TEST(TestArrowReadWrite, DictionaryColumnChunkedWrite) { // The column name gets changed on output to the name of the // field, and it also turns into a nullable column - columns.emplace_back(MakeColumn("dictionary", expected_array, true)); + columns.emplace_back(std::make_shared(expected_array)); schema = ::arrow::schema({::arrow::field("dictionary", ::arrow::utf8())}); @@ -2320,11 +2260,9 @@ class TestNestedSchemaRead : public ::testing::TestWithParam { void ValidateTableArrayTypes(const Table& table) { for (int i = 0; i < table.num_columns(); i++) { const std::shared_ptr<::arrow::Field> schema_field = table.schema()->field(i); - const std::shared_ptr column = table.column(i); - // Compare with the column field - ASSERT_TRUE(schema_field->Equals(column->field())); + const std::shared_ptr column = table.column(i); // Compare with the array type - ASSERT_TRUE(schema_field->type()->Equals(column->data()->chunk(0)->type())); + ASSERT_TRUE(schema_field->type()->Equals(column->chunk(0)->type())); } } @@ -2519,13 +2457,13 @@ TEST_F(TestNestedSchemaRead, ReadIntoTableFull) { ASSERT_NO_FATAL_FAILURE(ValidateTableArrayTypes(*table)); auto struct_field_array = - std::static_pointer_cast<::arrow::StructArray>(table->column(0)->data()->chunk(0)); + std::static_pointer_cast<::arrow::StructArray>(table->column(0)->chunk(0)); auto leaf1_array = std::static_pointer_cast<::arrow::Int32Array>(struct_field_array->field(0)); auto leaf2_array = std::static_pointer_cast<::arrow::Int32Array>(struct_field_array->field(1)); auto leaf3_array = - std::static_pointer_cast<::arrow::Int32Array>(table->column(1)->data()->chunk(0)); + std::static_pointer_cast<::arrow::Int32Array>(table->column(1)->chunk(0)); // validate struct and leaf arrays @@ -2599,8 +2537,8 @@ TEST_P(TestNestedSchemaRead, DeepNestedSchemaRead) { const int num_trees = 3; const int depth = 3; #else - const int num_trees = 5; - const int depth = 5; + const int num_trees = 2; + const int depth = 2; #endif const int num_children = 3; int num_rows = SMALL_SIZE * (depth + 2); @@ -2613,7 +2551,7 @@ TEST_P(TestNestedSchemaRead, DeepNestedSchemaRead) { DeepParquetTestVisitor visitor(GetParam(), values_array_); for (int i = 0; i < table->num_columns(); i++) { - auto tree = table->column(i)->data()->chunk(0); + auto tree = table->column(i)->chunk(0); ASSERT_OK_NO_THROW(visitor.Validate(tree)); } } @@ -2670,7 +2608,8 @@ TEST(TestArrowReaderAdHoc, DISABLED_LargeStringColumn) { } std::shared_ptr array; ASSERT_OK(builder.Finish(&array)); - auto table = Table::Make({std::make_shared("x", array)}); + auto table = + Table::Make(::arrow::schema({::arrow::field("x", array->type())}), {array}); std::shared_ptr schm; ASSERT_OK_NO_THROW( ToParquetSchema(table->schema().get(), *default_writer_properties(), &schm)); @@ -2740,10 +2679,9 @@ TEST_P(TestArrowReaderAdHocSparkAndHvr, ReadDecimals) { auto value_column = table->column(0); ASSERT_EQ(expected_length, value_column->length()); - auto raw_array = value_column->data(); - ASSERT_EQ(1, raw_array->num_chunks()); + ASSERT_EQ(1, value_column->num_chunks()); - auto chunk = raw_array->chunk(0); + auto chunk = value_column->chunk(0); std::shared_ptr expected_array; diff --git a/cpp/src/parquet/arrow/reader-writer-benchmark.cc b/cpp/src/parquet/arrow/reader-writer-benchmark.cc index d035e1ce8d6..239d707e231 100644 --- a/cpp/src/parquet/arrow/reader-writer-benchmark.cc +++ b/cpp/src/parquet/arrow/reader-writer-benchmark.cc @@ -113,8 +113,7 @@ std::shared_ptr<::arrow::Table> TableFromVector( auto field = ::arrow::field("column", type, nullable); auto schema = ::arrow::schema({field}); - auto column = std::make_shared<::arrow::Column>(field, array); - return ::arrow::Table::Make(schema, {column}); + return ::arrow::Table::Make(schema, {array}); } template <> @@ -136,8 +135,7 @@ std::shared_ptr<::arrow::Table> TableFromVector(const std::vector( std::vector>({field})); - auto column = std::make_shared<::arrow::Column>(field, array); - return ::arrow::Table::Make(schema, {column}); + return ::arrow::Table::Make(schema, {array}); } template diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index f757b5ff847..e789b949cea 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -56,7 +56,6 @@ using arrow::Array; using arrow::BooleanArray; using arrow::ChunkedArray; -using arrow::Column; using arrow::Field; using arrow::Int32Array; using arrow::ListArray; @@ -191,12 +190,10 @@ class RowGroupRecordBatchReader : public ::arrow::RecordBatchReader { // TODO (hatemhelal): Consider refactoring this to share logic with ReadTable as this // does not currently honor the use_threads option. - std::vector> columns(column_indices_.size()); + std::vector> columns(column_indices_.size()); for (size_t i = 0; i < column_indices_.size(); ++i) { - std::shared_ptr array; - RETURN_NOT_OK(column_readers_[i]->NextBatch(batch_size_, &array)); - columns[i] = std::make_shared(schema_->field(static_cast(i)), array); + RETURN_NOT_OK(column_readers_[i]->NextBatch(batch_size_, &columns[i])); } // Create an intermediate table and use TableBatchReader as an adaptor to a @@ -278,7 +275,7 @@ class FileReader::Impl { std::vector GetDictionaryIndices(const std::vector& indices); std::shared_ptr<::arrow::Schema> FixSchema( const ::arrow::Schema& old_schema, const std::vector& dict_indices, - std::vector>& columns); + const std::vector>& columns); int64_t batch_size() const { return reader_properties_.batch_size(); } @@ -548,15 +545,14 @@ Status FileReader::Impl::ReadRowGroup(int row_group_index, return Status::Invalid("Invalid column index"); } int num_fields = static_cast(field_indices.size()); - std::vector> columns(num_fields); + std::vector> columns(num_fields); // TODO(wesm): Refactor to share more code with ReadTable - auto ReadColumnFunc = [&indices, &field_indices, &row_group_index, &schema, &columns, + auto ReadColumnFunc = [&indices, &field_indices, &row_group_index, &columns, this](int i) { - std::shared_ptr array; - RETURN_NOT_OK(ReadColumnChunk(field_indices[i], indices, row_group_index, &array)); - columns[i] = std::make_shared(schema->field(i), array); + RETURN_NOT_OK( + ReadColumnChunk(field_indices[i], indices, row_group_index, &columns[i])); return Status::OK(); }; @@ -606,13 +602,10 @@ Status FileReader::Impl::ReadTable(const std::vector& indices, } int num_fields = static_cast(field_indices.size()); - std::vector> columns(num_fields); + std::vector> columns(num_fields); - auto ReadColumnFunc = [&indices, &field_indices, &schema, &columns, this](int i) { - std::shared_ptr array; - RETURN_NOT_OK(ReadSchemaField(field_indices[i], indices, &array)); - columns[i] = std::make_shared(schema->field(i), array); - return Status::OK(); + auto ReadColumnFunc = [&indices, &field_indices, &columns, this](int i) { + return ReadSchemaField(field_indices[i], indices, &columns[i]); }; if (reader_properties_.use_threads()) { @@ -697,18 +690,13 @@ std::vector FileReader::Impl::GetDictionaryIndices(const std::vector& std::shared_ptr<::arrow::Schema> FileReader::Impl::FixSchema( const ::arrow::Schema& old_schema, const std::vector& dict_indices, - std::vector>& columns) { + const std::vector>& columns) { // Fix the schema with the actual DictionaryType that was read auto fields = old_schema.fields(); for (int idx : dict_indices) { - auto name = columns[idx]->name(); - auto dict_array = columns[idx]->data(); - auto dict_field = std::make_shared<::arrow::Field>(name, dict_array->type()); - fields[idx] = dict_field; - columns[idx] = std::make_shared(dict_field, dict_array); + fields[idx] = old_schema.field(idx)->WithType(columns[idx]->type()); } - return std::make_shared<::arrow::Schema>(fields, old_schema.metadata()); } @@ -1740,8 +1728,9 @@ void StructImpl::InitField( for (size_t i = 0; i < children.size(); i++) { fields[i] = children[i]->field(); } + auto type = ::arrow::struct_(fields); - field_ = ::arrow::field(node->name(), type); + field_ = ::arrow::field(node->name(), type, node->is_optional()); } Status StructImpl::GetRepLevels(const int16_t** data, size_t* length) { diff --git a/cpp/src/parquet/arrow/test-util.h b/cpp/src/parquet/arrow/test-util.h index b99e28f5e03..c50dfd6fc29 100644 --- a/cpp/src/parquet/arrow/test-util.h +++ b/cpp/src/parquet/arrow/test-util.h @@ -429,26 +429,11 @@ Status MakeEmptyListsArray(int64_t size, std::shared_ptr* out_array) { return Status::OK(); } -static inline std::shared_ptr<::arrow::Column> MakeColumn( - const std::string& name, const std::shared_ptr& array, bool nullable) { - auto field = ::arrow::field(name, array->type(), nullable); - return std::make_shared<::arrow::Column>(field, array); -} - -static inline std::shared_ptr<::arrow::Column> MakeColumn( - const std::string& name, const std::vector>& arrays, - bool nullable) { - auto field = ::arrow::field(name, arrays[0]->type(), nullable); - return std::make_shared<::arrow::Column>(field, arrays); -} - std::shared_ptr<::arrow::Table> MakeSimpleTable(const std::shared_ptr& values, bool nullable) { - std::shared_ptr<::arrow::Column> column = MakeColumn("col", values, nullable); - std::vector> columns({column}); - std::vector> fields({column->field()}); - auto schema = std::make_shared<::arrow::Schema>(fields); - return ::arrow::Table::Make(schema, columns); + auto carr = std::make_shared<::arrow::ChunkedArray>(values); + auto schema = ::arrow::schema({::arrow::field("col", values->type(), nullable)}); + return ::arrow::Table::Make(schema, {carr}); } template diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index 91811203f92..9c487b97e2e 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -1195,8 +1195,7 @@ Status FileWriter::WriteTable(const Table& table, int64_t chunk_size) { auto WriteRowGroup = [&](int64_t offset, int64_t size) { RETURN_NOT_OK(NewRowGroup(size)); for (int i = 0; i < table.num_columns(); i++) { - auto chunked_data = table.column(i)->data(); - RETURN_NOT_OK(WriteColumnChunk(chunked_data, offset, size)); + RETURN_NOT_OK(WriteColumnChunk(table.column(i), offset, size)); } return Status::OK(); }; diff --git a/docs/source/cpp/api/table.rst b/docs/source/cpp/api/table.rst index e8b4f8e066e..53e2d72e672 100644 --- a/docs/source/cpp/api/table.rst +++ b/docs/source/cpp/api/table.rst @@ -19,23 +19,6 @@ Two-dimensional Datasets ======================== -Columns -======= - -.. doxygenclass:: arrow::Column - :project: arrow_cpp - :members: - -Tables -====== - -.. doxygenclass:: arrow::Table - :project: arrow_cpp - :members: - -.. doxygenfunction:: arrow::ConcatenateTables - :project: arrow_cpp - Record Batches ============== @@ -50,3 +33,13 @@ Record Batches .. doxygenclass:: arrow::TableBatchReader :project: arrow_cpp :members: + +Tables +====== + +.. doxygenclass:: arrow::Table + :project: arrow_cpp + :members: + +.. doxygenfunction:: arrow::ConcatenateTables + :project: arrow_cpp diff --git a/docs/source/cpp/overview.rst b/docs/source/cpp/overview.rst index 490efc1b7a2..53fc998eae6 100644 --- a/docs/source/cpp/overview.rst +++ b/docs/source/cpp/overview.rst @@ -51,10 +51,8 @@ The two-dimensional layer **Schemas** describe a logical collection of several pieces of data, each with a distinct name and type, and optional metadata. -**Columns** are like chunked arrays, but with optional metadata. - -**Tables** are collections of columns in accordance to a schema. They are -the most capable dataset-providing abstraction in Arrow. +**Tables** are collections of chunked array in accordance to a schema. They +are the most capable dataset-providing abstraction in Arrow. **Record batches** are collections of contiguous arrays, described by a schema. They allow incremental construction or serialization of tables. diff --git a/docs/source/cpp/tables.rst b/docs/source/cpp/tables.rst index d42f0c6c4f5..e929c6eecd8 100644 --- a/docs/source/cpp/tables.rst +++ b/docs/source/cpp/tables.rst @@ -56,20 +56,13 @@ function overloads:: field_b = arrow::field("B", arrow::utf8()); schema = arrow::schema({field_a, field_b}); -Columns -======= - -A :class:`arrow::Column` is a chunked array tied together with a field. -The field describes the column's name (for lookup in a larger dataset) -and its metadata. - Tables ====== -A :class:`arrow::Table` is a two-dimensional dataset of a number of columns, -together with a schema. The columns' names and types must match the schema. -Also, each column must have the same logical length in number of elements -(although each column can be chunked in a different way). +A :class:`arrow::Table` is a two-dimensional dataset with chunked arrays for +columns, together with a schema providing field names. Also, each chunked +column must have the same logical length in number of elements (although each +column can be chunked in a different way). Record Batches ============== diff --git a/docs/source/python/api/tables.rst b/docs/source/python/api/tables.rst index 9d350a402ed..501230f8fa6 100644 --- a/docs/source/python/api/tables.rst +++ b/docs/source/python/api/tables.rst @@ -29,7 +29,6 @@ Factory Functions :toctree: ../generated/ table - column chunked_array concat_tables @@ -40,7 +39,6 @@ Classes :toctree: ../generated/ ChunkedArray - Column RecordBatch Table diff --git a/docs/source/python/data.rst b/docs/source/python/data.rst index 3260f6d6377..cb1eb6b1a7d 100644 --- a/docs/source/python/data.rst +++ b/docs/source/python/data.rst @@ -394,16 +394,15 @@ one or more copies of the batch using ``Table.from_batches``: table table.num_rows -The table's columns are instances of :class:`~.Column`, which is a container -for one or more arrays of the same type. +The table's columns are instances of :class:`~.ChunkedArray`, which is a +container for one or more arrays of the same type. .. ipython:: python c = table[0] c - c.data - c.data.num_chunks - c.data.chunk(0) + c.num_chunks + c.chunk(0) As you'll see in the :ref:`pandas section `, we can convert these objects to contiguous NumPy arrays for use in pandas: @@ -421,7 +420,7 @@ Multiple tables can also be concatenated together to form a single table using table_all = pa.concat_tables(tables) table_all.num_rows c = table_all[0] - c.data.num_chunks + c.num_chunks This is similar to ``Table.from_batches``, but uses tables as input instead of record batches. Record batches can be made into tables, but not the other way diff --git a/docs/source/python/extending.rst b/docs/source/python/extending.rst index f15b1bedbac..4ee20c77aee 100644 --- a/docs/source/python/extending.rst +++ b/docs/source/python/extending.rst @@ -81,11 +81,6 @@ C++ objects. Return whether *obj* wraps an Arrow C++ :class:`Buffer` pointer; in other words, whether *obj* is a :py:class:`pyarrow.Buffer` instance. -.. function:: bool is_column(PyObject* obj) - - Return whether *obj* wraps an Arrow C++ :class:`Column` pointer; - in other words, whether *obj* is a :py:class:`pyarrow.Column` instance. - .. function:: bool is_data_type(PyObject* obj) Return whether *obj* wraps an Arrow C++ :class:`DataType` pointer; @@ -139,10 +134,6 @@ occurred. If successful, *out* is guaranteed to be non-NULL. Unwrap the Arrow C++ :class:`Buffer` pointer from *obj* and put it in *out*. -.. function:: Status unwrap_column(PyObject* obj, std::shared_ptr* out) - - Unwrap the Arrow C++ :class:`Column` pointer from *obj* and put it in *out*. - .. function:: Status unwrap_data_type(PyObject* obj, std::shared_ptr* out) Unwrap the Arrow C++ :class:`DataType` pointer from *obj* and put it in *out*. @@ -187,10 +178,6 @@ On error, NULL is returned and a Python exception is set. Wrap the Arrow C++ *buffer* in a :py:class:`pyarrow.Buffer` instance. -.. function:: PyObject* wrap_column(const std::shared_ptr& column) - - Wrap the Arrow C++ *column* in a :py:class:`pyarrow.Column` instance. - .. function:: PyObject* wrap_data_type(const std::shared_ptr& data_type) Wrap the Arrow C++ *data_type* in a :py:class:`pyarrow.DataType` instance. @@ -259,10 +246,6 @@ an exception) if the input is not of the right type. Unwrap the Arrow C++ :cpp:class:`Buffer` pointer from *obj*. -.. function:: pyarrow_unwrap_column(obj) -> shared_ptr[CColumn] - - Unwrap the Arrow C++ :cpp:class:`Column` pointer from *obj*. - .. function:: pyarrow_unwrap_data_type(obj) -> shared_ptr[CDataType] Unwrap the Arrow C++ :cpp:class:`CDataType` pointer from *obj*. @@ -306,10 +289,6 @@ pyarray object of the corresponding type. An exception is raised on error. Wrap the Arrow C++ *buffer* in a Python :class:`pyarrow.Buffer` instance. -.. function:: pyarrow_wrap_column(sp_array: const shared_ptr[CColumn]& column) -> object - - Wrap the Arrow C++ *column* in a Python :class:`pyarrow.Column` instance. - .. function:: pyarrow_wrap_data_type(sp_array: const shared_ptr[CDataType]& data_type) -> object Wrap the Arrow C++ *data_type* in a Python :class:`pyarrow.DataType` instance. diff --git a/matlab/src/feather_reader.cc b/matlab/src/feather_reader.cc index 1c1b21cd66b..484c300e0e4 100644 --- a/matlab/src/feather_reader.cc +++ b/matlab/src/feather_reader.cc @@ -37,46 +37,38 @@ namespace matlab { namespace internal { // Read the name of variable i from the Feather file as a mxArray*. -mxArray* ReadVariableName(const std::shared_ptr& column) { - return matlab::util::ConvertUTF8StringToUTF16CharMatrix(column->name()); +mxArray* ReadVariableName(const std::string& column_name) { + return matlab::util::ConvertUTF8StringToUTF16CharMatrix(column_name); } template -mxArray* ReadNumericVariableData(const std::shared_ptr& column) { +mxArray* ReadNumericVariableData(const std::shared_ptr& column) { using MatlabType = typename MatlabTraits::MatlabType; using ArrowArrayType = typename TypeTraits::ArrayType; - std::shared_ptr chunked_array = column->data(); - const int32_t num_chunks = chunked_array->num_chunks(); - const mxClassID matlab_class_id = MatlabTraits::matlab_class_id; // Allocate a numeric mxArray* with the correct mxClassID based on the type of the - // arrow::Column. + // arrow::Array. mxArray* variable_data = mxCreateNumericMatrix(column->length(), 1, matlab_class_id, mxREAL); - int64_t mx_array_offset = 0; - // Iterate over each arrow::Array in the arrow::ChunkedArray. - for (int32_t i = 0; i < num_chunks; ++i) { - std::shared_ptr array = chunked_array->chunk(i); - const int64_t chunk_length = array->length(); - std::shared_ptr integer_array = std::static_pointer_cast(array); - - // Get a raw pointer to the Arrow array data. - const MatlabType* source = integer_array->raw_values(); - - // Get a mutable pointer to the MATLAB array data and std::copy the - // Arrow array data into it. - MatlabType* destination = MatlabTraits::GetData(variable_data); - std::copy(source, source + chunk_length, destination + mx_array_offset); - mx_array_offset += chunk_length; - } + std::shared_ptr integer_array = + std::static_pointer_cast(column); + + // Get a raw pointer to the Arrow array data. + const MatlabType* source = integer_array->raw_values(); + + // Get a mutable pointer to the MATLAB array data and std::copy the + // Arrow array data into it. + MatlabType* destination = MatlabTraits::GetData(variable_data); + std::copy(source, source + column->length(), destination); return variable_data; } // Read the data of variable i from the Feather file as a mxArray*. -mxArray* ReadVariableData(const std::shared_ptr& column) { +mxArray* ReadVariableData(const std::shared_ptr& column, + const std::string& column_name) { std::shared_ptr type = column->type(); switch (type->id()) { @@ -103,7 +95,7 @@ mxArray* ReadVariableData(const std::shared_ptr& column) { default: { mexErrMsgIdAndTxt("MATLAB:arrow:UnsupportedArrowType", "Unsupported arrow::Type '%s' for variable '%s'", - type->name().c_str(), column->name().c_str()); + type->name().c_str(), column_name.c_str()); break; } } @@ -125,22 +117,22 @@ void BitUnpackBuffer(const std::shared_ptr& source, int64_t length, arrow::internal::VisitBitsUnrolled(source_data, start_offset, length, visitFcn); } -// Populates the validity bitmap from an arrow::Array or an arrow::Column, +// Populates the validity bitmap from an arrow::Array. // writes to a zero-initialized destination buffer. // Implements a fast path for the fully-valid and fully-invalid cases. // Returns true if the destination buffer was successfully populated. -template -bool TryBitUnpackFastPath(const std::shared_ptr& array, mxLogical* destination) { +bool TryBitUnpackFastPath(const std::shared_ptr& array, + mxLogical* destination) { const int64_t null_count = array->null_count(); const int64_t length = array->length(); if (null_count == length) { - // The source array/column is filled with invalid values. Since mxCreateLogicalMatrix + // The source array is filled with invalid values. Since mxCreateLogicalMatrix // zero-initializes the destination buffer, we can return without changing anything // in the destination buffer. return true; } else if (null_count == 0) { - // The source array/column contains only valid values. Fill the destination buffer + // The source array contains only valid values. Fill the destination buffer // with 'true'. std::fill(destination, destination + length, true); return true; @@ -152,7 +144,7 @@ bool TryBitUnpackFastPath(const std::shared_ptr& array, mxLogical* de // Read the validity (null) bitmap of variable i from the Feather // file as an mxArray*. -mxArray* ReadVariableValidityBitmap(const std::shared_ptr& column) { +mxArray* ReadVariableValidityBitmap(const std::shared_ptr& column) { // Allocate an mxLogical array to store the validity (null) bitmap values. // Note: All Arrow arrays can have an associated validity (null) bitmap. // The Apache Arrow specification defines 0 (false) to represent an @@ -161,38 +153,17 @@ mxArray* ReadVariableValidityBitmap(const std::shared_ptr& column) { mxArray* validity_bitmap = mxCreateLogicalMatrix(column->length(), 1); mxLogical* validity_bitmap_unpacked = mxGetLogicals(validity_bitmap); - // The Apache Arrow specification allows validity (null) bitmaps - // to be unallocated if there are no null values. In this case, - // we simply return a logical array filled with the value true. - if (TryBitUnpackFastPath(column, validity_bitmap_unpacked)) { - // Return early since the validity bitmap was already filled. - return validity_bitmap; - } - - std::shared_ptr chunked_array = column->data(); - const int32_t num_chunks = chunked_array->num_chunks(); - - int64_t mx_array_offset = 0; - // Iterate over each arrow::Array in the arrow::ChunkedArray. - for (int32_t chunk_index = 0; chunk_index < num_chunks; ++chunk_index) { - std::shared_ptr array = chunked_array->chunk(chunk_index); - const int64_t array_length = array->length(); - - if (!TryBitUnpackFastPath(array, validity_bitmap_unpacked + mx_array_offset)) { - // Couldn't fill the full validity bitmap at once. Call an optimized loop-unrolled - // implementation instead that goes byte-by-byte and populates the validity bitmap. - BitUnpackBuffer(array->null_bitmap(), array_length, - validity_bitmap_unpacked + mx_array_offset); - } - - mx_array_offset += array_length; + if (!TryBitUnpackFastPath(column, validity_bitmap_unpacked)) { + // Couldn't fill the full validity bitmap at once. Call an optimized loop-unrolled + // implementation instead that goes byte-by-byte and populates the validity bitmap. + BitUnpackBuffer(column->null_bitmap(), column->length(), validity_bitmap_unpacked); } return validity_bitmap; } -// Read the type name of an Arrow column as an mxChar array. -mxArray* ReadVariableType(const std::shared_ptr& column) { +// Read the type name of an arrow::Array as an mxChar array. +mxArray* ReadVariableType(const std::shared_ptr& column) { return util::ConvertUTF8StringToUTF16CharMatrix(column->type()->name()); } @@ -204,18 +175,18 @@ static constexpr uint64_t MAX_MATLAB_SIZE = static_cast(0x01) << 48; Status FeatherReader::Open(const std::string& filename, std::shared_ptr* feather_reader) { *feather_reader = std::shared_ptr(new FeatherReader()); - + // Open file with given filename as a ReadableFile. std::shared_ptr readable_file(nullptr); - + RETURN_NOT_OK(io::ReadableFile::Open(filename, &readable_file)); - + // TableReader expects a RandomAccessFile. std::shared_ptr random_access_file(readable_file); // Open the Feather file for reading with a TableReader. - RETURN_NOT_OK(ipc::feather::TableReader::Open( - random_access_file, &(*feather_reader)->table_reader_)); + RETURN_NOT_OK(ipc::feather::TableReader::Open(random_access_file, + &(*feather_reader)->table_reader_)); // Read the table metadata from the Feather file. (*feather_reader)->num_rows_ = (*feather_reader)->table_reader_->num_rows(); @@ -273,14 +244,20 @@ mxArray* FeatherReader::ReadVariables() const { // Read all the table variables in the Feather file into memory. for (int64_t i = 0; i < num_variables_; ++i) { - std::shared_ptr column(nullptr); + std::shared_ptr column; util::HandleStatus(table_reader_->GetColumn(i, &column)); + if (column->num_chunks() != 1) { + mexErrMsgIdAndTxt("MATLAB:arrow:FeatherReader::ReadVariables", + "Chunked columns not yet supported"); + } + std::shared_ptr chunk = column->chunk(0); + const std::string column_name = table_reader_->GetColumnName(i); // set the struct fields data - mxSetField(variables, i, "Name", internal::ReadVariableName(column)); - mxSetField(variables, i, "Type", internal::ReadVariableType(column)); - mxSetField(variables, i, "Data", internal::ReadVariableData(column)); - mxSetField(variables, i, "Valid", internal::ReadVariableValidityBitmap(column)); + mxSetField(variables, i, "Name", internal::ReadVariableName(column_name)); + mxSetField(variables, i, "Type", internal::ReadVariableType(chunk)); + mxSetField(variables, i, "Data", internal::ReadVariableData(chunk, column_name)); + mxSetField(variables, i, "Valid", internal::ReadVariableValidityBitmap(chunk)); } return variables; diff --git a/matlab/src/util/handle_status.cc b/matlab/src/util/handle_status.cc index 992f2c31d37..f1c3b7f2598 100644 --- a/matlab/src/util/handle_status.cc +++ b/matlab/src/util/handle_status.cc @@ -79,31 +79,6 @@ void HandleStatus(const Status& status) { status.ToString().c_str()); break; } - case StatusCode::PythonError: { - mexErrMsgIdAndTxt("MATLAB:arrow:status:PythonError", arrow_error_message, - status.ToString().c_str()); - break; - } - case StatusCode::PlasmaObjectExists: { - mexErrMsgIdAndTxt("MATLAB:arrow:status:PlasmaObjectExists", arrow_error_message, - status.ToString().c_str()); - break; - } - case StatusCode::PlasmaObjectNonexistent: { - mexErrMsgIdAndTxt("MATLAB:arrow:status:PlasmaObjectNonexistent", - arrow_error_message, status.ToString().c_str()); - break; - } - case StatusCode::PlasmaStoreFull: { - mexErrMsgIdAndTxt("MATLAB:arrow:status:PlasmaStoreFull", arrow_error_message, - status.ToString().c_str()); - break; - } - case StatusCode::PlasmaObjectAlreadySealed: { - mexErrMsgIdAndTxt("MATLAB:arrow:status:PlasmaObjectAlreadySealed", - arrow_error_message, status.ToString().c_str()); - break; - } default: { mexErrMsgIdAndTxt("MATLAB:arrow:status:UnknownStatus", arrow_error_message, "Unknown status"); diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 9ef809bfcbf..bc49e1733ca 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -65,7 +65,7 @@ def parse_git(root, **kwargs): Schema, schema, Array, Tensor, - array, chunked_array, column, table, + array, chunked_array, table, SparseTensorCSR, SparseTensorCOO, infer_type, from_numpy_dtype, NullArray, @@ -111,7 +111,7 @@ def parse_git(root, **kwargs): create_memory_map, have_libhdfs, have_libhdfs3, MockOutputStream, input_stream, output_stream) -from pyarrow.lib import (ChunkedArray, Column, RecordBatch, Table, +from pyarrow.lib import (ChunkedArray, RecordBatch, Table, concat_arrays, concat_tables) # Exceptions diff --git a/python/pyarrow/compat.py b/python/pyarrow/compat.py index e37307c744c..e9572d83080 100644 --- a/python/pyarrow/compat.py +++ b/python/pyarrow/compat.py @@ -74,6 +74,9 @@ def tobytes(o): else: return o + def u_utf8(s): + return s.decode('utf-8') + def frombytes(o): return o @@ -112,6 +115,11 @@ def tobytes(o): else: return o + def u_utf8(s): + if isinstance(s, bytes): + return frombytes(s) + return s + def frombytes(o): return o.decode('utf8') diff --git a/python/pyarrow/feather.pxi b/python/pyarrow/feather.pxi index 20b12c1e35d..6fd13bc04b4 100644 --- a/python/pyarrow/feather.pxi +++ b/python/pyarrow/feather.pxi @@ -98,12 +98,12 @@ cdef class FeatherReader: if i < 0 or i >= self.num_columns: raise IndexError(i) - cdef shared_ptr[CColumn] sp_column + cdef shared_ptr[CChunkedArray] sp_chunked_array with nogil: check_status(self.reader.get() - .GetColumn(i, &sp_column)) + .GetColumn(i, &sp_chunked_array)) - return pyarrow_wrap_column(sp_column) + return pyarrow_wrap_chunked_array(sp_chunked_array) def _read(self): cdef shared_ptr[CTable] sp_table diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index e8df30489a2..282572e6964 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -490,29 +490,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CChunkedArray] Slice(int64_t offset, int64_t length) const shared_ptr[CChunkedArray] Slice(int64_t offset) const - CStatus Validate() const - - cdef cppclass CColumn" arrow::Column": - CColumn(const shared_ptr[CField]& field, - const shared_ptr[CArray]& data) - - CColumn(const shared_ptr[CField]& field, - const vector[shared_ptr[CArray]]& chunks) - - CColumn(const shared_ptr[CField]& field, - const shared_ptr[CChunkedArray]& data) - - c_bool Equals(const CColumn& other) - - CStatus Flatten(CMemoryPool* pool, vector[shared_ptr[CColumn]]* out) - - shared_ptr[CField] field() + CStatus Flatten(CMemoryPool* pool, + vector[shared_ptr[CChunkedArray]]* out) - int64_t length() - int64_t null_count() - const c_string& name() - shared_ptr[CDataType] type() - shared_ptr[CChunkedArray] data() + CStatus Validate() const cdef cppclass CRecordBatch" arrow::RecordBatch": @staticmethod @@ -539,12 +520,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CTable" arrow::Table": CTable(const shared_ptr[CSchema]& schema, - const vector[shared_ptr[CColumn]]& columns) + const vector[shared_ptr[CChunkedArray]]& columns) @staticmethod shared_ptr[CTable] Make( const shared_ptr[CSchema]& schema, - const vector[shared_ptr[CColumn]]& columns) + const vector[shared_ptr[CChunkedArray]]& columns) @staticmethod CStatus FromRecordBatches( @@ -558,12 +539,15 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: c_bool Equals(const CTable& other) shared_ptr[CSchema] schema() - shared_ptr[CColumn] column(int i) + shared_ptr[CChunkedArray] column(int i) + shared_ptr[CField] field(int i) - CStatus AddColumn(int i, const shared_ptr[CColumn]& column, + CStatus AddColumn(int i, shared_ptr[CField] field, + shared_ptr[CChunkedArray] column, shared_ptr[CTable]* out) CStatus RemoveColumn(int i, shared_ptr[CTable]* out) - CStatus SetColumn(int i, const shared_ptr[CColumn]& column, + CStatus SetColumn(int i, shared_ptr[CField] field, + shared_ptr[CChunkedArray] column, shared_ptr[CTable]* out) vector[c_string] ColumnNames() @@ -1055,7 +1039,7 @@ cdef extern from "arrow/ipc/api.h" namespace "arrow::ipc" nogil: shared_ptr[CSchema] schema() - CStatus GetColumn(int i, shared_ptr[CColumn]* out) + CStatus GetColumn(int i, shared_ptr[CChunkedArray]* out) c_string GetColumnName(int i) CStatus Read(shared_ptr[CTable]* out) @@ -1280,10 +1264,6 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: const shared_ptr[CChunkedArray]& arr, object py_ref, PyObject** out) - CStatus ConvertColumnToPandas(const PandasOptions& options, - const shared_ptr[CColumn]& arr, - object py_ref, PyObject** out) - CStatus ConvertTableToPandas( const PandasOptions& options, const unordered_set[c_string]& categorical_columns, diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index c3cce23cc21..09314630f2e 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -372,14 +372,6 @@ cdef class ChunkedArray(_PandasConvertible): cdef getitem(self, int64_t i) -cdef class Column(_PandasConvertible): - cdef: - shared_ptr[CColumn] sp_column - CColumn* column - - cdef void init(self, const shared_ptr[CColumn]& column) - - cdef class Table(_PandasConvertible): cdef: shared_ptr[CTable] sp_table @@ -469,7 +461,6 @@ cdef public object pyarrow_wrap_chunked_array( # XXX pyarrow.h calls it `wrap_record_batch` cdef public object pyarrow_wrap_batch(const shared_ptr[CRecordBatch]& cbatch) cdef public object pyarrow_wrap_buffer(const shared_ptr[CBuffer]& buf) -cdef public object pyarrow_wrap_column(const shared_ptr[CColumn]& ccolumn) cdef public object pyarrow_wrap_data_type(const shared_ptr[CDataType]& type) cdef public object pyarrow_wrap_field(const shared_ptr[CField]& field) cdef public object pyarrow_wrap_resizable_buffer( @@ -485,7 +476,6 @@ cdef public object pyarrow_wrap_sparse_tensor_csr( cdef public shared_ptr[CArray] pyarrow_unwrap_array(object array) cdef public shared_ptr[CRecordBatch] pyarrow_unwrap_batch(object batch) cdef public shared_ptr[CBuffer] pyarrow_unwrap_buffer(object buffer) -cdef public shared_ptr[CColumn] pyarrow_unwrap_column(object column) cdef public shared_ptr[CDataType] pyarrow_unwrap_data_type(object data_type) cdef public shared_ptr[CField] pyarrow_unwrap_field(object field) cdef public shared_ptr[CSchema] pyarrow_unwrap_schema(object schema) diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 439b6fe16d9..40598b642dc 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -29,7 +29,8 @@ import pyarrow as pa from pyarrow.lib import _pandas_api -from pyarrow.compat import (builtin_pickle, PY2, zip_longest, Sequence) # noqa +from pyarrow.compat import (builtin_pickle, # noqa + PY2, zip_longest, Sequence, u_utf8) _logical_type_map = {} @@ -668,7 +669,7 @@ def _check_data_column_metadata_consistency(all_columns): def _deserialize_column_index(block_table, all_columns, column_indexes): - column_strings = [x.name for x in block_table.itercolumns()] + column_strings = [u_utf8(x) for x in block_table.column_names] if all_columns: columns_name_dict = { c.get('field_name', _column_name_to_strings(c['name'])): c['name'] @@ -770,21 +771,21 @@ def _extract_index_level(table, result_table, field_name, # The serialized index column was removed by the user return table, None, None + pd = _pandas_api.pd + col = table.column(i) - col_pandas = col.to_pandas() - values = col_pandas.values + values = col.to_pandas() + if hasattr(values, 'flags') and not values.flags.writeable: # ARROW-1054: in pandas 0.19.2, factorize will reject # non-writeable arrays when calling MultiIndex.from_arrays values = values.copy() - pd = _pandas_api.pd - - if _pandas_api.is_datetimetz(col_pandas.dtype): + if isinstance(col.type, pa.lib.TimestampType): index_level = (pd.Series(values).dt.tz_localize('utc') - .dt.tz_convert(col_pandas.dtype.tz)) + .dt.tz_convert(col.type.tz)) else: - index_level = pd.Series(values, dtype=col_pandas.dtype) + index_level = pd.Series(values, dtype=values.dtype) result_table = result_table.remove_column( result_table.schema.get_field_index(field_name) ) @@ -899,6 +900,7 @@ def _reconstruct_columns_from_metadata(columns, column_indexes): new_levels = [] encoder = operator.methodcaller('encode', 'UTF-8') + for level, pandas_dtype in levels_dtypes: dtype = _pandas_type_to_numpy_type(pandas_dtype) @@ -944,6 +946,7 @@ def _flatten_single_level_multiindex(index): def _add_any_metadata(table, pandas_metadata): modified_columns = {} + modified_fields = {} schema = table.schema @@ -971,20 +974,23 @@ def _add_any_metadata(table, pandas_metadata): converted = col.to_pandas() tz = col_meta['metadata']['timezone'] tz_aware_type = pa.timestamp('ns', tz=tz) - with_metadata = pa.Array.from_pandas(converted.values, + with_metadata = pa.Array.from_pandas(converted, type=tz_aware_type) - field = pa.field(schema[idx].name, tz_aware_type) - modified_columns[idx] = pa.Column.from_array(field, - with_metadata) + modified_fields[idx] = pa.field(schema[idx].name, + tz_aware_type) + modified_columns[idx] = with_metadata if len(modified_columns) > 0: columns = [] + fields = [] for i in range(len(table.schema)): if i in modified_columns: columns.append(modified_columns[i]) + fields.append(modified_fields[i]) else: columns.append(table[i]) - return pa.Table.from_arrays(columns) + fields.append(table.schema[i]) + return pa.Table.from_arrays(columns, schema=pa.schema(fields)) else: return table diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 2d780afe79b..c3199db95d3 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -605,9 +605,8 @@ def read(self, columns=None, use_threads=True, partitions=None, # manifest, so ['a', 'b', 'c'] as in our example above. dictionary = partitions.levels[i].dictionary - arr = lib.DictionaryArray.from_arrays(indices, dictionary) - col = lib.Column.from_array(name, arr) - table = table.append_column(col) + arr = pa.DictionaryArray.from_arrays(indices, dictionary) + table = table.append_column(name, arr) return table diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index 05c07748f17..f6ef2c955cd 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -16,7 +16,7 @@ # under the License. from libcpp.memory cimport shared_ptr -from pyarrow.includes.libarrow cimport (CArray, CColumn, CDataType, CField, +from pyarrow.includes.libarrow cimport (CArray, CDataType, CField, CRecordBatch, CSchema, CTable, CTensor, CSparseTensorCSR, CSparseTensorCOO) @@ -296,25 +296,6 @@ cdef api object pyarrow_wrap_sparse_tensor_csr( return sparse_tensor -cdef api bint pyarrow_is_column(object column): - return isinstance(column, Column) - - -cdef api shared_ptr[CColumn] pyarrow_unwrap_column(object column): - cdef Column col - if pyarrow_is_column(column): - col = (column) - return col.sp_column - - return shared_ptr[CColumn]() - - -cdef api object pyarrow_wrap_column(const shared_ptr[CColumn]& ccolumn): - cdef Column column = Column.__new__(Column) - column.init(ccolumn) - return column - - cdef api bint pyarrow_is_table(object table): return isinstance(table, Table) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 0a76ddbc6e5..c0782fe26c2 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -38,6 +38,14 @@ cdef class ChunkedArray(_PandasConvertible): def __reduce__(self): return chunked_array, (self.chunks, self.type) + @property + def data(self): + import warnings + warnings.warn("Calling .data on ChunkedArray is provided for " + "compatibility after Column was removed, simply drop " + "this attribute", FutureWarning) + return self + @property def type(self): return pyarrow_wrap_data_type(self.sp_chunked_array.get().type()) @@ -153,6 +161,33 @@ cdef class ChunkedArray(_PandasConvertible): return self.to_pandas() return self.to_pandas().astype(dtype) + def cast(self, object target_type, bint safe=True): + """ + Cast values to another data type + + Parameters + ---------- + target_type : DataType + Type to cast to + safe : boolean, default True + Check for overflows or other unsafe conversions + + Returns + ------- + casted : ChunkedArray + """ + cdef: + CCastOptions options = CCastOptions(safe) + DataType type = ensure_type(target_type) + shared_ptr[CArray] result + CDatum out + + with nogil: + check_status(Cast(_context(), CDatum(self.sp_chunked_array), + type.sp_type, options, &out)) + + return pyarrow_wrap_chunked_array(out.chunked_array()) + def dictionary_encode(self): """ Compute dictionary-encoded representation of array @@ -171,6 +206,29 @@ cdef class ChunkedArray(_PandasConvertible): return wrap_datum(out) + def flatten(self, MemoryPool memory_pool=None): + """ + Flatten this ChunkedArray. If it has a struct type, the column is + flattened into one array per struct field. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Returns + ------- + result : List[ChunkedArray] + """ + cdef: + vector[shared_ptr[CChunkedArray]] flattened + CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) + + with nogil: + check_status(self.chunked_array.Flatten(pool, &flattened)) + + return [pyarrow_wrap_chunked_array(col) for col in flattened] + def unique(self): """ Compute distinct elements in array @@ -267,7 +325,7 @@ def chunked_array(arrays, type=None): Parameters ---------- - arrays : list of Array or values coercible to arrays + arrays : Array, list of Array, or values coercible to arrays Must all be the same data type. Can be empty only if type also passed type : DataType or string coercible to DataType @@ -282,6 +340,9 @@ def chunked_array(arrays, type=None): shared_ptr[CChunkedArray] sp_chunked_array shared_ptr[CDataType] sp_data_type + if isinstance(arrays, Array): + arrays = [arrays] + for x in arrays: if isinstance(x, Array): arr = x @@ -307,287 +368,10 @@ def chunked_array(arrays, type=None): return pyarrow_wrap_chunked_array(sp_chunked_array) -def column(object field_or_name, arr): - """ - Create Column object from field/string and array-like data - - Parameters - ---------- - field_or_name : string or Field - arr : Array, list of Arrays, or ChunkedArray - - Returns - ------- - column : Column - """ - cdef: - Field boxed_field - Array _arr - ChunkedArray _carr - shared_ptr[CColumn] sp_column - - if isinstance(arr, list): - arr = chunked_array(arr) - elif not isinstance(arr, (Array, ChunkedArray)): - arr = array(arr) - - if isinstance(field_or_name, Field): - boxed_field = field_or_name - if arr.type != boxed_field.type: - raise ValueError('Passed field type does not match array') - else: - boxed_field = field(field_or_name, arr.type) - - if isinstance(arr, Array): - _arr = arr - sp_column.reset(new CColumn(boxed_field.sp_field, _arr.sp_array)) - elif isinstance(arr, ChunkedArray): - _carr = arr - sp_column.reset(new CColumn(boxed_field.sp_field, - _carr.sp_chunked_array)) - else: - raise ValueError("Unsupported type for column(...): {}" - .format(type(arr))) - - return pyarrow_wrap_column(sp_column) - - -cdef class Column(_PandasConvertible): - """ - Named vector of elements of equal type. - - Warning - ------- - Do not call this class's constructor directly. - """ - - def __cinit__(self): - self.column = NULL - - def __init__(self): - raise TypeError("Do not call Column's constructor directly, use one " - "of the `Column.from_*` functions instead.") - - cdef void init(self, const shared_ptr[CColumn]& column): - self.sp_column = column - self.column = column.get() - - def __reduce__(self): - return column, (self.field, self.data) - - def __repr__(self): - from pyarrow.compat import StringIO - result = StringIO() - result.write('' - .format(self.name, self.type)) - result.write('\n{}'.format(str(self.data))) - - return result.getvalue() - - def __getitem__(self, key): - return self.data[key] - - @staticmethod - def from_array(*args): - return column(*args) - - def cast(self, object target_type, bint safe=True): - """ - Cast column values to another data type - - Parameters - ---------- - target_type : DataType - Type to cast to - safe : boolean, default True - Check for overflows or other unsafe conversions - - Returns - ------- - casted : Column - """ - cdef: - CCastOptions options = CCastOptions(safe) - DataType type = ensure_type(target_type) - shared_ptr[CArray] result - CDatum out - - with nogil: - check_status(Cast(_context(), CDatum(self.column.data()), - type.sp_type, options, &out)) - - casted_data = pyarrow_wrap_chunked_array(out.chunked_array()) - return column(self.name, casted_data) - - def dictionary_encode(self): - """ - Compute dictionary-encoded representation of array - - Returns - ------- - pyarrow.Column - Same chunking as the input, all chunks share a common dictionary. - """ - ca = self.data.dictionary_encode() - return column(self.name, ca) - - def unique(self): - """ - Compute distinct elements in array - - Returns - ------- - pyarrow.Array - """ - return self.data.unique() - - def flatten(self, MemoryPool memory_pool=None): - """ - Flatten this Column. If it has a struct type, the column is - flattened into one column per struct field. - - Parameters - ---------- - memory_pool : MemoryPool, default None - For memory allocations, if required, otherwise use default pool - - Returns - ------- - result : List[Column] - """ - cdef: - vector[shared_ptr[CColumn]] flattened - CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) - - with nogil: - check_status(self.column.Flatten(pool, &flattened)) - - return [pyarrow_wrap_column(col) for col in flattened] - - def _to_pandas(self, options, **kwargs): - values = self.data._to_pandas(options) - result = pandas_api.make_series(values, name=self.name) - - if isinstance(self.type, TimestampType): - tz = self.type.tz - if tz is not None: - tz = string_to_tzinfo(tz) - result = (result.dt.tz_localize('utc') - .dt.tz_convert(tz)) - - return result - - def __array__(self, dtype=None): - return self.data.__array__(dtype=dtype) - - def __eq__(self, other): - try: - return self.equals(other) - except TypeError: - return NotImplemented - - def equals(self, Column other): - """ - Check if contents of two columns are equal - - Parameters - ---------- - other : pyarrow.Column - - Returns - ------- - are_equal : boolean - """ - cdef: - CColumn* this_col = self.column - CColumn* other_col = other.column - c_bool result - - if other is None: - return False - - with nogil: - result = this_col.Equals(deref(other_col)) - - return result - - def to_pylist(self): - """ - Convert to a list of native Python objects. - """ - return self.data.to_pylist() - - def __len__(self): - return self.length() - - def length(self): - return self.column.length() - - @property - def field(self): - return pyarrow_wrap_field(self.column.field()) - - @property - def shape(self): - """ - Dimensions of this columns - - Returns - ------- - (int,) - """ - return (self.length(),) - - @property - def null_count(self): - """ - Number of null entires - - Returns - ------- - int - """ - return self.column.null_count() - - @property - def name(self): - """ - Label of the column - - Returns - ------- - str - """ - return bytes(self.column.name()).decode('utf8') - - @property - def type(self): - """ - Type information for this column - - Returns - ------- - pyarrow.DataType - """ - return pyarrow_wrap_data_type(self.column.type()) - - @property - def data(self): - """ - The underlying data - - Returns - ------- - pyarrow.ChunkedArray - """ - return pyarrow_wrap_chunked_array(self.column.data()) - - cdef _schema_from_arrays(arrays, names, metadata, shared_ptr[CSchema]* schema): cdef: Py_ssize_t K = len(arrays) c_string c_name - CColumn* c_column shared_ptr[CDataType] c_type shared_ptr[CKeyValueMetadata] c_meta vector[shared_ptr[CField]] c_fields @@ -603,29 +387,24 @@ cdef _schema_from_arrays(arrays, names, metadata, shared_ptr[CSchema]* schema): c_fields.resize(K) - if isinstance(arrays[0], Column): - for i in range(K): - c_column = (arrays[i]).column - c_fields[i] = c_column.field() - else: - if names is None: - raise ValueError('Must pass names when constructing ' - 'from Array objects') - if len(names) != K: - raise ValueError('Length of names ({}) does not match ' - 'length of arrays ({})'.format(len(names), K)) - for i in range(K): - val = arrays[i] - if isinstance(val, (Array, ChunkedArray)): - c_type = ( val.type).sp_type - else: - raise TypeError(type(val)) + if names is None: + raise ValueError('Must pass names or schema to Table.from_arrays') - if names[i] is None: - c_name = tobytes(u'None') - else: - c_name = tobytes(names[i]) - c_fields[i].reset(new CField(c_name, c_type, True)) + if len(names) != K: + raise ValueError('Length of names ({}) does not match ' + 'length of arrays ({})'.format(len(names), K)) + for i in range(K): + val = arrays[i] + if isinstance(val, (Array, ChunkedArray)): + c_type = ( val.type).sp_type + else: + raise TypeError(type(val)) + + if names[i] is None: + c_name = tobytes(u'None') + else: + c_name = tobytes(names[i]) + c_fields[i].reset(new CField(c_name, c_type, True)) schema.reset(new CSchema(c_fields, c_meta)) @@ -732,7 +511,7 @@ cdef class RecordBatch(_PandasConvertible): Returns ------- - list of pa.Column + list of pa.ChunkedArray """ return [self.column(i) for i in range(self.num_columns)] @@ -975,7 +754,7 @@ cdef class Table(_PandasConvertible): def __reduce__(self): # Reduce the columns as ChunkedArrays to avoid serializing schema # data twice - columns = [col.data for col in self.columns] + columns = [col for col in self.columns] return _reconstruct_table, (columns, self.schema) def replace_schema_metadata(self, metadata=None): @@ -1101,7 +880,7 @@ cdef class Table(_PandasConvertible): casted : Table """ cdef: - Column column, casted + ChunkedArray column, casted Field field list newcols = [] @@ -1184,17 +963,16 @@ cdef class Table(_PandasConvertible): @staticmethod def from_arrays(arrays, names=None, schema=None, metadata=None): """ - Construct a Table from Arrow arrays or columns + Construct a Table from Arrow arrays Parameters ---------- - arrays : list of pyarrow.Array or pyarrow.Column + arrays : list of pyarrow.Array or pyarrow.ChunkedArray Equal-length arrays that should form the table. names : list of str, optional - Names for the table columns. If Columns passed, will be - inferred. If Arrays passed, this argument is required + Names for the table columns. If not passed, schema must be passed schema : Schema, default None - If not passed, will be inferred from the arrays + Schema for the created table. If not passed, names must be passed metadata : dict or Mapping, default None Optional metadata for the schema (if inferred). @@ -1204,7 +982,7 @@ cdef class Table(_PandasConvertible): """ cdef: - vector[shared_ptr[CColumn]] columns + vector[shared_ptr[CChunkedArray]] columns Schema cy_schema shared_ptr[CSchema] c_schema int i, K = len(arrays) @@ -1228,26 +1006,12 @@ cdef class Table(_PandasConvertible): for i in range(K): if isinstance(arrays[i], Array): columns.push_back( - make_shared[CColumn]( - c_schema.get().field(i), + make_shared[CChunkedArray]( ( arrays[i]).sp_array ) ) elif isinstance(arrays[i], ChunkedArray): - columns.push_back( - make_shared[CColumn]( - c_schema.get().field(i), - ( arrays[i]).sp_chunked_array - ) - ) - elif isinstance(arrays[i], Column): - # Make sure schema field and column are consistent - columns.push_back( - make_shared[CColumn]( - c_schema.get().field(i), - ( arrays[i]).sp_column.get().data() - ) - ) + columns.push_back(( arrays[i]).sp_chunked_array) else: raise TypeError(type(arrays[i])) @@ -1272,18 +1036,27 @@ cdef class Table(_PandasConvertible): pyarrow.Table """ - names = [] arrays = [] - for k, v in mapping.items(): - names.append(k) - if not isinstance(v, (Array, ChunkedArray)): - v = array(v) - arrays.append(v) - if schema is None: - return Table.from_arrays(arrays, names, metadata=metadata) - else: + if schema is not None: + for field in schema: + try: + v = mapping[field.name] + except KeyError as e: + try: + v = mapping[tobytes(field.name)] + except KeyError as e2: + raise e + arrays.append(array(v, type=field.type)) # Will raise if metadata is not None return Table.from_arrays(arrays, schema=schema, metadata=metadata) + else: + names = [] + for k, v in mapping.items(): + names.append(k) + if not isinstance(v, (Array, ChunkedArray)): + v = array(v) + arrays.append(v) + return Table.from_arrays(arrays, names, metadata=metadata) @staticmethod def from_batches(batches, Schema schema=None): @@ -1381,11 +1154,11 @@ cdef class Table(_PandasConvertible): size_t i size_t num_columns = self.table.num_columns() list entries = [] - Column column + ChunkedArray column for i in range(num_columns): column = self.column(i) - entries.append((column.name, column.to_pylist())) + entries.append((self.field(i).name, column.to_pylist())) return ordered_dict(entries) @@ -1400,6 +1173,32 @@ cdef class Table(_PandasConvertible): """ return pyarrow_wrap_schema(self.table.schema()) + def field(self, i): + """ + Select a schema field by its numeric index. + + Parameters + ---------- + i : int or string + + Returns + ------- + pyarrow.Field + """ + cdef: + int num_columns = self.num_columns + int index + + if not -num_columns <= i < num_columns: + raise IndexError( + 'Table column index {:d} is out of range'.format(i) + ) + + index = i if i >= 0 else num_columns + i + assert index >= 0 + + return pyarrow_wrap_field(self.table.field(index)) + def column(self, i): """ Select a column by its column name, or numeric index. @@ -1410,7 +1209,7 @@ cdef class Table(_PandasConvertible): Returns ------- - pyarrow.Column + pyarrow.ChunkedArray """ if isinstance(i, six.string_types): field_index = self.schema.get_field_index(i) @@ -1433,7 +1232,7 @@ cdef class Table(_PandasConvertible): Returns ------- - pyarrow.Column + pyarrow.ChunkedArray """ cdef: int num_columns = self.num_columns @@ -1447,7 +1246,7 @@ cdef class Table(_PandasConvertible): index = i if i >= 0 else num_columns + i assert index >= 0 - return pyarrow_wrap_column(self.table.column(index)) + return pyarrow_wrap_chunked_array(self.table.column(index)) def __getitem__(self, key): cdef int index = _normalize_index(key, self.num_columns) @@ -1467,7 +1266,7 @@ cdef class Table(_PandasConvertible): Returns ------- - list of pa.Column + list of pa.ChunkedArray """ return [self._column(i) for i in range(self.num_columns)] @@ -1510,22 +1309,37 @@ cdef class Table(_PandasConvertible): """ return (self.num_rows, self.num_columns) - def add_column(self, int i, Column column): + def add_column(self, int i, field_, column): """ Add column to Table at position. Returns new table """ - cdef shared_ptr[CTable] c_table + cdef: + shared_ptr[CTable] c_table + Field c_field + ChunkedArray c_arr + + if isinstance(column, ChunkedArray): + c_arr = column + else: + c_arr = chunked_array(column) + + if isinstance(field_, Field): + c_field = field_ + else: + c_field = field(field_, c_arr.type) with nogil: - check_status(self.table.AddColumn(i, column.sp_column, &c_table)) + check_status(self.table.AddColumn(i, c_field.sp_field, + c_arr.sp_chunked_array, + &c_table)) return pyarrow_wrap_table(c_table) - def append_column(self, Column column): + def append_column(self, field_, column): """ Append column at end of columns. Returns new table """ - return self.add_column(self.num_columns, column) + return self.add_column(self.num_columns, field_, column) def remove_column(self, int i): """ @@ -1538,14 +1352,29 @@ cdef class Table(_PandasConvertible): return pyarrow_wrap_table(c_table) - def set_column(self, int i, Column column): + def set_column(self, int i, field_, column): """ Replace column in Table at position. Returns new table """ - cdef shared_ptr[CTable] c_table + cdef: + shared_ptr[CTable] c_table + Field c_field + ChunkedArray c_arr + + if isinstance(column, ChunkedArray): + c_arr = column + else: + c_arr = chunked_array(column) + + if isinstance(field_, Field): + c_field = field_ + else: + c_field = field(field_, c_arr.type) with nogil: - check_status(self.table.SetColumn(i, column.sp_column, &c_table)) + check_status(self.table.SetColumn(i, c_field.sp_field, + c_arr.sp_chunked_array, + &c_table)) return pyarrow_wrap_table(c_table) diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index 585f1d21179..ce797489498 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -40,6 +40,7 @@ groups = [ 'cython', 'hypothesis', + 'fastparquet', 'gandiva', 'hdfs', 'large_memory', @@ -55,6 +56,7 @@ defaults = { 'cython': False, + 'fastparquet': False, 'hypothesis': False, 'gandiva': False, 'hdfs': False, @@ -74,6 +76,12 @@ except ImportError: pass +try: + import fastparquet # noqa + defaults['fastparquet'] = True +except ImportError: + pass + try: import pyarrow.gandiva # noqa defaults['gandiva'] = True diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py index 514c5ad2b62..a2828643cc3 100644 --- a/python/pyarrow/tests/strategies.py +++ b/python/pyarrow/tests/strategies.py @@ -219,13 +219,6 @@ def chunked_arrays(draw, type, min_chunks=0, max_chunks=None, chunk_size=None): return pa.chunked_array(draw(chunks), type=type) -def columns(type, min_chunks=0, max_chunks=None, chunk_size=None): - chunked_array = chunked_arrays(type, chunk_size=chunk_size, - min_chunks=min_chunks, - max_chunks=max_chunks) - return st.builds(pa.column, st.text(), chunked_array) - - @st.composite def record_batches(draw, type, rows=None, max_fields=None): if isinstance(rows, st.SearchStrategy): @@ -258,6 +251,5 @@ def tables(draw, type, rows=None, max_fields=None): all_arrays = arrays(all_types) all_chunked_arrays = chunked_arrays(all_types) -all_columns = columns(all_types) all_record_batches = record_batches(all_types) all_tables = tables(all_types) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 387b6502eb9..c14d291374b 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -647,27 +647,26 @@ def test_cast_integers_safe(): def test_cast_none(): # ARROW-3735: Ensure that calling cast(None) doesn't segfault. arr = pa.array([1, 2, 3]) - col = pa.column('foo', [arr]) with pytest.raises(TypeError): arr.cast(None) - with pytest.raises(TypeError): - col.cast(None) - -def test_cast_column(): +def test_cast_chunked_array(): arrays = [pa.array([1, 2, 3]), pa.array([4, 5, 6])] - - col = pa.column('foo', arrays) + carr = pa.chunked_array(arrays) target = pa.float64() - casted = col.cast(target) - - expected = pa.column('foo', [x.cast(target) for x in arrays]) + casted = carr.cast(target) + expected = pa.chunked_array([x.cast(target) for x in arrays]) assert casted.equals(expected) +def test_chunked_array_data_warns(): + with pytest.warns(FutureWarning): + pa.chunked_array([[]]).data + + def test_cast_integers_unsafe(): # We let NumPy do the unsafe casting unsafe_cases = [ @@ -781,8 +780,6 @@ def test_unique_simple(): for arr, expected in cases: result = arr.unique() assert result.equals(expected) - result = pa.column("column", arr).unique() - assert result.equals(expected) result = pa.chunked_array([arr]).unique() assert result.equals(expected) @@ -801,8 +798,6 @@ def test_dictionary_encode_simple(): for arr, expected in cases: result = arr.dictionary_encode() assert result.equals(expected) - result = pa.column("column", arr).dictionary_encode() - assert result.data.chunk(0).equals(expected) result = pa.chunked_array([arr]).dictionary_encode() assert result.chunk(0).equals(expected) diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index 57a023237a4..9f0c08bd490 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -188,7 +188,7 @@ def read_bytes(self, b, **kwargs): def check_names(self, table, names): assert table.num_columns == len(names) - assert [c.name for c in table.columns] == names + assert table.column_names == names def test_file_object(self): data = b"a,b\n1,2\n" diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index f26f7ca95b6..137dfeaeaa9 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -69,7 +69,8 @@ def _get_null_counts(self, path, columns=None): counts = [] for i in range(reader.num_columns): col = reader.get_column(i) - if columns is None or col.name in columns: + name = reader.get_column_name(i) + if columns is None or name in columns: counts.append(col.null_count) return counts diff --git a/python/pyarrow/tests/test_misc.py b/python/pyarrow/tests/test_misc.py index f7c316a8baf..8871d69cea1 100644 --- a/python/pyarrow/tests/test_misc.py +++ b/python/pyarrow/tests/test_misc.py @@ -45,7 +45,6 @@ def test_cpu_count(): @pytest.mark.parametrize('klass', [ pa.Field, pa.Schema, - pa.Column, pa.ChunkedArray, pa.RecordBatch, pa.Table, diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 1854898b372..931c2b10caa 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -147,7 +147,7 @@ class TestConvertMetadata(object): def test_non_string_columns(self): df = pd.DataFrame({0: [1, 2, 3]}) table = pa.Table.from_pandas(df) - assert table.column(0).name == '0' + assert table.field(0).name == '0' def test_from_pandas_with_columns(self): df = pd.DataFrame({0: [1, 2, 3], 1: [1, 3, 3], 2: [2, 4, 5]}, @@ -1034,7 +1034,7 @@ class MyDate(date): def test_datetime64_to_date32(self): # ARROW-1718 arr = pa.array([date(2017, 10, 23), None]) - c = pa.Column.from_array("d", arr) + c = pa.chunked_array([arr]) s = c.to_pandas() arr2 = pa.Array.from_pandas(s, type=pa.date32()) @@ -1090,8 +1090,7 @@ def test_array_types_date_as_object(self): objects = [ # The second value is the expected value for date_as_object=False (pa.array(data), expected), - (pa.chunked_array([data]), expected), - (pa.column('date', [data]), expected.astype('M8[ns]'))] + (pa.chunked_array([data]), expected)] assert objects[0][0].equals(pa.array(expected)) @@ -2491,8 +2490,7 @@ def test_to_pandas_deduplicate_strings_array_types(): for arr in [pa.array(values, type=pa.binary()), pa.array(values, type=pa.utf8()), - pa.chunked_array([values, values]), - pa.column('foo', [values, values])]: + pa.chunked_array([values, values])]: _assert_nunique(arr.to_pandas(), nunique) _assert_nunique(arr.to_pandas(deduplicate_objects=False), len(arr)) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 436d5bfc515..3f7f4fd50ab 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -118,12 +118,13 @@ def test_single_pylist_column_roundtrip(tempdir, dtype): table = pa.Table.from_arrays(data, names=['a']) _write_table(table, filename) table_read = _read_table(filename) - for col_written, col_read in zip(table.itercolumns(), - table_read.itercolumns()): - assert col_written.name == col_read.name - assert col_read.data.num_chunks == 1 - data_written = col_written.data.chunk(0) - data_read = col_read.data.chunk(0) + for i in range(table.num_columns): + col_written = table[i] + col_read = table_read[i] + assert table.field(i).name == table_read.field(i).name + assert col_read.num_chunks == 1 + data_written = col_written.chunk(0) + data_read = col_read.chunk(0) assert data_written.equals(data_read) @@ -1953,12 +1954,14 @@ def read_multiple_files(paths, columns=None, use_threads=True, **kwargs): assert result3.equals(expected) # Read column subset - to_read = [result[0], result[2], result[6], result[result.num_columns - 1]] + to_read = [0, 2, 6, result.num_columns - 1] - result = pa.localfs.read_parquet( - dirpath, columns=[c.name for c in to_read]) - expected = pa.Table.from_arrays(to_read, metadata=result.schema.metadata) - assert result.equals(expected) + col_names = [result.field(i).name for i in to_read] + out = pa.localfs.read_parquet(dirpath, columns=col_names) + expected = pa.Table.from_arrays([result.column(i) for i in to_read], + names=col_names, + metadata=result.schema.metadata) + assert out.equals(expected) # Read with multiple threads pa.localfs.read_parquet(dirpath, use_threads=True) @@ -2965,4 +2968,4 @@ def test_filter_before_validate_schema(tempdir): # read single file using filter table = pq.read_table(tempdir, filters=[[('A', '==', 0)]]) - assert table.column('B').equals(pa.column('B', pa.array([1, 2, 3]))) + assert table.column('B').equals(pa.chunked_array([[1, 2, 3]])) diff --git a/python/pyarrow/tests/test_strategies.py b/python/pyarrow/tests/test_strategies.py index f1227d26e36..f4249df2462 100644 --- a/python/pyarrow/tests/test_strategies.py +++ b/python/pyarrow/tests/test_strategies.py @@ -46,11 +46,6 @@ def test_chunked_arrays(chunked_array): assert isinstance(chunked_array, pa.lib.ChunkedArray) -@h.given(past.all_columns) -def test_columns(column): - assert isinstance(column, pa.lib.Column) - - @h.given(past.all_record_batches) def test_record_batches(record_bath): assert isinstance(record_bath, pa.lib.RecordBatch) diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index c7216ea738f..0645fcbe180 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -186,9 +187,9 @@ def test_chunked_array_to_pandas(): pa.array([-10, -5, 0, 5, 10]) ] table = pa.Table.from_arrays(data, names=['a']) - chunked_arr = table.column(0).data - assert isinstance(chunked_arr, pa.ChunkedArray) - array = chunked_arr.to_pandas() + col = table.column(0) + assert isinstance(col, pa.ChunkedArray) + array = col.to_pandas() assert array.shape == (5,) assert array[0] == -10 @@ -223,124 +224,22 @@ def test_chunked_array_asarray(): assert np_arr.dtype == np.dtype('float64') -def test_column_basics(): - data = [ - pa.array([-10, -5, 0, 5, 10]) - ] - table = pa.Table.from_arrays(data, names=['a']) - column = table.column(0) - assert column.name == 'a' - assert column.length() == 5 - assert len(column) == 5 - assert column.shape == (5,) - assert column.to_pylist() == [-10, -5, 0, 5, 10] - assert column == pa.Column.from_array("a", column.data) - assert column != pa.Column.from_array("b", column.data) - assert column != column.data - assert not column.equals(None) - - -def test_column_factory_function(): - # ARROW-1575 - arr = pa.array([0, 1, 2, 3, 4]) - arr2 = pa.array([5, 6, 7, 8]) - - col1 = pa.Column.from_array('foo', arr) - col2 = pa.Column.from_array(pa.field('foo', arr.type), arr) - - assert col1.equals(col2) - - col3 = pa.column('foo', [arr, arr2]) - chunked_arr = pa.chunked_array([arr, arr2]) - col4 = pa.column('foo', chunked_arr) - assert col3.equals(col4) - - col5 = pa.column('foo', arr.to_pandas()) - assert col5.equals(pa.column('foo', arr)) - - # Type mismatch - with pytest.raises(ValueError): - pa.Column.from_array(pa.field('foo', pa.string()), arr) - - -def test_column_pickle(): - arr = pa.chunked_array([[1, 2], [5, 6, 7]], type=pa.int16()) - field = pa.field("ints", pa.int16()).add_metadata({b"foo": b"bar"}) - col = pa.column(field, arr) - - result = pickle.loads(pickle.dumps(col)) - assert result.equals(col) - assert result.data.num_chunks == 2 - assert result.field == field - - -@pytest.mark.pandas -def test_column_to_pandas(): - data = [ - pa.array([-10, -5, 0, 5, 10]) - ] - table = pa.Table.from_arrays(data, names=['a']) - column = table.column(0) - series = column.to_pandas() - assert series.name == 'a' - assert series.shape == (5,) - assert series.iloc[0] == -10 - - -def test_column_asarray(): - data = [ - pa.array([-10, -5, 0, 5, 10]) - ] - table = pa.Table.from_arrays(data, names=['a']) - column = table.column(0) - - np_arr = np.asarray(column) - assert np_arr.tolist() == [-10, -5, 0, 5, 10] - assert np_arr.dtype == np.dtype('int64') - - # An optional type can be specified when calling np.asarray - np_arr = np.asarray(column, dtype='str') - assert np_arr.tolist() == ['-10', '-5', '0', '5', '10'] - - -def test_column_flatten(): +def test_chunked_array_flatten(): ty = pa.struct([pa.field('x', pa.int16()), pa.field('y', pa.float32())]) a = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty) - col = pa.Column.from_array('foo', a) - x, y = col.flatten() - assert x == pa.column('foo.x', pa.array([1, 3, 5], type=pa.int16())) - assert y == pa.column('foo.y', pa.array([2.5, 4.5, 6.5], - type=pa.float32())) + carr = pa.chunked_array(a) + x, y = carr.flatten() + assert x.equals(pa.chunked_array(pa.array([1, 3, 5], type=pa.int16()))) + assert y.equals(pa.chunked_array(pa.array([2.5, 4.5, 6.5], + type=pa.float32()))) + # Empty column a = pa.array([], type=ty) - col = pa.Column.from_array('foo', a) - x, y = col.flatten() - assert x == pa.column('foo.x', pa.array([], type=pa.int16())) - assert y == pa.column('foo.y', pa.array([], type=pa.float32())) - - -def test_column_getitem(): - arr = pa.array([1, 2, 3, 4, 5, 6]) - col = pa.column('ints', arr) - - assert col[1].as_py() == 2 - assert col[-1].as_py() == 6 - assert col[-6].as_py() == 1 - with pytest.raises(IndexError): - col[6] - with pytest.raises(IndexError): - col[-7] - - data_slice = col[2:4] - assert data_slice.to_pylist() == [3, 4] - - data_slice = col[4:-1] - assert data_slice.to_pylist() == [5] - - data_slice = col[99:99] - assert data_slice.type == col.type - assert data_slice.to_pylist() == [] + carr = pa.chunked_array(a) + x, y = carr.flatten() + assert x.equals(pa.chunked_array(pa.array([], type=pa.int16()))) + assert y.equals(pa.chunked_array(pa.array([], type=pa.float32()))) def test_recordbatch_basics(): @@ -481,7 +380,7 @@ def test_recordbatchlist_schema_equals(): def test_table_equals(): - table = pa.Table.from_arrays([]) + table = pa.Table.from_arrays([], names=[]) assert table.equals(table) # ARROW-4822 @@ -497,8 +396,8 @@ def test_table_from_batches_and_schema(): names=['a', 'b']) table = pa.Table.from_batches([batch], schema) assert table.schema.equals(schema) - assert table.column(0) == pa.column('a', pa.array([1])) - assert table.column(1) == pa.column('b', pa.array([3.14])) + assert table.column(0) == pa.chunked_array([[1]]) + assert table.column(1) == pa.chunked_array([[3.14]]) incompatible_schema = pa.schema([pa.field('a', pa.int64())]) with pytest.raises(pa.ArrowInvalid): @@ -565,18 +464,19 @@ def test_table_basics(): columns = [] for col in table.itercolumns(): columns.append(col) - for chunk in col.data.iterchunks(): + for chunk in col.iterchunks(): assert chunk is not None with pytest.raises(IndexError): - col.data.chunk(-1) + col.chunk(-1) with pytest.raises(IndexError): - col.data.chunk(col.data.num_chunks) + col.chunk(col.num_chunks) assert table.columns == columns - assert table == pa.Table.from_arrays(columns) - assert table != pa.Table.from_arrays(columns[1:]) + assert table == pa.Table.from_arrays(columns, names=table.column_names) + assert table != pa.Table.from_arrays(columns[1:], + names=table.column_names[1:]) assert table != columns @@ -586,13 +486,10 @@ def test_table_from_arrays_preserves_column_metadata(): arr1 = pa.array([3, 4]) field0 = pa.field('field1', pa.int64(), metadata=dict(a="A", b="B")) field1 = pa.field('field2', pa.int64(), nullable=False) - columns = [ - pa.column(field0, arr0), - pa.column(field1, arr1) - ] - table = pa.Table.from_arrays(columns) - assert b"a" in table.column(0).field.metadata - assert table.column(1).field.nullable is False + table = pa.Table.from_arrays([arr0, arr1], + schema=pa.schema([field0, field1])) + assert b"a" in table.field(0).metadata + assert table.field(1).nullable is False def test_table_from_arrays_invalid_names(): @@ -664,16 +561,16 @@ def test_table_add_column(): ] table = pa.Table.from_arrays(data, names=('a', 'b', 'c')) - col = pa.Column.from_array('d', data[1]) - t2 = table.add_column(3, col) - t3 = table.append_column(col) + new_field = pa.field('d', data[1].type) + t2 = table.add_column(3, new_field, data[1]) + t3 = table.append_column(new_field, data[1]) expected = pa.Table.from_arrays(data + [data[1]], names=('a', 'b', 'c', 'd')) assert t2.equals(expected) assert t3.equals(expected) - t4 = table.add_column(0, col) + t4 = table.add_column(0, new_field, data[1]) expected = pa.Table.from_arrays([data[1]] + data, names=('d', 'a', 'b', 'c')) assert t4.equals(expected) @@ -687,8 +584,8 @@ def test_table_set_column(): ] table = pa.Table.from_arrays(data, names=('a', 'b', 'c')) - col = pa.Column.from_array('d', data[1]) - t2 = table.set_column(0, col) + new_field = pa.field('d', data[1].type) + t2 = table.set_column(0, new_field, data[1]) expected_data = list(data) expected_data[0] = data[1] @@ -739,7 +636,7 @@ def test_table_remove_column_empty(): t2._validate() assert len(t2) == len(table) - t3 = t2.add_column(0, table[0]) + t3 = t2.add_column(0, table.field(0), table[0]) t3._validate() assert t3.equals(table) @@ -791,7 +688,7 @@ def test_table_combine_chunks(): combined._validate() assert combined.equals(table) for c in combined.columns: - assert c.data.num_chunks == 1 + assert c.num_chunks == 1 def test_concat_tables(): @@ -1011,7 +908,12 @@ def test_table_from_pydict(): def test_table_factory_function(): import pandas as pd - d = {'a': [1, 2, 3], 'b': ['a', 'b', 'c']} + # Put in wrong order to make sure that lines up with schema + d = OrderedDict([('b', ['a', 'b', 'c']), ('a', [1, 2, 3])]) + + d_explicit = {'b': pa.array(['a', 'b', 'c'], type='string'), + 'a': pa.array([1, 2, 3], type='int32')} + schema = pa.schema([('a', pa.int32()), ('b', pa.string())]) df = pd.DataFrame(d) @@ -1022,9 +924,25 @@ def test_table_factory_function(): table2 = pa.Table.from_pandas(df, schema=schema) assert table1.equals(table2) - table1 = pa.table(d) - table2 = pa.Table.from_pydict(d) + table1 = pa.table(d_explicit) + table2 = pa.Table.from_pydict(d_explicit) assert table1.equals(table2) + + # schema coerces type table1 = pa.table(d, schema=schema) table2 = pa.Table.from_pydict(d, schema=schema) assert table1.equals(table2) + + +def test_table_function_unicode_schema(): + col_a = "äääh" + col_b = "öööf" + + # Put in wrong order to make sure that lines up with schema + d = OrderedDict([(col_b, ['a', 'b', 'c']), (col_a, [1, 2, 3])]) + + schema = pa.schema([(col_a, pa.int32()), (col_b, pa.string())]) + + result = pa.table(d, schema=schema) + assert result[0].chunk(0).equals(pa.array([1, 2, 3], type='int32')) + assert result[1].chunk(0).equals(pa.array(['a', 'b', 'c'], type='string')) diff --git a/r/R/Table.R b/r/R/Table.R index 51320fde98d..15ea48fe7c1 100644 --- a/r/R/Table.R +++ b/r/R/Table.R @@ -31,7 +31,8 @@ #' @name arrow__Table `arrow::Table` <- R6Class("arrow::Table", inherit = `arrow::Object`, public = list( - column = function(i) shared_ptr(`arrow::Column`, Table__column(self, i)), + column = function(i) shared_ptr(`arrow::ChunkedArray`, Table__column(self, i)), + field = function(i) shared_ptr(`arrow::Field`, Table__field(self, i)), serialize = function(output_stream, ...) write_table(self, output_stream, ...), diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 951b83b9080..2031feaaaa5 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -204,22 +204,6 @@ ChunkArray__Slice2 <- function(chunked_array, offset, length){ .Call(`_arrow_ChunkArray__Slice2` , chunked_array, offset, length) } -Column__length <- function(column){ - .Call(`_arrow_Column__length` , column) -} - -Column__null_count <- function(column){ - .Call(`_arrow_Column__null_count` , column) -} - -Column__type <- function(column){ - .Call(`_arrow_Column__type` , column) -} - -Column__data <- function(column){ - .Call(`_arrow_Column__data` , column) -} - util___Codec__Create <- function(codec){ .Call(`_arrow_util___Codec__Create` , codec) } @@ -948,6 +932,10 @@ Table__column <- function(table, i){ .Call(`_arrow_Table__column` , table, i) } +Table__field <- function(table, i){ + .Call(`_arrow_Table__field` , table, i) +} + Table__columns <- function(table){ .Call(`_arrow_Table__columns` , table) } diff --git a/r/R/feather.R b/r/R/feather.R index 75ab6104237..57c1dffae42 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -34,7 +34,7 @@ num_rows = function() ipc___feather___TableReader__num_rows(self), num_columns = function() ipc___feather___TableReader__num_columns(self), GetColumnName = function(i) ipc___feather___TableReader__GetColumnName(self, i), - GetColumn = function(i) shared_ptr(`arrow::Column`, ipc___feather___TableReader__GetColumn(self, i)), + GetColumn = function(i) shared_ptr(`arrow::Array`, ipc___feather___TableReader__GetColumn(self, i)), Read = function(columns) { shared_ptr(`arrow::Table`, ipc___feather___TableReader__Read(self, columns)) } diff --git a/r/README.Rmd b/r/README.Rmd index 6b8381769dd..9a6c0b9728a 100644 --- a/r/README.Rmd +++ b/r/README.Rmd @@ -84,20 +84,19 @@ You can specify a particular commit, branch, or [release](https://github.com/apa ## Developing -If you need to alter both the Arrow C++ library and the R package code, or if you can't get a binary version of the latest C++ library elsewhere, you'll need to build it from source too. +If you need to alter both the Arrow C++ library and the R package code, or if +you can't get a binary version of the latest C++ library elsewhere, you'll need +to build it from source too. -First, clone the repository and install a release build of the C++ library. +First, install the C++ library. See the [C++ developer +guide](https://arrow.apache.org/docs/developers/cpp.html) for details. -```shell -git clone https://github.com/apache/arrow.git -mkdir arrow/cpp/build && cd arrow/cpp/build -cmake .. -DARROW_PARQUET=ON -DARROW_BOOST_USE_SHARED:BOOL=Off -DARROW_INSTALL_NAME_RPATH=OFF -make install -``` +Note that after any change to the C++ library, you must reinstall it and run +`make clean` or `git clean -fdx .` to remove any cached object code in the `r/` +directory. -This likely will require additional system libraries to be installed, the specifics of which are platform dependent. See the [C++ developer guide](https://arrow.apache.org/docs/developers/cpp.html) for details. - -Once you've built the C++ library, you can install the R package and its dependencies, along with additional dev dependencies, from the git checkout: +Once you've built the C++ library, you can install the R package and its +dependencies, along with additional dev dependencies, from the git checkout: ```shell cd ../../r @@ -114,15 +113,27 @@ unable to load shared object '/Users/you/R/00LOCK-r/00new/arrow/libs/arrow.so': dlopen(/Users/you/R/00LOCK-r/00new/arrow/libs/arrow.so, 6): Library not loaded: @rpath/libarrow.14.dylib ``` -try setting the environment variable `LD_LIBRARY_PATH` (or `DYLD_LIBRARY_PATH` on macOS) to wherever Arrow C++ was put in `make install`, e.g. `export LD_LIBRARY_PATH=/usr/local/lib`, and retry installing the R package. +try setting the environment variable `LD_LIBRARY_PATH` (or `DYLD_LIBRARY_PATH` +on macOS) to wherever Arrow C++ was put in `make install`, e.g. `export +LD_LIBRARY_PATH=/usr/local/lib`, and retry installing the R package. -For any other build/configuration challenges, see the [C++ developer guide](https://arrow.apache.org/docs/developers/cpp.html#building). +For any other build/configuration challenges, see the [C++ developer +guide](https://arrow.apache.org/docs/developers/cpp.html#building). ### Editing Rcpp code -The `arrow` package uses some customized tools on top of `Rcpp` to prepare its C++ code in `src/`. If you change C++ code in the R package, you will need to set the `ARROW_R_DEV` environment variable to `TRUE` (optionally, add it to your`~/.Renviron` file to persist across sessions) so that the `data-raw/codegen.R` file is used for code generation. +The `arrow` package uses some customized tools on top of `Rcpp` to prepare its +C++ code in `src/`. If you change C++ code in the R package, you will need to +set the `ARROW_R_DEV` environment variable to `TRUE` (optionally, add it to +your`~/.Renviron` file to persist across sessions) so that the +`data-raw/codegen.R` file is used for code generation. -You'll also need `remotes::install_github("romainfrancois/decor")`. +The codegen.R script has these dependencies: + +``` +remotes::install_github("romainfrancois/decor") +install.packages(c("dplyr", "purrr", "glue")) +``` ### Useful functions @@ -137,7 +148,9 @@ pkgdown::build_site(run_dont_run=TRUE) # To preview the documentation website devtools::check() # All package checks; see also below ``` -Any of those can be run from the command line by wrapping them in `R -e '$COMMAND'`. There's also a `Makefile` to help with some common tasks from the command line (`make test`, `make doc`, `make clean`, etc.) +Any of those can be run from the command line by wrapping them in `R -e +'$COMMAND'`. There's also a `Makefile` to help with some common tasks from the +command line (`make test`, `make doc`, `make clean`, etc.) ### Full package validation @@ -145,3 +158,5 @@ Any of those can be run from the command line by wrapping them in `R -e '$COMMAN R CMD build --keep-empty-dirs . R CMD check arrow_*.tar.gz --as-cran --no-manual ``` + +[1]: https://github.com/apache/arrow/blob/master/docs/source/developers/cpp.rst \ No newline at end of file diff --git a/r/README.md b/r/README.md index ddae0992a4d..43280f33c1b 100644 --- a/r/README.md +++ b/r/README.md @@ -48,6 +48,14 @@ library. ``` r library(arrow) +#> +#> Attaching package: 'arrow' +#> The following object is masked from 'package:utils': +#> +#> timestamp +#> The following objects are masked from 'package:base': +#> +#> array, table set.seed(24) tab <- arrow::table(x = 1:10, y = rnorm(10)) @@ -125,20 +133,13 @@ If you need to alter both the Arrow C++ library and the R package code, or if you can’t get a binary version of the latest C++ library elsewhere, you’ll need to build it from source too. -First, clone the repository and install a release build of the C++ -library. - -``` shell -git clone https://github.com/apache/arrow.git -mkdir arrow/cpp/build && cd arrow/cpp/build -cmake .. -DARROW_PARQUET=ON -DARROW_BOOST_USE_SHARED:BOOL=Off -DARROW_INSTALL_NAME_RPATH=OFF -make install -``` - -This likely will require additional system libraries to be installed, -the specifics of which are platform dependent. See the [C++ developer +First, install the C++ library. See the [C++ developer guide](https://arrow.apache.org/docs/developers/cpp.html) for details. +Note that after any change to the C++ library, you must reinstall it and +run `make clean` or `git clean -fdx .` to remove any cached object code +in the `r/` directory. + Once you’ve built the C++ library, you can install the R package and its dependencies, along with additional dev dependencies, from the git checkout: @@ -173,7 +174,10 @@ you will need to set the `ARROW_R_DEV` environment variable to `TRUE` sessions) so that the `data-raw/codegen.R` file is used for code generation. -You’ll also need `remotes::install_github("romainfrancois/decor")`. +The codegen.R script has these dependencies: + + remotes::install_github("romainfrancois/decor") + install.packages(c("dplyr", "purrr", "glue")) ### Useful functions diff --git a/r/src/array__to_vector.cpp b/r/src/array__to_vector.cpp index 7fcb02bef3c..1458d012c22 100644 --- a/r/src/array__to_vector.cpp +++ b/r/src/array__to_vector.cpp @@ -816,8 +816,8 @@ Rcpp::List Table__to_dataframe(const std::shared_ptr& table, std::vector> converters(nc); for (int64_t i = 0; i < nc; i++) { - converters[i] = arrow::r::Converter::Make(table->column(i)->data()->chunks()); - names[i] = table->column(i)->name(); + converters[i] = arrow::r::Converter::Make(table->column(i)->chunks()); + names[i] = table->field(i)->name(); } if (use_threads) { diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 356f9ab2027..bcb0ac59b95 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -793,66 +793,6 @@ RcppExport SEXP _arrow_ChunkArray__Slice2(SEXP chunked_array_sexp, SEXP offset_s } #endif -// column.cpp -#if defined(ARROW_R_WITH_ARROW) -int Column__length(const std::shared_ptr& column); -RcppExport SEXP _arrow_Column__length(SEXP column_sexp){ -BEGIN_RCPP - Rcpp::traits::input_parameter&>::type column(column_sexp); - return Rcpp::wrap(Column__length(column)); -END_RCPP -} -#else -RcppExport SEXP _arrow_Column__length(SEXP column_sexp){ - Rf_error("Cannot call Column__length(). Please use arrow::install_arrow() to install required runtime libraries. "); -} -#endif - -// column.cpp -#if defined(ARROW_R_WITH_ARROW) -int Column__null_count(const std::shared_ptr& column); -RcppExport SEXP _arrow_Column__null_count(SEXP column_sexp){ -BEGIN_RCPP - Rcpp::traits::input_parameter&>::type column(column_sexp); - return Rcpp::wrap(Column__null_count(column)); -END_RCPP -} -#else -RcppExport SEXP _arrow_Column__null_count(SEXP column_sexp){ - Rf_error("Cannot call Column__null_count(). Please use arrow::install_arrow() to install required runtime libraries. "); -} -#endif - -// column.cpp -#if defined(ARROW_R_WITH_ARROW) -std::shared_ptr Column__type(const std::shared_ptr& column); -RcppExport SEXP _arrow_Column__type(SEXP column_sexp){ -BEGIN_RCPP - Rcpp::traits::input_parameter&>::type column(column_sexp); - return Rcpp::wrap(Column__type(column)); -END_RCPP -} -#else -RcppExport SEXP _arrow_Column__type(SEXP column_sexp){ - Rf_error("Cannot call Column__type(). Please use arrow::install_arrow() to install required runtime libraries. "); -} -#endif - -// column.cpp -#if defined(ARROW_R_WITH_ARROW) -std::shared_ptr Column__data(const std::shared_ptr& column); -RcppExport SEXP _arrow_Column__data(SEXP column_sexp){ -BEGIN_RCPP - Rcpp::traits::input_parameter&>::type column(column_sexp); - return Rcpp::wrap(Column__data(column)); -END_RCPP -} -#else -RcppExport SEXP _arrow_Column__data(SEXP column_sexp){ - Rf_error("Cannot call Column__data(). Please use arrow::install_arrow() to install required runtime libraries. "); -} -#endif - // compression.cpp #if defined(ARROW_R_WITH_ARROW) std::unique_ptr util___Codec__Create(arrow::Compression::type codec); @@ -1982,7 +1922,7 @@ RcppExport SEXP _arrow_ipc___feather___TableReader__GetColumnName(SEXP reader_se // feather.cpp #if defined(ARROW_R_WITH_ARROW) -std::shared_ptr ipc___feather___TableReader__GetColumn(const std::unique_ptr& reader, int i); +std::shared_ptr ipc___feather___TableReader__GetColumn(const std::unique_ptr& reader, int i); RcppExport SEXP _arrow_ipc___feather___TableReader__GetColumn(SEXP reader_sexp, SEXP i_sexp){ BEGIN_RCPP Rcpp::traits::input_parameter&>::type reader(reader_sexp); @@ -3641,7 +3581,7 @@ RcppExport SEXP _arrow_Table__schema(SEXP x_sexp){ // table.cpp #if defined(ARROW_R_WITH_ARROW) -std::shared_ptr Table__column(const std::shared_ptr& table, int i); +std::shared_ptr Table__column(const std::shared_ptr& table, int i); RcppExport SEXP _arrow_Table__column(SEXP table_sexp, SEXP i_sexp){ BEGIN_RCPP Rcpp::traits::input_parameter&>::type table(table_sexp); @@ -3657,7 +3597,23 @@ RcppExport SEXP _arrow_Table__column(SEXP table_sexp, SEXP i_sexp){ // table.cpp #if defined(ARROW_R_WITH_ARROW) -std::vector> Table__columns(const std::shared_ptr& table); +std::shared_ptr Table__field(const std::shared_ptr& table, int i); +RcppExport SEXP _arrow_Table__field(SEXP table_sexp, SEXP i_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type table(table_sexp); + Rcpp::traits::input_parameter::type i(i_sexp); + return Rcpp::wrap(Table__field(table, i)); +END_RCPP +} +#else +RcppExport SEXP _arrow_Table__field(SEXP table_sexp, SEXP i_sexp){ + Rf_error("Cannot call Table__field(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// table.cpp +#if defined(ARROW_R_WITH_ARROW) +std::vector> Table__columns(const std::shared_ptr& table); RcppExport SEXP _arrow_Table__columns(SEXP table_sexp){ BEGIN_RCPP Rcpp::traits::input_parameter&>::type table(table_sexp); @@ -3811,10 +3767,6 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_ChunkedArray__type", (DL_FUNC) &_arrow_ChunkedArray__type, 1}, { "_arrow_ChunkArray__Slice1", (DL_FUNC) &_arrow_ChunkArray__Slice1, 2}, { "_arrow_ChunkArray__Slice2", (DL_FUNC) &_arrow_ChunkArray__Slice2, 3}, - { "_arrow_Column__length", (DL_FUNC) &_arrow_Column__length, 1}, - { "_arrow_Column__null_count", (DL_FUNC) &_arrow_Column__null_count, 1}, - { "_arrow_Column__type", (DL_FUNC) &_arrow_Column__type, 1}, - { "_arrow_Column__data", (DL_FUNC) &_arrow_Column__data, 1}, { "_arrow_util___Codec__Create", (DL_FUNC) &_arrow_util___Codec__Create, 1}, { "_arrow_io___CompressedOutputStream__Make", (DL_FUNC) &_arrow_io___CompressedOutputStream__Make, 2}, { "_arrow_io___CompressedInputStream__Make", (DL_FUNC) &_arrow_io___CompressedInputStream__Make, 2}, @@ -3997,6 +3949,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_Table__num_rows", (DL_FUNC) &_arrow_Table__num_rows, 1}, { "_arrow_Table__schema", (DL_FUNC) &_arrow_Table__schema, 1}, { "_arrow_Table__column", (DL_FUNC) &_arrow_Table__column, 2}, + { "_arrow_Table__field", (DL_FUNC) &_arrow_Table__field, 2}, { "_arrow_Table__columns", (DL_FUNC) &_arrow_Table__columns, 1}, { "_arrow_Table__column_names", (DL_FUNC) &_arrow_Table__column_names, 1}, { "_arrow_Table__select", (DL_FUNC) &_arrow_Table__select, 2}, diff --git a/r/src/column.cpp b/r/src/column.cpp deleted file mode 100644 index 026cb6904d4..00000000000 --- a/r/src/column.cpp +++ /dev/null @@ -1,44 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "./arrow_types.h" - -#if defined(ARROW_R_WITH_ARROW) - -// [[arrow::export]] -int Column__length(const std::shared_ptr& column) { - return column->length(); -} - -// [[arrow::export]] -int Column__null_count(const std::shared_ptr& column) { - return column->null_count(); -} - -// [[arrow::export]] -std::shared_ptr Column__type( - const std::shared_ptr& column) { - return column->type(); -} - -// [[arrow::export]] -std::shared_ptr Column__data( - const std::shared_ptr& column) { - return column->data(); -} - -#endif diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 4e4091017c1..2adb4f66f5d 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -74,14 +74,12 @@ std::shared_ptr Table__cast( const std::shared_ptr& options) { auto nc = table->num_columns(); - using ColumnVector = std::vector>; + using ColumnVector = std::vector>; ColumnVector columns(nc); for (int i = 0; i < nc; i++) { - columns[i] = std::make_shared( - table->column(i)->name(), - ChunkedArray__cast(table->column(i)->data(), schema->field(i)->type(), options)); + columns[i] = ChunkedArray__cast(table->column(i), + schema->field(i)->type(), options); } - return arrow::Table::Make(schema, std::move(columns), table->num_rows()); } diff --git a/r/src/feather.cpp b/r/src/feather.cpp index a5198812647..7bdfeab72b2 100644 --- a/r/src/feather.cpp +++ b/r/src/feather.cpp @@ -106,9 +106,9 @@ std::string ipc___feather___TableReader__GetColumnName( } // [[arrow::export]] -std::shared_ptr ipc___feather___TableReader__GetColumn( +std::shared_ptr ipc___feather___TableReader__GetColumn( const std::unique_ptr& reader, int i) { - std::shared_ptr column; + std::shared_ptr column; STOP_IF_NOT_OK(reader->GetColumn(i, &column)); return column; } diff --git a/r/src/table.cpp b/r/src/table.cpp index 1e958d03eff..c54b9d8abc9 100644 --- a/r/src/table.cpp +++ b/r/src/table.cpp @@ -47,16 +47,22 @@ std::shared_ptr Table__schema(const std::shared_ptr } // [[arrow::export]] -std::shared_ptr Table__column(const std::shared_ptr& table, +std::shared_ptr Table__column(const std::shared_ptr& table, int i) { return table->column(i); } // [[arrow::export]] -std::vector> Table__columns( +std::shared_ptr Table__field(const std::shared_ptr& table, + int i) { + return table->field(i); +} + +// [[arrow::export]] +std::vector> Table__columns( const std::shared_ptr& table) { auto nc = table->num_columns(); - std::vector> res(nc); + std::vector> res(nc); for (int i = 0; i < nc; i++) { res[i] = table->column(i); } @@ -68,7 +74,7 @@ Rcpp::CharacterVector Table__column_names(const std::shared_ptr& t int nc = table->num_columns(); Rcpp::CharacterVector res(nc); for (int i = 0; i < nc; i++) { - res[i] = table->column(i)->name(); + res[i] = table->field(i)->name(); } return res; } @@ -79,7 +85,7 @@ std::shared_ptr Table__select(const std::shared_ptr& R_xlen_t n = indices.size(); std::vector> fields(n); - std::vector> columns(n); + std::vector> columns(n); for (R_xlen_t i = 0; i < n; i++) { int pos = indices[i] - 1; @@ -120,7 +126,7 @@ std::shared_ptr Table__from_dots(SEXP lst, SEXP schema_sxp) { int num_fields; STOP_IF_NOT_OK(arrow::r::count_fields(lst, &num_fields)); - std::vector> columns(num_fields); + std::vector> columns(num_fields); std::shared_ptr schema; if (Rf_isNull(schema_sxp)) { @@ -129,21 +135,18 @@ std::shared_ptr Table__from_dots(SEXP lst, SEXP schema_sxp) { SEXP names = Rf_getAttrib(lst, R_NamesSymbol); auto fill_one_column = [&columns, &fields](int j, SEXP x, SEXP name) { - if (Rf_inherits(x, "arrow::Column")) { - columns[j] = arrow::r::extract(x); - fields[j] = columns[j]->field(); - } else if (Rf_inherits(x, "arrow::ChunkedArray")) { + if (Rf_inherits(x, "arrow::ChunkedArray")) { auto chunked_array = arrow::r::extract(x); - fields[j] = std::make_shared(CHAR(name), chunked_array->type()); - columns[j] = std::make_shared(fields[j], chunked_array); + fields[j] = arrow::field(CHAR(name), chunked_array->type()); + columns[j] = chunked_array; } else if (Rf_inherits(x, "arrow::Array")) { auto array = arrow::r::extract(x); - fields[j] = std::make_shared(CHAR(name), array->type()); - columns[j] = std::make_shared(fields[j], array); + fields[j] = arrow::field(CHAR(name), array->type()); + columns[j] = std::make_shared(array); } else { auto array = Array__from_vector(x, R_NilValue); - fields[j] = std::make_shared(CHAR(name), array->type()); - columns[j] = std::make_shared(fields[j], array); + fields[j] = arrow::field(CHAR(name), array->type()); + columns[j] = std::make_shared(array); } }; @@ -168,18 +171,16 @@ std::shared_ptr Table__from_dots(SEXP lst, SEXP schema_sxp) { schema = arrow::r::extract(schema_sxp); auto fill_one_column = [&columns, &schema](int j, SEXP x) { - if (Rf_inherits(x, "arrow::Column")) { - columns[j] = arrow::r::extract(x); - } else if (Rf_inherits(x, "arrow::ChunkedArray")) { + if (Rf_inherits(x, "arrow::ChunkedArray")) { auto chunked_array = arrow::r::extract(x); - columns[j] = std::make_shared(schema->field(j), chunked_array); + columns[j] = chunked_array; } else if (Rf_inherits(x, "arrow::Array")) { auto array = arrow::r::extract(x); - columns[j] = std::make_shared(schema->field(j), array); + columns[j] = std::make_shared(array); } else { auto type = schema->field(j)->type(); auto array = arrow::r::Array__from_vector(x, type, false); - columns[j] = std::make_shared(schema->field(j), array); + columns[j] = std::make_shared(array); } }; diff --git a/r/tests/testthat/test-json.R b/r/tests/testthat/test-json.R index 94139048192..b3e7d5638f5 100644 --- a/r/tests/testthat/test-json.R +++ b/r/tests/testthat/test-json.R @@ -113,7 +113,7 @@ test_that("Can read json file with nested columns (ARROW-5503)", { ) ) - struct_array <- tab1$column(1)$data()$chunk(0) + struct_array <- tab1$column(1)$chunk(0) ps <- array(c(NA, NA, 78, 90, NA, 19)) hello <- array(c(NA, NA, "hi", "bonjour", "ciao", NA)) expect_equal(struct_array$field(0L), ps) @@ -132,7 +132,7 @@ test_that("Can read json file with nested columns (ARROW-5503)", { 5, c(5, 6) ) - list_array <- tab1$column(0)$data() + list_array <- tab1$column(0) expect_identical( list_array$as_vector(), list_array_r diff --git a/r/tests/testthat/test-read-write.R b/r/tests/testthat/test-read-write.R index fb442a293cf..17d994deab2 100644 --- a/r/tests/testthat/test-read-write.R +++ b/r/tests/testthat/test-read-write.R @@ -28,14 +28,8 @@ test_that("arrow::table round trip", { expect_equal(tab$num_columns, 3L) expect_equal(tab$num_rows, 10L) - # arrow::Column - col_int <- tab$column(0) - expect_equal(col_int$length(), 10L) - expect_equal(col_int$null_count, 0L) - expect_equal(col_int$type, int32()) - # arrow::ChunkedArray - chunked_array_int <- col_int$data() + chunked_array_int <- tab$column(0) expect_equal(chunked_array_int$length(), 10L) expect_equal(chunked_array_int$null_count, 0L) expect_equal(chunked_array_int$as_vector(), tbl$int) @@ -47,14 +41,8 @@ test_that("arrow::table round trip", { expect_equal(chunked_array_int$chunk(i-1L), chunks_int[[i]]) } - # arrow::Column - col_dbl <- tab$column(1) - expect_equal(col_dbl$length(), 10L) - expect_equal(col_dbl$null_count, 0L) - expect_equal(col_dbl$type, float64()) - # arrow::ChunkedArray - chunked_array_dbl <- col_dbl$data() + chunked_array_dbl <- tab$column(1) expect_equal(chunked_array_dbl$length(), 10L) expect_equal(chunked_array_dbl$null_count, 0L) expect_equal(chunked_array_dbl$as_vector(), tbl$dbl) @@ -66,14 +54,8 @@ test_that("arrow::table round trip", { expect_equal(chunked_array_dbl$chunk(i-1L), chunks_dbl[[i]]) } - # arrow::Colmumn - col_raw <- tab$column(2) - expect_equal(col_raw$length(), 10L) - expect_equal(col_raw$null_count, 0L) - expect_equal(col_raw$type, int8()) - # arrow::ChunkedArray - chunked_array_raw <- col_raw$data() + chunked_array_raw <- tab$column(2) expect_equal(chunked_array_raw$length(), 10L) expect_equal(chunked_array_raw$null_count, 0L) expect_equal(chunked_array_raw$as_vector(), as.integer(tbl$raw)) @@ -130,4 +112,3 @@ test_that("arrow::table round trip handles NA in integer and numeric", { expect_true(is.na(res$dbl[10])) unlink(tf) }) - diff --git a/ruby/red-arrow/ext/arrow/raw-records.cpp b/ruby/red-arrow/ext/arrow/raw-records.cpp index a2fab554536..9b94ec04d53 100644 --- a/ruby/red-arrow/ext/arrow/raw-records.cpp +++ b/ruby/red-arrow/ext/arrow/raw-records.cpp @@ -661,8 +661,7 @@ namespace red_arrow { rb_ary_push(records_, record); } for (int i = 0; i < n_columns_; ++i) { - const auto column = table.column(i).get(); - const auto chunked_array = column->data(); + const auto& chunked_array = table.column(i).get(); column_index_ = i; row_offset_ = 0; for (const auto array : chunked_array->chunks()) { diff --git a/ruby/red-arrow/lib/arrow/chunked-array.rb b/ruby/red-arrow/lib/arrow/chunked-array.rb index c720d229c39..1f55cb82522 100644 --- a/ruby/red-arrow/lib/arrow/chunked-array.rb +++ b/ruby/red-arrow/lib/arrow/chunked-array.rb @@ -19,6 +19,11 @@ module Arrow class ChunkedArray include Enumerable + alias_method :size, :n_rows + unless method_defined?(:length) + alias_method :length, :n_rows + end + alias_method :chunks_raw, :chunks def chunks @chunks ||= chunks_raw diff --git a/ruby/red-arrow/lib/arrow/column-containable.rb b/ruby/red-arrow/lib/arrow/column-containable.rb new file mode 100644 index 00000000000..51ad88e7080 --- /dev/null +++ b/ruby/red-arrow/lib/arrow/column-containable.rb @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Arrow + module ColumnContainable + def columns + @columns ||= schema.n_fields.times.collect do |i| + Column.new(self, i) + end + end + + def each_column(&block) + columns.each(&block) + end + + def find_column(name_or_index) + case name_or_index + when String, Symbol + name = name_or_index.to_s + index = schema.get_field_index(name) + return nil if index == -1 + Column.new(self, index) + when Integer + index = name_or_index + index += n_columns if index < 0 + return nil if index < 0 or index >= n_columns + Column.new(self, index) + else + message = "column name or index must be String, Symbol or Integer" + raise ArgumentError, message + end + end + end +end diff --git a/ruby/red-arrow/lib/arrow/column.rb b/ruby/red-arrow/lib/arrow/column.rb index de385c04622..06f3dbdc05b 100644 --- a/ruby/red-arrow/lib/arrow/column.rb +++ b/ruby/red-arrow/lib/arrow/column.rb @@ -19,32 +19,58 @@ module Arrow class Column include Enumerable + attr_reader :container + attr_reader :field + attr_reader :data + def initialize(container, index) + @container = container + @index = index + @field = @container.schema[@index] + @data = @container.get_column_data(@index) + end + + def name + @field.name + end + + def data_type + @field.data_type + end + def null?(i) - data.null?(i) + @data.null?(i) end def valid?(i) - data.valid?(i) + @data.valid?(i) end def [](i) - data[i] + @data[i] end def each(&block) - return to_enum(__method__) unless block_given? - - data.each(&block) + @data.each(&block) end def reverse_each(&block) - return to_enum(__method__) unless block_given? + @data.reverse_each(&block) + end + + def n_rows + @data.n_rows + end + alias_method :size, :n_rows + alias_method :length, :n_rows - data.reverse_each(&block) + def n_nulls + @data.n_nulls end - def pack - self.class.new(field, data.pack) + def ==(other) + other.is_a?(self.class) and + @field == other.field and + @data == other.data end end end diff --git a/ruby/red-arrow/lib/arrow/data-type.rb b/ruby/red-arrow/lib/arrow/data-type.rb index 5b1c873029a..9411785b437 100644 --- a/ruby/red-arrow/lib/arrow/data-type.rb +++ b/ruby/red-arrow/lib/arrow/data-type.rb @@ -29,24 +29,33 @@ class << self # # @return [Arrow::DataType] The given data type itself. # - # @overload resolve(name, *arguments) + # @overload resolve(name) # # Creates a suitable data type from type name. For example, # you can create {Arrow::BooleanDataType} from `:boolean`. # # @param name [String, Symbol] The type name of the data type. # - # @param arguments [::Array] The additional information of the - # data type. + # @example Create a boolean data type + # Arrow::DataType.resolve(:boolean) + # + # @overload resolve(name_with_arguments) + # + # Creates a suitable data type from type name with arguments. + # + # @param name_with_arguments [::Array] + # The type name of the data type as the first element. + # + # The rest elements are additional information of the data type. # # For example, {Arrow::TimestampDataType} needs unit as # additional information. # # @example Create a boolean data type - # Arrow::DataType.resolve(:boolean) + # Arrow::DataType.resolve([:boolean]) # # @example Create a milliseconds unit timestamp data type - # Arrow::DataType.resolve(:timestamp, :milli) + # Arrow::DataType.resolve([:timestamp, :milli]) # # @overload resolve(description) # @@ -135,5 +144,13 @@ def resolve_class(data_type) Arrow.const_get(data_type_class_name) end end + + def build_array(values) + base_name = self.class.name.gsub(/DataType\z/, "") + builder_class = self.class.const_get("#{base_name}ArrayBuilder") + args = [values] + args.unshift(self) unless builder_class.buildable?(args) + builder_class.build(*args) + end end end diff --git a/ruby/red-arrow/lib/arrow/field-containable.rb b/ruby/red-arrow/lib/arrow/field-containable.rb index 1956dde12c9..e4dbf4ec26c 100644 --- a/ruby/red-arrow/lib/arrow/field-containable.rb +++ b/ruby/red-arrow/lib/arrow/field-containable.rb @@ -24,6 +24,9 @@ def find_field(name_or_index) get_field_by_name(name) when Integer index = name_or_index + raise if index < 0 + index += n_fields if index < 0 + return nil if index < 0 or index >= n_fields get_field(index) else message = "field name or index must be String, Symbol or Integer" diff --git a/ruby/red-arrow/lib/arrow/group.rb b/ruby/red-arrow/lib/arrow/group.rb index 7ef8dc3d74c..568e0e8c3fe 100644 --- a/ruby/red-arrow/lib/arrow/group.rb +++ b/ruby/red-arrow/lib/arrow/group.rb @@ -152,24 +152,21 @@ def aggregate(target_columns) end grouped_key_arrays_raw = grouped_keys.transpose - columns = @keys.collect.with_index do |key, i| + fields = [] + arrays = [] + @keys.each_with_index do |key, i| key_column = @table[key] - key_column_array_class = key_column.data.chunks.first.class - if key_column_array_class == TimestampArray - builder = TimestampArrayBuilder.new(key_column.data_type) - key_column_array = builder.build(grouped_key_arrays_raw[i]) - else - key_column_array = - key_column_array_class.new(grouped_key_arrays_raw[i]) - end - Column.new(key_column.field, key_column_array) + key_column_array_raw = grouped_key_arrays_raw[i] + key_column_array = key_column.data_type.build_array(key_column_array_raw) + fields << key_column.field + arrays << key_column_array end target_columns.each_with_index do |column, i| array = ArrayBuilder.build(aggregated_arrays_raw[i]) - field = Field.new(column.name, array.value_data_type) - columns << Column.new(field, array) + arrays << array + fields << Field.new(column.field.name, array.value_data_type) end - Table.new(columns) + Table.new(fields, arrays) end end end diff --git a/ruby/red-arrow/lib/arrow/record-batch.rb b/ruby/red-arrow/lib/arrow/record-batch.rb index b577d4a41a6..60fd42ec4ac 100644 --- a/ruby/red-arrow/lib/arrow/record-batch.rb +++ b/ruby/red-arrow/lib/arrow/record-batch.rb @@ -15,10 +15,12 @@ # specific language governing permissions and limitations # under the License. +require "arrow/column-containable" require "arrow/record-containable" module Arrow class RecordBatch + include ColumnContainable include RecordContainable include Enumerable @@ -40,10 +42,10 @@ def new(*args) alias_method :each, :each_record - alias_method :columns_raw, :columns - def columns - @columns ||= columns_raw - end + alias_method :size, :n_rows + alias_method :length, :n_rows + + alias_method :[], :find_column # Converts the record batch to {Arrow::Table}. # diff --git a/ruby/red-arrow/lib/arrow/record-containable.rb b/ruby/red-arrow/lib/arrow/record-containable.rb index f73d1a8c126..20c9ac2f576 100644 --- a/ruby/red-arrow/lib/arrow/record-containable.rb +++ b/ruby/red-arrow/lib/arrow/record-containable.rb @@ -17,12 +17,6 @@ module Arrow module RecordContainable - def each_column(&block) - return to_enum(__method__) unless block_given? - - columns.each(&block) - end - def each_record(reuse_record: false) unless block_given? return to_enum(__method__, reuse_record: reuse_record) @@ -40,34 +34,5 @@ def each_record(reuse_record: false) end end end - - def find_column(name_or_index) - case name_or_index - when String, Symbol - name = name_or_index.to_s - index = resolve_column_name(name) - return nil if index.nil? - columns[index] - when Integer - index = name_or_index - columns[index] - else - message = "column name or index must be String, Symbol or Integer" - raise ArgumentError, message - end - end - - private - def resolve_column_name(name) - (@column_name_to_index ||= build_column_name_resolve_table)[name] - end - - def build_column_name_resolve_table - table = {} - schema.fields.each_with_index do |field, i| - table[field.name] = i - end - table - end end end diff --git a/ruby/red-arrow/lib/arrow/record.rb b/ruby/red-arrow/lib/arrow/record.rb index 70bd215033f..6f83dded0c3 100644 --- a/ruby/red-arrow/lib/arrow/record.rb +++ b/ruby/red-arrow/lib/arrow/record.rb @@ -17,38 +17,41 @@ module Arrow class Record + attr_reader :container attr_accessor :index - def initialize(record_container, index) - @record_container = record_container + def initialize(container, index) + @container = container @index = index end def [](column_name_or_column_index) - column = @record_container.find_column(column_name_or_column_index) + column = @container.find_column(column_name_or_column_index) return nil if column.nil? column[@index] end - def columns - @record_container.columns + def to_a + @container.columns.collect do |column| + column[@index] + end end def to_h attributes = {} - @record_container.schema.fields.each_with_index do |field, i| - attributes[field.name] = self[i] + @container.columns.each do |column| + attributes[column.name] = column[@index] end attributes end def respond_to_missing?(name, include_private) - return true if @record_container.find_column(name) + return true if @container.find_column(name) super end def method_missing(name, *args, &block) if args.empty? - column = @record_container.find_column(name) + column = @container.find_column(name) return column[@index] if column end super diff --git a/ruby/red-arrow/lib/arrow/slicer.rb b/ruby/red-arrow/lib/arrow/slicer.rb index fd2033d37cb..fa834766866 100644 --- a/ruby/red-arrow/lib/arrow/slicer.rb +++ b/ruby/red-arrow/lib/arrow/slicer.rb @@ -253,9 +253,9 @@ def evaluate case @value when nil if @column.n_nulls.zero? - raw_array = [true] * @column.length + raw_array = [true] * @column.n_rows else - raw_array = @column.length.times.collect do |i| + raw_array = @column.n_rows.times.collect do |i| @column.valid?(i) end end diff --git a/ruby/red-arrow/lib/arrow/table-loader.rb b/ruby/red-arrow/lib/arrow/table-loader.rb index 15bd9ee3a32..204b4f87754 100644 --- a/ruby/red-arrow/lib/arrow/table-loader.rb +++ b/ruby/red-arrow/lib/arrow/table-loader.rb @@ -88,17 +88,11 @@ def open_input_stream def load_raw(input, reader) schema = reader.schema - chunked_arrays = [] + record_batches = [] reader.each do |record_batch| - record_batch.columns.each_with_index do |array, i| - chunked_array = (chunked_arrays[i] ||= []) - chunked_array << array - end - end - columns = schema.fields.collect.with_index do |field, i| - Column.new(field, ChunkedArray.new(chunked_arrays[i])) + record_batches << record_batch end - table = Table.new(schema, columns) + table = Table.new(schema, record_batches) table.instance_variable_set(:@input, input) table end diff --git a/ruby/red-arrow/lib/arrow/table.rb b/ruby/red-arrow/lib/arrow/table.rb index 64f4b49fc51..c0ce502beca 100644 --- a/ruby/red-arrow/lib/arrow/table.rb +++ b/ruby/red-arrow/lib/arrow/table.rb @@ -15,11 +15,13 @@ # specific language governing permissions and limitations # under the License. +require "arrow/column-containable" require "arrow/group" require "arrow/record-containable" module Arrow class Table + include ColumnContainable include RecordContainable class << self @@ -74,6 +76,24 @@ def load(path, options={}) # Arrow::Table.new("count" => Arrow::ChunkedArray.new(count_chunks), # "visible" => Arrow::ChunkedArray.new(visible_chunks)) # + # @overload initialize(raw_table) + # + # @param raw_table [Hash] + # The pairs of column name and values of the table. Column values is + # `Array`. + # + # @example Create a table from column name and values + # count_chunks = [ + # Arrow::UInt32Array.new([0, 2]), + # Arrow::UInt32Array.new([nil, 4]), + # ] + # visible_chunks = [ + # Arrow::BooleanArray.new([true]), + # Arrow::BooleanArray.new([nil, nil, false]), + # ] + # Arrow::Table.new("count" => [0, 2, nil, 4], + # "visible" => [true, nil, nil, false]) + # # @overload initialize(schema, columns) # # @param schema [Arrow::Schema] The schema of the table. @@ -152,17 +172,18 @@ def initialize(*args) case n_args when 1 if args[0][0].is_a?(Column) - values = args[0] - fields = values.collect(&:field) + columns = args[0] + fields = columns.collect(&:field) + values = columns.collect(&:data) schema = Schema.new(fields) else raw_table = args[0] fields = [] values = [] raw_table.each do |name, array| - field = Field.new(name.to_s, array.value_data_type) - fields << field - values << Column.new(field, array) + array = ArrayBuilder.build(array) if array.is_a?(::Array) + fields << Field.new(name.to_s, array.value_data_type) + values << array end schema = Schema.new(fields) end @@ -170,20 +191,19 @@ def initialize(*args) schema = args[0] schema = Schema.new(schema) unless schema.is_a?(Schema) values = args[1] - if values[0].is_a?(::Array) + case values[0] + when ::Array values = [RecordBatch.new(schema, values)] + when Column + values = values.collect(&:data) end else - message = "wrong number of arguments (given, #{n_args}, expected 1..2)" + message = "wrong number of arguments (given #{n_args}, expected 1..2)" raise ArgumentError, message end initialize_raw(schema, values) end - def columns - @columns ||= n_columns.times.collect {|i| get_column(i)} - end - def each_record_batch return to_enum(__method__) unless block_given? @@ -338,7 +358,7 @@ def merge(other) other.each do |name, value| name = name.to_s if value - added_columns[name] = ensure_column(name, value) + added_columns[name] = ensure_raw_column(name, value) else removed_columns[name] = true end @@ -346,7 +366,8 @@ def merge(other) when Table added_columns = {} other.columns.each do |column| - added_columns[column.name] = column + name = column.name + added_columns[name] = ensure_raw_column(name, column) end else message = "merge target must be Hash or Arrow::Table: " + @@ -363,15 +384,18 @@ def merge(other) next end next if removed_columns.key?(column_name) - new_columns << column + new_columns << ensure_raw_column(column_name, column) end added_columns.each do |name, new_column| new_columns << new_column end - new_fields = new_columns.collect do |new_column| - new_column.field + new_fields = [] + new_arrays = [] + new_columns.each do |new_column| + new_fields << new_column[:field] + new_arrays << new_column[:data] end - self.class.new(Schema.new(new_fields), new_columns) + self.class.new(new_fields, new_arrays) end alias_method :remove_column_raw, :remove_column @@ -447,10 +471,10 @@ def save(path, options={}) end def pack - packed_columns = columns.collect do |column| - column.pack + packed_arrays = columns.collect do |column| + column.data.pack end - self.class.new(schema, packed_columns) + self.class.new(schema, packed_arrays) end alias_method :to_s_raw, :to_s @@ -524,13 +548,26 @@ def slice_by_ranges(ranges) end end - def ensure_column(name, data) + def ensure_raw_column(name, data) case data when Array - field = Field.new(name, data.value_data_type) - Column.new(field, data) + { + field: Field.new(name, data.value_data_type), + data: ChunkedArray.new([data]), + } + when ChunkedArray + { + field: Field.new(name, data.value_data_type), + data: data, + } when Column - data + column = data + data = column.data + data = ChunkedArray.new([data]) unless data.is_a?(ChunkedArray) + { + field: column.field, + data: data, + } else message = "column must be Arrow::Array or Arrow::Column: " + "<#{name}>: <#{data.inspect}>: #{inspect}" diff --git a/ruby/red-arrow/lib/arrow/timestamp-array-builder.rb b/ruby/red-arrow/lib/arrow/timestamp-array-builder.rb index 4b22682efca..17efaa0c446 100644 --- a/ruby/red-arrow/lib/arrow/timestamp-array-builder.rb +++ b/ruby/red-arrow/lib/arrow/timestamp-array-builder.rb @@ -17,6 +17,25 @@ module Arrow class TimestampArrayBuilder + class << self + def build(unit_or_data_type, values) + builder = new(unit_or_data_type) + builder.build(values) + end + end + + alias_method :initialize_raw, :initialize + def initialize(unit_or_data_type) + case unit_or_data_type + when DataType + data_type = unit_or_data_type + else + unit = unit_or_data_type + data_type = TimestampDataType.new(unit) + end + initialize_raw(data_type) + end + private def unit_id @unit_id ||= value_data_type.unit.nick.to_sym diff --git a/ruby/red-arrow/lib/arrow/timestamp-array.rb b/ruby/red-arrow/lib/arrow/timestamp-array.rb index 6cffb8c261c..3262c23a918 100644 --- a/ruby/red-arrow/lib/arrow/timestamp-array.rb +++ b/ruby/red-arrow/lib/arrow/timestamp-array.rb @@ -17,14 +17,6 @@ module Arrow class TimestampArray - class << self - def new(unit, values) - data_type = TimestampDataType.new(unit) - builder = TimestampArrayBuilder.new(data_type) - builder.build(values) - end - end - def get_value(i) to_time(get_raw_value(i)) end diff --git a/ruby/red-arrow/test/raw-records/test-dense-union-array.rb b/ruby/red-arrow/test/raw-records/test-dense-union-array.rb index c79c093d550..5e267660eb1 100644 --- a/ruby/red-arrow/test/raw-records/test-dense-union-array.rb +++ b/ruby/red-arrow/test/raw-records/test-dense-union-array.rb @@ -52,7 +52,7 @@ def build_record_batch(type, records) end sub_record_batch = Arrow::RecordBatch.new(sub_schema, sub_records) - sub_record_batch.columns[0] + sub_record_batch.columns[0].data end records.each do |record| column = record[0] diff --git a/ruby/red-arrow/test/raw-records/test-sparse-union-array.rb b/ruby/red-arrow/test/raw-records/test-sparse-union-array.rb index f80592ff41a..c761cc64743 100644 --- a/ruby/red-arrow/test/raw-records/test-sparse-union-array.rb +++ b/ruby/red-arrow/test/raw-records/test-sparse-union-array.rb @@ -47,7 +47,7 @@ def build_record_batch(type, records) end sub_record_batch = Arrow::RecordBatch.new(sub_schema, sub_records) - sub_record_batch.columns[0] + sub_record_batch.columns[0].data end records.each do |record| column = record[0] diff --git a/ruby/red-arrow/test/test-column.rb b/ruby/red-arrow/test/test-column.rb index 81bf0e09a5b..613b01ccc7b 100644 --- a/ruby/red-arrow/test/test-column.rb +++ b/ruby/red-arrow/test/test-column.rb @@ -16,51 +16,76 @@ # under the License. class ColumnTest < Test::Unit::TestCase + def setup + table = Arrow::Table.new("visible" => [true, nil, false]) + @column = table.visible + end + + test("#name") do + assert_equal("visible", @column.name) + end + + test("#data_type") do + assert_equal(Arrow::BooleanDataType.new, @column.data_type) + end + + test("#null?") do + assert do + @column.null?(1) + end + end + + test("#valid?") do + assert do + @column.valid?(0) + end + end + test("#each") do - arrays = [ - Arrow::BooleanArray.new([true, false]), - Arrow::BooleanArray.new([nil, true]), - ] - chunked_array = Arrow::ChunkedArray.new(arrays) - column = Arrow::Column.new(Arrow::Field.new("visible", :boolean), - chunked_array) - assert_equal([true, false, nil, true], - column.to_a) + assert_equal([true, nil, false], @column.each.to_a) + end + + test("#reverse_each") do + assert_equal([false, nil, true], @column.reverse_each.to_a) end - test("#pack") do - arrays = [ - Arrow::BooleanArray.new([true, false]), - Arrow::BooleanArray.new([nil, true]), - ] - chunked_array = Arrow::ChunkedArray.new(arrays) - column = Arrow::Column.new(Arrow::Field.new("visible", :boolean), - chunked_array) - packed_column = column.pack - assert_equal([1, [true, false, nil, true]], - [packed_column.data.n_chunks, packed_column.to_a]) + test("#n_rows") do + assert_equal(3, @column.n_rows) + end + + test("#n_nulls") do + assert_equal(1, @column.n_nulls) end sub_test_case("#==") do - def setup - arrays = [ - Arrow::BooleanArray.new([true]), - Arrow::BooleanArray.new([false, true]), - ] - chunked_array = Arrow::ChunkedArray.new(arrays) - @column = Arrow::Column.new(Arrow::Field.new("visible", :boolean), - chunked_array) + test("same value") do + table1 = Arrow::Table.new("visible" => [true, false]) + table2 = Arrow::Table.new("visible" => [true, false]) + assert do + table1.visible == table2.visible + end + end + + test("different name") do + table1 = Arrow::Table.new("visible" => [true, false]) + table2 = Arrow::Table.new("invisible" => [true, false]) + assert do + not table1.visible == table2.invisible + end end - test("Arrow::Column") do + test("different value") do + table1 = Arrow::Table.new("visible" => [true, false]) + table2 = Arrow::Table.new("visible" => [true, true]) assert do - @column == @column + not table1.visible == table2.visible end end test("not Arrow::Column") do + table = Arrow::Table.new("visible" => [true, false]) assert do - not (@column == 29) + not table.visible == 29 end end end diff --git a/ruby/red-arrow/test/test-slicer.rb b/ruby/red-arrow/test/test-slicer.rb index ba035b604a0..a661888d3ec 100644 --- a/ruby/red-arrow/test/test-slicer.rb +++ b/ruby/red-arrow/test/test-slicer.rb @@ -36,9 +36,7 @@ def setup ] @count_array = Arrow::ChunkedArray.new(count_arrays) @visible_array = Arrow::ChunkedArray.new(visible_arrays) - @count_column = Arrow::Column.new(@count_field, @count_array) - @visible_column = Arrow::Column.new(@visible_field, @visible_array) - @table = Arrow::Table.new(schema, [@count_column, @visible_column]) + @table = Arrow::Table.new(schema, [@count_array, @visible_array]) end sub_test_case("column") do diff --git a/ruby/red-arrow/test/test-table.rb b/ruby/red-arrow/test/test-table.rb index dce5d25758c..e3b43cd31f3 100644 --- a/ruby/red-arrow/test/test-table.rb +++ b/ruby/red-arrow/test/test-table.rb @@ -37,14 +37,15 @@ def setup ] @count_array = Arrow::ChunkedArray.new(count_arrays) @visible_array = Arrow::ChunkedArray.new(visible_arrays) - @count_column = Arrow::Column.new(@count_field, @count_array) - @visible_column = Arrow::Column.new(@visible_field, @visible_array) - @table = Arrow::Table.new(schema, [@count_column, @visible_column]) + @table = Arrow::Table.new(schema, [@count_array, @visible_array]) end test("#columns") do - assert_equal(["count", "visible"], - @table.columns.collect(&:name)) + assert_equal([ + Arrow::Column.new(@table, 0), + Arrow::Column.new(@table, 1), + ], + @table.columns) end sub_test_case("#slice") do @@ -188,11 +189,18 @@ def setup sub_test_case("#[]") do test("[String]") do - assert_equal(@count_column, @table["count"]) + assert_equal(Arrow::Column.new(@table, 0), + @table["count"]) end test("[Symbol]") do - assert_equal(@visible_column, @table[:visible]) + assert_equal(Arrow::Column.new(@table, 1), + @table[:visible]) + end + + test("[Integer]") do + assert_equal(Arrow::Column.new(@table, 1), + @table[-1]) end end @@ -279,7 +287,8 @@ def setup end test("column name getter") do - assert_equal(@visible_column, @table.visible) + assert_equal(Arrow::Column.new(@table, 1), + @table.visible) end sub_test_case("#remove_column") do diff --git a/ruby/red-parquet/test/test-arrow-table.rb b/ruby/red-parquet/test/test-arrow-table.rb index 1a565b64451..fc2a6c998c6 100644 --- a/ruby/red-parquet/test/test-arrow-table.rb +++ b/ruby/red-parquet/test/test-arrow-table.rb @@ -35,9 +35,7 @@ def setup ] @count_array = Arrow::ChunkedArray.new(count_arrays) @visible_array = Arrow::ChunkedArray.new(visible_arrays) - @count_column = Arrow::Column.new(@count_field, @count_array) - @visible_column = Arrow::Column.new(@visible_field, @visible_array) - @table = Arrow::Table.new(schema, [@count_column, @visible_column]) + @table = Arrow::Table.new(schema, [@count_array, @visible_array]) end def test_save_load_path