Skip to content

Commit c7c3059

Browse files
committed
Fix performance issues
1 parent ebc348e commit c7c3059

File tree

1 file changed

+66
-36
lines changed

1 file changed

+66
-36
lines changed

cpp/src/arrow/type.cc

Lines changed: 66 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1066,17 +1066,29 @@ std::string FieldPath::ToString() const {
10661066
return repr;
10671067
}
10681068

1069-
static Status NonStructError() {
1070-
return Status::NotImplemented("Get child data of non-struct array");
1071-
}
1069+
struct NestedSelectorUtil {
1070+
static Status NonStructError() {
1071+
return Status::NotImplemented("Get child data of non-struct array");
1072+
}
1073+
1074+
template <typename T>
1075+
static const DataType* GetType(const T& input) {
1076+
if constexpr (std::is_same_v<T, ArrayData>) {
1077+
return input.type.get();
1078+
} else {
1079+
return input.type().get();
1080+
}
1081+
}
1082+
};
10721083

1073-
// Utility class for retrieving a child field/column from a top-level Field, Array, or
1074-
// ChunkedArray. The "root" value can either be a single parent or a vector of its
1075-
// children.
1084+
// Utility class for retrieving a child field/column from a top-level Field, Array,
1085+
// ArrayData, or ChunkedArray. The "root" value can either be a single parent or a vector
1086+
// of its children.
10761087
template <typename T, bool IsFlattening = false>
10771088
class NestedSelector {
10781089
public:
10791090
using ArrowType = T;
1091+
using Util = NestedSelectorUtil;
10801092

10811093
explicit NestedSelector(const std::vector<std::shared_ptr<T>>& children)
10821094
: parent_or_children_(&children) {}
@@ -1095,7 +1107,18 @@ class NestedSelector {
10951107
Result<NestedSelector> GetChild(int i) const {
10961108
std::shared_ptr<T> child;
10971109
if (auto parent = get_parent()) {
1098-
ARROW_ASSIGN_OR_RAISE(child, GetChild(*parent, i, pool_));
1110+
const DataType* type = Util::GetType(*parent);
1111+
// We avoid this check for schema fields since it's inconsequential (plus there are
1112+
// tests elsewhere that rely on it not happening)
1113+
if constexpr (!std::is_same_v<T, Field>) {
1114+
if (ARROW_PREDICT_FALSE(type->id() != Type::STRUCT)) {
1115+
return Util::NonStructError();
1116+
}
1117+
}
1118+
// Bounds-check the index *once* using the parent's type
1119+
if (ARROW_PREDICT_TRUE(i >= 0 && i < type->num_fields())) {
1120+
ARROW_ASSIGN_OR_RAISE(child, GetChild(*parent, i, pool_));
1121+
}
10991122
} else if (auto children = get_children()) {
11001123
if (ARROW_PREDICT_TRUE(i >= 0 && static_cast<size_t>(i) < children->size())) {
11011124
child = (*children)[i];
@@ -1129,10 +1152,10 @@ class NestedSelector {
11291152
*os << "column types: { ";
11301153
if (auto children = get_children()) {
11311154
for (const auto& child : *children) {
1132-
*os << *child->type() << ", ";
1155+
*os << *Util::GetType(*child) << ", ";
11331156
}
11341157
} else if (auto parent = get_parent()) {
1135-
for (const auto& field : parent->type()->fields()) {
1158+
for (const auto& field : Util::GetType(*parent)->fields()) {
11361159
*os << *field->type() << ", ";
11371160
}
11381161
}
@@ -1155,21 +1178,33 @@ class NestedSelector {
11551178
}
11561179

11571180
static Result<std::shared_ptr<Field>> GetChild(const Field& field, int i, MemoryPool*) {
1158-
if (ARROW_PREDICT_FALSE(i < 0 || i >= field.type()->num_fields())) {
1159-
return nullptr;
1160-
}
11611181
return field.type()->field(i);
11621182
}
11631183

1164-
static Result<std::shared_ptr<Array>> GetChild(const Array& array, int i,
1165-
MemoryPool* pool) {
1166-
if (ARROW_PREDICT_FALSE(array.type_id() != Type::STRUCT)) {
1167-
return NonStructError();
1168-
}
1169-
if (ARROW_PREDICT_FALSE(i < 0 || i >= array.num_fields())) {
1170-
return nullptr;
1184+
static Result<std::shared_ptr<ArrayData>> GetChild(const ArrayData& data, int i,
1185+
MemoryPool* pool) {
1186+
std::shared_ptr<ArrayData> child_data;
1187+
if constexpr (IsFlattening) {
1188+
// First, convert to an Array so we can use StructArray::GetFlattenedField
1189+
auto array = MakeArray(data.Copy());
1190+
ARROW_ASSIGN_OR_RAISE(auto child_array, GetChild(*array, i, pool));
1191+
child_data = child_array->data();
1192+
} else {
1193+
// We could achieve the same result by converting to an Array (via MakeArray),
1194+
// calling StructArray::field(i), and pulling out the new ArrayData. However, this
1195+
// process can be very expensive when there are many columns - so we just
1196+
// reimplement the functionality that we need
1197+
child_data = data.child_data[i];
1198+
if (data.offset != 0 || data.child_data[i]->length != data.length) {
1199+
child_data = child_data->Slice(data.offset, data.length);
1200+
}
11711201
}
11721202

1203+
return std::move(child_data);
1204+
}
1205+
1206+
static Result<std::shared_ptr<Array>> GetChild(const Array& array, int i,
1207+
MemoryPool* pool) {
11731208
const auto& struct_array = checked_cast<const StructArray&>(array);
11741209
if constexpr (IsFlattening) {
11751210
return struct_array.GetFlattenedField(i, pool);
@@ -1181,22 +1216,15 @@ class NestedSelector {
11811216
static Result<std::shared_ptr<ChunkedArray>> GetChild(const ChunkedArray& chunked_array,
11821217
int i, MemoryPool* pool) {
11831218
const auto& type = *chunked_array.type();
1184-
if (ARROW_PREDICT_FALSE(type.id() != Type::STRUCT)) {
1185-
return NonStructError();
1186-
}
1187-
if (ARROW_PREDICT_FALSE(i < 0 || i >= type.num_fields())) {
1188-
return nullptr;
1189-
}
11901219

11911220
ArrayVector chunks;
11921221
chunks.reserve(chunked_array.num_chunks());
11931222
for (const auto& parent_chunk : chunked_array.chunks()) {
11941223
ARROW_ASSIGN_OR_RAISE(auto chunk, GetChild(*parent_chunk, i, pool));
1195-
if (!chunk) return nullptr;
11961224
chunks.push_back(std::move(chunk));
11971225
}
11981226

1199-
return ChunkedArray::Make(std::move(chunks), type.field(i)->type());
1227+
return std::make_shared<ChunkedArray>(std::move(chunks), type.field(i)->type());
12001228
}
12011229

12021230
std::shared_ptr<T> owned_parent_;
@@ -1289,7 +1317,11 @@ Result<std::shared_ptr<Schema>> FieldPath::GetAll(const Schema& schm,
12891317
}
12901318

12911319
Result<std::shared_ptr<Array>> FieldPath::Get(const RecordBatch& batch) const {
1292-
return FieldPathGetImpl::Get(this, ZeroCopySelector<Array>(batch.columns()));
1320+
// Deliberately calling `column_data` here because `RecordBatch::columns` is nontrivial
1321+
ARROW_ASSIGN_OR_RAISE(
1322+
auto data,
1323+
FieldPathGetImpl::Get(this, ZeroCopySelector<ArrayData>(batch.column_data())));
1324+
return MakeArray(data);
12931325
}
12941326

12951327
Result<std::shared_ptr<ChunkedArray>> FieldPath::Get(const Table& table) const {
@@ -1301,11 +1333,7 @@ Result<std::shared_ptr<Array>> FieldPath::Get(const Array& array) const {
13011333
}
13021334

13031335
Result<std::shared_ptr<ArrayData>> FieldPath::Get(const ArrayData& data) const {
1304-
// We indirect from ArrayData to Array rather than vice-versa because, when selecting a
1305-
// nested column, the StructArray::field method does the work of adjusting the data's
1306-
// offset/length if necessary.
1307-
ARROW_ASSIGN_OR_RAISE(auto array, Get(*MakeArray(data.Copy())));
1308-
return array->data();
1336+
return FieldPathGetImpl::Get(this, ZeroCopySelector<ArrayData>(data));
13091337
}
13101338

13111339
Result<std::shared_ptr<ChunkedArray>> FieldPath::Get(
@@ -1320,8 +1348,7 @@ Result<std::shared_ptr<Array>> FieldPath::GetFlattened(const Array& array,
13201348

13211349
Result<std::shared_ptr<ArrayData>> FieldPath::GetFlattened(const ArrayData& data,
13221350
MemoryPool* pool) const {
1323-
ARROW_ASSIGN_OR_RAISE(auto array, GetFlattened(*MakeArray(data.Copy()), pool));
1324-
return array->data();
1351+
return FieldPathGetImpl::Get(this, FlatteningSelector<ArrayData>(data, pool));
13251352
}
13261353

13271354
Result<std::shared_ptr<ChunkedArray>> FieldPath::GetFlattened(
@@ -1332,7 +1359,10 @@ Result<std::shared_ptr<ChunkedArray>> FieldPath::GetFlattened(
13321359

13331360
Result<std::shared_ptr<Array>> FieldPath::GetFlattened(const RecordBatch& batch,
13341361
MemoryPool* pool) const {
1335-
return FieldPathGetImpl::Get(this, FlatteningSelector<Array>(batch.columns(), pool));
1362+
ARROW_ASSIGN_OR_RAISE(
1363+
auto data, FieldPathGetImpl::Get(
1364+
this, FlatteningSelector<ArrayData>(batch.column_data(), pool)));
1365+
return MakeArray(data);
13361366
}
13371367

13381368
Result<std::shared_ptr<ChunkedArray>> FieldPath::GetFlattened(const Table& table,

0 commit comments

Comments
 (0)