From 922067eaf41a29f483b79f7bf5337c85b1738fb7 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 27 Feb 2020 13:51:26 -0500 Subject: [PATCH 01/12] ARROW-7412: [C++][Dataset] Provide FieldRef to disambiuate field references --- cpp/src/arrow/array.h | 2 - cpp/src/arrow/buffer.h | 2 - cpp/src/arrow/flight/perf_server.cc | 2 - cpp/src/arrow/record_batch.cc | 2 + cpp/src/arrow/record_batch.h | 5 +- cpp/src/arrow/table.cc | 15 +- cpp/src/arrow/table.h | 6 +- cpp/src/arrow/testing/gtest_util.h | 2 - cpp/src/arrow/testing/util.h | 8 - cpp/src/arrow/type.cc | 383 +++++++++++++++++++++++++++- cpp/src/arrow/type.h | 102 +++++++- cpp/src/arrow/type_fwd.h | 15 +- cpp/src/arrow/type_test.cc | 83 ++++++ 13 files changed, 591 insertions(+), 36 deletions(-) diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 9c59c8c650a..eaabcf2f6b1 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -426,8 +426,6 @@ class ARROW_EXPORT Array { ARROW_DISALLOW_COPY_AND_ASSIGN(Array); }; -using ArrayVector = std::vector>; - namespace internal { /// Given a number of ArrayVectors, treat each ArrayVector as the diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index 465ca4ba160..fbc7b6b2148 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -332,8 +332,6 @@ class ARROW_EXPORT Buffer { ARROW_DISALLOW_COPY_AND_ASSIGN(Buffer); }; -using BufferVector = std::vector>; - /// \defgroup buffer-slicing-functions Functions for slicing buffers /// /// @{ diff --git a/cpp/src/arrow/flight/perf_server.cc b/cpp/src/arrow/flight/perf_server.cc index 2d4492c2a09..c899076029d 100644 --- a/cpp/src/arrow/flight/perf_server.cc +++ b/cpp/src/arrow/flight/perf_server.cc @@ -54,8 +54,6 @@ namespace flight { } \ } while (0) -using ArrayVector = std::vector>; - // Create record batches with a unique "a" column so we can verify on the // client side that the results are correct class PerfDataStream : public FlightDataStream { diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index ffcbdc459d1..b1a1708ba6c 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -80,6 +80,8 @@ class SimpleRecordBatch : public RecordBatch { std::shared_ptr column_data(int i) const override { return columns_[i]; } + ArrayDataVector column_data() const override { return columns_; } + Status AddColumn(int i, const std::shared_ptr& field, const std::shared_ptr& column, std::shared_ptr* out) const override { diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h index 4a59c239b63..ada1ad7eef4 100644 --- a/cpp/src/arrow/record_batch.h +++ b/cpp/src/arrow/record_batch.h @@ -99,11 +99,14 @@ class ARROW_EXPORT RecordBatch { /// \return an Array or null if no field was found std::shared_ptr GetColumnByName(const std::string& name) const; - /// \brief Retrieve an array's internaldata from the record batch + /// \brief Retrieve an array's internal data from the record batch /// \param[in] i field index, does not boundscheck /// \return an internal ArrayData object virtual std::shared_ptr column_data(int i) const = 0; + /// \brief Retrieve all arrays' internal data from the record batch. + virtual ArrayDataVector column_data() const = 0; + /// \brief Add column to the record batch, producing a new RecordBatch /// /// \param[in] i field index, which will be boundschecked diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index e68f6811c59..2948a9299a7 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -41,25 +41,24 @@ using internal::checked_cast; // ---------------------------------------------------------------------- // ChunkedArray methods -ChunkedArray::ChunkedArray(const ArrayVector& chunks) : chunks_(chunks) { +ChunkedArray::ChunkedArray(ArrayVector chunks) : chunks_(std::move(chunks)) { length_ = 0; null_count_ = 0; - ARROW_CHECK_GT(chunks.size(), 0) + ARROW_CHECK_GT(chunks_.size(), 0) << "cannot construct ChunkedArray from empty vector and omitted type"; - type_ = chunks[0]->type(); - for (const std::shared_ptr& chunk : chunks) { + type_ = chunks_[0]->type(); + for (const std::shared_ptr& chunk : chunks_) { length_ += chunk->length(); null_count_ += chunk->null_count(); } } -ChunkedArray::ChunkedArray(const ArrayVector& chunks, - const std::shared_ptr& type) - : chunks_(chunks), type_(type) { +ChunkedArray::ChunkedArray(ArrayVector chunks, std::shared_ptr type) + : chunks_(std::move(chunks)), type_(std::move(type)) { length_ = 0; null_count_ = 0; - for (const std::shared_ptr& chunk : chunks) { + for (const std::shared_ptr& chunk : chunks_) { length_ += chunk->length(); null_count_ += chunk->null_count(); } diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index 4b106a16b9d..880573adc6e 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -30,8 +30,6 @@ namespace arrow { -using ArrayVector = std::vector>; - /// \class ChunkedArray /// \brief A data structure managing a list of primitive Arrow arrays logically /// as one large array @@ -41,7 +39,7 @@ class ARROW_EXPORT ChunkedArray { /// /// The vector must be non-empty and all its elements must have the same /// data type. - explicit ChunkedArray(const ArrayVector& chunks); + explicit ChunkedArray(ArrayVector chunks); /// \brief Construct a chunked array from a single Array explicit ChunkedArray(const std::shared_ptr& chunk) @@ -50,7 +48,7 @@ class ARROW_EXPORT ChunkedArray { /// \brief Construct a chunked array from a vector of arrays and a data type /// /// As the data type is passed explicitly, the vector may be empty. - ChunkedArray(const ArrayVector& chunks, const std::shared_ptr& type); + ChunkedArray(ArrayVector chunks, std::shared_ptr type); /// \return the total length of the chunked array; computed on construction int64_t length() const { return length_; } diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index 364ecc58d2d..93ea12ddcf8 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -164,8 +164,6 @@ struct Datum; using Datum = compute::Datum; -using ArrayVector = std::vector>; - #define ASSERT_ARRAYS_EQUAL(lhs, rhs) AssertArraysEqual((lhs), (rhs)) #define ASSERT_BATCHES_EQUAL(lhs, rhs) AssertBatchesEqual((lhs), (rhs)) #define ASSERT_TABLES_EQUAL(lhs, rhs) AssertTablesEqual((lhs), (rhs)) diff --git a/cpp/src/arrow/testing/util.h b/cpp/src/arrow/testing/util.h index e67231158bd..3ddf097a3a8 100644 --- a/cpp/src/arrow/testing/util.h +++ b/cpp/src/arrow/testing/util.h @@ -39,14 +39,6 @@ namespace arrow { -class Array; -class ChunkedArray; -class MemoryPool; -class RecordBatch; -class Table; - -using ArrayVector = std::vector>; - template Status CopyBufferFromVector(const std::vector& values, MemoryPool* pool, std::shared_ptr* result) { diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index b716033707c..2251f197a06 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -30,9 +30,12 @@ #include "arrow/array.h" #include "arrow/compare.h" +#include "arrow/record_batch.h" #include "arrow/result.h" #include "arrow/status.h" +#include "arrow/table.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/hashing.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/make_unique.h" @@ -608,6 +611,384 @@ std::string DictionaryType::ToString() const { std::string NullType::ToString() const { return name(); } +FieldRef::FieldRef(std::vector children) { + // flatten children + struct Visitor { + void operator()(std::string&& name) { *out++ = FieldRef(std::move(name)); } + + void operator()(Indices&& indices) { *out++ = FieldRef(std::move(indices)); } + + void operator()(std::vector&& children) { + for (auto& child : children) { + util::visit(*this, std::move(child.impl_)); + } + } + + std::back_insert_iterator> out; + }; + + std::vector out; + Visitor{std::back_inserter(out)}(std::move(children)); + + DCHECK(!out.empty()); + DCHECK(std::none_of(out.begin(), out.end(), + [](const FieldRef& ref) { return ref.IsNested(); })); + + if (out.size() == 1) { + impl_ = std::move(out[0].impl_); + } else { + impl_ = std::move(out); + } +} + +Result FieldRef::FromDotPath(const std::string& dot_path_arg) { + if (dot_path_arg.empty()) { + return Status::Invalid("Dot path was empty"); + } + + std::vector children; + + util::string_view dot_path = dot_path_arg; + + auto parse_name = [&] { + std::string name; + for (;;) { + auto segment_end = dot_path.find_first_of("\\[."); + if (segment_end == util::string_view::npos) { + // dot_path doesn't contain any other special characters; consume all + name.append(dot_path.begin(), dot_path.end()); + dot_path = ""; + break; + } + + if (dot_path[segment_end] != '\\') { + // segment_end points to a subscipt for a new FieldRef + name.append(dot_path.begin(), segment_end); + dot_path = dot_path.substr(segment_end); + break; + } + + if (dot_path.size() == segment_end + 1) { + // dot_path ends with backslash; consume it all + name.append(dot_path.begin(), dot_path.end()); + dot_path = ""; + break; + } + + // append all characters before backslash, then the character which follows it + name.append(dot_path.begin(), segment_end); + name.push_back(dot_path[segment_end + 1]); + dot_path = dot_path.substr(segment_end + 2); + } + return name; + }; + + while (!dot_path.empty()) { + auto subscipt = dot_path[0]; + dot_path = dot_path.substr(1); + switch (subscipt) { + case '.': { + // next element is a name + children.push_back(parse_name()); + continue; + } + case '[': { + auto subscipt_end = dot_path.find_first_not_of("0123456789"); + if (subscipt_end == util::string_view::npos || dot_path[subscipt_end] != ']') { + return Status::Invalid("Dot path '", dot_path_arg, + "' contained an unterminated index"); + } + children.emplace_back(std::atoi(dot_path.data())); + dot_path = dot_path.substr(subscipt_end + 1); + continue; + } + default: + return Status::Invalid("Dot path must begin with '[' or '.', got '", dot_path_arg, + "'"); + } + } + + return FieldRef(std::move(children)); +} + +size_t FieldRef::hash() const { + struct Visitor : std::hash { + using std::hash::operator(); + + size_t operator()(const Indices& indices) { + return internal::ComputeStringHash(indices.data(), indices.length() * sizeof(int)); + } + + size_t operator()(const std::vector& children) { + size_t hash = 0; + + for (const FieldRef& child : children) { + hash ^= child.hash(); + } + + return hash; + } + }; + + return util::visit(Visitor{}, impl_); +} + +std::string FieldRef::ToString() const { + struct Visitor { + std::string operator()(const Indices& indices) { + std::string repr = "Indices("; + for (auto index : indices) { + repr += std::to_string(index) + " "; + } + repr.resize(repr.size() - 1); + repr += ")"; + return repr; + } + + std::string operator()(const std::string& name) { return "Name(" + name + ")"; } + + std::string operator()(const std::vector& children) { + std::string repr = "Nested("; + for (const auto& child : children) { + repr += child.ToString() + " "; + } + repr.resize(repr.size() - 1); + repr += ")"; + return repr; + } + }; + + return util::visit(Visitor{}, impl_); +} + +struct FieldRefGetImpl { + static std::string Summarize(const Field& field) { return field.ToString(); } + + static std::string Summarize(const ArrayData& data) { return data.type->ToString(); } + + static std::string Summarize(const ChunkedArray& array) { + return array.type()->ToString(); + } + + template + static Status IndexError(const FieldRef::Indices& indices, int out_of_range_depth, + const std::vector& children) { + std::stringstream ss; + ss << "index out of range. "; + + ss << "indices=[ "; + int depth = 0; + for (int i : indices) { + if (depth != out_of_range_depth) { + ss << i << " "; + continue; + } + ss << ">" << i << "< "; + ++depth; + } + ss << "] "; + + ss << "child "; + if (std::is_same>::value) { + ss << "fields were: { "; + for (const auto& field : children) { + ss << Summarize(*field) << ", "; + } + ss << "}"; + } else { + ss << "columns had types: { "; + for (const auto& column : children) { + ss << Summarize(*column) << ", "; + } + ss << "}"; + } + + return Status::IndexError(ss.str()); + } + + template + static Result Get(const FieldRef::Indices& indices, const std::vector* children, + GetChildren&& get_children, bool verbose_error = true) { + if (indices.empty()) { + return Status::Invalid("empty indices cannot be traversed"); + } + + int depth = 0; + const T* out; + for (int index : indices) { + if (index < 0 || static_cast(index) >= children->size()) { + // verbose_error is a hack to allow this function to be used for cheap validation + // in FieldRef::FindAll + if (verbose_error) { + return IndexError(indices, depth, *children); + } + return Status::IndexError("index out of range"); + } + + out = &children->at(index); + children = get_children(*out); + ++depth; + } + + return *out; + } + + static Result> Get(const FieldRef::Indices& indices, + const FieldVector& fields, + bool verbose_error = true) { + return FieldRefGetImpl::Get( + indices, &fields, + [](const std::shared_ptr& field) { return &field->type()->children(); }, + verbose_error); + } + + static Result> Get(const FieldRef::Indices& indices, + const ArrayDataVector& child_data) { + return FieldRefGetImpl::Get( + indices, &child_data, + [](const std::shared_ptr& data) { return &data->child_data; }); + } + + static Result> Get( + const FieldRef::Indices& indices, const ChunkedArrayVector& columns_arg) { + ChunkedArrayVector columns = columns_arg; + + return FieldRefGetImpl::Get( + indices, &columns, [&](const std::shared_ptr& a) { + columns.clear(); + + for (int i = 0; i < a->type()->num_children(); ++i) { + ArrayVector child_chunks; + + for (const auto& chunk : a->chunks()) { + auto child_chunk = MakeArray(chunk->data()->child_data[i]); + child_chunks.push_back(std::move(child_chunk)); + } + + auto child_column = std::make_shared( + std::move(child_chunks), a->type()->child(i)->type()); + + columns.emplace_back(std::move(child_column)); + } + + return &columns; + }); + } +}; + +Result> FieldRef::Get(const Indices& indices, + const Schema& schema) { + return FieldRefGetImpl::Get(indices, schema.fields()); +} + +Result> FieldRef::Get(const Indices& indices, const Field& field) { + return FieldRefGetImpl::Get(indices, field.type()->children()); +} + +Result> FieldRef::Get(const Indices& indices, + const DataType& type) { + return FieldRefGetImpl::Get(indices, type.children()); +} + +Result> FieldRef::Get(const Indices& indices, + const FieldVector& fields) { + return FieldRefGetImpl::Get(indices, fields); +} + +Result> FieldRef::Get(const Indices& indices, + const RecordBatch& batch) { + ARROW_ASSIGN_OR_RAISE(auto data, FieldRefGetImpl::Get(indices, batch.column_data())); + return MakeArray(data); +} + +Result> FieldRef::Get(const Indices& indices, + const Table& table) { + return FieldRefGetImpl::Get(indices, table.columns()); +} + +std::vector FieldRef::FindAll(const Schema& schema) const { + return FindAll(schema.fields()); +} + +std::vector FieldRef::FindAll(const DataType& type) const { + return FindAll(type.children()); +} + +std::vector FieldRef::FindAll(const Field& field) const { + return FindAll(field.type()->children()); +} + +std::vector FieldRef::FindAll(const FieldVector& fields) const { + struct Visitor { + std::vector operator()(const FieldRef::Indices& indices) { + if (FieldRefGetImpl::Get(indices, fields_, /* verbose_error = */ false).ok()) { + return {indices}; + } + return {}; + } + + std::vector operator()(const std::string& name) { + std::vector out; + + for (int i = 0; i < static_cast(fields_.size()); ++i) { + if (fields_[i]->name() == name) { + out.push_back({i}); + } + } + + return out; + } + + struct Matches { + // referents[i] is referenced by prefixes[i] + std::vector prefixes; + FieldVector referents; + + Matches(std::vector indices, const FieldVector& fields) { + for (auto& indices : indices) { + Add({}, std::move(indices), fields); + } + } + + Matches() = default; + + size_t size() const { return referents.size(); } + + void Add(FieldRef::Indices prefix, FieldRef::Indices indices, + const FieldVector& fields) { + auto maybe_field = FieldRef::Get(indices, fields); + DCHECK_OK(maybe_field.status()); + referents.push_back(std::move(maybe_field).ValueOrDie()); + prefixes.push_back(prefix + indices); + } + }; + + std::vector operator()(const std::vector& refs) { + Matches matches(refs.front().FindAll(fields_), fields_); + + for (auto ref_it = refs.begin() + 1; ref_it != refs.end(); ++ref_it) { + Matches next_matches; + for (size_t i = 0; i < matches.size(); ++i) { + const auto& referent = *matches.referents[i]; + + for (const Indices& indices : ref_it->FindAll(referent)) { + next_matches.Add(matches.prefixes[i], indices, referent.type()->children()); + } + } + matches = std::move(next_matches); + } + + return matches.prefixes; + } + + const FieldVector& fields_; + }; + + return util::visit(Visitor{fields}, impl_); +} + +void PrintTo(const FieldRef& ref, std::ostream* os) { *os << ref.ToString(); } + // ---------------------------------------------------------------------- // Schema implementation @@ -636,7 +1017,7 @@ Schema::~Schema() {} int Schema::num_fields() const { return static_cast(impl_->fields_.size()); } -std::shared_ptr Schema::field(int i) const { +const std::shared_ptr& Schema::field(int i) const { DCHECK_GE(i, 0); DCHECK_LT(i, num_fields()); return impl_->fields_[i]; diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 7cd1b651994..13dd71d1dec 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -29,6 +29,7 @@ #include "arrow/type_fwd.h" // IWYU pragma: export #include "arrow/util/checked_cast.h" #include "arrow/util/macros.h" +#include "arrow/util/variant.h" #include "arrow/util/visibility.h" #include "arrow/visitor.h" // IWYU pragma: keep @@ -243,7 +244,7 @@ class ARROW_EXPORT DataType : public detail::Fingerprintable { /// \brief Return whether the types are equal bool Equals(const std::shared_ptr& other) const; - std::shared_ptr child(int i) const { return children_[i]; } + const std::shared_ptr& child(int i) const { return children_[i]; } const std::vector>& children() const { return children_; } @@ -1398,6 +1399,103 @@ class ARROW_EXPORT DictionaryUnifier { std::shared_ptr* out_dict) = 0; }; +// ---------------------------------------------------------------------- +// FieldRef + +/// \class FieldRef +/// \brief Descriptor of a (potentially nested) field within a schema. +class ARROW_EXPORT FieldRef { + public: + /// Represents a path to a nested field using indices of child fields. + /// For example, given indices {5, 9, 3} the field would be retrieved with + /// schema->field(5)->type()->child(9)->type()->child(3) + /// + /// std::basic_string is used to take advantage + /// of the short string approximation: up to four indices can be stored without memory + /// allocation. + using Indices = std::basic_string; + + /// Construct a FieldRef using a string of indices. The reference will be retrieved as: + /// schema.fields[self.indices[0]].type.fields[self.indices[1]] ... + FieldRef(Indices indices) : impl_(std::move(indices)) {} // NOLINT runtime/explicit + + /// Construct a by-name FieldRef. Multiple fields may match a by-name FieldRef: + /// [f for f in schema.fields where f.name == self.name] + FieldRef(std::string name) : impl_(std::move(name)) {} // NOLINT runtime/explicit + + /// Equivalent to a single index string of indices. + FieldRef(int index) : impl_(Indices({index})) {} // NOLINT runtime/explicit + + /// Construct a nested FieldRef. + explicit FieldRef(std::vector children); + + /// Convenience constructor for nested FieldRefs: each argument will be used to + /// construct a FieldRef + template + FieldRef(A0&& a0, A1&& a1, A&&... a) + : FieldRef(std::vector{FieldRef(std::forward(a0)), + FieldRef(std::forward(a1)), + FieldRef(std::forward(a))...}) {} + + /// Parse a dot path into a FieldRef. + /// + /// dot_path = '.' name + /// | '[' digit+ ']' + /// | dot_path+ + /// + /// Examples: + /// ".alpha" => FieldRef("alpha") + /// "[2]" => FieldRef(2) + /// ".beta[3]" => FieldRef("beta", 3) + /// "[5].gamma.delta[7]" => FieldRef(5, "gamma", "delta", 7) + /// ".hello world" => FieldRef("hello world") + /// R"(.\[y\]\\tho\.\)" => FieldRef(R"([y]\tho.\)") + static Result FromDotPath(const std::string& dot_path); + + bool Equals(const FieldRef& other) const { return impl_ == other.impl_; } + bool operator==(const FieldRef& other) const { return Equals(other); } + + std::string ToString() const; + + size_t hash() const; + + bool IsIndices() const { return util::holds_alternative(impl_); } + bool IsName() const { return util::holds_alternative(impl_); } + bool IsNested() const { return util::holds_alternative>(impl_); } + + /// \brief Retrieve the referenced child Field from a Schema, Field, or DataType + static Result> Get(const Indices& indices, const Schema& schema); + static Result> Get(const Indices& indices, const Field& field); + static Result> Get(const Indices& indices, const DataType& type); + static Result> Get(const Indices& indices, + const FieldVector& fields); + + /// \brief Retrieve the referenced column from a RecordBatch or Table + static Result> Get(const Indices& indices, + const RecordBatch& batch); + static Result> Get(const Indices& indices, + const Table& table); + + /// \brief Retrieve the referenced child Array from an Array or ChunkedArray + static Result> Get(const Indices& indices, const Array& array); + static Result> Get(const Indices& indices, + const ChunkedArray& array); + + /// \brief Retrieve Indices of child fields matching this FieldRef. + /// + /// TODO(bkietz) this will usually be a zero or one element vector. Vendor a + /// small_vector or hack one with variant, T> + std::vector FindAll(const Schema& schema) const; + std::vector FindAll(const Field& field) const; + std::vector FindAll(const DataType& type) const; + std::vector FindAll(const FieldVector& fields) const; + + private: + util::variant> impl_; + + friend void PrintTo(const FieldRef& ref, std::ostream* os); +}; + // ---------------------------------------------------------------------- // Schema @@ -1423,7 +1521,7 @@ class ARROW_EXPORT Schema : public detail::Fingerprintable, int num_fields() const; /// Return the ith schema element. Does not boundscheck - std::shared_ptr field(int i) const; + const std::shared_ptr& field(int i) const; const std::vector>& fields() const; diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 4ea81ee1937..2139db1292e 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -15,10 +15,10 @@ // specific language governing permissions and limitations // under the License. -#ifndef ARROW_TYPE_FWD_H -#define ARROW_TYPE_FWD_H +#pragma once #include +#include #include "arrow/util/visibility.h" @@ -39,21 +39,30 @@ class MemoryPool; class MutableBuffer; class ResizableBuffer; +using BufferVector = std::vector>; + class DataType; class Field; class KeyValueMetadata; class Schema; +using FieldVector = std::vector>; + class Array; struct ArrayData; class ArrayBuilder; class Tensor; struct Scalar; +using ArrayDataVector = std::vector>; +using ArrayVector = std::vector>; + class ChunkedArray; class RecordBatch; class Table; +using ChunkedArrayVector = std::vector>; +using RecordBatchVector = std::vector>; using RecordBatchIterator = Iterator>; class DictionaryType; @@ -261,5 +270,3 @@ ARROW_EXPORT MemoryPool* default_memory_pool(); #endif } // namespace arrow - -#endif // ARROW_TYPE_FWD_H diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index f0a12dc7514..797e382e8bc 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -22,6 +22,8 @@ #include #include +#include + #include "arrow/memory_pool.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" @@ -32,6 +34,8 @@ namespace arrow { +using testing::ElementsAre; + using internal::checked_cast; using internal::checked_pointer_cast; @@ -311,6 +315,85 @@ TEST(TestField, TestMerge) { } } +TEST(TestFieldRef, Basics) { + auto f0 = field("alpha", int32()); + auto f1 = field("beta", int32()); + auto f2 = field("alpha", int32()); + auto f3 = field("beta", int32()); + Schema s({f0, f1, f2, f3}); + + using I = FieldRef::Indices; + + // lookup by index returns Indices{index} + for (int index = 0; index < s.num_fields(); ++index) { + EXPECT_THAT(FieldRef(index).FindAll(s), ElementsAre(I{index})); + } + // out of range index results in a failure to match + EXPECT_THAT(FieldRef(s.num_fields() * 2).FindAll(s), ElementsAre()); + + // lookup by name returns the Indices of both matching fields + EXPECT_THAT(FieldRef("alpha").FindAll(s), ElementsAre(I{0}, I{2})); + EXPECT_THAT(FieldRef("beta").FindAll(s), ElementsAre(I{1}, I{3})); + EXPECT_THAT(FieldRef("beta").FindAll(s), ElementsAre(I{1}, I{3})); + + // retrieving a field with single-element Indices is equivalent to Schema::field + for (int index = 0; index < s.num_fields(); ++index) { + ASSERT_OK_AND_EQ(s.field(index), FieldRef::Get(I{index}, s)); + } + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, + testing::HasSubstr("empty indices cannot be traversed"), + FieldRef::Get(I{}, s)); + EXPECT_RAISES_WITH_MESSAGE_THAT(IndexError, testing::HasSubstr("index out of range"), + FieldRef::Get(I{s.num_fields() * 2}, s)); +} + +TEST(TestFieldRef, FromDotPath) { + ASSERT_OK_AND_EQ(FieldRef("alpha"), FieldRef::FromDotPath(R"(.alpha)")); + + ASSERT_OK_AND_EQ(FieldRef("", ""), FieldRef::FromDotPath(R"(..)")); + + ASSERT_OK_AND_EQ(FieldRef(2), FieldRef::FromDotPath(R"([2])")); + + ASSERT_OK_AND_EQ(FieldRef("beta", 3), FieldRef::FromDotPath(R"(.beta[3])")); + + ASSERT_OK_AND_EQ(FieldRef(5, "gamma", "delta", 7), + FieldRef::FromDotPath(R"([5].gamma.delta[7])")); + + ASSERT_OK_AND_EQ(FieldRef("hello world"), FieldRef::FromDotPath(R"(.hello world)")); + + ASSERT_OK_AND_EQ(FieldRef(R"([y]\tho.\)"), FieldRef::FromDotPath(R"(.\[y\]\\tho\.\)")); + + ASSERT_RAISES(Invalid, FieldRef::FromDotPath(R"()")); + ASSERT_RAISES(Invalid, FieldRef::FromDotPath(R"(alpha)")); + ASSERT_RAISES(Invalid, FieldRef::FromDotPath(R"([134234)")); + ASSERT_RAISES(Invalid, FieldRef::FromDotPath(R"([1stuf])")); +} + +TEST(TestFieldRef, Nested) { + auto f0 = field("alpha", int32()); + auto f1_0 = field("alpha", int32()); + auto f1 = field("beta", struct_({f1_0})); + auto f2_0 = field("alpha", int32()); + auto f2_1_0 = field("alpha", int32()); + auto f2_1_1 = field("alpha", int32()); + auto f2_1 = field("gamma", struct_({f2_1_0, f2_1_1})); + auto f2 = field("beta", struct_({f2_0, f2_1})); + Schema s({f0, f1, f2}); + + // retrieving fields with nested indices + EXPECT_EQ(FieldRef::Get({0}, s), f0); + EXPECT_EQ(FieldRef::Get({1, 0}, s), f1_0); + EXPECT_EQ(FieldRef::Get({2, 0}, s), f2_0); + EXPECT_EQ(FieldRef::Get({2, 1, 0}, s), f2_1_0); + EXPECT_EQ(FieldRef::Get({2, 1, 1}, s), f2_1_1); + + using I = FieldRef::Indices; + + EXPECT_THAT(FieldRef("beta", "alpha").FindAll(s), ElementsAre(I{1, 0}, I{2, 0})); + EXPECT_THAT(FieldRef("beta", "gamma", "alpha").FindAll(s), + ElementsAre(I{2, 1, 0}, I{2, 1, 1})); +} + using TestSchema = ::testing::Test; TEST_F(TestSchema, Basics) { From 31d09adf5e38c720c85cdd8c5939c650ff6a97bc Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 4 Mar 2020 16:10:23 -0500 Subject: [PATCH 02/12] add a small_vector implementation for FindAll's return --- cpp/src/arrow/type.cc | 20 +++--- cpp/src/arrow/type.h | 12 ++-- cpp/src/arrow/util/small_vector.h | 115 ++++++++++++++++++++++++++++++ 3 files changed, 130 insertions(+), 17 deletions(-) create mode 100644 cpp/src/arrow/util/small_vector.h diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 2251f197a06..cebc43e09a6 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -906,29 +906,29 @@ Result> FieldRef::Get(const Indices& indices, return FieldRefGetImpl::Get(indices, table.columns()); } -std::vector FieldRef::FindAll(const Schema& schema) const { +util::small_vector FieldRef::FindAll(const Schema& schema) const { return FindAll(schema.fields()); } -std::vector FieldRef::FindAll(const DataType& type) const { +util::small_vector FieldRef::FindAll(const DataType& type) const { return FindAll(type.children()); } -std::vector FieldRef::FindAll(const Field& field) const { +util::small_vector FieldRef::FindAll(const Field& field) const { return FindAll(field.type()->children()); } -std::vector FieldRef::FindAll(const FieldVector& fields) const { +util::small_vector FieldRef::FindAll(const FieldVector& fields) const { struct Visitor { - std::vector operator()(const FieldRef::Indices& indices) { + util::small_vector operator()(const FieldRef::Indices& indices) { if (FieldRefGetImpl::Get(indices, fields_, /* verbose_error = */ false).ok()) { return {indices}; } return {}; } - std::vector operator()(const std::string& name) { - std::vector out; + util::small_vector operator()(const std::string& name) { + util::small_vector out; for (int i = 0; i < static_cast(fields_.size()); ++i) { if (fields_[i]->name() == name) { @@ -941,10 +941,10 @@ std::vector FieldRef::FindAll(const FieldVector& fields) cons struct Matches { // referents[i] is referenced by prefixes[i] - std::vector prefixes; + util::small_vector prefixes; FieldVector referents; - Matches(std::vector indices, const FieldVector& fields) { + Matches(util::small_vector indices, const FieldVector& fields) { for (auto& indices : indices) { Add({}, std::move(indices), fields); } @@ -963,7 +963,7 @@ std::vector FieldRef::FindAll(const FieldVector& fields) cons } }; - std::vector operator()(const std::vector& refs) { + util::small_vector operator()(const std::vector& refs) { Matches matches(refs.front().FindAll(fields_), fields_); for (auto ref_it = refs.begin() + 1; ref_it != refs.end(); ++ref_it) { diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 13dd71d1dec..7b341033f44 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -29,6 +29,7 @@ #include "arrow/type_fwd.h" // IWYU pragma: export #include "arrow/util/checked_cast.h" #include "arrow/util/macros.h" +#include "arrow/util/small_vector.h" #include "arrow/util/variant.h" #include "arrow/util/visibility.h" #include "arrow/visitor.h" // IWYU pragma: keep @@ -1482,13 +1483,10 @@ class ARROW_EXPORT FieldRef { const ChunkedArray& array); /// \brief Retrieve Indices of child fields matching this FieldRef. - /// - /// TODO(bkietz) this will usually be a zero or one element vector. Vendor a - /// small_vector or hack one with variant, T> - std::vector FindAll(const Schema& schema) const; - std::vector FindAll(const Field& field) const; - std::vector FindAll(const DataType& type) const; - std::vector FindAll(const FieldVector& fields) const; + util::small_vector FindAll(const Schema& schema) const; + util::small_vector FindAll(const Field& field) const; + util::small_vector FindAll(const DataType& type) const; + util::small_vector FindAll(const FieldVector& fields) const; private: util::variant> impl_; diff --git a/cpp/src/arrow/util/small_vector.h b/cpp/src/arrow/util/small_vector.h new file mode 100644 index 00000000000..182c6755a13 --- /dev/null +++ b/cpp/src/arrow/util/small_vector.h @@ -0,0 +1,115 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/util/variant.h" + +namespace arrow { +namespace util { + +/// A minimal vector-like container which holds up to a single element without allocation. +template +class small_vector { + public: + using value_type = T; + using iterator = T*; + using const_iterator = const T*; + + small_vector() = default; + + small_vector(std::initializer_list list) { + if (list.size() == 1) { + storage_ = *list.begin(); + } else { + set_vector(list); + } + } + + explicit small_vector(std::vector vector) { + if (vector.size() == 1) { + storage_ = std::move(vector[0]); + } else { + set_vector(std::move(vector)); + } + } + + size_t size() const { + if (holds_alternative(storage_)) { + return 1; + } + return get_vector().size(); + } + + bool empty() const { return size() == 0; } + + void clear() { set_vector({}); } + + void push_back(value_type element) { + switch (size()) { + case 0: + storage_ = std::move(element); + return; + case 1: + set_vector({std::move(get(storage_)), std::move(element)}); + return; + default: + get_vector().push_back(std::move(element)); + return; + } + } + + template + void emplace_back(Args&&... args) { + push_back(value_type(std::forward(args)...)); + } + + value_type* data() { + return size() != 1 ? get_vector().data() : &get(storage_); + } + + const value_type* data() const { + return size() != 1 ? get_vector().data() : &get(storage_); + } + + value_type& operator[](size_t i) { return data()[i]; } + const value_type& operator[](size_t i) const { return data()[i]; } + + iterator begin() { return data(); } + iterator end() { return data() + size(); } + + const_iterator begin() const { return data(); } + const_iterator end() const { return data() + size(); } + + private: + std::vector& get_vector() { return get>(storage_); } + + const std::vector& get_vector() const { + return get>(storage_); + } + + void set_vector(std::vector v) { storage_ = std::move(v); } + + variant, value_type> storage_; +}; + +} // namespace util +} // namespace arrow From f5380d1b88a2ffad25439ff53634f4a938374e5a Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 5 Mar 2020 09:42:20 -0500 Subject: [PATCH 03/12] add more convenience accessors, use FieldRef in dataset:: --- cpp/src/arrow/dataset/dataset_internal.h | 7 +- cpp/src/arrow/dataset/filter.cc | 3 +- cpp/src/arrow/dataset/partition.cc | 8 +- cpp/src/arrow/dataset/projector.cc | 31 ++++--- cpp/src/arrow/dataset/projector.h | 4 +- cpp/src/arrow/type.cc | 104 +++++++++++++++-------- cpp/src/arrow/type.h | 72 +++++++++++++++- cpp/src/arrow/type_fwd.h | 1 + 8 files changed, 170 insertions(+), 60 deletions(-) diff --git a/cpp/src/arrow/dataset/dataset_internal.h b/cpp/src/arrow/dataset/dataset_internal.h index 269f5ae6f30..910695fc5e1 100644 --- a/cpp/src/arrow/dataset/dataset_internal.h +++ b/cpp/src/arrow/dataset/dataset_internal.h @@ -54,9 +54,10 @@ static inline FragmentIterator GetFragmentsFromDatasets( inline std::shared_ptr SchemaFromColumnNames( const std::shared_ptr& input, const std::vector& column_names) { std::vector> columns; - for (const auto& name : column_names) { - if (auto field = input->GetFieldByName(name)) { - columns.push_back(std::move(field)); + for (FieldRef ref : column_names) { + auto maybe_field = ref.GetOne(*input); + if (maybe_field.ok()) { + columns.push_back(std::move(maybe_field).ValueOrDie()); } } diff --git a/cpp/src/arrow/dataset/filter.cc b/cpp/src/arrow/dataset/filter.cc index ea7ab869b1b..83a607308e4 100644 --- a/cpp/src/arrow/dataset/filter.cc +++ b/cpp/src/arrow/dataset/filter.cc @@ -898,7 +898,8 @@ Result> ScalarExpression::Validate(const Schema& schem } Result> FieldExpression::Validate(const Schema& schema) const { - if (auto field = schema.GetFieldByName(name_)) { + ARROW_ASSIGN_OR_RAISE(auto field, FieldRef(name_).GetOneOrNone(schema)); + if (field != nullptr) { return field->type(); } return null(); diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc index eea841f2c78..8879263de35 100644 --- a/cpp/src/arrow/dataset/partition.cc +++ b/cpp/src/arrow/dataset/partition.cc @@ -71,7 +71,7 @@ Result> SegmentDictionaryPartitioning::Parse( Result> KeyValuePartitioning::ConvertKey( const Key& key, const Schema& schema) { - auto field = schema.GetFieldByName(key.name); + ARROW_ASSIGN_OR_RAISE(auto field, FieldRef(key.name).GetOneOrNone(schema)); if (field == nullptr) { return scalar(true); } @@ -141,10 +141,8 @@ class DirectoryPartitioningFactory : public PartitioningFactory { Result> Finish( const std::shared_ptr& schema) const override { - for (const auto& field_name : field_names_) { - if (schema->GetFieldIndex(field_name) == -1) { - return Status::TypeError("no field named '", field_name, "' in schema", *schema); - } + for (FieldRef ref : field_names_) { + RETURN_NOT_OK(ref.FindOne(*schema).status()); } // drop fields which aren't in field_names_ diff --git a/cpp/src/arrow/dataset/projector.cc b/cpp/src/arrow/dataset/projector.cc index 7148c333d84..e7f1b38381a 100644 --- a/cpp/src/arrow/dataset/projector.cc +++ b/cpp/src/arrow/dataset/projector.cc @@ -40,8 +40,15 @@ RecordBatchProjector::RecordBatchProjector(std::shared_ptr to) column_indices_(to_->num_fields(), kNoMatch), scalars_(to_->num_fields(), nullptr) {} -Status RecordBatchProjector::SetDefaultValue(int index, std::shared_ptr scalar) { +Status RecordBatchProjector::SetDefaultValue(FieldRef ref, + std::shared_ptr scalar) { DCHECK_NE(scalar, nullptr); + if (ref.IsNested()) { + return Status::NotImplemented("setting default values for nested columns"); + } + + ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOne(*to_)); + auto index = match[0]; auto field_type = to_->field(index)->type(); if (!field_type->Equals(scalar->type)) { @@ -83,9 +90,19 @@ Status RecordBatchProjector::SetInputSchema(std::shared_ptr from, for (int i = 0; i < to_->num_fields(); ++i) { const auto& field = to_->field(i); - int matching_index = from_->GetFieldIndex(field->name()); + FieldRef ref(field->name()); + auto matches = ref.FindAll(*from_); + + if (matches.empty()) { + // Mark column i as missing by setting missing_columns_[i] + // to a non-null placeholder. + RETURN_NOT_OK( + MakeArrayOfNull(pool, to_->field(i)->type(), 0, &missing_columns_[i])); + column_indices_[i] = kNoMatch; + } else { + RETURN_NOT_OK(ref.CheckNonMultiple(matches, *from_)); + int matching_index = matches[0][0]; - if (matching_index != kNoMatch) { if (!from_->field(matching_index)->Equals(field)) { return Status::TypeError("fields had matching names but were not equivalent ", from_->field(matching_index)->ToString(), " vs ", @@ -94,14 +111,8 @@ Status RecordBatchProjector::SetInputSchema(std::shared_ptr from, // Mark column i as not missing by setting missing_columns_[i] to nullptr missing_columns_[i] = nullptr; - } else { - // Mark column i as missing by setting missing_columns_[i] - // to a non-null placeholder. - RETURN_NOT_OK( - MakeArrayOfNull(pool, to_->field(i)->type(), 0, &missing_columns_[i])); + column_indices_[i] = matching_index; } - - column_indices_[i] = matching_index; } return Status::OK(); } diff --git a/cpp/src/arrow/dataset/projector.h b/cpp/src/arrow/dataset/projector.h index b64f2923110..4f2ec3b78b4 100644 --- a/cpp/src/arrow/dataset/projector.h +++ b/cpp/src/arrow/dataset/projector.h @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -48,7 +49,7 @@ class ARROW_DS_EXPORT RecordBatchProjector { /// If the indexed field is absent from a record batch it will be added to the projected /// record batch with all its slots equal to the provided scalar (instead of null). - Status SetDefaultValue(int index, std::shared_ptr scalar); + Status SetDefaultValue(FieldRef ref, std::shared_ptr scalar); Result> Project(const RecordBatch& batch, MemoryPool* pool = default_memory_pool()); @@ -63,6 +64,7 @@ class ARROW_DS_EXPORT RecordBatchProjector { std::shared_ptr from_, to_; int64_t missing_columns_length_ = 0; + // these vectors are indexed parallel to to_->fields() std::vector> missing_columns_; std::vector column_indices_; std::vector> scalars_; diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index cebc43e09a6..10d2373ef14 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -611,6 +611,13 @@ std::string DictionaryType::ToString() const { std::string NullType::ToString() const { return name(); } +// ---------------------------------------------------------------------- +// FieldRef + +FieldRef::FieldRef(Indices indices) : impl_(std::move(indices)) { + DCHECK_GT(util::get(impl_).size(), 0); +} + FieldRef::FieldRef(std::vector children) { // flatten children struct Visitor { @@ -716,7 +723,8 @@ size_t FieldRef::hash() const { using std::hash::operator(); size_t operator()(const Indices& indices) { - return internal::ComputeStringHash(indices.data(), indices.length() * sizeof(int)); + return internal::ComputeStringHash<0>(indices.data(), + indices.length() * sizeof(int)); } size_t operator()(const std::vector& children) { @@ -762,12 +770,25 @@ std::string FieldRef::ToString() const { } struct FieldRefGetImpl { - static std::string Summarize(const Field& field) { return field.ToString(); } + static const DataType& GetType(const ArrayData& data) { return *data.type; } - static std::string Summarize(const ArrayData& data) { return data.type->ToString(); } + static const DataType& GetType(const ChunkedArray& array) { return *array.type(); } + + static void Summarize(const FieldVector& fields, std::stringstream* ss) { + *ss << "{ "; + for (const auto& field : fields) { + *ss << field->ToString() << ", "; + } + *ss << "}"; + } - static std::string Summarize(const ChunkedArray& array) { - return array.type()->ToString(); + template + static void Summarize(const std::vector& columns, std::stringstream* ss) { + *ss << "{ "; + for (const auto& column : columns) { + *ss << GetType(*column) << ", "; + } + *ss << "}"; } template @@ -788,27 +809,19 @@ struct FieldRefGetImpl { } ss << "] "; - ss << "child "; if (std::is_same>::value) { - ss << "fields were: { "; - for (const auto& field : children) { - ss << Summarize(*field) << ", "; - } - ss << "}"; + ss << "fields were: "; } else { - ss << "columns had types: { "; - for (const auto& column : children) { - ss << Summarize(*column) << ", "; - } - ss << "}"; + ss << "columns had types: "; } + Summarize(children, &ss); return Status::IndexError(ss.str()); } template static Result Get(const FieldRef::Indices& indices, const std::vector* children, - GetChildren&& get_children, bool verbose_error = true) { + GetChildren&& get_children, int* out_of_range_depth) { if (indices.empty()) { return Status::Invalid("empty indices cannot be traversed"); } @@ -817,12 +830,8 @@ struct FieldRefGetImpl { const T* out; for (int index : indices) { if (index < 0 || static_cast(index) >= children->size()) { - // verbose_error is a hack to allow this function to be used for cheap validation - // in FieldRef::FindAll - if (verbose_error) { - return IndexError(indices, depth, *children); - } - return Status::IndexError("index out of range"); + *out_of_range_depth = depth; + return nullptr; } out = &children->at(index); @@ -833,13 +842,24 @@ struct FieldRefGetImpl { return *out; } + template + static Result Get(const FieldRef::Indices& indices, const std::vector* children, + GetChildren&& get_children) { + int out_of_range_depth; + ARROW_ASSIGN_OR_RAISE(auto child, + Get(indices, children, std::forward(get_children), + &out_of_range_depth)); + if (child != nullptr) { + return std::move(child); + } + return IndexError(indices, out_of_range_depth, *children); + } + static Result> Get(const FieldRef::Indices& indices, - const FieldVector& fields, - bool verbose_error = true) { + const FieldVector& fields) { return FieldRefGetImpl::Get( indices, &fields, - [](const std::shared_ptr& field) { return &field->type()->children(); }, - verbose_error); + [](const std::shared_ptr& field) { return &field->type()->children(); }); } static Result> Get(const FieldRef::Indices& indices, @@ -910,18 +930,26 @@ util::small_vector FieldRef::FindAll(const Schema& schema) co return FindAll(schema.fields()); } -util::small_vector FieldRef::FindAll(const DataType& type) const { - return FindAll(type.children()); -} - util::small_vector FieldRef::FindAll(const Field& field) const { return FindAll(field.type()->children()); } +util::small_vector FieldRef::FindAll(const DataType& type) const { + return FindAll(type.children()); +} + util::small_vector FieldRef::FindAll(const FieldVector& fields) const { struct Visitor { util::small_vector operator()(const FieldRef::Indices& indices) { - if (FieldRefGetImpl::Get(indices, fields_, /* verbose_error = */ false).ok()) { + int out_of_range_depth; + auto maybe_field = FieldRefGetImpl::Get( + indices, &fields_, + [](const std::shared_ptr& field) { return &field->type()->children(); }, + &out_of_range_depth); + + DCHECK_OK(maybe_field.status()); + + if (maybe_field.ValueOrDie() != nullptr) { return {indices}; } return {}; @@ -954,16 +982,18 @@ util::small_vector FieldRef::FindAll(const FieldVector& field size_t size() const { return referents.size(); } - void Add(FieldRef::Indices prefix, FieldRef::Indices indices, + void Add(FieldRef::Indices prefix, FieldRef::Indices match, const FieldVector& fields) { - auto maybe_field = FieldRef::Get(indices, fields); + auto maybe_field = FieldRef::Get(match, fields); DCHECK_OK(maybe_field.status()); + + prefixes.push_back(prefix + match); referents.push_back(std::move(maybe_field).ValueOrDie()); - prefixes.push_back(prefix + indices); } }; util::small_vector operator()(const std::vector& refs) { + DCHECK_GE(refs.size(), 1); Matches matches(refs.front().FindAll(fields_), fields_); for (auto ref_it = refs.begin() + 1; ref_it != refs.end(); ++ref_it) { @@ -971,8 +1001,8 @@ util::small_vector FieldRef::FindAll(const FieldVector& field for (size_t i = 0; i < matches.size(); ++i) { const auto& referent = *matches.referents[i]; - for (const Indices& indices : ref_it->FindAll(referent)) { - next_matches.Add(matches.prefixes[i], indices, referent.type()->children()); + for (const Indices& match : ref_it->FindAll(referent)) { + next_matches.Add(matches.prefixes[i], match, referent.type()->children()); } } matches = std::move(next_matches); diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 7b341033f44..f56f5517e49 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -26,6 +26,7 @@ #include #include +#include "arrow/result.h" #include "arrow/type_fwd.h" // IWYU pragma: export #include "arrow/util/checked_cast.h" #include "arrow/util/macros.h" @@ -1418,7 +1419,9 @@ class ARROW_EXPORT FieldRef { /// Construct a FieldRef using a string of indices. The reference will be retrieved as: /// schema.fields[self.indices[0]].type.fields[self.indices[1]] ... - FieldRef(Indices indices) : impl_(std::move(indices)) {} // NOLINT runtime/explicit + /// + /// Empty indices are not valid. + FieldRef(Indices indices); // NOLINT runtime/explicit /// Construct a by-name FieldRef. Multiple fields may match a by-name FieldRef: /// [f for f in schema.fields where f.name == self.name] @@ -1462,7 +1465,11 @@ class ARROW_EXPORT FieldRef { bool IsIndices() const { return util::holds_alternative(impl_); } bool IsName() const { return util::holds_alternative(impl_); } - bool IsNested() const { return util::holds_alternative>(impl_); } + bool IsNested() const { + if (IsName()) return false; + if (IsIndices()) return util::get(impl_).size() > 1; + return true; + } /// \brief Retrieve the referenced child Field from a Schema, Field, or DataType static Result> Get(const Indices& indices, const Schema& schema); @@ -1482,12 +1489,71 @@ class ARROW_EXPORT FieldRef { static Result> Get(const Indices& indices, const ChunkedArray& array); - /// \brief Retrieve Indices of child fields matching this FieldRef. + /// \brief Retrieve Indices of every child field which matches this FieldRef. util::small_vector FindAll(const Schema& schema) const; util::small_vector FindAll(const Field& field) const; util::small_vector FindAll(const DataType& type) const; util::small_vector FindAll(const FieldVector& fields) const; + template + Status CheckNonEmpty(const util::small_vector& matches, const T& root) const { + if (matches.empty()) { + return Status::Invalid("No match for ", ToString(), " in ", root.ToString()); + } + return Status::OK(); + } + + template + Status CheckNonMultiple(const util::small_vector& matches, + const T& root) const { + if (matches.size() > 1) { + return Status::Invalid("Multiple matches for ", ToString(), " in ", + root.ToString()); + } + return Status::OK(); + } + + /// \brief Retrieve Indices of a single child field which matches this + /// FieldRef. Emit an error if none or multiple match. + template + Result FindOne(const T& root) const { + auto matches = FindAll(root); + RETURN_NOT_OK(CheckNonEmpty(matches, root)); + RETURN_NOT_OK(CheckNonMultiple(matches, root)); + return std::move(matches[0]); + } + + template + using GetType = decltype(Get(Indices{}, std::declval()).ValueOrDie()); + + /// \brief Get all children matching this FieldRef. + template + util::small_vector> GetAll(const T& root) const { + util::small_vector> out; + for (const auto& match : FindAll(root)) { + out.push_back(Get(match, root).ValueOrDie()); + } + return out; + } + + /// \brief Get the single child matching this FieldRef. + /// Emit an error if none or multiple match. + template + Result> GetOne(const T& root) const { + ARROW_ASSIGN_OR_RAISE(auto match, FindOne(root)); + return Get(match, root); + } + + /// \brief Get the single child matching this FieldRef. + /// Return nullptr if none match, emit an error if multiple match. + template + Result> GetOneOrNone(const T& root) const { + auto matches = FindAll(root); + if (matches.empty()) return NULLPTR; + RETURN_NOT_OK(CheckNonMultiple(matches, root)); + return Get(matches[0], root); + } + private: util::variant> impl_; diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 2139db1292e..918b28960d5 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -43,6 +43,7 @@ using BufferVector = std::vector>; class DataType; class Field; +class FieldRef; class KeyValueMetadata; class Schema; From fecfbe718404443498570d78903b6bd0e893a3a3 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 5 Mar 2020 10:24:38 -0500 Subject: [PATCH 04/12] lint fixes --- cpp/src/arrow/type.cc | 3 ++- cpp/src/arrow/type.h | 14 ++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 10d2373ef14..15ad0b05464 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -635,7 +635,8 @@ FieldRef::FieldRef(std::vector children) { }; std::vector out; - Visitor{std::back_inserter(out)}(std::move(children)); + Visitor visitor{std::back_inserter(out)}; + visitor(std::move(children)); DCHECK(!out.empty()); DCHECK(std::none_of(out.begin(), out.end(), diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index f56f5517e49..c676a7ba25b 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1437,9 +1437,11 @@ class ARROW_EXPORT FieldRef { /// construct a FieldRef template FieldRef(A0&& a0, A1&& a1, A&&... a) - : FieldRef(std::vector{FieldRef(std::forward(a0)), - FieldRef(std::forward(a1)), - FieldRef(std::forward(a))...}) {} + : FieldRef(std::vector{ + // cpplint things the following are constructor decls + FieldRef(std::forward(a0)), // NOLINT runtime/explicit + FieldRef(std::forward(a1)), // NOLINT runtime/explicit + FieldRef(std::forward(a))...}) {} // NOLINT runtime/explicit /// Parse a dot path into a FieldRef. /// @@ -1518,8 +1520,8 @@ class ARROW_EXPORT FieldRef { template Result FindOne(const T& root) const { auto matches = FindAll(root); - RETURN_NOT_OK(CheckNonEmpty(matches, root)); - RETURN_NOT_OK(CheckNonMultiple(matches, root)); + ARROW_RETURN_NOT_OK(CheckNonEmpty(matches, root)); + ARROW_RETURN_NOT_OK(CheckNonMultiple(matches, root)); return std::move(matches[0]); } @@ -1550,7 +1552,7 @@ class ARROW_EXPORT FieldRef { Result> GetOneOrNone(const T& root) const { auto matches = FindAll(root); if (matches.empty()) return NULLPTR; - RETURN_NOT_OK(CheckNonMultiple(matches, root)); + ARROW_RETURN_NOT_OK(CheckNonMultiple(matches, root)); return Get(matches[0], root); } From 06d0fb622b025748d03ee5578b30f70b7889c50c Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 5 Mar 2020 10:42:50 -0500 Subject: [PATCH 05/12] GCC 5.4 fix: loop var doesn't shadow --- cpp/src/arrow/type.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 15ad0b05464..b67f8c41e54 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -973,9 +973,9 @@ util::small_vector FieldRef::FindAll(const FieldVector& field util::small_vector prefixes; FieldVector referents; - Matches(util::small_vector indices, const FieldVector& fields) { - for (auto& indices : indices) { - Add({}, std::move(indices), fields); + Matches(util::small_vector matches, const FieldVector& fields) { + for (auto& match : matches) { + Add({}, std::move(match), fields); } } From cd2b4b703da976fcf689aa66a1d8cafbfdf2e3af Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 5 Mar 2020 10:44:25 -0500 Subject: [PATCH 06/12] visibility fix: export FieldRef::PrintTo --- cpp/src/arrow/type.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index c676a7ba25b..b3b22ee8e50 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1559,7 +1559,7 @@ class ARROW_EXPORT FieldRef { private: util::variant> impl_; - friend void PrintTo(const FieldRef& ref, std::ostream* os); + ARROW_EXPORT friend void PrintTo(const FieldRef& ref, std::ostream* os); }; // ---------------------------------------------------------------------- From 127db04b25b58a4bed35c418fb93a5e48f5175b8 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 5 Mar 2020 12:12:32 -0500 Subject: [PATCH 07/12] fix typos, add escaping note to FromDotPath --- cpp/src/arrow/type.cc | 12 ++++++------ cpp/src/arrow/type.h | 6 +++++- cpp/src/arrow/type_test.cc | 1 - 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index b67f8c41e54..fedab739b4f 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -670,7 +670,7 @@ Result FieldRef::FromDotPath(const std::string& dot_path_arg) { } if (dot_path[segment_end] != '\\') { - // segment_end points to a subscipt for a new FieldRef + // segment_end points to a subscript for a new FieldRef name.append(dot_path.begin(), segment_end); dot_path = dot_path.substr(segment_end); break; @@ -692,22 +692,22 @@ Result FieldRef::FromDotPath(const std::string& dot_path_arg) { }; while (!dot_path.empty()) { - auto subscipt = dot_path[0]; + auto subscript = dot_path[0]; dot_path = dot_path.substr(1); - switch (subscipt) { + switch (subscript) { case '.': { // next element is a name children.push_back(parse_name()); continue; } case '[': { - auto subscipt_end = dot_path.find_first_not_of("0123456789"); - if (subscipt_end == util::string_view::npos || dot_path[subscipt_end] != ']') { + auto subscript_end = dot_path.find_first_not_of("0123456789"); + if (subscript_end == util::string_view::npos || dot_path[subscript_end] != ']') { return Status::Invalid("Dot path '", dot_path_arg, "' contained an unterminated index"); } children.emplace_back(std::atoi(dot_path.data())); - dot_path = dot_path.substr(subscipt_end + 1); + dot_path = dot_path.substr(subscript_end + 1); continue; } default: diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index b3b22ee8e50..d44dcc81ac0 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1438,7 +1438,7 @@ class ARROW_EXPORT FieldRef { template FieldRef(A0&& a0, A1&& a1, A&&... a) : FieldRef(std::vector{ - // cpplint things the following are constructor decls + // cpplint thinks the following are constructor decls FieldRef(std::forward(a0)), // NOLINT runtime/explicit FieldRef(std::forward(a1)), // NOLINT runtime/explicit FieldRef(std::forward(a))...}) {} // NOLINT runtime/explicit @@ -1456,6 +1456,10 @@ class ARROW_EXPORT FieldRef { /// "[5].gamma.delta[7]" => FieldRef(5, "gamma", "delta", 7) /// ".hello world" => FieldRef("hello world") /// R"(.\[y\]\\tho\.\)" => FieldRef(R"([y]\tho.\)") + /// + /// Note: When parsing a name, a '\' preceding any other character will be dropped from + /// the resulting name. Therefore if a name must contain the characters '.', '\', or '[' + /// those must be escaped with a preceding '\'. static Result FromDotPath(const std::string& dot_path); bool Equals(const FieldRef& other) const { return impl_ == other.impl_; } diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index 797e382e8bc..8677f4ae8bc 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -334,7 +334,6 @@ TEST(TestFieldRef, Basics) { // lookup by name returns the Indices of both matching fields EXPECT_THAT(FieldRef("alpha").FindAll(s), ElementsAre(I{0}, I{2})); EXPECT_THAT(FieldRef("beta").FindAll(s), ElementsAre(I{1}, I{3})); - EXPECT_THAT(FieldRef("beta").FindAll(s), ElementsAre(I{1}, I{3})); // retrieving a field with single-element Indices is equivalent to Schema::field for (int index = 0; index < s.num_fields(); ++index) { From 6bbbfc4c1c9d192d4fa5906e7887f34c9ba02d6c Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 6 Mar 2020 12:34:40 -0500 Subject: [PATCH 08/12] remove small_vector for now --- cpp/src/arrow/type.cc | 20 +++--- cpp/src/arrow/type.h | 18 +++-- cpp/src/arrow/util/small_vector.h | 115 ------------------------------ 3 files changed, 18 insertions(+), 135 deletions(-) delete mode 100644 cpp/src/arrow/util/small_vector.h diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index fedab739b4f..654962c18ba 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -927,21 +927,21 @@ Result> FieldRef::Get(const Indices& indices, return FieldRefGetImpl::Get(indices, table.columns()); } -util::small_vector FieldRef::FindAll(const Schema& schema) const { +std::vector FieldRef::FindAll(const Schema& schema) const { return FindAll(schema.fields()); } -util::small_vector FieldRef::FindAll(const Field& field) const { +std::vector FieldRef::FindAll(const Field& field) const { return FindAll(field.type()->children()); } -util::small_vector FieldRef::FindAll(const DataType& type) const { +std::vector FieldRef::FindAll(const DataType& type) const { return FindAll(type.children()); } -util::small_vector FieldRef::FindAll(const FieldVector& fields) const { +std::vector FieldRef::FindAll(const FieldVector& fields) const { struct Visitor { - util::small_vector operator()(const FieldRef::Indices& indices) { + std::vector operator()(const FieldRef::Indices& indices) { int out_of_range_depth; auto maybe_field = FieldRefGetImpl::Get( indices, &fields_, @@ -956,8 +956,8 @@ util::small_vector FieldRef::FindAll(const FieldVector& field return {}; } - util::small_vector operator()(const std::string& name) { - util::small_vector out; + std::vector operator()(const std::string& name) { + std::vector out; for (int i = 0; i < static_cast(fields_.size()); ++i) { if (fields_[i]->name() == name) { @@ -970,10 +970,10 @@ util::small_vector FieldRef::FindAll(const FieldVector& field struct Matches { // referents[i] is referenced by prefixes[i] - util::small_vector prefixes; + std::vector prefixes; FieldVector referents; - Matches(util::small_vector matches, const FieldVector& fields) { + Matches(std::vector matches, const FieldVector& fields) { for (auto& match : matches) { Add({}, std::move(match), fields); } @@ -993,7 +993,7 @@ util::small_vector FieldRef::FindAll(const FieldVector& field } }; - util::small_vector operator()(const std::vector& refs) { + std::vector operator()(const std::vector& refs) { DCHECK_GE(refs.size(), 1); Matches matches(refs.front().FindAll(fields_), fields_); diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index d44dcc81ac0..4bcdff2d5e8 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -30,7 +30,6 @@ #include "arrow/type_fwd.h" // IWYU pragma: export #include "arrow/util/checked_cast.h" #include "arrow/util/macros.h" -#include "arrow/util/small_vector.h" #include "arrow/util/variant.h" #include "arrow/util/visibility.h" #include "arrow/visitor.h" // IWYU pragma: keep @@ -1496,13 +1495,13 @@ class ARROW_EXPORT FieldRef { const ChunkedArray& array); /// \brief Retrieve Indices of every child field which matches this FieldRef. - util::small_vector FindAll(const Schema& schema) const; - util::small_vector FindAll(const Field& field) const; - util::small_vector FindAll(const DataType& type) const; - util::small_vector FindAll(const FieldVector& fields) const; + std::vector FindAll(const Schema& schema) const; + std::vector FindAll(const Field& field) const; + std::vector FindAll(const DataType& type) const; + std::vector FindAll(const FieldVector& fields) const; template - Status CheckNonEmpty(const util::small_vector& matches, const T& root) const { + Status CheckNonEmpty(const std::vector& matches, const T& root) const { if (matches.empty()) { return Status::Invalid("No match for ", ToString(), " in ", root.ToString()); } @@ -1510,8 +1509,7 @@ class ARROW_EXPORT FieldRef { } template - Status CheckNonMultiple(const util::small_vector& matches, - const T& root) const { + Status CheckNonMultiple(const std::vector& matches, const T& root) const { if (matches.size() > 1) { return Status::Invalid("Multiple matches for ", ToString(), " in ", root.ToString()); @@ -1534,8 +1532,8 @@ class ARROW_EXPORT FieldRef { /// \brief Get all children matching this FieldRef. template - util::small_vector> GetAll(const T& root) const { - util::small_vector> out; + std::vector> GetAll(const T& root) const { + std::vector> out; for (const auto& match : FindAll(root)) { out.push_back(Get(match, root).ValueOrDie()); } diff --git a/cpp/src/arrow/util/small_vector.h b/cpp/src/arrow/util/small_vector.h deleted file mode 100644 index 182c6755a13..00000000000 --- a/cpp/src/arrow/util/small_vector.h +++ /dev/null @@ -1,115 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include - -#include "arrow/util/variant.h" - -namespace arrow { -namespace util { - -/// A minimal vector-like container which holds up to a single element without allocation. -template -class small_vector { - public: - using value_type = T; - using iterator = T*; - using const_iterator = const T*; - - small_vector() = default; - - small_vector(std::initializer_list list) { - if (list.size() == 1) { - storage_ = *list.begin(); - } else { - set_vector(list); - } - } - - explicit small_vector(std::vector vector) { - if (vector.size() == 1) { - storage_ = std::move(vector[0]); - } else { - set_vector(std::move(vector)); - } - } - - size_t size() const { - if (holds_alternative(storage_)) { - return 1; - } - return get_vector().size(); - } - - bool empty() const { return size() == 0; } - - void clear() { set_vector({}); } - - void push_back(value_type element) { - switch (size()) { - case 0: - storage_ = std::move(element); - return; - case 1: - set_vector({std::move(get(storage_)), std::move(element)}); - return; - default: - get_vector().push_back(std::move(element)); - return; - } - } - - template - void emplace_back(Args&&... args) { - push_back(value_type(std::forward(args)...)); - } - - value_type* data() { - return size() != 1 ? get_vector().data() : &get(storage_); - } - - const value_type* data() const { - return size() != 1 ? get_vector().data() : &get(storage_); - } - - value_type& operator[](size_t i) { return data()[i]; } - const value_type& operator[](size_t i) const { return data()[i]; } - - iterator begin() { return data(); } - iterator end() { return data() + size(); } - - const_iterator begin() const { return data(); } - const_iterator end() const { return data() + size(); } - - private: - std::vector& get_vector() { return get>(storage_); } - - const std::vector& get_vector() const { - return get>(storage_); - } - - void set_vector(std::vector v) { storage_ = std::move(v); } - - variant, value_type> storage_; -}; - -} // namespace util -} // namespace arrow From 08013e84948156b94778409a5e26f66ff40c2f7d Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 13 Mar 2020 11:31:45 -0400 Subject: [PATCH 09/12] extract FieldRef::Indices -> FieldPath, add comments --- cpp/src/arrow/dataset/projector.h | 3 - cpp/src/arrow/type.cc | 390 +++++++++++++++--------------- cpp/src/arrow/type.h | 178 +++++++++----- cpp/src/arrow/type_test.cc | 65 +++-- 4 files changed, 358 insertions(+), 278 deletions(-) diff --git a/cpp/src/arrow/dataset/projector.h b/cpp/src/arrow/dataset/projector.h index 4f2ec3b78b4..13a0ffb1938 100644 --- a/cpp/src/arrow/dataset/projector.h +++ b/cpp/src/arrow/dataset/projector.h @@ -18,9 +18,6 @@ #pragma once #include -#include -#include -#include #include #include "arrow/dataset/type_fwd.h" diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 654962c18ba..0ca8617ff30 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -614,16 +614,182 @@ std::string NullType::ToString() const { return name(); } // ---------------------------------------------------------------------- // FieldRef -FieldRef::FieldRef(Indices indices) : impl_(std::move(indices)) { - DCHECK_GT(util::get(impl_).size(), 0); +size_t FieldPath::hash() const { + return internal::ComputeStringHash<0>(data(), length() * sizeof(int)); } -FieldRef::FieldRef(std::vector children) { +std::string FieldPath::ToString() const { + std::string repr = "FieldPath("; + for (auto index : *this) { + repr += std::to_string(index) + " "; + } + repr.resize(repr.size() - 1); + repr += ")"; + return repr; +} + +struct FieldPathGetImpl { + static const DataType& GetType(const ArrayData& data) { return *data.type; } + + static const DataType& GetType(const ChunkedArray& array) { return *array.type(); } + + static void Summarize(const FieldVector& fields, std::stringstream* ss) { + *ss << "{ "; + for (const auto& field : fields) { + *ss << field->ToString() << ", "; + } + *ss << "}"; + } + + template + static void Summarize(const std::vector& columns, std::stringstream* ss) { + *ss << "{ "; + for (const auto& column : columns) { + *ss << GetType(*column) << ", "; + } + *ss << "}"; + } + + template + static Status IndexError(const FieldPath* path, int out_of_range_depth, + const std::vector& children) { + std::stringstream ss; + ss << "index out of range. "; + + ss << "indices=[ "; + int depth = 0; + for (int i : *path) { + if (depth != out_of_range_depth) { + ss << i << " "; + continue; + } + ss << ">" << i << "< "; + ++depth; + } + ss << "] "; + + if (std::is_same>::value) { + ss << "fields were: "; + } else { + ss << "columns had types: "; + } + Summarize(children, &ss); + + return Status::IndexError(ss.str()); + } + + template + static Result Get(const FieldPath* path, const std::vector* children, + GetChildren&& get_children, int* out_of_range_depth) { + if (path->empty()) { + return Status::Invalid("empty indices cannot be traversed"); + } + + int depth = 0; + const T* out; + for (int index : *path) { + if (index < 0 || static_cast(index) >= children->size()) { + *out_of_range_depth = depth; + return nullptr; + } + + out = &children->at(index); + children = get_children(*out); + ++depth; + } + + return *out; + } + + template + static Result Get(const FieldPath* path, const std::vector* children, + GetChildren&& get_children) { + int out_of_range_depth; + ARROW_ASSIGN_OR_RAISE(auto child, + Get(path, children, std::forward(get_children), + &out_of_range_depth)); + if (child != nullptr) { + return std::move(child); + } + return IndexError(path, out_of_range_depth, *children); + } + + static Result> Get(const FieldPath* path, + const FieldVector& fields) { + return FieldPathGetImpl::Get(path, &fields, [](const std::shared_ptr& field) { + return &field->type()->children(); + }); + } + + static Result> Get(const FieldPath* path, + const ArrayDataVector& child_data) { + return FieldPathGetImpl::Get( + path, &child_data, + [](const std::shared_ptr& data) { return &data->child_data; }); + } + + static Result> Get( + const FieldPath* path, const ChunkedArrayVector& columns_arg) { + ChunkedArrayVector columns = columns_arg; + + return FieldPathGetImpl::Get( + path, &columns, [&](const std::shared_ptr& a) { + columns.clear(); + + for (int i = 0; i < a->type()->num_children(); ++i) { + ArrayVector child_chunks; + + for (const auto& chunk : a->chunks()) { + auto child_chunk = MakeArray(chunk->data()->child_data[i]); + child_chunks.push_back(std::move(child_chunk)); + } + + auto child_column = std::make_shared( + std::move(child_chunks), a->type()->child(i)->type()); + + columns.emplace_back(std::move(child_column)); + } + + return &columns; + }); + } +}; + +Result> FieldPath::Get(const Schema& schema) const { + return FieldPathGetImpl::Get(this, schema.fields()); +} + +Result> FieldPath::Get(const Field& field) const { + return FieldPathGetImpl::Get(this, field.type()->children()); +} + +Result> FieldPath::Get(const DataType& type) const { + return FieldPathGetImpl::Get(this, type.children()); +} + +Result> FieldPath::Get(const FieldVector& fields) const { + return FieldPathGetImpl::Get(this, fields); +} + +Result> FieldPath::Get(const RecordBatch& batch) const { + ARROW_ASSIGN_OR_RAISE(auto data, FieldPathGetImpl::Get(this, batch.column_data())); + return MakeArray(data); +} + +Result> FieldPath::Get(const Table& table) const { + return FieldPathGetImpl::Get(this, table.columns()); +} + +FieldRef::FieldRef(FieldPath indices) : impl_(std::move(indices)) { + DCHECK_GT(util::get(impl_).size(), 0); +} + +void FieldRef::Flatten(std::vector children) { // flatten children struct Visitor { void operator()(std::string&& name) { *out++ = FieldRef(std::move(name)); } - void operator()(Indices&& indices) { *out++ = FieldRef(std::move(indices)); } + void operator()(FieldPath&& indices) { *out++ = FieldRef(std::move(indices)); } void operator()(std::vector&& children) { for (auto& child : children) { @@ -697,7 +863,7 @@ Result FieldRef::FromDotPath(const std::string& dot_path_arg) { switch (subscript) { case '.': { // next element is a name - children.push_back(parse_name()); + children.emplace_back(parse_name()); continue; } case '[': { @@ -716,17 +882,16 @@ Result FieldRef::FromDotPath(const std::string& dot_path_arg) { } } - return FieldRef(std::move(children)); + FieldRef out; + out.Flatten(std::move(children)); + return out; } size_t FieldRef::hash() const { struct Visitor : std::hash { using std::hash::operator(); - size_t operator()(const Indices& indices) { - return internal::ComputeStringHash<0>(indices.data(), - indices.length() * sizeof(int)); - } + size_t operator()(const FieldPath& path) { return path.hash(); } size_t operator()(const std::vector& children) { size_t hash = 0; @@ -744,15 +909,7 @@ size_t FieldRef::hash() const { std::string FieldRef::ToString() const { struct Visitor { - std::string operator()(const Indices& indices) { - std::string repr = "Indices("; - for (auto index : indices) { - repr += std::to_string(index) + " "; - } - repr.resize(repr.size() - 1); - repr += ")"; - return repr; - } + std::string operator()(const FieldPath& path) { return path.ToString(); } std::string operator()(const std::string& name) { return "Name(" + name + ")"; } @@ -767,197 +924,41 @@ std::string FieldRef::ToString() const { } }; - return util::visit(Visitor{}, impl_); -} - -struct FieldRefGetImpl { - static const DataType& GetType(const ArrayData& data) { return *data.type; } - - static const DataType& GetType(const ChunkedArray& array) { return *array.type(); } - - static void Summarize(const FieldVector& fields, std::stringstream* ss) { - *ss << "{ "; - for (const auto& field : fields) { - *ss << field->ToString() << ", "; - } - *ss << "}"; - } - - template - static void Summarize(const std::vector& columns, std::stringstream* ss) { - *ss << "{ "; - for (const auto& column : columns) { - *ss << GetType(*column) << ", "; - } - *ss << "}"; - } - - template - static Status IndexError(const FieldRef::Indices& indices, int out_of_range_depth, - const std::vector& children) { - std::stringstream ss; - ss << "index out of range. "; - - ss << "indices=[ "; - int depth = 0; - for (int i : indices) { - if (depth != out_of_range_depth) { - ss << i << " "; - continue; - } - ss << ">" << i << "< "; - ++depth; - } - ss << "] "; - - if (std::is_same>::value) { - ss << "fields were: "; - } else { - ss << "columns had types: "; - } - Summarize(children, &ss); - - return Status::IndexError(ss.str()); - } - - template - static Result Get(const FieldRef::Indices& indices, const std::vector* children, - GetChildren&& get_children, int* out_of_range_depth) { - if (indices.empty()) { - return Status::Invalid("empty indices cannot be traversed"); - } - - int depth = 0; - const T* out; - for (int index : indices) { - if (index < 0 || static_cast(index) >= children->size()) { - *out_of_range_depth = depth; - return nullptr; - } - - out = &children->at(index); - children = get_children(*out); - ++depth; - } - - return *out; - } - - template - static Result Get(const FieldRef::Indices& indices, const std::vector* children, - GetChildren&& get_children) { - int out_of_range_depth; - ARROW_ASSIGN_OR_RAISE(auto child, - Get(indices, children, std::forward(get_children), - &out_of_range_depth)); - if (child != nullptr) { - return std::move(child); - } - return IndexError(indices, out_of_range_depth, *children); - } - - static Result> Get(const FieldRef::Indices& indices, - const FieldVector& fields) { - return FieldRefGetImpl::Get( - indices, &fields, - [](const std::shared_ptr& field) { return &field->type()->children(); }); - } - - static Result> Get(const FieldRef::Indices& indices, - const ArrayDataVector& child_data) { - return FieldRefGetImpl::Get( - indices, &child_data, - [](const std::shared_ptr& data) { return &data->child_data; }); - } - - static Result> Get( - const FieldRef::Indices& indices, const ChunkedArrayVector& columns_arg) { - ChunkedArrayVector columns = columns_arg; - - return FieldRefGetImpl::Get( - indices, &columns, [&](const std::shared_ptr& a) { - columns.clear(); - - for (int i = 0; i < a->type()->num_children(); ++i) { - ArrayVector child_chunks; - - for (const auto& chunk : a->chunks()) { - auto child_chunk = MakeArray(chunk->data()->child_data[i]); - child_chunks.push_back(std::move(child_chunk)); - } - - auto child_column = std::make_shared( - std::move(child_chunks), a->type()->child(i)->type()); - - columns.emplace_back(std::move(child_column)); - } - - return &columns; - }); - } -}; - -Result> FieldRef::Get(const Indices& indices, - const Schema& schema) { - return FieldRefGetImpl::Get(indices, schema.fields()); -} - -Result> FieldRef::Get(const Indices& indices, const Field& field) { - return FieldRefGetImpl::Get(indices, field.type()->children()); -} - -Result> FieldRef::Get(const Indices& indices, - const DataType& type) { - return FieldRefGetImpl::Get(indices, type.children()); -} - -Result> FieldRef::Get(const Indices& indices, - const FieldVector& fields) { - return FieldRefGetImpl::Get(indices, fields); -} - -Result> FieldRef::Get(const Indices& indices, - const RecordBatch& batch) { - ARROW_ASSIGN_OR_RAISE(auto data, FieldRefGetImpl::Get(indices, batch.column_data())); - return MakeArray(data); -} - -Result> FieldRef::Get(const Indices& indices, - const Table& table) { - return FieldRefGetImpl::Get(indices, table.columns()); + return "FieldRef." + util::visit(Visitor{}, impl_); } -std::vector FieldRef::FindAll(const Schema& schema) const { +std::vector FieldRef::FindAll(const Schema& schema) const { return FindAll(schema.fields()); } -std::vector FieldRef::FindAll(const Field& field) const { +std::vector FieldRef::FindAll(const Field& field) const { return FindAll(field.type()->children()); } -std::vector FieldRef::FindAll(const DataType& type) const { +std::vector FieldRef::FindAll(const DataType& type) const { return FindAll(type.children()); } -std::vector FieldRef::FindAll(const FieldVector& fields) const { +std::vector FieldRef::FindAll(const FieldVector& fields) const { struct Visitor { - std::vector operator()(const FieldRef::Indices& indices) { + std::vector operator()(const FieldPath& path) { + // skip long IndexError construction if path is out of range int out_of_range_depth; - auto maybe_field = FieldRefGetImpl::Get( - indices, &fields_, + auto maybe_field = FieldPathGetImpl::Get( + &path, &fields_, [](const std::shared_ptr& field) { return &field->type()->children(); }, &out_of_range_depth); DCHECK_OK(maybe_field.status()); if (maybe_field.ValueOrDie() != nullptr) { - return {indices}; + return {path}; } return {}; } - std::vector operator()(const std::string& name) { - std::vector out; + std::vector operator()(const std::string& name) { + std::vector out; for (int i = 0; i < static_cast(fields_.size()); ++i) { if (fields_[i]->name() == name) { @@ -970,10 +971,10 @@ std::vector FieldRef::FindAll(const FieldVector& fields) cons struct Matches { // referents[i] is referenced by prefixes[i] - std::vector prefixes; + std::vector prefixes; FieldVector referents; - Matches(std::vector matches, const FieldVector& fields) { + Matches(std::vector matches, const FieldVector& fields) { for (auto& match : matches) { Add({}, std::move(match), fields); } @@ -983,17 +984,16 @@ std::vector FieldRef::FindAll(const FieldVector& fields) cons size_t size() const { return referents.size(); } - void Add(FieldRef::Indices prefix, FieldRef::Indices match, - const FieldVector& fields) { - auto maybe_field = FieldRef::Get(match, fields); + void Add(FieldPath prefix, FieldPath match, const FieldVector& fields) { + auto maybe_field = match.Get(fields); DCHECK_OK(maybe_field.status()); - prefixes.push_back(prefix + match); + prefixes.emplace_back(prefix + match); referents.push_back(std::move(maybe_field).ValueOrDie()); } }; - std::vector operator()(const std::vector& refs) { + std::vector operator()(const std::vector& refs) { DCHECK_GE(refs.size(), 1); Matches matches(refs.front().FindAll(fields_), fields_); @@ -1002,7 +1002,7 @@ std::vector FieldRef::FindAll(const FieldVector& fields) cons for (size_t i = 0; i < matches.size(); ++i) { const auto& referent = *matches.referents[i]; - for (const Indices& match : ref_it->FindAll(referent)) { + for (const FieldPath& match : ref_it->FindAll(referent)) { next_matches.Add(matches.prefixes[i], match, referent.type()->children()); } } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 4bcdff2d5e8..e17ac65e446 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1403,44 +1403,110 @@ class ARROW_EXPORT DictionaryUnifier { // ---------------------------------------------------------------------- // FieldRef +/// \class FieldPath +/// +/// Represents a path to a nested field using indices of child fields. +/// For example, given indices {5, 9, 3} the field would be retrieved with +/// schema->field(5)->type()->child(9)->type()->child(3) +/// +/// Attempting to retrieve a child field using a FieldPath which is not valid for +/// a given schema will raise an error. Invalid FieldPaths include: +/// - an index is out of range +/// - the path is empty (note: a default constructed FieldPath will be empty) +/// +/// FieldPaths provide a number of accessors for drilling down to potentially nested +/// children. They are overloaded for convenience to support Schema (returns a field), +/// DataType (returns a child field), Field (returns a child field of this field's type) +/// Array (returns a child array), RecordBatch (returns a column), ChunkedArray (returns a +/// ChunkedArray where each chunk is a child array of the corresponding original chunk) +/// and Table (returns a column). +class ARROW_EXPORT FieldPath : public std::basic_string { + public: + // std::basic_string is used to take advantage of the short string approximation: up to + // four indices can be stored without memory allocation. + using std::basic_string::basic_string; + + FieldPath(std::basic_string indices) // NOLINT runtime/explicit + : std::basic_string(std::move(indices)) {} + + std::string ToString() const; + + size_t hash() const; + + explicit operator bool() const { return !empty(); } + + /// \brief Retrieve the referenced child Field from a Schema, Field, or DataType + Result> Get(const Schema& schema) const; + Result> Get(const Field& field) const; + Result> Get(const DataType& type) const; + Result> Get(const FieldVector& fields) const; + + /// \brief Retrieve the referenced column from a RecordBatch or Table + Result> Get(const RecordBatch& batch) const; + Result> Get(const Table& table) const; + + /// \brief Retrieve the referenced child Array from an Array or ChunkedArray + Result> Get(const Array& array) const; + Result> Get(const ChunkedArray& array) const; +}; + /// \class FieldRef /// \brief Descriptor of a (potentially nested) field within a schema. +/// +/// Unlike FieldPath (which exclusively uses indices of child fields), FieldRef may +/// reference a field by name. It is intended to replace parameters like `int field_index` +/// and `const std::string& field_name`; it can be implicitly constructed from either a +/// field index or a name. +/// +/// Nested fields can be referenced as well. Given +/// schema({field("a", struct_({field("n", null())})), field("b", int32())}) +/// +/// the following all indicate the nested field named "n": +/// FieldRef ref1(0, 0); +/// FieldRef ref2("a", 0); +/// FieldRef ref3("a", "n"); +/// FieldRef ref4(0, "n"); +/// ARROW_ASSIGN_OR_RAISE(FieldRef ref5, +/// FieldRef::FromDotPath(".a[0]")); +/// +/// FieldPaths matching a FieldRef are retrieved using the member function FindAll. +/// Multiple matches are possible because field names may be duplicated within a schema. +/// For example: +/// Schema a_is_ambiguous({field("a", int32()), field("a", float32())}); +/// auto matches = FieldRef("a").FindAll(a_is_ambiguous); +/// assert(matches.size() == 2); +/// assert(matches[0].Get(a_is_ambiguous)->Equals(a_is_ambiguous.field(0))); +/// assert(matches[1].Get(a_is_ambiguous)->Equals(a_is_ambiguous.field(1))); +/// +/// Convenience accessors are available which raise a helpful error if the field is not +/// found or ambiguous, and for immediately calling FieldPath::Get to retrieve any +/// matching children: +/// auto maybe_match = FieldRef("struct", "field_i32").FindOneOrNone(schema); +/// auto maybe_column = FieldRef("struct", "field_i32").GetOne(some_table); class ARROW_EXPORT FieldRef { public: - /// Represents a path to a nested field using indices of child fields. - /// For example, given indices {5, 9, 3} the field would be retrieved with - /// schema->field(5)->type()->child(9)->type()->child(3) - /// - /// std::basic_string is used to take advantage - /// of the short string approximation: up to four indices can be stored without memory - /// allocation. - using Indices = std::basic_string; + FieldRef() = default; /// Construct a FieldRef using a string of indices. The reference will be retrieved as: /// schema.fields[self.indices[0]].type.fields[self.indices[1]] ... /// /// Empty indices are not valid. - FieldRef(Indices indices); // NOLINT runtime/explicit + FieldRef(FieldPath indices); // NOLINT runtime/explicit /// Construct a by-name FieldRef. Multiple fields may match a by-name FieldRef: /// [f for f in schema.fields where f.name == self.name] FieldRef(std::string name) : impl_(std::move(name)) {} // NOLINT runtime/explicit /// Equivalent to a single index string of indices. - FieldRef(int index) : impl_(Indices({index})) {} // NOLINT runtime/explicit - - /// Construct a nested FieldRef. - explicit FieldRef(std::vector children); + FieldRef(int index) : impl_(FieldPath({index})) {} // NOLINT runtime/explicit /// Convenience constructor for nested FieldRefs: each argument will be used to /// construct a FieldRef template - FieldRef(A0&& a0, A1&& a1, A&&... a) - : FieldRef(std::vector{ - // cpplint thinks the following are constructor decls - FieldRef(std::forward(a0)), // NOLINT runtime/explicit - FieldRef(std::forward(a1)), // NOLINT runtime/explicit - FieldRef(std::forward(a))...}) {} // NOLINT runtime/explicit + FieldRef(A0&& a0, A1&& a1, A&&... a) { + Flatten({FieldRef(std::forward(a0)), FieldRef(std::forward(a1)), + FieldRef(std::forward(a))...}); + } /// Parse a dot path into a FieldRef. /// @@ -1468,48 +1534,32 @@ class ARROW_EXPORT FieldRef { size_t hash() const; - bool IsIndices() const { return util::holds_alternative(impl_); } + bool IsFieldPath() const { return util::holds_alternative(impl_); } bool IsName() const { return util::holds_alternative(impl_); } bool IsNested() const { if (IsName()) return false; - if (IsIndices()) return util::get(impl_).size() > 1; + if (IsFieldPath()) return util::get(impl_).size() > 1; return true; } - /// \brief Retrieve the referenced child Field from a Schema, Field, or DataType - static Result> Get(const Indices& indices, const Schema& schema); - static Result> Get(const Indices& indices, const Field& field); - static Result> Get(const Indices& indices, const DataType& type); - static Result> Get(const Indices& indices, - const FieldVector& fields); - - /// \brief Retrieve the referenced column from a RecordBatch or Table - static Result> Get(const Indices& indices, - const RecordBatch& batch); - static Result> Get(const Indices& indices, - const Table& table); - - /// \brief Retrieve the referenced child Array from an Array or ChunkedArray - static Result> Get(const Indices& indices, const Array& array); - static Result> Get(const Indices& indices, - const ChunkedArray& array); - - /// \brief Retrieve Indices of every child field which matches this FieldRef. - std::vector FindAll(const Schema& schema) const; - std::vector FindAll(const Field& field) const; - std::vector FindAll(const DataType& type) const; - std::vector FindAll(const FieldVector& fields) const; + /// \brief Retrieve FieldPath of every child field which matches this FieldRef. + std::vector FindAll(const Schema& schema) const; + std::vector FindAll(const Field& field) const; + std::vector FindAll(const DataType& type) const; + std::vector FindAll(const FieldVector& fields) const; + /// \brief Convenience function: raise an error if matches is empty. template - Status CheckNonEmpty(const std::vector& matches, const T& root) const { + Status CheckNonEmpty(const std::vector& matches, const T& root) const { if (matches.empty()) { return Status::Invalid("No match for ", ToString(), " in ", root.ToString()); } return Status::OK(); } + /// \brief Convenience function: raise an error if matches contains multiple FieldPaths. template - Status CheckNonMultiple(const std::vector& matches, const T& root) const { + Status CheckNonMultiple(const std::vector& matches, const T& root) const { if (matches.size() > 1) { return Status::Invalid("Multiple matches for ", ToString(), " in ", root.ToString()); @@ -1517,25 +1567,38 @@ class ARROW_EXPORT FieldRef { return Status::OK(); } - /// \brief Retrieve Indices of a single child field which matches this + /// \brief Retrieve FieldPath of a single child field which matches this /// FieldRef. Emit an error if none or multiple match. template - Result FindOne(const T& root) const { + Result FindOne(const T& root) const { auto matches = FindAll(root); ARROW_RETURN_NOT_OK(CheckNonEmpty(matches, root)); ARROW_RETURN_NOT_OK(CheckNonMultiple(matches, root)); return std::move(matches[0]); } + /// \brief Retrieve FieldPath of a single child field which matches this + /// FieldRef. Emit an error if multiple match. An empty (invalid) FieldPath + /// will be returned if none match. + template + Result FindOneOrNone(const T& root) const { + auto matches = FindAll(root); + ARROW_RETURN_NOT_OK(CheckNonMultiple(matches, root)); + if (matches.empty()) { + return FieldPath(); + } + return std::move(matches[0]); + } + template - using GetType = decltype(Get(Indices{}, std::declval()).ValueOrDie()); + using GetType = decltype(FieldPath().Get(std::declval()).ValueOrDie()); /// \brief Get all children matching this FieldRef. template std::vector> GetAll(const T& root) const { std::vector> out; for (const auto& match : FindAll(root)) { - out.push_back(Get(match, root).ValueOrDie()); + out.push_back(match.Get(root).ValueOrDie()); } return out; } @@ -1545,21 +1608,24 @@ class ARROW_EXPORT FieldRef { template Result> GetOne(const T& root) const { ARROW_ASSIGN_OR_RAISE(auto match, FindOne(root)); - return Get(match, root); + return match.Get(root).ValueOrDie(); } /// \brief Get the single child matching this FieldRef. /// Return nullptr if none match, emit an error if multiple match. template Result> GetOneOrNone(const T& root) const { - auto matches = FindAll(root); - if (matches.empty()) return NULLPTR; - ARROW_RETURN_NOT_OK(CheckNonMultiple(matches, root)); - return Get(matches[0], root); + ARROW_ASSIGN_OR_RAISE(auto match, FindOneOrNone(root)); + if (match) { + return match.Get(root).ValueOrDie(); + } + return NULLPTR; } private: - util::variant> impl_; + void Flatten(std::vector children); + + util::variant> impl_; ARROW_EXPORT friend void PrintTo(const FieldRef& ref, std::ostream* os); }; diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index 8677f4ae8bc..fd715f2b4fb 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -315,35 +315,41 @@ TEST(TestField, TestMerge) { } } -TEST(TestFieldRef, Basics) { +TEST(TestFieldPath, Basics) { auto f0 = field("alpha", int32()); auto f1 = field("beta", int32()); auto f2 = field("alpha", int32()); auto f3 = field("beta", int32()); Schema s({f0, f1, f2, f3}); - using I = FieldRef::Indices; + // retrieving a field with single-element FieldPath is equivalent to Schema::field + for (int index = 0; index < s.num_fields(); ++index) { + ASSERT_OK_AND_EQ(s.field(index), FieldPath({index}).Get(s)); + } + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, + testing::HasSubstr("empty indices cannot be traversed"), + FieldPath().Get(s)); + EXPECT_RAISES_WITH_MESSAGE_THAT(IndexError, testing::HasSubstr("index out of range"), + FieldPath({s.num_fields() * 2}).Get(s)); +} + +TEST(TestFieldRef, Basics) { + auto f0 = field("alpha", int32()); + auto f1 = field("beta", int32()); + auto f2 = field("alpha", int32()); + auto f3 = field("beta", int32()); + Schema s({f0, f1, f2, f3}); // lookup by index returns Indices{index} for (int index = 0; index < s.num_fields(); ++index) { - EXPECT_THAT(FieldRef(index).FindAll(s), ElementsAre(I{index})); + EXPECT_THAT(FieldRef(index).FindAll(s), ElementsAre(FieldPath{index})); } // out of range index results in a failure to match EXPECT_THAT(FieldRef(s.num_fields() * 2).FindAll(s), ElementsAre()); // lookup by name returns the Indices of both matching fields - EXPECT_THAT(FieldRef("alpha").FindAll(s), ElementsAre(I{0}, I{2})); - EXPECT_THAT(FieldRef("beta").FindAll(s), ElementsAre(I{1}, I{3})); - - // retrieving a field with single-element Indices is equivalent to Schema::field - for (int index = 0; index < s.num_fields(); ++index) { - ASSERT_OK_AND_EQ(s.field(index), FieldRef::Get(I{index}, s)); - } - EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, - testing::HasSubstr("empty indices cannot be traversed"), - FieldRef::Get(I{}, s)); - EXPECT_RAISES_WITH_MESSAGE_THAT(IndexError, testing::HasSubstr("index out of range"), - FieldRef::Get(I{s.num_fields() * 2}, s)); + EXPECT_THAT(FieldRef("alpha").FindAll(s), ElementsAre(FieldPath{0}, FieldPath{2})); + EXPECT_THAT(FieldRef("beta").FindAll(s), ElementsAre(FieldPath{1}, FieldPath{3})); } TEST(TestFieldRef, FromDotPath) { @@ -368,7 +374,7 @@ TEST(TestFieldRef, FromDotPath) { ASSERT_RAISES(Invalid, FieldRef::FromDotPath(R"([1stuf])")); } -TEST(TestFieldRef, Nested) { +TEST(TestFieldPath, Nested) { auto f0 = field("alpha", int32()); auto f1_0 = field("alpha", int32()); auto f1 = field("beta", struct_({f1_0})); @@ -380,17 +386,28 @@ TEST(TestFieldRef, Nested) { Schema s({f0, f1, f2}); // retrieving fields with nested indices - EXPECT_EQ(FieldRef::Get({0}, s), f0); - EXPECT_EQ(FieldRef::Get({1, 0}, s), f1_0); - EXPECT_EQ(FieldRef::Get({2, 0}, s), f2_0); - EXPECT_EQ(FieldRef::Get({2, 1, 0}, s), f2_1_0); - EXPECT_EQ(FieldRef::Get({2, 1, 1}, s), f2_1_1); + EXPECT_EQ(FieldPath({0}).Get(s), f0); + EXPECT_EQ(FieldPath({1, 0}).Get(s), f1_0); + EXPECT_EQ(FieldPath({2, 0}).Get(s), f2_0); + EXPECT_EQ(FieldPath({2, 1, 0}).Get(s), f2_1_0); + EXPECT_EQ(FieldPath({2, 1, 1}).Get(s), f2_1_1); +} - using I = FieldRef::Indices; +TEST(TestFieldRef, Nested) { + auto f0 = field("alpha", int32()); + auto f1_0 = field("alpha", int32()); + auto f1 = field("beta", struct_({f1_0})); + auto f2_0 = field("alpha", int32()); + auto f2_1_0 = field("alpha", int32()); + auto f2_1_1 = field("alpha", int32()); + auto f2_1 = field("gamma", struct_({f2_1_0, f2_1_1})); + auto f2 = field("beta", struct_({f2_0, f2_1})); + Schema s({f0, f1, f2}); - EXPECT_THAT(FieldRef("beta", "alpha").FindAll(s), ElementsAre(I{1, 0}, I{2, 0})); + EXPECT_THAT(FieldRef("beta", "alpha").FindAll(s), + ElementsAre(FieldPath{1, 0}, FieldPath{2, 0})); EXPECT_THAT(FieldRef("beta", "gamma", "alpha").FindAll(s), - ElementsAre(I{2, 1, 0}, I{2, 1, 1})); + ElementsAre(FieldPath{2, 1, 0}, FieldPath{2, 1, 1})); } using TestSchema = ::testing::Test; From f9ecb5d0dbe008029e52102c52badfc407c548a4 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 13 Mar 2020 11:46:32 -0400 Subject: [PATCH 10/12] mark non constructors with NOLINT again --- cpp/src/arrow/type.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index e17ac65e446..1a378977b47 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1504,8 +1504,10 @@ class ARROW_EXPORT FieldRef { /// construct a FieldRef template FieldRef(A0&& a0, A1&& a1, A&&... a) { - Flatten({FieldRef(std::forward(a0)), FieldRef(std::forward(a1)), - FieldRef(std::forward(a))...}); + Flatten({// cpplint thinks the following are constructor decls + FieldRef(std::forward(a0)), // NOLINT runtime/explicit + FieldRef(std::forward(a1)), // NOLINT runtime/explicit + FieldRef(std::forward(a))...}); // NOLINT runtime/explicit } /// Parse a dot path into a FieldRef. From 8b7de20ed030e28b13d8f54d22fa683007c5fab8 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 13 Mar 2020 11:52:09 -0400 Subject: [PATCH 11/12] more explicit construction of FieldPath for GCC 5.4 --- cpp/src/arrow/type.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 1a378977b47..3ab5b39ef45 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1426,6 +1426,8 @@ class ARROW_EXPORT FieldPath : public std::basic_string { // four indices can be stored without memory allocation. using std::basic_string::basic_string; + FieldPath() = default; + FieldPath(std::basic_string indices) // NOLINT runtime/explicit : std::basic_string(std::move(indices)) {} @@ -1593,7 +1595,7 @@ class ARROW_EXPORT FieldRef { } template - using GetType = decltype(FieldPath().Get(std::declval()).ValueOrDie()); + using GetType = decltype(std::declval().Get(std::declval()).ValueOrDie()); /// \brief Get all children matching this FieldRef. template From 7e81ee6d45428cc7eb742ba0d946f90a2b6fca77 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Fri, 13 Mar 2020 12:14:24 -0400 Subject: [PATCH 12/12] remove basic_string for MSVC --- cpp/src/arrow/type.cc | 8 +++++--- cpp/src/arrow/type.h | 10 ++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 0ca8617ff30..108758927fe 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -615,7 +615,7 @@ std::string NullType::ToString() const { return name(); } // FieldRef size_t FieldPath::hash() const { - return internal::ComputeStringHash<0>(data(), length() * sizeof(int)); + return internal::ComputeStringHash<0>(data(), size() * sizeof(int)); } std::string FieldPath::ToString() const { @@ -984,11 +984,13 @@ std::vector FieldRef::FindAll(const FieldVector& fields) const { size_t size() const { return referents.size(); } - void Add(FieldPath prefix, FieldPath match, const FieldVector& fields) { + void Add(FieldPath prefix, const FieldPath& match, const FieldVector& fields) { auto maybe_field = match.Get(fields); DCHECK_OK(maybe_field.status()); - prefixes.emplace_back(prefix + match); + prefix.resize(prefix.size() + match.size()); + std::copy(match.begin(), match.end(), prefix.end() - match.size()); + prefixes.push_back(std::move(prefix)); referents.push_back(std::move(maybe_field).ValueOrDie()); } }; diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 3ab5b39ef45..7f58369b38b 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1420,16 +1420,14 @@ class ARROW_EXPORT DictionaryUnifier { /// Array (returns a child array), RecordBatch (returns a column), ChunkedArray (returns a /// ChunkedArray where each chunk is a child array of the corresponding original chunk) /// and Table (returns a column). -class ARROW_EXPORT FieldPath : public std::basic_string { +class ARROW_EXPORT FieldPath : public std::vector { public: - // std::basic_string is used to take advantage of the short string approximation: up to - // four indices can be stored without memory allocation. - using std::basic_string::basic_string; + using std::vector::vector; FieldPath() = default; - FieldPath(std::basic_string indices) // NOLINT runtime/explicit - : std::basic_string(std::move(indices)) {} + FieldPath(std::vector indices) // NOLINT runtime/explicit + : std::vector(std::move(indices)) {} std::string ToString() const;