From 1e0152da643941290ffd43ab0d56decbca083dd7 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 19 Apr 2016 20:40:30 +0000 Subject: [PATCH 1/4] wip --- cpp/src/arrow/type.cc | 23 +++- cpp/src/arrow/type.h | 69 ++++++++--- cpp/src/arrow/types/construct.cc | 2 +- cpp/src/arrow/types/decimal.h | 1 - cpp/src/arrow/types/list.h | 8 +- cpp/src/arrow/types/string-test.cc | 183 ++++++++++++++++++++++++++--- cpp/src/arrow/types/string.cc | 47 ++++++-- cpp/src/arrow/types/string.h | 142 +++++++++++++++------- cpp/src/arrow/util/macros.h | 2 +- format/Message.fbs | 9 +- 10 files changed, 397 insertions(+), 89 deletions(-) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 4e686d9cf4a..ad5829a119e 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -31,7 +31,18 @@ std::string Field::ToString() const { DataType::~DataType() {} -StringType::StringType() : DataType(Type::STRING) {} +bool DataType::Equals(const DataType* other) const { + bool equals = other && ((this == other) || + ((this->type == other->type) && + ((this->num_children() == other->num_children())))); + if (equals) { + for (int i = 0; i < num_children(); ++i) { + // TODO(emkornfield) limit recursion + if (!children_[i]->Equals(other->children_[i])) { return false; } + } + } + return equals; +} std::string StringType::ToString() const { std::string result(name()); @@ -44,6 +55,16 @@ std::string ListType::ToString() const { return s.str(); } +std::string BinaryType::ToString() const { + return std::string(name()); +} + +std::string CharType::ToString() const { + std::stringstream s; + s << "char(" << size << ")"; + return s.str(); +} + std::string StructType::ToString() const { std::stringstream s; s << "struct<"; diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index f366645cd5c..e92c007c8f4 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -23,6 +23,8 @@ #include #include +#include "arrow/util/macros.h" + namespace arrow { // Data types in this library are all *logical*. They can be expressed as @@ -59,9 +61,6 @@ struct Type { // UTF8 variable-length string as List STRING = 13, - // VARCHAR(N): Null-terminated string type embedded in a CHAR(N + 1) - VARCHAR = 14, - // Variable-length bytes (no guarantee of UTF8-ness) BINARY = 15, @@ -114,12 +113,15 @@ struct DataType { virtual ~DataType(); - bool Equals(const DataType* other) { - // Call with a pointer so more friendly to subclasses - return other && ((this == other) || (this->type == other->type)); - } + // Return whether the types are equal + // + // Types that are logically convertable from one to another e.g. List + // and Binary are NOT equal). + virtual bool Equals(const DataType* other) const; - bool Equals(const std::shared_ptr& other) { return Equals(other.get()); } + bool Equals(const std::shared_ptr& other) const { + return Equals(other.get()); + } const std::shared_ptr& child(int i) const { return children_[i]; } @@ -236,9 +238,8 @@ struct DoubleType : public PrimitiveType { struct ListType : public DataType { // List can contain any other logical value type - explicit ListType(const std::shared_ptr& value_type) : DataType(Type::LIST) { - children_ = {std::make_shared("item", value_type)}; - } + explicit ListType(const std::shared_ptr& value_type) + : ListType(value_type, Type::LIST) {} explicit ListType(const std::shared_ptr& value_field) : DataType(Type::LIST) { children_ = {value_field}; @@ -251,15 +252,55 @@ struct ListType : public DataType { static char const* name() { return "list"; } std::string ToString() const override; + + protected: + // Constructor for classes that are implemented as List Arrays. + ListType(const std::shared_ptr& value_type, Type::type logical_type) + : DataType(logical_type) { + // TODO ARROW-187 this can technically fail, make a constructor method ? + children_ = {std::make_shared("item", value_type)}; + } +}; + +// BinaryType type is reprsents lists of 1-byte values. +struct BinaryType : public ListType { + BinaryType() : BinaryType(Type::BINARY) {} + static char const* name() { return "binary"; } + std::string ToString() const override; + + protected: + // Allow subclasses to change the logical type. + explicit BinaryType(Type::type logical_type) + : ListType(std::shared_ptr(new UInt8Type()), logical_type) {} }; -// String is a logical type consisting of a physical list of 1-byte values -struct StringType : public DataType { - StringType(); +// UTF encoded strings +struct StringType : public BinaryType { + StringType() : BinaryType(Type::STRING) {} static char const* name() { return "string"; } std::string ToString() const override; + + protected: + explicit StringType(Type::type logical_type) : BinaryType(logical_type) {} +}; + +struct CharType : public StringType { + int size; + + explicit CharType(int size) : StringType(Type::CHAR), size(size) {} + + CharType(const CharType& other) : CharType(other.size) {} + bool Equals(const DataType* other) const override { + return StringType::Equals(other) && + (size == static_cast(other)->size); + } + static char const* name() { return "char"; } + std::string ToString() const override; + // TODO(emkornfield) This is byte size not character size, which generally isn't how + // DBMSes work. + int value_size() const override { return size; } }; struct StructType : public DataType { diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc index bcb0ec49090..dd6dd9ae6b1 100644 --- a/cpp/src/arrow/types/construct.cc +++ b/cpp/src/arrow/types/construct.cc @@ -128,9 +128,9 @@ Status MakeListArray(const TypePtr& type, int32_t length, out->reset(new ListArray(type, length, offsets, values, null_count, null_bitmap)); break; case Type::CHAR: + out->reset(new CharArray(type, length, offsets, values, null_count, null_bitmap)); case Type::DECIMAL_TEXT: case Type::STRING: - case Type::VARCHAR: out->reset(new StringArray(type, length, offsets, values, null_count, null_bitmap)); break; default: diff --git a/cpp/src/arrow/types/decimal.h b/cpp/src/arrow/types/decimal.h index 1be489d4f51..598df3ef70d 100644 --- a/cpp/src/arrow/types/decimal.h +++ b/cpp/src/arrow/types/decimal.h @@ -29,7 +29,6 @@ struct DecimalType : public DataType { : DataType(Type::DECIMAL), precision(precision_), scale(scale_) {} int precision; int scale; - static char const* name() { return "decimal"; } std::string ToString() const override; diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index 0a3941633eb..2f6f85d66ca 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -66,8 +66,8 @@ class ListArray : public Array { int32_t offset(int i) const { return offsets_[i]; } // Neither of these functions will perform boundschecking - int32_t value_offset(int i) { return offsets_[i]; } - int32_t value_length(int i) { return offsets_[i + 1] - offsets_[i]; } + int32_t value_offset(int i) const { return offsets_[i]; } + int32_t value_length(int i) const { return offsets_[i + 1] - offsets_[i]; } bool EqualsExact(const ListArray& other) const; bool Equals(const std::shared_ptr& arr) const override; @@ -92,9 +92,9 @@ class ListArray : public Array { // a sequence of offests and null values. // // A note on types. Per arrow/type.h all types in the c++ implementation are -// logical so even though this class always builds an Array of lists, this can +// logical so even though this class always builds list array, this can // represent multiple different logical types. If no logical type is provided -// at construction time, the class defaults to List where t is take from the +// at construction time, the class defaults to List where t is taken from the // value_builder/values that the object is constructed with. class ListBuilder : public ArrayBuilder { public: diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc index ee4307c4d16..f1d421cbc00 100644 --- a/cpp/src/arrow/types/string-test.cc +++ b/cpp/src/arrow/types/string-test.cc @@ -34,8 +34,23 @@ namespace arrow { class Buffer; +TEST(TypesTest, BinaryType) { + BinaryType t1; + BinaryType e1; + StringType t2; + EXPECT_TRUE(t1.Equals(&e1)); + EXPECT_FALSE(t1.Equals(&t2)); + ASSERT_EQ(t1.type, Type::BINARY); + ASSERT_EQ(t1.ToString(), std::string("binary")); +} + TEST(TypesTest, TestCharType) { CharType t1(5); + CharType e1(5); + CharType ne1(6); + + ASSERT_TRUE(t1.Equals(&e1)); + EXPECT_FALSE(t1.Equals(&ne1)); ASSERT_EQ(t1.type, Type::CHAR); ASSERT_EQ(t1.size, 5); @@ -46,20 +61,8 @@ TEST(TypesTest, TestCharType) { CharType t2 = t1; ASSERT_EQ(t2.type, Type::CHAR); ASSERT_EQ(t2.size, 5); -} - -TEST(TypesTest, TestVarcharType) { - VarcharType t1(5); - - ASSERT_EQ(t1.type, Type::VARCHAR); - ASSERT_EQ(t1.size, 5); - - ASSERT_EQ(t1.ToString(), std::string("varchar(5)")); - // Test copy constructor - VarcharType t2 = t1; - ASSERT_EQ(t2.type, Type::VARCHAR); - ASSERT_EQ(t2.size, 5); + // Test CharType(5) } TEST(TypesTest, TestStringType) { @@ -119,6 +122,7 @@ class TestStringContainer : public ::testing::Test { TEST_F(TestStringContainer, TestArrayBasics) { ASSERT_EQ(length_, strings_->length()); ASSERT_EQ(1, strings_->null_count()); + ASSERT_OK(strings_->Validate()); } TEST_F(TestStringContainer, TestType) { @@ -163,7 +167,10 @@ class TestStringBuilder : public TestBuilder { builder_.reset(new StringBuilder(pool_, type_)); } - void Done() { result_ = std::dynamic_pointer_cast(builder_->Finish()); } + void Done() { + result_ = std::dynamic_pointer_cast(builder_->Finish()); + result_->Validate(); + } protected: TypePtr type_; @@ -216,4 +223,152 @@ TEST_F(TestStringBuilder, TestZeroLength) { Done(); } +// Binary container type +// TODO(emkornfield) there should be some way to refactor these to avoid code duplicating +// with String +class TestBinaryContainer : public ::testing::Test { + public: + void SetUp() { + chars_ = {'a', 'b', 'b', 'c', 'c', 'c'}; + offsets_ = {0, 1, 1, 1, 3, 6}; + valid_bytes_ = {1, 1, 0, 1, 1}; + expected_ = {"a", "", "", "bb", "ccc"}; + + MakeArray(); + } + + void MakeArray() { + length_ = offsets_.size() - 1; + int nchars = chars_.size(); + + value_buf_ = test::to_buffer(chars_); + values_ = ArrayPtr(new UInt8Array(nchars, value_buf_)); + + offsets_buf_ = test::to_buffer(offsets_); + + null_bitmap_ = test::bytes_to_null_buffer(valid_bytes_); + null_count_ = test::null_count(valid_bytes_); + + strings_ = std::make_shared( + length_, offsets_buf_, values_, null_count_, null_bitmap_); + } + + protected: + std::vector offsets_; + std::vector chars_; + std::vector valid_bytes_; + + std::vector expected_; + + std::shared_ptr value_buf_; + std::shared_ptr offsets_buf_; + std::shared_ptr null_bitmap_; + + int null_count_; + int length_; + + ArrayPtr values_; + std::shared_ptr strings_; +}; + +TEST_F(TestBinaryContainer, TestArrayBasics) { + ASSERT_EQ(length_, strings_->length()); + ASSERT_EQ(1, strings_->null_count()); + ASSERT_OK(strings_->Validate()); +} + +TEST_F(TestBinaryContainer, TestType) { + TypePtr type = strings_->type(); + + ASSERT_EQ(Type::BINARY, type->type); + ASSERT_EQ(Type::BINARY, strings_->type_enum()); +} + +TEST_F(TestBinaryContainer, TestListFunctions) { + int pos = 0; + for (size_t i = 0; i < expected_.size(); ++i) { + ASSERT_EQ(pos, strings_->value_offset(i)); + ASSERT_EQ(expected_[i].size(), strings_->value_length(i)); + pos += expected_[i].size(); + } +} + +TEST_F(TestBinaryContainer, TestDestructor) { + auto arr = std::make_shared( + length_, offsets_buf_, values_, null_count_, null_bitmap_); +} + +TEST_F(TestBinaryContainer, TestGetValue) { + for (size_t i = 0; i < expected_.size(); ++i) { + if (valid_bytes_[i] == 0) { + ASSERT_TRUE(strings_->IsNull(i)); + } else { + int32_t len = -1; + const uint8_t* bytes = strings_->GetValue(i, &len); + ASSERT_EQ(0, std::memcmp(expected_[i].data(), bytes, len)); + } + } +} + +class TestBinaryBuilder : public TestBuilder { + public: + void SetUp() { + TestBuilder::SetUp(); + type_ = TypePtr(new BinaryType()); + builder_.reset(new BinaryBuilder(pool_, type_)); + } + + void Done() { + result_ = std::dynamic_pointer_cast(builder_->Finish()); + result_->Validate(); + } + + protected: + TypePtr type_; + + std::unique_ptr builder_; + std::shared_ptr result_; +}; + +TEST_F(TestBinaryBuilder, TestScalarAppend) { + std::vector strings = {"", "bb", "a", "", "ccc"}; + std::vector is_null = {0, 0, 0, 1, 0}; + + int N = strings.size(); + int reps = 1000; + + for (int j = 0; j < reps; ++j) { + for (int i = 0; i < N; ++i) { + if (is_null[i]) { + builder_->AppendNull(); + } else { + builder_->Append( + reinterpret_cast(strings[i].data()), strings[i].size()); + } + } + } + Done(); + ASSERT_OK(result_->Validate()); + ASSERT_EQ(reps * N, result_->length()); + ASSERT_EQ(reps, result_->null_count()); + ASSERT_EQ(reps * 6, result_->values()->length()); + + int32_t length; + for (int i = 0; i < N * reps; ++i) { + if (is_null[i % N]) { + ASSERT_TRUE(result_->IsNull(i)); + } else { + ASSERT_FALSE(result_->IsNull(i)); + const uint8_t* vals = result_->GetValue(i, &length); + ASSERT_EQ(strings[i % N].size(), length); + ASSERT_EQ(0, std::memcmp(vals, strings[i % N].data(), length)); + } + } +} + +TEST_F(TestBinaryBuilder, TestZeroLength) { + // All buffers are null + Done(); +} + } // namespace arrow diff --git a/cpp/src/arrow/types/string.cc b/cpp/src/arrow/types/string.cc index 29d97d03947..3cf08f2abad 100644 --- a/cpp/src/arrow/types/string.cc +++ b/cpp/src/arrow/types/string.cc @@ -24,25 +24,54 @@ namespace arrow { +const std::shared_ptr BINARY(new BinaryType()); const std::shared_ptr STRING(new StringType()); +BinaryArray::BinaryArray(int32_t length, const std::shared_ptr& offsets, + const ArrayPtr& values, int32_t null_count, + const std::shared_ptr& null_bitmap) + : BinaryArray(BINARY, length, offsets, values, null_count, null_bitmap) {} + +BinaryArray::BinaryArray(const TypePtr& type, int32_t length, + const std::shared_ptr& offsets, const ArrayPtr& values, int32_t null_count, + const std::shared_ptr& null_bitmap) + : ListArray(type, length, offsets, values, null_count, null_bitmap), + bytes_(std::dynamic_pointer_cast(values).get()), + raw_bytes_(bytes_->raw_data()) {} + +Status BinaryArray::Validate() const { + if (values()->null_count() > 0) { + std::stringstream ss; + ss << type()->ToString() << " can have null values in the value array"; + Status::Invalid(ss.str()); + } + return ListArray::Validate(); +} + StringArray::StringArray(int32_t length, const std::shared_ptr& offsets, const ArrayPtr& values, int32_t null_count, const std::shared_ptr& null_bitmap) : StringArray(STRING, length, offsets, values, null_count, null_bitmap) {} -std::string CharType::ToString() const { - std::stringstream s; - s << "char(" << size << ")"; - return s.str(); +Status StringArray::Validate() const { + // TODO(emkornfield) Validate proper UTF8 code points? + return BinaryArray::Validate(); } -std::string VarcharType::ToString() const { - std::stringstream s; - s << "varchar(" << size << ")"; - return s.str(); +Status CharArray::Validate() const { + const int32_t num_slots = length(); + const int32_t fixed_length = std::static_pointer_cast(type())->size; + for (int i = 0; i < num_slots; ++i) { + int str_len = value_length(i); + if (str_len != fixed_length) { + std::stringstream ss; + ss << "String length ( " << str_len << ") != fixed size (" << fixed_length + << ") at element: " << i; + } + } + return StringArray::Validate(); } -TypePtr StringBuilder::value_type_ = TypePtr(new UInt8Type()); +TypePtr BinaryBuilder::value_type_ = TypePtr(new UInt8Type()); } // namespace arrow diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index d2d3c5b6b5a..1cce5e5b382 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -34,85 +34,141 @@ namespace arrow { class Buffer; class MemoryPool; -struct CharType : public DataType { - int size; - - explicit CharType(int size) : DataType(Type::CHAR), size(size) {} - - CharType(const CharType& other) : CharType(other.size) {} - - virtual std::string ToString() const; -}; - -// Variable-length, null-terminated strings, up to a certain length -struct VarcharType : public DataType { - int size; - - explicit VarcharType(int size) : DataType(Type::VARCHAR), size(size) {} - VarcharType(const VarcharType& other) : VarcharType(other.size) {} - - virtual std::string ToString() const; -}; - -// TODO(wesm): add a BinaryArray layer in between -class StringArray : public ListArray { +class BinaryArray : public ListArray { public: - StringArray(const TypePtr& type, int32_t length, const std::shared_ptr& offsets, + BinaryArray(int32_t length, const std::shared_ptr& offsets, const ArrayPtr& values, int32_t null_count = 0, - const std::shared_ptr& null_bitmap = nullptr) - : ListArray(type, length, offsets, values, null_count, null_bitmap) { - // For convenience - bytes_ = static_cast(values.get()); - raw_bytes_ = bytes_->raw_data(); - } - - StringArray(int32_t length, const std::shared_ptr& offsets, + const std::shared_ptr& null_bitmap = nullptr); + // Constructor that allows sub-classes/builders to propagate there logical type up the + // class hierarchy. + BinaryArray(const TypePtr& type, int32_t length, const std::shared_ptr& offsets, const ArrayPtr& values, int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); - // Compute the pointer t + // Return the pointer to the given elements bytes + // TODO(emkornfield) introduce a StringPiece or something similar to capture zero-copy + // pointer + offset const uint8_t* GetValue(int i, int32_t* out_length) const { - int32_t pos = offsets_[i]; + DCHECK(out_length); + const int32_t pos = offsets_[i]; *out_length = offsets_[i + 1] - pos; return raw_bytes_ + pos; } + Status Validate() const override; + + private: + UInt8Array* bytes_; + const uint8_t* raw_bytes_; +}; + +class StringArray : public BinaryArray { + public: + StringArray(int32_t length, const std::shared_ptr& offsets, + const ArrayPtr& values, int32_t null_count = 0, + const std::shared_ptr& null_bitmap = nullptr); + // Constructor that allows overriding the logical type, so subclasses can propagate + // there + // up the class hierarchy. + StringArray(const TypePtr& type, int32_t length, const std::shared_ptr& offsets, + const ArrayPtr& values, int32_t null_count = 0, + const std::shared_ptr& null_bitmap = nullptr) + : BinaryArray(type, length, offsets, values, null_count, null_bitmap) {} + // Construct a std::string + // TODO: std::bad_alloc possibility std::string GetString(int i) const { int32_t nchars; const uint8_t* str = GetValue(i, &nchars); return std::string(reinterpret_cast(str), nchars); } + Status Validate() const override; +}; + +class CharArray : public StringArray { + public: + CharArray(const TypePtr& type, int32_t length, const std::shared_ptr& offsets, + const ArrayPtr& values, int32_t null_count = 0, + const std::shared_ptr& null_bitmap = nullptr) + : StringArray(type, length, offsets, values, null_count, null_bitmap), + char_type(std::dynamic_pointer_cast(type).get()) {} + + // The fixed size of each string + // (TODO), right now this is in bytes, DB systems seems to + // generally store this as characters. Which would mean 3-6x the number of bytes + // stored) + int32_t string_size() const { return char_type->size; } + Status Validate() const override; + private: - UInt8Array* bytes_; - const uint8_t* raw_bytes_; + // for convenience + CharType* char_type; }; -// String builder -class StringBuilder : public ListBuilder { +// BinaryBuilder : public ListBuilder +class BinaryBuilder : public ListBuilder { public: - explicit StringBuilder(MemoryPool* pool, const TypePtr& type) + explicit BinaryBuilder(MemoryPool* pool, const TypePtr& type) : ListBuilder(pool, std::make_shared(pool, value_type_), type) { byte_builder_ = static_cast(value_builder_.get()); } + Status Append(const uint8_t* value, int32_t length) { + RETURN_NOT_OK(ListBuilder::Append()); + return byte_builder_->Append(value, length); + } + + std::shared_ptr Finish() override { + return ListBuilder::Transfer(); + } + + protected: + UInt8Builder* byte_builder_; + static TypePtr value_type_; +}; + +// String builder +class StringBuilder : public BinaryBuilder { + public: + explicit StringBuilder(MemoryPool* pool, const TypePtr& type) + : BinaryBuilder(pool, type) {} + Status Append(const std::string& value) { return Append(value.c_str(), value.size()); } Status Append(const char* value, int32_t length) { - RETURN_NOT_OK(ListBuilder::Append()); - return byte_builder_->Append(reinterpret_cast(value), length); + return BinaryBuilder::Append(reinterpret_cast(value), length); } + Status Append(const std::vector& values, uint8_t* null_bytes); std::shared_ptr Finish() override { return ListBuilder::Transfer(); } +}; - protected: - UInt8Builder* byte_builder_; +// Fixed width char builder +class CharBuilder : public StringBuilder { + public: + explicit CharBuilder(MemoryPool* pool, const std::shared_ptr& type) + : StringBuilder(pool, std::static_pointer_cast(type)) { + DCHECK(type); + fixed_length_ = type->size; + } - static TypePtr value_type_; + Status Append(const std::string& value) { return Append(value.c_str(), value.size()); } + + Status Append(const char* value, int32_t length) { + if (length != fixed_length_) { + return Status::Invalid("Invalid length specified for fixed character type"); + } + return BinaryBuilder::Append(reinterpret_cast(value), length); + } + + std::shared_ptr Finish() override { return ListBuilder::Transfer(); } + + private: + int32_t fixed_length_; }; } // namespace arrow diff --git a/cpp/src/arrow/util/macros.h b/cpp/src/arrow/util/macros.h index 51e605ee50a..69ecda16ceb 100644 --- a/cpp/src/arrow/util/macros.h +++ b/cpp/src/arrow/util/macros.h @@ -21,6 +21,6 @@ // From Google gutil #define DISALLOW_COPY_AND_ASSIGN(TypeName) \ TypeName(const TypeName&) = delete; \ - void operator=(const TypeName&) = delete + TypeName& operator=(const TypeName&) = delete #endif // ARROW_UTIL_MACROS_H diff --git a/format/Message.fbs b/format/Message.fbs index fc849eedf79..532e3e03349 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -37,6 +37,12 @@ table FloatingPoint { table Utf8 { } +/// Equivelant of a CHAR(N) +/// in databases +table FixedWidthUtf8 { + width: int; +} + table Binary { } @@ -72,7 +78,8 @@ union Type { List, Tuple, Union, - JSONScalar + JSONScalar, + FixedWidthUtf8 } /// ---------------------------------------------------------------------- From 58bfcc960d678eb1fd6e5be778d0c49d4ada15c5 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 7 Jun 2016 08:18:45 +0000 Subject: [PATCH 2/4] fix style of char_type_ --- cpp/src/arrow/types/string.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index 1cce5e5b382..2db167b8271 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -92,18 +92,18 @@ class CharArray : public StringArray { const ArrayPtr& values, int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr) : StringArray(type, length, offsets, values, null_count, null_bitmap), - char_type(std::dynamic_pointer_cast(type).get()) {} + char_type_(std::dynamic_pointer_cast(type).get()) {} // The fixed size of each string // (TODO), right now this is in bytes, DB systems seems to // generally store this as characters. Which would mean 3-6x the number of bytes // stored) - int32_t string_size() const { return char_type->size; } + int32_t string_size() const { return char_type_->size; } Status Validate() const override; private: // for convenience - CharType* char_type; + CharType* char_type_; }; // BinaryBuilder : public ListBuilder From 6f0634ca57a755711683cd7c85b82a6e655196e9 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 15 Jun 2016 04:13:59 +0000 Subject: [PATCH 3/4] remove char type and add dcheck --- cpp/src/arrow/type.cc | 6 ---- cpp/src/arrow/type.h | 20 -------------- cpp/src/arrow/types/construct.cc | 2 -- cpp/src/arrow/types/string-test.cc | 21 -------------- cpp/src/arrow/types/string.cc | 19 +++---------- cpp/src/arrow/types/string.h | 44 ------------------------------ format/Message.fbs | 9 +----- 7 files changed, 5 insertions(+), 116 deletions(-) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index ad5829a119e..4fd50b7c193 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -59,12 +59,6 @@ std::string BinaryType::ToString() const { return std::string(name()); } -std::string CharType::ToString() const { - std::stringstream s; - s << "char(" << size << ")"; - return s.str(); -} - std::string StructType::ToString() const { std::stringstream s; s << "struct<"; diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index e92c007c8f4..8fb41211ba9 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -55,9 +55,6 @@ struct Type { // 8-byte floating point value DOUBLE = 11, - // CHAR(N): fixed-length UTF8 string with length N - CHAR = 12, - // UTF8 variable-length string as List STRING = 13, @@ -286,23 +283,6 @@ struct StringType : public BinaryType { explicit StringType(Type::type logical_type) : BinaryType(logical_type) {} }; -struct CharType : public StringType { - int size; - - explicit CharType(int size) : StringType(Type::CHAR), size(size) {} - - CharType(const CharType& other) : CharType(other.size) {} - bool Equals(const DataType* other) const override { - return StringType::Equals(other) && - (size == static_cast(other)->size); - } - static char const* name() { return "char"; } - std::string ToString() const override; - // TODO(emkornfield) This is byte size not character size, which generally isn't how - // DBMSes work. - int value_size() const override { return size; } -}; - struct StructType : public DataType { explicit StructType(const std::vector>& fields) : DataType(Type::STRUCT) { diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc index dd6dd9ae6b1..2d913a73748 100644 --- a/cpp/src/arrow/types/construct.cc +++ b/cpp/src/arrow/types/construct.cc @@ -127,8 +127,6 @@ Status MakeListArray(const TypePtr& type, int32_t length, case Type::LIST: out->reset(new ListArray(type, length, offsets, values, null_count, null_bitmap)); break; - case Type::CHAR: - out->reset(new CharArray(type, length, offsets, values, null_count, null_bitmap)); case Type::DECIMAL_TEXT: case Type::STRING: out->reset(new StringArray(type, length, offsets, values, null_count, null_bitmap)); diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc index f1d421cbc00..a141fc11321 100644 --- a/cpp/src/arrow/types/string-test.cc +++ b/cpp/src/arrow/types/string-test.cc @@ -44,27 +44,6 @@ TEST(TypesTest, BinaryType) { ASSERT_EQ(t1.ToString(), std::string("binary")); } -TEST(TypesTest, TestCharType) { - CharType t1(5); - CharType e1(5); - CharType ne1(6); - - ASSERT_TRUE(t1.Equals(&e1)); - EXPECT_FALSE(t1.Equals(&ne1)); - - ASSERT_EQ(t1.type, Type::CHAR); - ASSERT_EQ(t1.size, 5); - - ASSERT_EQ(t1.ToString(), std::string("char(5)")); - - // Test copy constructor - CharType t2 = t1; - ASSERT_EQ(t2.type, Type::CHAR); - ASSERT_EQ(t2.size, 5); - - // Test CharType(5) -} - TEST(TypesTest, TestStringType) { StringType str; ASSERT_EQ(str.type, Type::STRING); diff --git a/cpp/src/arrow/types/string.cc b/cpp/src/arrow/types/string.cc index 3cf08f2abad..da02c7d1d8a 100644 --- a/cpp/src/arrow/types/string.cc +++ b/cpp/src/arrow/types/string.cc @@ -37,7 +37,10 @@ BinaryArray::BinaryArray(const TypePtr& type, int32_t length, const std::shared_ptr& null_bitmap) : ListArray(type, length, offsets, values, null_count, null_bitmap), bytes_(std::dynamic_pointer_cast(values).get()), - raw_bytes_(bytes_->raw_data()) {} + raw_bytes_(bytes_->raw_data()) { + // Check in case the dynamic cast fails. + DCHECK(bytes_); +} Status BinaryArray::Validate() const { if (values()->null_count() > 0) { @@ -58,20 +61,6 @@ Status StringArray::Validate() const { return BinaryArray::Validate(); } -Status CharArray::Validate() const { - const int32_t num_slots = length(); - const int32_t fixed_length = std::static_pointer_cast(type())->size; - for (int i = 0; i < num_slots; ++i) { - int str_len = value_length(i); - if (str_len != fixed_length) { - std::stringstream ss; - ss << "String length ( " << str_len << ") != fixed size (" << fixed_length - << ") at element: " << i; - } - } - return StringArray::Validate(); -} - TypePtr BinaryBuilder::value_type_ = TypePtr(new UInt8Type()); } // namespace arrow diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index 2db167b8271..b3c00d298b3 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -86,26 +86,6 @@ class StringArray : public BinaryArray { Status Validate() const override; }; -class CharArray : public StringArray { - public: - CharArray(const TypePtr& type, int32_t length, const std::shared_ptr& offsets, - const ArrayPtr& values, int32_t null_count = 0, - const std::shared_ptr& null_bitmap = nullptr) - : StringArray(type, length, offsets, values, null_count, null_bitmap), - char_type_(std::dynamic_pointer_cast(type).get()) {} - - // The fixed size of each string - // (TODO), right now this is in bytes, DB systems seems to - // generally store this as characters. Which would mean 3-6x the number of bytes - // stored) - int32_t string_size() const { return char_type_->size; } - Status Validate() const override; - - private: - // for convenience - CharType* char_type_; -}; - // BinaryBuilder : public ListBuilder class BinaryBuilder : public ListBuilder { public: @@ -147,30 +127,6 @@ class StringBuilder : public BinaryBuilder { } }; -// Fixed width char builder -class CharBuilder : public StringBuilder { - public: - explicit CharBuilder(MemoryPool* pool, const std::shared_ptr& type) - : StringBuilder(pool, std::static_pointer_cast(type)) { - DCHECK(type); - fixed_length_ = type->size; - } - - Status Append(const std::string& value) { return Append(value.c_str(), value.size()); } - - Status Append(const char* value, int32_t length) { - if (length != fixed_length_) { - return Status::Invalid("Invalid length specified for fixed character type"); - } - return BinaryBuilder::Append(reinterpret_cast(value), length); - } - - std::shared_ptr Finish() override { return ListBuilder::Transfer(); } - - private: - int32_t fixed_length_; -}; - } // namespace arrow #endif // ARROW_TYPES_STRING_H diff --git a/format/Message.fbs b/format/Message.fbs index 532e3e03349..fc849eedf79 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -37,12 +37,6 @@ table FloatingPoint { table Utf8 { } -/// Equivelant of a CHAR(N) -/// in databases -table FixedWidthUtf8 { - width: int; -} - table Binary { } @@ -78,8 +72,7 @@ union Type { List, Tuple, Union, - JSONScalar, - FixedWidthUtf8 + JSONScalar } /// ---------------------------------------------------------------------- From 441481611143b8da8271b776a301a6e355246278 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Thu, 16 Jun 2016 03:48:10 +0000 Subject: [PATCH 4/4] remove CHAR from parquet --- cpp/src/arrow/parquet/schema.cc | 5 ----- 1 file changed, 5 deletions(-) diff --git a/cpp/src/arrow/parquet/schema.cc b/cpp/src/arrow/parquet/schema.cc index fd758940c9f..c7979db3494 100644 --- a/cpp/src/arrow/parquet/schema.cc +++ b/cpp/src/arrow/parquet/schema.cc @@ -250,11 +250,6 @@ Status FieldToNode(const std::shared_ptr& field, NodePtr* out) { case Type::DOUBLE: type = ParquetType::DOUBLE; break; - case Type::CHAR: - type = ParquetType::FIXED_LEN_BYTE_ARRAY; - logical_type = LogicalType::UTF8; - length = static_cast(field->type.get())->size; - break; case Type::STRING: type = ParquetType::BYTE_ARRAY; logical_type = LogicalType::UTF8;