From 8c279436a1dbb45ea0bdb9225ceb4fa3020f16ae Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Fri, 12 Aug 2016 16:54:40 -0700 Subject: [PATCH 1/3] ARROW-255: Finalize Dictionary representation --- cpp/src/arrow/ipc/metadata-internal.cc | 2 +- cpp/src/arrow/type.h | 10 +++++++--- format/Layout.md | 6 ++++++ format/Message.fbs | 4 ++++ 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index 1d3edf0117f..778db94e5b5 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -220,7 +220,7 @@ static Status FieldToFlatbuffer( auto fb_children = fbb.CreateVector(children); *offset = flatbuf::CreateField( - fbb, fb_name, field->nullable, type_enum, type_data, fb_children); + fbb, fb_name, field->nullable, type_enum, type_data, field->dictionary, fb_children); return Status::OK(); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 4cb37fd1dea..a461fed3488 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -144,8 +144,12 @@ struct ARROW_EXPORT Field { // Fields can be nullable bool nullable; - Field(const std::string& name, const TypePtr& type, bool nullable = true) - : name(name), type(type), nullable(nullable) {} + // optional dictionary id if the field is dictionary encoded + // 0 means it's not dictionary encoded + int64_t dictionary; + + Field(const std::string& name, const TypePtr& type, bool nullable = true, int64_t dictionary = 0) + : name(name), type(type), nullable(nullable), dictionary(dictionary) {} bool operator==(const Field& other) const { return this->Equals(other); } @@ -154,7 +158,7 @@ struct ARROW_EXPORT Field { bool Equals(const Field& other) const { return (this == &other) || (this->name == other.name && this->nullable == other.nullable && - this->type->Equals(other.type.get())); + this->dictionary == dictionary && this->type->Equals(other.type.get())); } bool Equals(const std::shared_ptr& other) const { return Equals(*other.get()); } diff --git a/format/Layout.md b/format/Layout.md index 5eaefeebf21..5a18acebc48 100644 --- a/format/Layout.md +++ b/format/Layout.md @@ -583,6 +583,12 @@ even if the null bitmap of the parent union array indicates the slot is null. Additionally, a child array may have a non-null slot even if the the types array indicates that a slot contains a different type at the index. +## Dictionary encoding + +When a field is dictionary encoded, the values are represented by an array of Int32 representing the index of the value in the dictionary. +The Dictionary is received as a DIctionaryBacth whose id is referenced byt the dictionary field in the Field. +The dictionary has the same layout as the type of the field would dictate. Each entry in the dictionary can be accessed by its indexed in the DictionaryBatch. + ## References Apache Drill Documentation - [Value Vectors][6] diff --git a/format/Message.fbs b/format/Message.fbs index 3f688c156e3..14794ef2baf 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -84,6 +84,10 @@ table Field { name: string; nullable: bool; type: Type; + // present only if the field is dictionary encoded + // will point to a dictionary provided by a DictionaryBatch message + dictionary: long; + // children apply only to Nested data types like Struct, List and Union children: [Field]; } From e28a3c853163e48c2188e0c1f350e797f04559ea Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Sat, 20 Aug 2016 10:55:55 -0700 Subject: [PATCH 2/3] ARROW-255: review feedback --- format/Layout.md | 35 +++++++++++++++++++++++++++++++++-- format/Message.fbs | 2 +- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/format/Layout.md b/format/Layout.md index 5a18acebc48..91f25db2401 100644 --- a/format/Layout.md +++ b/format/Layout.md @@ -586,8 +586,39 @@ the the types array indicates that a slot contains a different type at the index ## Dictionary encoding When a field is dictionary encoded, the values are represented by an array of Int32 representing the index of the value in the dictionary. -The Dictionary is received as a DIctionaryBacth whose id is referenced byt the dictionary field in the Field. -The dictionary has the same layout as the type of the field would dictate. Each entry in the dictionary can be accessed by its indexed in the DictionaryBatch. +The Dictionary is received as a DictionaryBacth whose id is referenced by a dictionary attribute defined in the metadata (Message.fbs) in the Field table. +The dictionary has the same layout as the type of the field would dictate. Each entry in the dictionary can be accessed by its index in the DictionaryBatch. +When a Schema references a Dictionary id, it must dend a DictionaryBatch for this id before any RecordBatch. + +As an example, you could have the following data: +``` +type: List + +[ + ['a', 'b'], + ['a', 'b'], + ['a', 'b'], + ['c', 'd', 'e'], + ['c', 'd', 'e'], + ['c', 'd', 'e'], + ['c', 'd', 'e'], + ['a', 'b'] +] +``` +In dictionary-encoded form, this could appear as: +``` +data List (dictionary-encoded, dictionary id i) +indices: [0, 0, 0, 1, 1, 1, 0] + +dictionary i + +type: List + +[ + ['a', 'b'], + ['c', 'd', 'e'], +] +``` ## References diff --git a/format/Message.fbs b/format/Message.fbs index 14794ef2baf..d650dd4c2c7 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -169,8 +169,8 @@ table RecordBatch { /// For sending dictionary encoding information. Any Field can be /// dictionary-encoded, but in this case none of its children may be /// dictionary-encoded. +/// There is one dictionary batch per dictionary /// -/// TODO(wesm): To be documented in more detail table DictionaryBatch { id: long; From 316745d98d89a1d640dd2fdaa5306a5a194e03e3 Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Sat, 20 Aug 2016 12:38:08 -0700 Subject: [PATCH 3/3] ARROW-255: fix typo and linter errors --- cpp/src/arrow/ipc/metadata-internal.cc | 3 ++- cpp/src/arrow/type.h | 3 ++- format/Layout.md | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index 778db94e5b5..970c31476fc 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -220,7 +220,8 @@ static Status FieldToFlatbuffer( auto fb_children = fbb.CreateVector(children); *offset = flatbuf::CreateField( - fbb, fb_name, field->nullable, type_enum, type_data, field->dictionary, fb_children); + fbb, fb_name, field->nullable, type_enum, type_data, field->dictionary, + fb_children); return Status::OK(); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index a461fed3488..02677d5e18b 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -148,7 +148,8 @@ struct ARROW_EXPORT Field { // 0 means it's not dictionary encoded int64_t dictionary; - Field(const std::string& name, const TypePtr& type, bool nullable = true, int64_t dictionary = 0) + Field(const std::string& name, const TypePtr& type, bool nullable = true, + int64_t dictionary = 0) : name(name), type(type), nullable(nullable), dictionary(dictionary) {} bool operator==(const Field& other) const { return this->Equals(other); } diff --git a/format/Layout.md b/format/Layout.md index 91f25db2401..a953930e172 100644 --- a/format/Layout.md +++ b/format/Layout.md @@ -588,7 +588,7 @@ the the types array indicates that a slot contains a different type at the index When a field is dictionary encoded, the values are represented by an array of Int32 representing the index of the value in the dictionary. The Dictionary is received as a DictionaryBacth whose id is referenced by a dictionary attribute defined in the metadata (Message.fbs) in the Field table. The dictionary has the same layout as the type of the field would dictate. Each entry in the dictionary can be accessed by its index in the DictionaryBatch. -When a Schema references a Dictionary id, it must dend a DictionaryBatch for this id before any RecordBatch. +When a Schema references a Dictionary id, it must send a DictionaryBatch for this id before any RecordBatch. As an example, you could have the following data: ```