diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index 1d3edf0117f..970c31476fc 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -220,7 +220,8 @@ static Status FieldToFlatbuffer( auto fb_children = fbb.CreateVector(children); *offset = flatbuf::CreateField( - fbb, fb_name, field->nullable, type_enum, type_data, fb_children); + fbb, fb_name, field->nullable, type_enum, type_data, field->dictionary, + fb_children); return Status::OK(); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 4cb37fd1dea..02677d5e18b 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -144,8 +144,13 @@ struct ARROW_EXPORT Field { // Fields can be nullable bool nullable; - Field(const std::string& name, const TypePtr& type, bool nullable = true) - : name(name), type(type), nullable(nullable) {} + // optional dictionary id if the field is dictionary encoded + // 0 means it's not dictionary encoded + int64_t dictionary; + + Field(const std::string& name, const TypePtr& type, bool nullable = true, + int64_t dictionary = 0) + : name(name), type(type), nullable(nullable), dictionary(dictionary) {} bool operator==(const Field& other) const { return this->Equals(other); } @@ -154,7 +159,7 @@ struct ARROW_EXPORT Field { bool Equals(const Field& other) const { return (this == &other) || (this->name == other.name && this->nullable == other.nullable && - this->type->Equals(other.type.get())); + this->dictionary == dictionary && this->type->Equals(other.type.get())); } bool Equals(const std::shared_ptr& other) const { return Equals(*other.get()); } diff --git a/format/Layout.md b/format/Layout.md index 5eaefeebf21..a953930e172 100644 --- a/format/Layout.md +++ b/format/Layout.md @@ -583,6 +583,43 @@ even if the null bitmap of the parent union array indicates the slot is null. Additionally, a child array may have a non-null slot even if the the types array indicates that a slot contains a different type at the index. +## Dictionary encoding + +When a field is dictionary encoded, the values are represented by an array of Int32 representing the index of the value in the dictionary. +The Dictionary is received as a DictionaryBacth whose id is referenced by a dictionary attribute defined in the metadata (Message.fbs) in the Field table. +The dictionary has the same layout as the type of the field would dictate. Each entry in the dictionary can be accessed by its index in the DictionaryBatch. +When a Schema references a Dictionary id, it must send a DictionaryBatch for this id before any RecordBatch. + +As an example, you could have the following data: +``` +type: List + +[ + ['a', 'b'], + ['a', 'b'], + ['a', 'b'], + ['c', 'd', 'e'], + ['c', 'd', 'e'], + ['c', 'd', 'e'], + ['c', 'd', 'e'], + ['a', 'b'] +] +``` +In dictionary-encoded form, this could appear as: +``` +data List (dictionary-encoded, dictionary id i) +indices: [0, 0, 0, 1, 1, 1, 0] + +dictionary i + +type: List + +[ + ['a', 'b'], + ['c', 'd', 'e'], +] +``` + ## References Apache Drill Documentation - [Value Vectors][6] diff --git a/format/Message.fbs b/format/Message.fbs index 3f688c156e3..d650dd4c2c7 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -84,6 +84,10 @@ table Field { name: string; nullable: bool; type: Type; + // present only if the field is dictionary encoded + // will point to a dictionary provided by a DictionaryBatch message + dictionary: long; + // children apply only to Nested data types like Struct, List and Union children: [Field]; } @@ -165,8 +169,8 @@ table RecordBatch { /// For sending dictionary encoding information. Any Field can be /// dictionary-encoded, but in this case none of its children may be /// dictionary-encoded. +/// There is one dictionary batch per dictionary /// -/// TODO(wesm): To be documented in more detail table DictionaryBatch { id: long;