diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h new file mode 100644 index 00000000000000..0d8fd722541ec0 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/column_reader.h @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef DORIS_BE_SRC_OLAP_ROWSET_SEGMENT_V2_COLUMN_READER_H +#define DORIS_BE_SRC_OLAP_ROWSET_SEGMENT_V2_COLUMN_READER_H + +#include "runtime/vectorized_row_batch.h" +#include "common/status.h" + +namespace doris { + +namespace segment_v2 { + +class ColumnReader { +public: + ColumnReader() { } + + doris::Status init(); + + // Seek to the first entry in the column. + doris::Status seek_to_first(); + + // Seek to the given ordinal entry in the column. + // Entry 0 is the first entry written to the column. + // If provided seek point is past the end of the file, + // then returns false. + doris::Status seek_to_ordinal(rowid_t ord_idx) override; + + // Fetch the next vector of values from the page into 'dst'. + // The output vector must have space for up to n cells. + // + // return the size of entries. + // + // In the case that the values are themselves references + // to other memory (eg Slices), the referred-to memory is + // allocated in the dst column vector's arena. + virtual doris::Status next_batch(size_t* n, doris::ColumnVector* dst, doris::MemPool* mem_pool) = 0; + + // Get current oridinal + size_t get_current_oridinal(); + + // Call this function every time before next_batch. + // This function will preload pages from disk into memory if necessary. + doris::Status prepare_batch(size_t n); + + // release next_batch related resource + doris::Status finish_batch(); +}; + +} // namespace segment_v2 + +} // namespace doris + +#endif // DORIS_BE_SRC_OLAP_ROWSET_SEGMENT_V2_COLUMN_READER_H diff --git a/be/src/olap/rowset/segment_v2/column_writer.h b/be/src/olap/rowset/segment_v2/column_writer.h new file mode 100644 index 00000000000000..d85ac5f669d507 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/column_writer.h @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef DORIS_BE_SRC_OLAP_ROWSET_SEGMENT_V2_COLUMN_WRITER_H +#define DORIS_BE_SRC_OLAP_ROWSET_SEGMENT_V2_COLUMN_WRITER_H + +#include + +#include "gen_cpp/doris.pb.h" +#include "util/slice.h" +#include "common/status.h" + +namespace doris { + +namespace segment_v2 { + +// ColumnWriter is used to write data of a column +class ColumnWriter { +public: + explicit ColumnWriter(BuilderOptions builder_options, ColumnSchemaPB* column_schema) + : _builder_options(builder_options), + _column_schema(column_schema) { } + + doris::Status init(); + + // close the writer + doris::Status finish(); + + // Caller will loop all the ColumnWriter and call the following get page api + // to get page data and get the page pointer + doris::Status get_data_pages(std::vector* data_buffers); + + // Get the dictionary page for under dictionary encoding mode column. + doris::Status get_dictionary_page(doris::Slice* dictionary_page); + + // Get the bloom filter pages for under bloom filter indexed column. + doris::Status get_bloom_filter_pages(std::vector* bf_pages); + + // Get the bitmap page for under bitmap indexed column. + doris::Status get_bitmap_page(doris::Slice* bitmap_page); + + // Get the statistic page for under statistic column. + doris::Status get_statistic_page(doris::Slice* statistic_page); + + doris::Status write_batch(doris::RowBlock* block); + + size_t written_size() const; + + size_t written_value_count() const; + +private: + BuilderOptions _builder_options; + ColumnSchemaPB* _column_schema; +}; + +} // namespace segment_v2 + +} // namespace doris + + +#endif // DORIS_BE_SRC_OLAP_ROWSET_SEGMENT_V2_COLUMN_WRITER_H diff --git a/be/src/olap/rowset/segment_v2/common.h b/be/src/olap/rowset/segment_v2/common.h new file mode 100644 index 00000000000000..abbc2baafd4ea2 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/common.h @@ -0,0 +1,31 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef DORIS_BE_SRC_OLAP_ROWSET_SEGMENT_V2_COMMON_H +#define DORIS_BE_SRC_OLAP_ROWSET_SEGMENT_V2_COMMON_H + +namespace doris { + +namespace segment_v2 { + +typedef uint32_t rowid_t; + +} // namespace segment_v2 + +} // namespace doris + +#endif // DORIS_BE_SRC_OLAP_ROWSET_SEGMENT_V2_COMMON_H diff --git a/be/src/olap/rowset/segment_v2/options.h b/be/src/olap/rowset/segment_v2/options.h new file mode 100644 index 00000000000000..f159a21addcec5 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/options.h @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef DORIS_BE_SRC_OLAP_ROWSET_SEGMENT_V2_OPTIONS_H +#define DORIS_BE_SRC_OLAP_ROWSET_SEGMENT_V2_OPTIONS_H + +#include "gen_cpp/segment_v2.pb.h" + +namespace doris { + +namespace segment_v2 { + +struct BuilderOptions { + size_t data_page_size; + + size_t dict_page_size; + + bool write_posidx; + + EncodingTypePB encoding; + + CompressionTypePB compression_type; + + bool is_nullable; + + bool has_dictionary; +}; + +} // namespace segment_v2 + +} // namespace doris + +#endif // DORIS_BE_SRC_OLAP_ROWSET_SEGMENT_V2_OPTIONS_H diff --git a/be/src/olap/rowset/segment_v2/ordinal_index.h b/be/src/olap/rowset/segment_v2/ordinal_index.h new file mode 100644 index 00000000000000..cbbd235e35166e --- /dev/null +++ b/be/src/olap/rowset/segment_v2/ordinal_index.h @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "util/slice.h" +#include "gen_cpp/segment_v2.pb.h" +#include "common/status.h" + +#ifndef DORIS_BE_SRC_OLAP_ROWSET_SEGMENT_V2_ORDINAL_INDEX_H +#define DORIS_BE_SRC_OLAP_ROWSET_SEGMENT_V2_ORDINAL_INDEX_H + +namespace doris { + +namespace segment_v2 { + +class OrdinalIndexReader { +public: + // parse the data + doris::Status init(const Slice& data); + + // return the entry number of the index + size_t count(); + + // compare the row_id in idx_in_block to the row_id + int compare_key(int idx_in_block, const rowid_t row_id); + + // get the OrdinalIndex from the reader + std::unique_ptr get_short_key_index(); +}; + +class OrdinalIndexWriter { +public: + doris::Status init(); + + // add a rowid -> page_pointer entry to the index + doris::Status add_entry(rowid_t rowid, const PagePointerPB& page_pointer); + + // return the index data + doris::Slice finish(); +}; + +class OrdinalIndex { +public: + OrdinalIndex(OrdinalIndexReader* reader); + + // seek the the first entry when the rowid is equal to or greater than row_id + // if equal, matched will be set to true, else false + doris::Status seek_at_or_after(const rowid_t row_id, bool* matched); + + // seek the the first entry when the rowid is equal to or less than row_id + // if equal, matched will be set to true, else false + doris::Status seek_at_or_before(const rowid_t row_id, bool* matched); + + // return the current seeked index related page pointer + void get_current_page_pointer(PagePointerPB* page_pointer); + +private: + bool _seeked; + size_t _cur_idx; +}; + +} // namespace segment_v2 + +} // namespace doris + +#endif // DORIS_BE_SRC_OLAP_ROWSET_SEGMENT_V2_ORDINAL_INDEX_H diff --git a/be/src/olap/rowset/segment_v2/page_builder.h b/be/src/olap/rowset/segment_v2/page_builder.h new file mode 100644 index 00000000000000..38b60413c31be2 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/page_builder.h @@ -0,0 +1,80 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef DORIS_BE_SRC_OLAP_ROWSET_SEGMENT_V2_PAGE_BUILDER_H +#define DORIS_BE_SRC_OLAP_ROWSET_SEGMENT_V2_PAGE_BUILDER_H + +#include +#include + +#include "util/slice.h" +#include "common/status.h" + +namespace doris { + +namespace segment_v2 { + +// PageBuilder is used to build page +// Page is a data management unit, including: +// 1. Data Page: store encoded and compressed data +// 2. BloomFilter Page: store bloom filter of data +// 3. Ordinal Index Page: store ordinal index of data +// 4. Short Key Index Page: store short key index of data +// 5. Bitmap Index Page: store bitmap index of data +class PageBuilder { +public: + virtual ~PageBuilder() { } + + // Used by column writer to determine whether the current page is full. + // Column writer depends on the result to decide whether to flush current page. + virtual bool is_page_full() = 0; + + // Add a sequence of values to the page. + // Returns the number of values actually added, which may be less + // than requested if the page is full. + // + // vals size should be decided according to the page build type + virtual doris::Status add(const uint8_t* vals, size_t count) = 0; + + // Get the dictionary page for under dictionary encoding mode column. + virtual doris::Status get_dictionary_page(doris::Slice* dictionary_page); + + // Get the bitmap page for under bitmap indexed column. + virtual doris::Status get_bitmap_page(doris::Slice* bitmap_page); + + // Return a Slice which represents the encoded data of current page. + // + // This Slice points to internal data of this builder. + virtual Slice finish(rowid_t page_first_rowid) = 0; + + // Reset the internal state of the page builder. + // + // Any data previously returned by finish may be invalidated by this call. + virtual void reset() = 0; + + // Return the number of entries that have been added to the page. + virtual size_t count() const = 0; + +private: + DISALLOW_COPY_AND_ASSIGN(PageBuilder); +}; + +} // namespace segment_v2 + +} // namespace doris + +#endif // DORIS_BE_SRC_OLAP_ROWSET_SEGMENT_V2_PAGE_BUILDER_H diff --git a/be/src/olap/rowset/segment_v2/page_decoder.h b/be/src/olap/rowset/segment_v2/page_decoder.h new file mode 100644 index 00000000000000..216a01566c78c0 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/page_decoder.h @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef DORIS_BE_SRC_OLAP_ROWSET_SEGMENT_V2_PAGE_DECODER_H +#define DORIS_BE_SRC_OLAP_ROWSET_SEGMENT_V2_PAGE_DECODER_H + +#include "runtime/vectorized_row_batch.h" +#include "common/status.h" + +namespace doris { + +namespace segment_v2 { + +// PageDecoder is used to decode page page. +class PageDecoder { +public: + virtual ~PageDecoder() { } + + // Call this to do some preparation for decoder. + // eg: parse data page header + virtual doris::Status init() = 0; + + // Seek the decoder to the given positional index of the page. + // For example, seek_to_position_in_page(0) seeks to the first + // stored entry. + // + // It is an error to call this with a value larger than Count(). + // Doing so has undefined results. + virtual doris::Status seek_to_position_in_page(size_t pos) = 0; + + // Seek the decoder forward by a given number of rows, or to the end + // of the page. This is primarily used to skip over data. + // + // Return the step skipped. + virtual size_t seek_forward(size_t n) { + size_t step = std::min(n, count() - current_index()); + DCHECK_GE(step, 0); + seek_to_position_in_page(current_index() + step); + return step; + } + + // Fetch the next vector of values from the page into 'dst'. + // The output vector must have space for up to n cells. + // + // return the size of entries. + // + // In the case that the values are themselves references + // to other memory (eg Slices), the referred-to memory is + // allocated in the mem_pool. + virtual doris::Status next_batch(size_t* n, doris::ColumnVector* dst, doris::MemPool* mem_pool) = 0; + + // Return the number of elements in this page. + virtual size_t count() const = 0; + + // Return the position within the page of the currently seeked + // entry (ie the entry that will next be returned by next_vector()) + virtual size_t current_index() const = 0; + + // Return the first rowid stored in this page. + virtual rowid_t get_first_rowid() const = 0; + +private: + DISALLOW_COPY_AND_ASSIGN(PageDecoder); +}; + +} // namespace segment_v2 + +} // namespace doris + +#endif // DORIS_BE_SRC_OLAP_ROWSET_SEGMENT_V2_PAGE_DECODER_H diff --git a/be/src/olap/rowset/segment_v2/short_index.h b/be/src/olap/rowset/segment_v2/short_index.h new file mode 100644 index 00000000000000..cd963c3861810e --- /dev/null +++ b/be/src/olap/rowset/segment_v2/short_index.h @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef DORIS_BE_SRC_OLAP_ROWSET_SEGMENT_V2_SHORT_INDEX_H +#define DORIS_BE_SRC_OLAP_ROWSET_SEGMENT_V2_SHORT_INDEX_H + +#include + +#include "util/slice.h" + +namespace doris { + +namespace segment_v2 { + +class ShortKeyIndexReader { +public: + // parse index data + bool init(const Slice& data); + + // return the entry number of the index + size_t count(); + + // compare the short key in idx_in_block to the key + int compare_key(int idx_in_block, const Slice& key); + + // get the ShortKeyIndex from the reader + std::unique_ptr get_short_key_index(); +}; + +class ShortKeyIndexWriter { +public: + bool init(); + + // add a short key -> rowid entry to the index + bool add_entry(doris::Slice* key, rowid_t rowid); + + // return the index data + dorsi::Slice finish(); +}; + +class ShortKeyIndex { +public: + ShortKeyIndex(ShortKeyIndexReader* reader); + + // seek the the first entry when the short key is equal to or greater than key + // if equal, matched will be set to true, else false + bool seek_at_or_after(const doris::Slice& key, bool* matched); + + // seek the the first entry when the short key is equal to or less than key + // if equal, matched will be set to true, else false + bool seek_at_or_before(const doris::Slice& key, bool* matched); + + // return the current row id of current index entry + rowid_t get_current_rowid(); + + // Seek the index to previous one + // If the current index is 0, return false + bool prev(); + + // Seek the index to next one + // If the current index is tee last one, return false + bool next(); + +private: + bool _seeked; + size_t _cur_idx; +} + +} // namespace segment_v2 + +} // namespace doris + +#endif // DORIS_BE_SRC_OLAP_ROWSET_SEGMENT_V2_SHORT_INDEX_H diff --git a/docs/documentation/cn/extending-doris/doris_storage_optimization.md b/docs/documentation/cn/extending-doris/doris_storage_optimization.md new file mode 100644 index 00000000000000..54fae85fc0ff29 --- /dev/null +++ b/docs/documentation/cn/extending-doris/doris_storage_optimization.md @@ -0,0 +1,198 @@ +# Doris存储文件格式优化 # + +## 文件格式 ## + +![](../../../resources/segment_v2.png) +
图1. doris segment文件格式
+ +文件包括: +- 文件开始是8个字节的magic code,用于识别文件格式和版本 +- Data Region:用于存储各个列的数据信息,这里的数据是按需分page加载的 +- Index Region: doris中将各个列的index数据统一存储在Index Region,这里的数据会按照列粒度进行加载,所以跟列的数据信息分开存储 +- Footer信息 + - FileFooterPB:定义文件的元数据信息 + - 4个字节的footer pb内容的checksum + - 4个字节的FileFooterPB消息长度,用于读取FileFooterPB + - 8个字节的MAGIC CODE,之所以在末位存储,是方便不同的场景进行文件类型的识别 + +文件中的数据按照page的方式进行组织,page是编码和压缩的基本单位。现在的page类型包括以下几种: + +### DataPage ### + +DataPage分为两种:nullable和non-nullable的data page。 + +nullable的data page内容包括: +``` + + +----------------+ + | value count | + |----------------| + | bitmap length | + |----------------| + | null bitmap | + |----------------| + | data | + |----------------| + | checksum | + +----------------+ +``` + +non-nullable data page结构如下: + +``` + |----------------| + | data | + |----------------| + | checksum | + +----------------+ +``` + +其中各个字段含义如下: + +- value count + - 表示page中的行数 +- bitmap length + - 表示接下来bitmap的字节数 +- null bitmap + - 表示null信息的bitmap +- data + - 存储经过encoding和compress之后的数据 + - 需要在数据的头部信息中写入:is_compressed + - 各种不同编码的data需要在头部信息写入一些字段信息,以实现数据的解析 + - TODO:添加各种encoding的header信息 +- checksum + - 存储page粒度的校验和,包括page第一个字节和之后的实际数据 + + +### Bloom Filter Pages ### + +针对每个bloom filter列,会在page的粒度相应的生成一个bloom filter的page,保存在bloom filter pages区域 + +### Ordinal Index Page ### + +针对每个列,都会按照page粒度,建立行号的稀疏索引。内容为这个page的起始行的行号到这个block的指针(包括offset和length) + +### Short Key Index page ### + +我们会每隔N行(可配置)生成一个short key的稀疏索引,索引的内容为:short key->行号(ordinal) + +### Column的其他索引 ### + +该格式设计支持后续扩展其他的索引信息,比如bitmap索引,spatial索引等等,只需要将需要的数据写到现有的列数据后面,并且添加对应的元数据字段到FileFooterPB中 + +### 元数据定义 ### +FileFooterPB的定义为: + +``` +message ColumnPB { + optional uint32 column_id = 1; // 这里使用column id,不使用column name是因为计划支持修改列名 + optional string type = 2; // 列类型 + optional string aggregation = 3; // 是否聚合 + optional uint32 length = 4; // 长度 + optional bool is_key = 5; // 是否是主键列 + optional string default_value = 6; // 默认值 + optional uint32 precision = 9 [default = 27]; // 精度 + optional uint32 frac = 10 [default = 9]; + optional bool is_nullable = 11 [default=false]; // 是否有null + optional bool is_bf_column = 15 [default=false]; // 是否有bf词典 + optional bool is_bitmap_column = 16 [default=false]; // 是否有bitmap索引 +} + +// page偏移 +message PagePointerPB { + required uint64 offset; // page在文件中的偏移 + required uint32 length; // page的大小 +} + +message MetadataPairPB { + optional string key = 1; + optional bytes value = 2; +} + +message ColumnMetaPB { + optional ColumnMessage encoding; // 编码方式 + + optional PagePointerPB dict_page // 词典page + repeated PagePointerPB bloom_filter_pages; // bloom filter词典信息 + optional PagePointerPB ordinal_index_page; // 行号索引数据 + optional PagePointerPB page_zone_map_page; // page级别统计信息索引数据 + + optional PagePointerPB bitmap_index_page; // bitmap索引数据 + + optional uint64 data_footprint; // 列中索引的大小 + optional uint64 index_footprint; // 列中数据的大小 + optional uint64 raw_data_footprint; // 原始列数据大小 + + optional CompressKind compress_kind; // 列的压缩方式 + + optional ZoneMapPB column_zone_map; //文件级别的过滤条件 + repeated MetadataPairPB column_meta_datas; +} + +message FileFooterPB { + optional uint32 version = 2 [default = 1]; // 用于版本兼容和升级使用 + repeated ColumnPB schema = 5; // 列Schema + optional uint64 num_values = 4; // 文件中保存的行数 + optional uint64 index_footprint = 7; // 索引大小 + optional uint64 data_footprint = 8; // 数据大小 + optional uint64 raw_data_footprint = 8; // 原始数据大小 + + optional CompressKind compress_kind = 9 [default = COMPRESS_LZO]; // 压缩方式 + repeated ColumnMetaPB column_metas = 10; // 列元数据 + optional PagePointerPB key_index_page; // short key索引page +} + +``` + +## 读写逻辑 ## + +### 写入 ### + +大体的写入流程如下: +1. 写入magic +2. 根据schema信息,生成对应的ColumnWriter,每个ColumnWriter按照不同的类型,获取对应的encoding信息(可配置),根据encoding,生成对应的encoder +3. 调用encoder->add(value)进行数据写入,每个K行,生成一个short key index entry,并且,如果当前的page满足一定条件(大小超过1M或者行数为K),就生成一个新的page,缓存在内存中。 +4. 不断的循环步骤3,直到数据写入完成。将各个列的数据依序刷入文件中 +5. 生成FileFooterPB信息,写入文件中。 + +相关的问题: + +- short key的索引如何生成? + - 现在还是按照每隔多少行生成一个short key的稀疏索引,保持每隔1024行生成一个short的稀疏索引,具体的内容是:short key -> ordinal + +- ordinal索引里面应该存什么? + - 存储page的第一个ordinal到page pointer的映射信息 +- 不同encoding类型的page里存什么? + - 词典压缩 + - plain + - rle + - bshuf + +### 读取 ### + +1. 读取文件的magic,判断文件类型和版本 +2. 读取FileFooterPB,进行checksum校验 +3. 按照需要的列,读取short key索引和对应列的数据ordinal索引信息 +4. 使用start key和end key,通过short key索引定位到要读取的行号,然后通过ordinal索引确定需要读取的row ranges, 同时需要通过统计信息、bitmap索引等过滤需要读取的row ranges +5. 然后按照row ranges通过ordinal索引读取行的数据 + +相关的问题: +1. 如何实现在page内部快速的定位到某一行? + + page内部是的数据是经过encoding的,无法快速进行行级数据的定位。不同的encoding方式,在内部进行快速的行号定位的方案不一样,需要具体分析: + - 如果是rle编码的,需要通过解析rle的header进行skip,直到到达包含该行的那个rle块之后,再进行反解。 + - binary plain encoding:会在page的中存储offset信息,并且会在page header中指定offset信息的offset,读取的时候会先解析offset信息到数组中,这样子就可以通过各个行的offset数据信息快速的定位block某一行的数据 +2. 如何实现块的高效读取?可以考虑将相邻的块在读取的时候进行merge,一次性读取? + 这个需要在读取的时候,判断block是否连续,如果连续,就一次性的读取 + +## 编码 ## + +现有的doris存储中,针对string类型的编码,采用plain encoding的方式,效率比较低。经过对比,发现在百度统计的场景下,数据会因为string类型的编码膨胀超过一倍。所以,计划引入基于词典的编码压缩。 + +## 压缩 ## + +实现可扩展的压缩框架,支持多种压缩算法,方便后续添加新的压缩算法,计划引入zstd压缩。 + +## TODO ## +1. 如何实现嵌套类型?如何在嵌套类型中进行行号定位? +2. 如何优化现在的ScanRange拆分导致的下游bitmap、column statistic统计等进行多次? diff --git a/docs/resources/segment_v2.png b/docs/resources/segment_v2.png new file mode 100644 index 00000000000000..72f30ac2c17852 Binary files /dev/null and b/docs/resources/segment_v2.png differ diff --git a/gensrc/proto/segment_v2.proto b/gensrc/proto/segment_v2.proto new file mode 100644 index 00000000000000..f02ac348648452 --- /dev/null +++ b/gensrc/proto/segment_v2.proto @@ -0,0 +1,103 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// Define file format struct, like data header, index header. + +package doris.v2; + +message ColumnSchemaPB { + optional uint32 column_id = 1; + optional string type = 2; + optional string aggregation = 3; + optional uint32 length = 4; + optional bool is_key = 5; + optional string default_value = 6; + optional uint32 precision = 9 [default = 27]; + optional uint32 frac = 10 [default = 9]; + optional bool is_nullable = 11 [default=false]; + optional bool is_bf_column = 15 [default=false]; // is bloom filter indexed column + optional bool is_bitmap_column = 16 [default=false]; +} + +// page position info +message PagePointerPB { + required uint64 offset = 1; // offset in segment file + required uint32 length = 2; // size of page in byte +} + +message MetadataPairPB { + optional string key = 1; + optional bytes value = 2; +} + +enum EncodingTypePB { + PLAIN_ENCODING = 1; + PREFIX_ENCODING = 2; + RLE = 4; + DICT_ENCODING = 5; + BIT_SHUFFLE = 6; + UNKNOWN_ENCODING = 1000; +} + +enum CompressionTypePB { + DEFAULT_COMPRESSION = 0; + NO_COMPRESSION = 1; + SNAPPY = 2; + LZ4 = 3; + ZLIB = 4; + ZSTB = 5; + LZO = 6; + UNKNOWN_COMPRESSION = 1000; +} + +message ZoneMapPB { + optional bytes min = 1; + optional bytes max = 2; + optional bool null_flag = 3; +} + +message ColumnMetaPB { + optional EncodingTypePB encoding = 1; + + optional PagePointerPB dict_page = 2;// dictionary page for DICT_ENCODING + repeated PagePointerPB bloom_filter_pages = 3; // bloom filter pages for bloom filter column + optional PagePointerPB ordinal_index_page = 4; // ordinal index page + optional PagePointerPB page_zonemap_page = 5; // page zonemap info of column + + optional PagePointerPB bitmap_index_page = 6; // bitmap index page + + optional uint64 data_footprint = 7; // data footprint of column after encoding and compress + optional uint64 index_footprint = 8; // index footprint of column after encoding and compress + optional uint64 raw_data_footprint = 9; // raw column data footprint + + optional CompressionTypePB compress_type = 10; // compress type for column + + optional ZoneMapPB column_zonemap = 11; // column zonemap info + repeated MetadataPairPB column_meta_datas = 12; +} + +message FileFooterPB { + optional uint32 version = 1 [default = 1]; // file version + repeated ColumnSchemaPB schema = 2; // tablet schema + optional uint64 num_values = 3; // number of values + optional uint64 index_footprint = 4; // total idnex footprint of all columns + optional uint64 data_footprint = 5; // total data footprint of all columns + optional uint64 raw_data_footprint = 6; // raw data footprint + + optional CompressionTypePB compress_type = 7 [default = LZO]; // default compression type for file columns + repeated MetadataPairPB file_meta_datas = 8; // meta data of file + optional PagePointerPB key_index_page = 9; // short key index page +}