diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index f6e2ca7249ded4..5290acd01cbac0 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1298,6 +1298,9 @@ DEFINE_mBool(skip_loading_stale_rowset_meta, "false"); DEFINE_Bool(enable_file_logger, "true"); +// The minimum row group size when exporting Parquet files. default 128MB +DEFINE_Int64(min_row_group_size, "134217728"); + // clang-format off #ifdef BE_TEST // test s3 diff --git a/be/src/common/config.h b/be/src/common/config.h index cd7ef3c2f47dd2..785cdfe8be54ff 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1381,6 +1381,9 @@ DECLARE_mBool(skip_loading_stale_rowset_meta); // Only works when starting BE with --console. DECLARE_Bool(enable_file_logger); +// The minimum row group size when exporting Parquet files. +DECLARE_Int64(min_row_group_size); + #ifdef BE_TEST // test s3 DECLARE_String(test_s3_resource); diff --git a/be/src/vec/runtime/vparquet_transformer.cpp b/be/src/vec/runtime/vparquet_transformer.cpp index 77068adeebec95..fbb24e009688ae 100644 --- a/be/src/vec/runtime/vparquet_transformer.cpp +++ b/be/src/vec/runtime/vparquet_transformer.cpp @@ -35,6 +35,7 @@ #include #include +#include "common/config.h" #include "common/status.h" #include "gutil/endian.h" #include "io/fs/file_writer.h" @@ -70,8 +71,6 @@ namespace doris::vectorized { -const uint64_t min_row_group_size = 128 * 1024 * 1024; // 128MB - ParquetOutputStream::ParquetOutputStream(doris::io::FileWriter* file_writer) : _file_writer(file_writer), _cur_pos(0), _written_len(0) { set_mode(arrow::io::FileMode::WRITE); @@ -247,6 +246,7 @@ Status VParquetTransformer::_parse_properties() { } builder.created_by( fmt::format("{}({})", doris::get_short_version(), parquet::DEFAULT_CREATED_BY)); + builder.max_row_group_length(std::numeric_limits::max()); _parquet_writer_properties = builder.build(); _arrow_properties = parquet::ArrowWriterProperties::Builder() .enable_deprecated_int96_timestamps() @@ -296,7 +296,7 @@ Status VParquetTransformer::write(const Block& block) { RETURN_DORIS_STATUS_IF_ERROR(_writer->WriteRecordBatch(*result)); _write_size += block.bytes(); - if (_write_size >= min_row_group_size) { + if (_write_size >= doris::config::min_row_group_size) { RETURN_DORIS_STATUS_IF_ERROR(_writer->NewBufferedRowGroup()); _write_size = 0; }