-
Notifications
You must be signed in to change notification settings - Fork 14
split large parquet files on part export, preserve entire settings object in part export #1229
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d2d6d34
5f29e0a
e2092c8
0785184
f021160
ff8a74e
a456c8a
189433c
c4e0888
c6eee2c
57ca34c
b0e53bc
49d9740
b7f6b57
04e2145
fad2e88
5aa5178
ea40929
5c3f42e
497b488
20376f2
8a3179e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -62,14 +62,14 @@ struct ExportReplicatedMergeTreePartitionProcessingPartEntry | |
| struct ExportReplicatedMergeTreePartitionProcessedPartEntry | ||
| { | ||
| String part_name; | ||
| String path_in_destination; | ||
| std::vector<String> paths_in_destination; | ||
| String finished_by; | ||
|
|
||
| std::string toJsonString() const | ||
| { | ||
| Poco::JSON::Object json; | ||
| json.set("part_name", part_name); | ||
| json.set("path_in_destination", path_in_destination); | ||
| json.set("paths_in_destination", paths_in_destination); | ||
| json.set("finished_by", finished_by); | ||
| std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM | ||
| oss.exceptions(std::ios::failbit); | ||
|
|
@@ -86,7 +86,11 @@ struct ExportReplicatedMergeTreePartitionProcessedPartEntry | |
| ExportReplicatedMergeTreePartitionProcessedPartEntry entry; | ||
|
|
||
| entry.part_name = json->getValue<String>("part_name"); | ||
| entry.path_in_destination = json->getValue<String>("path_in_destination"); | ||
|
|
||
| const auto paths_in_destination_array = json->getArray("paths_in_destination"); | ||
| for (size_t i = 0; i < paths_in_destination_array->size(); ++i) | ||
| entry.paths_in_destination.emplace_back(paths_in_destination_array->getElement<String>(static_cast<unsigned int>(i))); | ||
|
|
||
| entry.finished_by = json->getValue<String>("finished_by"); | ||
|
|
||
| return entry; | ||
|
|
@@ -108,6 +112,8 @@ struct ExportReplicatedMergeTreePartitionManifest | |
| size_t max_threads; | ||
| bool parallel_formatting; | ||
| bool parquet_parallel_encoding; | ||
| size_t max_bytes_per_file; | ||
| size_t max_rows_per_file; | ||
| MergeTreePartExportManifest::FileAlreadyExistsPolicy file_already_exists_policy; | ||
|
|
||
| std::string toJsonString() const | ||
|
|
@@ -127,6 +133,8 @@ struct ExportReplicatedMergeTreePartitionManifest | |
| json.set("parallel_formatting", parallel_formatting); | ||
| json.set("max_threads", max_threads); | ||
| json.set("parquet_parallel_encoding", parquet_parallel_encoding); | ||
| json.set("max_bytes_per_file", max_bytes_per_file); | ||
| json.set("max_rows_per_file", max_rows_per_file); | ||
| json.set("file_already_exists_policy", String(magic_enum::enum_name(file_already_exists_policy))); | ||
| json.set("create_time", create_time); | ||
| json.set("max_retries", max_retries); | ||
|
|
@@ -160,7 +168,8 @@ struct ExportReplicatedMergeTreePartitionManifest | |
| manifest.max_threads = json->getValue<size_t>("max_threads"); | ||
| manifest.parallel_formatting = json->getValue<bool>("parallel_formatting"); | ||
| manifest.parquet_parallel_encoding = json->getValue<bool>("parquet_parallel_encoding"); | ||
|
|
||
| manifest.max_bytes_per_file = json->getValue<size_t>("max_bytes_per_file"); | ||
| manifest.max_rows_per_file = json->getValue<size_t>("max_rows_per_file"); | ||
|
Comment on lines
168
to
+172
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Useful? React with 👍 / 👎.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not a problem. Nobody is using the export partition feature for now, and even if they were, the zookeeper entries would have expired (unless a big TTL was set, which I doubt) |
||
| if (json->has("file_already_exists_policy")) | ||
| { | ||
| const auto file_already_exists_policy = magic_enum::enum_cast<MergeTreePartExportManifest::FileAlreadyExistsPolicy>(json->getValue<String>("file_already_exists_policy")); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fromJsonStringnow assumes every processed entry contains apaths_in_destinationarray and immediately dereferences it. Existing ZooKeeper nodes written by previous versions stored a singlepath_in_destinationstring, so after an upgradegetArraywill return null (or throw), crashingExportPartitionUtils::getExportedPathswhen it reads legacy entries and preventing pending exports from ever committing. Please keep parsing the old field or guard against missing arrays before dereferencing.Useful? React with 👍 / 👎.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sigh we've been through that multiple times