From 8214444a74d99428845d0fadf46eb2a54016c203 Mon Sep 17 00:00:00 2001 From: Peter Vary Date: Sun, 8 Dec 2024 10:13:58 +0100 Subject: [PATCH 01/11] Core, Spark: Refactor RewriteFileGroup planner to core --- .../actions/RewriteFileGroupPlanner.java | 177 +++++++++++++++++ .../actions/SizeBasedFileRewriter.java | 2 +- .../actions/TestRewriteFileGroupPlanner.java | 161 ++++++++++++++++ .../actions/RewriteDataFilesSparkAction.java | 182 +++--------------- .../actions/TestRewriteDataFilesAction.java | 7 +- 5 files changed, 369 insertions(+), 160 deletions(-) create mode 100644 core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java create mode 100644 core/src/test/java/org/apache/iceberg/actions/TestRewriteFileGroupPlanner.java diff --git a/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java b/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java new file mode 100644 index 000000000000..6d1e27503da7 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.actions; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Stream; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.RewriteJobOrder; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.Table; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.StructLikeMap; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Checks the files in the table, and using the {@link FileRewriter} plans the groups for + * compaction. + */ +public class RewriteFileGroupPlanner { + private static final Logger LOG = LoggerFactory.getLogger(RewriteFileGroupPlanner.class); + + private final FileRewriter rewriter; + private final RewriteJobOrder rewriteJobOrder; + + public RewriteFileGroupPlanner( + FileRewriter rewriter, RewriteJobOrder rewriteJobOrder) { + this.rewriter = rewriter; + this.rewriteJobOrder = rewriteJobOrder; + } + + public RewritePlanResult plan( + Table table, Expression filter, long startingSnapshotId, boolean caseSensitive) { + StructLikeMap>> plan = + planFileGroups(table, filter, startingSnapshotId, caseSensitive); + RewriteExecutionContext ctx = new RewriteExecutionContext(); + Stream groups = + plan.entrySet().stream() + .filter(e -> !e.getValue().isEmpty()) + .flatMap( + e -> { + StructLike partition = e.getKey(); + List> scanGroups = e.getValue(); + return scanGroups.stream().map(tasks -> newRewriteGroup(ctx, partition, tasks)); + }) + .sorted(RewriteFileGroup.comparator(rewriteJobOrder)); + Map groupsInPartition = plan.transformValues(List::size); + int totalGroupCount = groupsInPartition.values().stream().reduce(Integer::sum).orElse(0); + return new RewritePlanResult(groups, totalGroupCount, groupsInPartition); + } + + private StructLikeMap>> planFileGroups( + Table table, Expression filter, long startingSnapshotId, boolean caseSensitive) { + CloseableIterable fileScanTasks = + table + .newScan() + .useSnapshot(startingSnapshotId) + .caseSensitive(caseSensitive) + .filter(filter) + .ignoreResiduals() + .planFiles(); + + try { + Types.StructType partitionType = table.spec().partitionType(); + StructLikeMap> filesByPartition = + groupByPartition(table, partitionType, fileScanTasks); + return filesByPartition.transformValues( + tasks -> ImmutableList.copyOf(rewriter.planFileGroups(tasks))); + } finally { + try { + fileScanTasks.close(); + } catch (IOException io) { + LOG.error("Cannot properly close file iterable while planning for rewrite", io); + } + } + } + + private StructLikeMap> groupByPartition( + Table table, Types.StructType partitionType, Iterable tasks) { + StructLikeMap> filesByPartition = StructLikeMap.create(partitionType); + StructLike emptyStruct = GenericRecord.create(partitionType); + + for (FileScanTask task : tasks) { + // If a task uses an incompatible partition spec the data inside could contain values + // which belong to multiple partitions in the current spec. Treating all such files as + // un-partitioned and grouping them together helps to minimize new files made. + StructLike taskPartition = + task.file().specId() == table.spec().specId() ? task.file().partition() : emptyStruct; + + filesByPartition.computeIfAbsent(taskPartition, unused -> Lists.newArrayList()).add(task); + } + + return filesByPartition; + } + + private RewriteFileGroup newRewriteGroup( + RewriteExecutionContext ctx, StructLike partition, List tasks) { + RewriteDataFiles.FileGroupInfo info = + ImmutableRewriteDataFiles.FileGroupInfo.builder() + .globalIndex(ctx.currentGlobalIndex()) + .partitionIndex(ctx.currentPartitionIndex(partition)) + .partition(partition) + .build(); + return new RewriteFileGroup(info, Lists.newArrayList(tasks)); + } + + public static class RewritePlanResult { + private final Stream groups; + private final int totalGroupCount; + private final Map groupsInPartition; + + private RewritePlanResult( + Stream groups, + int totalGroupCount, + Map groupsInPartition) { + this.groups = groups; + this.totalGroupCount = totalGroupCount; + this.groupsInPartition = groupsInPartition; + } + + public Stream groups() { + return groups; + } + + public int groupsInPartition(StructLike partition) { + return groupsInPartition.get(partition); + } + + public int totalGroupCount() { + return totalGroupCount; + } + } + + private static class RewriteExecutionContext { + private final Map partitionIndexMap; + private final AtomicInteger groupIndex; + + private RewriteExecutionContext() { + this.partitionIndexMap = Maps.newConcurrentMap(); + this.groupIndex = new AtomicInteger(1); + } + + private int currentGlobalIndex() { + return groupIndex.getAndIncrement(); + } + + private int currentPartitionIndex(StructLike partition) { + return partitionIndexMap.merge(partition, 1, Integer::sum); + } + } +} diff --git a/core/src/main/java/org/apache/iceberg/actions/SizeBasedFileRewriter.java b/core/src/main/java/org/apache/iceberg/actions/SizeBasedFileRewriter.java index cea7003c1a38..5d45392c5487 100644 --- a/core/src/main/java/org/apache/iceberg/actions/SizeBasedFileRewriter.java +++ b/core/src/main/java/org/apache/iceberg/actions/SizeBasedFileRewriter.java @@ -191,7 +191,7 @@ protected long inputSize(List group) { * of output files. The final split size is adjusted to be at least as big as the target file size * but less than the max write file size. */ - protected long splitSize(long inputSize) { + public long splitSize(long inputSize) { long estimatedSplitSize = (inputSize / numOutputFiles(inputSize)) + SPLIT_OVERHEAD; if (estimatedSplitSize < targetFileSize) { return targetFileSize; diff --git a/core/src/test/java/org/apache/iceberg/actions/TestRewriteFileGroupPlanner.java b/core/src/test/java/org/apache/iceberg/actions/TestRewriteFileGroupPlanner.java new file mode 100644 index 000000000000..d3382fb8b349 --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/actions/TestRewriteFileGroupPlanner.java @@ -0,0 +1,161 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.actions; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.UUID; +import java.util.stream.Collectors; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DataFiles; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.RewriteJobOrder; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.TestBase; +import org.apache.iceberg.TestTables; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +class TestRewriteFileGroupPlanner { + private static final DataFile FILE_1 = newDataFile("data_bucket=0", 10); + private static final DataFile FILE_2 = newDataFile("data_bucket=0", 10); + private static final DataFile FILE_3 = newDataFile("data_bucket=0", 10); + private static final DataFile FILE_4 = newDataFile("data_bucket=1", 11); + private static final DataFile FILE_5 = newDataFile("data_bucket=1", 11); + private static final DataFile FILE_6 = newDataFile("data_bucket=2", 50); + + private static final Map> EXPECTED = + ImmutableMap.of( + RewriteJobOrder.FILES_DESC, + ImmutableList.of(FILE_1.partition(), FILE_4.partition(), FILE_6.partition()), + RewriteJobOrder.FILES_ASC, + ImmutableList.of(FILE_6.partition(), FILE_4.partition(), FILE_1.partition()), + RewriteJobOrder.BYTES_DESC, + ImmutableList.of(FILE_6.partition(), FILE_1.partition(), FILE_4.partition()), + RewriteJobOrder.BYTES_ASC, + ImmutableList.of(FILE_4.partition(), FILE_1.partition(), FILE_6.partition())); + + @TempDir private File tableDir = null; + private TestTables.TestTable table = null; + + @BeforeEach + public void setupTable() throws Exception { + this.table = TestTables.create(tableDir, "test", TestBase.SCHEMA, TestBase.SPEC, 3); + } + + @AfterEach + public void cleanupTables() { + TestTables.clearTables(); + } + + @ParameterizedTest + @EnumSource( + value = RewriteJobOrder.class, + names = {"FILES_DESC", "FILES_ASC", "BYTES_DESC", "BYTES_ASC"}) + void testGroups(RewriteJobOrder order) { + table + .newAppend() + .appendFile(FILE_1) + .appendFile(FILE_2) + .appendFile(FILE_3) + .appendFile(FILE_4) + .appendFile(FILE_5) + .appendFile(FILE_6) + .commit(); + RewriteFileGroupPlanner planner = new RewriteFileGroupPlanner(new DummyRewriter(false), order); + RewriteFileGroupPlanner.RewritePlanResult result = + planner.plan(table, Expressions.alwaysTrue(), table.currentSnapshot().snapshotId(), false); + List groups = result.groups().collect(Collectors.toList()); + assertThat(groups.stream().map(group -> group.info().partition()).collect(Collectors.toList())) + .isEqualTo(EXPECTED.get(order)); + assertThat(result.totalGroupCount()).isEqualTo(3); + EXPECTED.get(order).forEach(s -> assertThat(result.groupsInPartition(s)).isEqualTo(1)); + } + + @Test + void testContext() { + table + .newAppend() + .appendFile(FILE_1) + .appendFile(FILE_2) + .appendFile(FILE_3) + .appendFile(FILE_4) + .appendFile(FILE_5) + .appendFile(FILE_6) + .commit(); + RewriteFileGroupPlanner planner = + new RewriteFileGroupPlanner(new DummyRewriter(true), RewriteJobOrder.FILES_DESC); + RewriteFileGroupPlanner.RewritePlanResult result = + planner.plan(table, Expressions.alwaysTrue(), table.currentSnapshot().snapshotId(), false); + assertThat(result.totalGroupCount()).isEqualTo(6); + assertThat(result.groupsInPartition(FILE_1.partition())).isEqualTo(3); + assertThat(result.groupsInPartition(FILE_4.partition())).isEqualTo(2); + assertThat(result.groupsInPartition(FILE_6.partition())).isEqualTo(1); + } + + private static class DummyRewriter implements FileRewriter { + private final boolean split; + + private DummyRewriter(boolean split) { + this.split = split; + } + + @Override + public Set validOptions() { + return Set.of(); + } + + @Override + public void init(Map options) {} + + @Override + public Iterable> planFileGroups(Iterable tasks) { + List taskList = Lists.newArrayList(tasks); + return split + ? taskList.stream().map(ImmutableList::of).collect(Collectors.toList()) + : ImmutableList.of(taskList); + } + + @Override + public Set rewrite(List group) { + return Set.of(); + } + } + + private static DataFile newDataFile(String partitionPath, long fileSize) { + return DataFiles.builder(TestBase.SPEC) + .withPath("/path/to/data-" + UUID.randomUUID() + ".parquet") + .withFileSizeInBytes(fileSize) + .withPartitionPath(partitionPath) + .withRecordCount(1) + .build(); + } +} diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java index e04a0c88b4bb..fe0cbdaa4c46 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java @@ -18,20 +18,16 @@ */ package org.apache.iceberg.spark.actions; -import java.io.IOException; import java.math.RoundingMode; import java.util.Arrays; import java.util.Collection; import java.util.List; -import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.ThreadPoolExecutor; -import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; -import java.util.stream.Stream; import org.apache.iceberg.DataFile; import org.apache.iceberg.FileScanTask; import org.apache.iceberg.RewriteJobOrder; @@ -44,28 +40,24 @@ import org.apache.iceberg.actions.RewriteDataFiles; import org.apache.iceberg.actions.RewriteDataFilesCommitManager; import org.apache.iceberg.actions.RewriteFileGroup; -import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.actions.RewriteFileGroupPlanner; +import org.apache.iceberg.actions.RewriteFileGroupPlanner.RewritePlanResult; import org.apache.iceberg.exceptions.CommitFailedException; import org.apache.iceberg.exceptions.ValidationException; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.relocated.com.google.common.collect.Queues; import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.relocated.com.google.common.math.IntMath; import org.apache.iceberg.relocated.com.google.common.util.concurrent.MoreExecutors; import org.apache.iceberg.relocated.com.google.common.util.concurrent.ThreadFactoryBuilder; import org.apache.iceberg.spark.SparkUtil; -import org.apache.iceberg.types.Types.StructType; import org.apache.iceberg.util.PropertyUtil; -import org.apache.iceberg.util.StructLikeMap; import org.apache.iceberg.util.Tasks; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; @@ -171,21 +163,17 @@ public RewriteDataFiles.Result execute() { validateAndInitOptions(); - StructLikeMap>> fileGroupsByPartition = - planFileGroups(startingSnapshotId); - RewriteExecutionContext ctx = new RewriteExecutionContext(fileGroupsByPartition); + RewritePlanResult result = plan(startingSnapshotId); - if (ctx.totalGroupCount() == 0) { + if (result.totalGroupCount() == 0) { LOG.info("Nothing found to rewrite in {}", table.name()); return EMPTY_RESULT; } - Stream groupStream = toGroupStream(ctx, fileGroupsByPartition); - Builder resultBuilder = partialProgressEnabled - ? doExecuteWithPartialProgress(ctx, groupStream, commitManager(startingSnapshotId)) - : doExecute(ctx, groupStream, commitManager(startingSnapshotId)); + ? doExecuteWithPartialProgress(result, commitManager(startingSnapshotId)) + : doExecute(result, commitManager(startingSnapshotId)); if (removeDanglingDeletes) { RemoveDanglingDeletesSparkAction action = @@ -193,68 +181,18 @@ public RewriteDataFiles.Result execute() { int removedCount = Iterables.size(action.execute().removedDeleteFiles()); resultBuilder.removedDeleteFilesCount(removedCount); } - return resultBuilder.build(); - } - - StructLikeMap>> planFileGroups(long startingSnapshotId) { - CloseableIterable fileScanTasks = - table - .newScan() - .useSnapshot(startingSnapshotId) - .caseSensitive(caseSensitive) - .filter(filter) - .ignoreResiduals() - .planFiles(); - try { - StructType partitionType = table.spec().partitionType(); - StructLikeMap> filesByPartition = - groupByPartition(partitionType, fileScanTasks); - return fileGroupsByPartition(filesByPartition); - } finally { - try { - fileScanTasks.close(); - } catch (IOException io) { - LOG.error("Cannot properly close file iterable while planning for rewrite", io); - } - } - } - - private StructLikeMap> groupByPartition( - StructType partitionType, Iterable tasks) { - StructLikeMap> filesByPartition = StructLikeMap.create(partitionType); - StructLike emptyStruct = GenericRecord.create(partitionType); - - for (FileScanTask task : tasks) { - // If a task uses an incompatible partition spec the data inside could contain values - // which belong to multiple partitions in the current spec. Treating all such files as - // un-partitioned and grouping them together helps to minimize new files made. - StructLike taskPartition = - task.file().specId() == table.spec().specId() ? task.file().partition() : emptyStruct; - - List files = filesByPartition.get(taskPartition); - if (files == null) { - files = Lists.newArrayList(); - } - - files.add(task); - filesByPartition.put(taskPartition, files); - } - return filesByPartition; - } - - private StructLikeMap>> fileGroupsByPartition( - StructLikeMap> filesByPartition) { - return filesByPartition.transformValues(this::planFileGroups); + return resultBuilder.build(); } - private List> planFileGroups(List tasks) { - return ImmutableList.copyOf(rewriter.planFileGroups(tasks)); + RewritePlanResult plan(long startingSnapshotId) { + return new RewriteFileGroupPlanner(rewriter, rewriteJobOrder) + .plan(table, filter, startingSnapshotId, caseSensitive); } @VisibleForTesting - RewriteFileGroup rewriteFiles(RewriteExecutionContext ctx, RewriteFileGroup fileGroup) { - String desc = jobDesc(fileGroup, ctx); + RewriteFileGroup rewriteFiles(RewritePlanResult planResult, RewriteFileGroup fileGroup) { + String desc = jobDesc(fileGroup, planResult); Set addedFiles = withJobGroupInfo( newJobGroupInfo("REWRITE-DATA-FILES", desc), @@ -280,29 +218,25 @@ RewriteDataFilesCommitManager commitManager(long startingSnapshotId) { } private Builder doExecute( - RewriteExecutionContext ctx, - Stream groupStream, - RewriteDataFilesCommitManager commitManager) { + RewritePlanResult planResult, RewriteDataFilesCommitManager commitManager) { ExecutorService rewriteService = rewriteService(); ConcurrentLinkedQueue rewrittenGroups = Queues.newConcurrentLinkedQueue(); Tasks.Builder rewriteTaskBuilder = - Tasks.foreach(groupStream) + Tasks.foreach(planResult.groups()) .executeWith(rewriteService) .stopOnFailure() .noRetry() .onFailure( - (fileGroup, exception) -> { - LOG.warn( - "Failure during rewrite process for group {}", fileGroup.info(), exception); - }); + (fileGroup, exception) -> + LOG.warn( + "Failure during rewrite process for group {}", + fileGroup.info(), + exception)); try { - rewriteTaskBuilder.run( - fileGroup -> { - rewrittenGroups.add(rewriteFiles(ctx, fileGroup)); - }); + rewriteTaskBuilder.run(fileGroup -> rewrittenGroups.add(rewriteFiles(planResult, fileGroup))); } catch (Exception e) { // At least one rewrite group failed, clean up all completed rewrites LOG.error( @@ -345,20 +279,19 @@ private Builder doExecute( } private Builder doExecuteWithPartialProgress( - RewriteExecutionContext ctx, - Stream groupStream, - RewriteDataFilesCommitManager commitManager) { + RewritePlanResult planResult, RewriteDataFilesCommitManager commitManager) { ExecutorService rewriteService = rewriteService(); // start commit service - int groupsPerCommit = IntMath.divide(ctx.totalGroupCount(), maxCommits, RoundingMode.CEILING); + int groupsPerCommit = + IntMath.divide(planResult.totalGroupCount(), maxCommits, RoundingMode.CEILING); RewriteDataFilesCommitManager.CommitService commitService = commitManager.service(groupsPerCommit); commitService.start(); Collection rewriteFailures = new ConcurrentLinkedQueue<>(); // start rewrite tasks - Tasks.foreach(groupStream) + Tasks.foreach(planResult.groups()) .suppressFailureWhenFinished() .executeWith(rewriteService) .noRetry() @@ -371,7 +304,7 @@ private Builder doExecuteWithPartialProgress( .dataFilesCount(fileGroup.numFiles()) .build()); }) - .run(fileGroup -> commitService.offer(rewriteFiles(ctx, fileGroup))); + .run(fileGroup -> commitService.offer(rewriteFiles(planResult, fileGroup))); rewriteService.shutdown(); // stop commit service @@ -404,32 +337,6 @@ private Builder doExecuteWithPartialProgress( .rewriteFailures(rewriteFailures); } - Stream toGroupStream( - RewriteExecutionContext ctx, Map>> groupsByPartition) { - return groupsByPartition.entrySet().stream() - .filter(e -> !e.getValue().isEmpty()) - .flatMap( - e -> { - StructLike partition = e.getKey(); - List> scanGroups = e.getValue(); - return scanGroups.stream().map(tasks -> newRewriteGroup(ctx, partition, tasks)); - }) - .sorted(RewriteFileGroup.comparator(rewriteJobOrder)); - } - - private RewriteFileGroup newRewriteGroup( - RewriteExecutionContext ctx, StructLike partition, List tasks) { - int globalIndex = ctx.currentGlobalIndex(); - int partitionIndex = ctx.currentPartitionIndex(partition); - FileGroupInfo info = - ImmutableRewriteDataFiles.FileGroupInfo.builder() - .globalIndex(globalIndex) - .partitionIndex(partitionIndex) - .partition(partition) - .build(); - return new RewriteFileGroup(info, tasks); - } - private Iterable toRewriteResults(List commitResults) { return commitResults.stream().map(RewriteFileGroup::asResult).collect(Collectors.toList()); } @@ -492,7 +399,7 @@ void validateAndInitOptions() { PARTIAL_PROGRESS_ENABLED); } - private String jobDesc(RewriteFileGroup group, RewriteExecutionContext ctx) { + private String jobDesc(RewriteFileGroup group, RewritePlanResult planResult) { StructLike partition = group.info().partition(); if (partition.size() > 0) { return String.format( @@ -500,10 +407,10 @@ private String jobDesc(RewriteFileGroup group, RewriteExecutionContext ctx) { group.rewrittenFiles().size(), rewriter.description(), group.info().globalIndex(), - ctx.totalGroupCount(), + planResult.totalGroupCount(), partition, group.info().partitionIndex(), - ctx.groupsInPartition(partition), + planResult.groupsInPartition(partition), table.name()); } else { return String.format( @@ -511,39 +418,8 @@ private String jobDesc(RewriteFileGroup group, RewriteExecutionContext ctx) { group.rewrittenFiles().size(), rewriter.description(), group.info().globalIndex(), - ctx.totalGroupCount(), + planResult.totalGroupCount(), table.name()); } } - - @VisibleForTesting - static class RewriteExecutionContext { - private final StructLikeMap numGroupsByPartition; - private final int totalGroupCount; - private final Map partitionIndexMap; - private final AtomicInteger groupIndex; - - RewriteExecutionContext(StructLikeMap>> fileGroupsByPartition) { - this.numGroupsByPartition = fileGroupsByPartition.transformValues(List::size); - this.totalGroupCount = numGroupsByPartition.values().stream().reduce(Integer::sum).orElse(0); - this.partitionIndexMap = Maps.newConcurrentMap(); - this.groupIndex = new AtomicInteger(1); - } - - public int currentGlobalIndex() { - return groupIndex.getAndIncrement(); - } - - public int currentPartitionIndex(StructLike partition) { - return partitionIndexMap.merge(partition, 1, Integer::sum); - } - - public int groupsInPartition(StructLike partition) { - return numGroupsByPartition.get(partition); - } - - public int totalGroupCount() { - return totalGroupCount; - } - } } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java index 38c4d32a90d2..2127b20aa9b1 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java @@ -108,7 +108,6 @@ import org.apache.iceberg.spark.SparkTableUtil; import org.apache.iceberg.spark.SparkWriteOptions; import org.apache.iceberg.spark.TestBase; -import org.apache.iceberg.spark.actions.RewriteDataFilesSparkAction.RewriteExecutionContext; import org.apache.iceberg.spark.data.TestHelpers; import org.apache.iceberg.spark.source.ThreeColumnRecord; import org.apache.iceberg.types.Comparators; @@ -117,7 +116,6 @@ import org.apache.iceberg.types.Types.NestedField; import org.apache.iceberg.util.ArrayUtil; import org.apache.iceberg.util.Pair; -import org.apache.iceberg.util.StructLikeMap; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Row; @@ -1852,11 +1850,8 @@ protected List currentDataFiles(Table table) { private Stream toGroupStream(Table table, RewriteDataFilesSparkAction rewrite) { rewrite.validateAndInitOptions(); - StructLikeMap>> fileGroupsByPartition = - rewrite.planFileGroups(table.currentSnapshot().snapshotId()); - return rewrite.toGroupStream( - new RewriteExecutionContext(fileGroupsByPartition), fileGroupsByPartition); + return rewrite.plan(table.currentSnapshot().snapshotId()).groups(); } protected List currentData() { From ef646fbcd9ce2c6af1ada35008ca6dbcc3d43310 Mon Sep 17 00:00:00 2001 From: Peter Vary Date: Thu, 14 Nov 2024 10:46:52 +0100 Subject: [PATCH 02/11] Russell's comments --- .../org/apache/iceberg/EmptyStructLike.java | 4 +- .../actions/RewriteFileGroupPlanner.java | 39 ++++++++++++------- .../actions/TestRewriteFileGroupPlanner.java | 4 +- .../actions/RewriteDataFilesSparkAction.java | 30 +++++++------- 4 files changed, 44 insertions(+), 33 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/EmptyStructLike.java b/api/src/main/java/org/apache/iceberg/EmptyStructLike.java index 2d57f4c01a66..8b046780aa7a 100644 --- a/api/src/main/java/org/apache/iceberg/EmptyStructLike.java +++ b/api/src/main/java/org/apache/iceberg/EmptyStructLike.java @@ -20,13 +20,13 @@ import java.io.Serializable; -class EmptyStructLike implements StructLike, Serializable { +public class EmptyStructLike implements StructLike, Serializable { private static final EmptyStructLike INSTANCE = new EmptyStructLike(); private EmptyStructLike() {} - static EmptyStructLike get() { + public static EmptyStructLike get() { return INSTANCE; } diff --git a/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java b/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java index 6d1e27503da7..d80a6163dc94 100644 --- a/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java +++ b/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java @@ -24,11 +24,11 @@ import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Stream; import org.apache.iceberg.DataFile; +import org.apache.iceberg.EmptyStructLike; import org.apache.iceberg.FileScanTask; import org.apache.iceberg.RewriteJobOrder; import org.apache.iceberg.StructLike; import org.apache.iceberg.Table; -import org.apache.iceberg.data.GenericRecord; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; @@ -40,8 +40,8 @@ import org.slf4j.LoggerFactory; /** - * Checks the files in the table, and using the {@link FileRewriter} plans the groups for - * compaction. + * Checks the files in the {@link Table}. The {@link RewriteFileGroup}s are grouped by partitions + * and split by the {@link FileRewriter}. */ public class RewriteFileGroupPlanner { private static final Logger LOG = LoggerFactory.getLogger(RewriteFileGroupPlanner.class); @@ -55,10 +55,18 @@ public RewriteFileGroupPlanner( this.rewriteJobOrder = rewriteJobOrder; } - public RewritePlanResult plan( - Table table, Expression filter, long startingSnapshotId, boolean caseSensitive) { + /** + * Generates the plan for the current table. + * + * @param table to plan for + * @param filter to exclude files from planning + * @param snapshotId of the last snapshot included in the plan + * @param caseSensitive setting for filtering + * @return the generated plan which could be executed during the compaction + */ + public RewritePlan plan(Table table, Expression filter, long snapshotId, boolean caseSensitive) { StructLikeMap>> plan = - planFileGroups(table, filter, startingSnapshotId, caseSensitive); + planFileGroups(table, filter, snapshotId, caseSensitive); RewriteExecutionContext ctx = new RewriteExecutionContext(); Stream groups = plan.entrySet().stream() @@ -72,15 +80,15 @@ public RewritePlanResult plan( .sorted(RewriteFileGroup.comparator(rewriteJobOrder)); Map groupsInPartition = plan.transformValues(List::size); int totalGroupCount = groupsInPartition.values().stream().reduce(Integer::sum).orElse(0); - return new RewritePlanResult(groups, totalGroupCount, groupsInPartition); + return new RewritePlan(groups, totalGroupCount, groupsInPartition); } private StructLikeMap>> planFileGroups( - Table table, Expression filter, long startingSnapshotId, boolean caseSensitive) { + Table table, Expression filter, long snapshotId, boolean caseSensitive) { CloseableIterable fileScanTasks = table .newScan() - .useSnapshot(startingSnapshotId) + .useSnapshot(snapshotId) .caseSensitive(caseSensitive) .filter(filter) .ignoreResiduals() @@ -104,14 +112,15 @@ private StructLikeMap>> planFileGroups( private StructLikeMap> groupByPartition( Table table, Types.StructType partitionType, Iterable tasks) { StructLikeMap> filesByPartition = StructLikeMap.create(partitionType); - StructLike emptyStruct = GenericRecord.create(partitionType); for (FileScanTask task : tasks) { // If a task uses an incompatible partition spec the data inside could contain values // which belong to multiple partitions in the current spec. Treating all such files as // un-partitioned and grouping them together helps to minimize new files made. StructLike taskPartition = - task.file().specId() == table.spec().specId() ? task.file().partition() : emptyStruct; + task.file().specId() == table.spec().specId() + ? task.file().partition() + : EmptyStructLike.get(); filesByPartition.computeIfAbsent(taskPartition, unused -> Lists.newArrayList()).add(task); } @@ -130,12 +139,13 @@ private RewriteFileGroup newRewriteGroup( return new RewriteFileGroup(info, Lists.newArrayList(tasks)); } - public static class RewritePlanResult { + /** Result of the data file rewrite planning. */ + public static class RewritePlan { private final Stream groups; private final int totalGroupCount; private final Map groupsInPartition; - private RewritePlanResult( + private RewritePlan( Stream groups, int totalGroupCount, Map groupsInPartition) { @@ -144,14 +154,17 @@ private RewritePlanResult( this.groupsInPartition = groupsInPartition; } + /** The stream of the generated {@link RewriteFileGroup}s. */ public Stream groups() { return groups; } + /** The number of the generated groups in the given partition. */ public int groupsInPartition(StructLike partition) { return groupsInPartition.get(partition); } + /** The total number of the groups generated by this plan. */ public int totalGroupCount() { return totalGroupCount; } diff --git a/core/src/test/java/org/apache/iceberg/actions/TestRewriteFileGroupPlanner.java b/core/src/test/java/org/apache/iceberg/actions/TestRewriteFileGroupPlanner.java index d3382fb8b349..8bf7018eccc4 100644 --- a/core/src/test/java/org/apache/iceberg/actions/TestRewriteFileGroupPlanner.java +++ b/core/src/test/java/org/apache/iceberg/actions/TestRewriteFileGroupPlanner.java @@ -91,7 +91,7 @@ void testGroups(RewriteJobOrder order) { .appendFile(FILE_6) .commit(); RewriteFileGroupPlanner planner = new RewriteFileGroupPlanner(new DummyRewriter(false), order); - RewriteFileGroupPlanner.RewritePlanResult result = + RewriteFileGroupPlanner.RewritePlan result = planner.plan(table, Expressions.alwaysTrue(), table.currentSnapshot().snapshotId(), false); List groups = result.groups().collect(Collectors.toList()); assertThat(groups.stream().map(group -> group.info().partition()).collect(Collectors.toList())) @@ -113,7 +113,7 @@ void testContext() { .commit(); RewriteFileGroupPlanner planner = new RewriteFileGroupPlanner(new DummyRewriter(true), RewriteJobOrder.FILES_DESC); - RewriteFileGroupPlanner.RewritePlanResult result = + RewriteFileGroupPlanner.RewritePlan result = planner.plan(table, Expressions.alwaysTrue(), table.currentSnapshot().snapshotId(), false); assertThat(result.totalGroupCount()).isEqualTo(6); assertThat(result.groupsInPartition(FILE_1.partition())).isEqualTo(3); diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java index fe0cbdaa4c46..84520187d3fc 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java @@ -41,7 +41,7 @@ import org.apache.iceberg.actions.RewriteDataFilesCommitManager; import org.apache.iceberg.actions.RewriteFileGroup; import org.apache.iceberg.actions.RewriteFileGroupPlanner; -import org.apache.iceberg.actions.RewriteFileGroupPlanner.RewritePlanResult; +import org.apache.iceberg.actions.RewriteFileGroupPlanner.RewritePlan; import org.apache.iceberg.exceptions.CommitFailedException; import org.apache.iceberg.exceptions.ValidationException; import org.apache.iceberg.expressions.Expression; @@ -163,17 +163,17 @@ public RewriteDataFiles.Result execute() { validateAndInitOptions(); - RewritePlanResult result = plan(startingSnapshotId); + RewritePlan plan = plan(startingSnapshotId); - if (result.totalGroupCount() == 0) { + if (plan.totalGroupCount() == 0) { LOG.info("Nothing found to rewrite in {}", table.name()); return EMPTY_RESULT; } Builder resultBuilder = partialProgressEnabled - ? doExecuteWithPartialProgress(result, commitManager(startingSnapshotId)) - : doExecute(result, commitManager(startingSnapshotId)); + ? doExecuteWithPartialProgress(plan, commitManager(startingSnapshotId)) + : doExecute(plan, commitManager(startingSnapshotId)); if (removeDanglingDeletes) { RemoveDanglingDeletesSparkAction action = @@ -185,13 +185,13 @@ public RewriteDataFiles.Result execute() { return resultBuilder.build(); } - RewritePlanResult plan(long startingSnapshotId) { + RewritePlan plan(long startingSnapshotId) { return new RewriteFileGroupPlanner(rewriter, rewriteJobOrder) .plan(table, filter, startingSnapshotId, caseSensitive); } @VisibleForTesting - RewriteFileGroup rewriteFiles(RewritePlanResult planResult, RewriteFileGroup fileGroup) { + RewriteFileGroup rewriteFiles(RewritePlan planResult, RewriteFileGroup fileGroup) { String desc = jobDesc(fileGroup, planResult); Set addedFiles = withJobGroupInfo( @@ -217,8 +217,7 @@ RewriteDataFilesCommitManager commitManager(long startingSnapshotId) { table, startingSnapshotId, useStartingSequenceNumber, commitSummary()); } - private Builder doExecute( - RewritePlanResult planResult, RewriteDataFilesCommitManager commitManager) { + private Builder doExecute(RewritePlan planResult, RewriteDataFilesCommitManager commitManager) { ExecutorService rewriteService = rewriteService(); ConcurrentLinkedQueue rewrittenGroups = Queues.newConcurrentLinkedQueue(); @@ -229,11 +228,10 @@ private Builder doExecute( .stopOnFailure() .noRetry() .onFailure( - (fileGroup, exception) -> - LOG.warn( - "Failure during rewrite process for group {}", - fileGroup.info(), - exception)); + (fileGroup, exception) -> { + LOG.warn( + "Failure during rewrite process for group {}", fileGroup.info(), exception); + }); try { rewriteTaskBuilder.run(fileGroup -> rewrittenGroups.add(rewriteFiles(planResult, fileGroup))); @@ -279,7 +277,7 @@ private Builder doExecute( } private Builder doExecuteWithPartialProgress( - RewritePlanResult planResult, RewriteDataFilesCommitManager commitManager) { + RewritePlan planResult, RewriteDataFilesCommitManager commitManager) { ExecutorService rewriteService = rewriteService(); // start commit service @@ -399,7 +397,7 @@ void validateAndInitOptions() { PARTIAL_PROGRESS_ENABLED); } - private String jobDesc(RewriteFileGroup group, RewritePlanResult planResult) { + private String jobDesc(RewriteFileGroup group, RewritePlan planResult) { StructLike partition = group.info().partition(); if (partition.size() > 0) { return String.format( From 4e01ed713bf2ac574fbdb1cb00076ccd19fcf45e Mon Sep 17 00:00:00 2001 From: Peter Vary Date: Thu, 14 Nov 2024 13:24:47 +0100 Subject: [PATCH 03/11] Reverting EmptyStructLike changes --- api/src/main/java/org/apache/iceberg/EmptyStructLike.java | 4 ++-- .../apache/iceberg/actions/RewriteFileGroupPlanner.java | 7 +++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/EmptyStructLike.java b/api/src/main/java/org/apache/iceberg/EmptyStructLike.java index 8b046780aa7a..2d57f4c01a66 100644 --- a/api/src/main/java/org/apache/iceberg/EmptyStructLike.java +++ b/api/src/main/java/org/apache/iceberg/EmptyStructLike.java @@ -20,13 +20,13 @@ import java.io.Serializable; -public class EmptyStructLike implements StructLike, Serializable { +class EmptyStructLike implements StructLike, Serializable { private static final EmptyStructLike INSTANCE = new EmptyStructLike(); private EmptyStructLike() {} - public static EmptyStructLike get() { + static EmptyStructLike get() { return INSTANCE; } diff --git a/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java b/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java index d80a6163dc94..56ba48ca431f 100644 --- a/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java +++ b/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java @@ -24,11 +24,11 @@ import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Stream; import org.apache.iceberg.DataFile; -import org.apache.iceberg.EmptyStructLike; import org.apache.iceberg.FileScanTask; import org.apache.iceberg.RewriteJobOrder; import org.apache.iceberg.StructLike; import org.apache.iceberg.Table; +import org.apache.iceberg.data.GenericRecord; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; @@ -112,15 +112,14 @@ private StructLikeMap>> planFileGroups( private StructLikeMap> groupByPartition( Table table, Types.StructType partitionType, Iterable tasks) { StructLikeMap> filesByPartition = StructLikeMap.create(partitionType); + StructLike emptyStruct = GenericRecord.create(partitionType); for (FileScanTask task : tasks) { // If a task uses an incompatible partition spec the data inside could contain values // which belong to multiple partitions in the current spec. Treating all such files as // un-partitioned and grouping them together helps to minimize new files made. StructLike taskPartition = - task.file().specId() == table.spec().specId() - ? task.file().partition() - : EmptyStructLike.get(); + task.file().specId() == table.spec().specId() ? task.file().partition() : emptyStruct; filesByPartition.computeIfAbsent(taskPartition, unused -> Lists.newArrayList()).add(task); } From 7de2eb5690d2827216b6fcb0cfd7c843dc3ba843 Mon Sep 17 00:00:00 2001 From: Peter Vary Date: Mon, 18 Nov 2024 15:22:12 +0100 Subject: [PATCH 04/11] Szehon's comments --- .../actions/RewriteFileGroupPlanner.java | 4 +-- .../actions/RewriteDataFilesSparkAction.java | 27 +++++++++---------- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java b/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java index 56ba48ca431f..e7f0f8ea6518 100644 --- a/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java +++ b/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java @@ -40,8 +40,8 @@ import org.slf4j.LoggerFactory; /** - * Checks the files in the {@link Table}. The {@link RewriteFileGroup}s are grouped by partitions - * and split by the {@link FileRewriter}. + * Groups specified files in the {@link Table} by {@link RewriteFileGroup}s. These will be grouped + * by partitions. */ public class RewriteFileGroupPlanner { private static final Logger LOG = LoggerFactory.getLogger(RewriteFileGroupPlanner.class); diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java index 84520187d3fc..442ebc09670f 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java @@ -191,8 +191,8 @@ RewritePlan plan(long startingSnapshotId) { } @VisibleForTesting - RewriteFileGroup rewriteFiles(RewritePlan planResult, RewriteFileGroup fileGroup) { - String desc = jobDesc(fileGroup, planResult); + RewriteFileGroup rewriteFiles(RewritePlan plan, RewriteFileGroup fileGroup) { + String desc = jobDesc(fileGroup, plan); Set addedFiles = withJobGroupInfo( newJobGroupInfo("REWRITE-DATA-FILES", desc), @@ -217,13 +217,13 @@ RewriteDataFilesCommitManager commitManager(long startingSnapshotId) { table, startingSnapshotId, useStartingSequenceNumber, commitSummary()); } - private Builder doExecute(RewritePlan planResult, RewriteDataFilesCommitManager commitManager) { + private Builder doExecute(RewritePlan plan, RewriteDataFilesCommitManager commitManager) { ExecutorService rewriteService = rewriteService(); ConcurrentLinkedQueue rewrittenGroups = Queues.newConcurrentLinkedQueue(); Tasks.Builder rewriteTaskBuilder = - Tasks.foreach(planResult.groups()) + Tasks.foreach(plan.groups()) .executeWith(rewriteService) .stopOnFailure() .noRetry() @@ -234,7 +234,7 @@ private Builder doExecute(RewritePlan planResult, RewriteDataFilesCommitManager }); try { - rewriteTaskBuilder.run(fileGroup -> rewrittenGroups.add(rewriteFiles(planResult, fileGroup))); + rewriteTaskBuilder.run(fileGroup -> rewrittenGroups.add(rewriteFiles(plan, fileGroup))); } catch (Exception e) { // At least one rewrite group failed, clean up all completed rewrites LOG.error( @@ -277,19 +277,18 @@ private Builder doExecute(RewritePlan planResult, RewriteDataFilesCommitManager } private Builder doExecuteWithPartialProgress( - RewritePlan planResult, RewriteDataFilesCommitManager commitManager) { + RewritePlan plan, RewriteDataFilesCommitManager commitManager) { ExecutorService rewriteService = rewriteService(); // start commit service - int groupsPerCommit = - IntMath.divide(planResult.totalGroupCount(), maxCommits, RoundingMode.CEILING); + int groupsPerCommit = IntMath.divide(plan.totalGroupCount(), maxCommits, RoundingMode.CEILING); RewriteDataFilesCommitManager.CommitService commitService = commitManager.service(groupsPerCommit); commitService.start(); Collection rewriteFailures = new ConcurrentLinkedQueue<>(); // start rewrite tasks - Tasks.foreach(planResult.groups()) + Tasks.foreach(plan.groups()) .suppressFailureWhenFinished() .executeWith(rewriteService) .noRetry() @@ -302,7 +301,7 @@ private Builder doExecuteWithPartialProgress( .dataFilesCount(fileGroup.numFiles()) .build()); }) - .run(fileGroup -> commitService.offer(rewriteFiles(planResult, fileGroup))); + .run(fileGroup -> commitService.offer(rewriteFiles(plan, fileGroup))); rewriteService.shutdown(); // stop commit service @@ -397,7 +396,7 @@ void validateAndInitOptions() { PARTIAL_PROGRESS_ENABLED); } - private String jobDesc(RewriteFileGroup group, RewritePlan planResult) { + private String jobDesc(RewriteFileGroup group, RewritePlan plan) { StructLike partition = group.info().partition(); if (partition.size() > 0) { return String.format( @@ -405,10 +404,10 @@ private String jobDesc(RewriteFileGroup group, RewritePlan planResult) { group.rewrittenFiles().size(), rewriter.description(), group.info().globalIndex(), - planResult.totalGroupCount(), + plan.totalGroupCount(), partition, group.info().partitionIndex(), - planResult.groupsInPartition(partition), + plan.groupsInPartition(partition), table.name()); } else { return String.format( @@ -416,7 +415,7 @@ private String jobDesc(RewriteFileGroup group, RewritePlan planResult) { group.rewrittenFiles().size(), rewriter.description(), group.info().globalIndex(), - planResult.totalGroupCount(), + plan.totalGroupCount(), table.name()); } } From e7f633d59a1c63811d31270aab0f935a44ba0848 Mon Sep 17 00:00:00 2001 From: Peter Vary Date: Thu, 21 Nov 2024 09:18:24 +0100 Subject: [PATCH 05/11] First version of the refactor --- ...Rewriter.java => FileRewriteExecutor.java} | 31 +-- .../iceberg/actions/FileRewriteGroup.java | 87 ++++++ .../iceberg/actions/FileRewritePlan.java | 86 ++++++ .../iceberg/actions/FileRewritePlanner.java | 69 +++++ .../iceberg/actions/RewriteFileGroup.java | 59 +---- .../actions/RewriteFileGroupPlanner.java | 199 +++++++++----- .../actions/RewritePositionDeletesGroup.java | 57 +--- .../RewritePositionDeletesGroupPlanner.java | 235 +++++++++++++++++ .../actions/SizeBasedDataRewriter.java | 109 -------- ....java => SizeBasedFileRewritePlanner.java} | 44 ++-- .../SizeBasedPositionDeletesRewriter.java | 58 ---- .../actions/TestRewriteFileGroupPlanner.java | 57 ++-- .../actions/TestSizeBasedRewriter.java | 68 ++--- .../TestRewritePositionDeleteFiles.java | 6 +- .../IcebergSortCompactionBenchmark.java | 30 +-- .../actions/RewriteDataFilesSparkAction.java | 73 ++--- ...RewritePositionDeleteFilesSparkAction.java | 206 ++++----------- ...a => SparkBinPackDataRewriteExecutor.java} | 9 +- ...inPackPositionDeletesRewriteExecutor.java} | 19 +- .../spark/actions/SparkRewriteExecutor.java | 83 ++++++ ...=> SparkShufflingDataRewriteExecutor.java} | 19 +- ...=> SparkSizeBasedDataRewriteExecutor.java} | 17 +- ...java => SparkSortDataRewriteExecutor.java} | 6 +- ...va => SparkZOrderDataRewriteExecutor.java} | 6 +- .../actions/TestRewriteDataFilesAction.java | 98 +++---- .../TestRewritePositionDeleteFilesAction.java | 52 ++-- ...java => TestSparkFileRewriteExecutor.java} | 249 +++++++++--------- .../spark/source/TestCompressionSettings.java | 4 +- 28 files changed, 1176 insertions(+), 860 deletions(-) rename core/src/main/java/org/apache/iceberg/actions/{FileRewriter.java => FileRewriteExecutor.java} (62%) create mode 100644 core/src/main/java/org/apache/iceberg/actions/FileRewriteGroup.java create mode 100644 core/src/main/java/org/apache/iceberg/actions/FileRewritePlan.java create mode 100644 core/src/main/java/org/apache/iceberg/actions/FileRewritePlanner.java create mode 100644 core/src/main/java/org/apache/iceberg/actions/RewritePositionDeletesGroupPlanner.java delete mode 100644 core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java rename core/src/main/java/org/apache/iceberg/actions/{SizeBasedFileRewriter.java => SizeBasedFileRewritePlanner.java} (92%) delete mode 100644 core/src/main/java/org/apache/iceberg/actions/SizeBasedPositionDeletesRewriter.java rename spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/{SparkBinPackDataRewriter.java => SparkBinPackDataRewriteExecutor.java} (88%) rename spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/{SparkBinPackPositionDeletesRewriter.java => SparkBinPackPositionDeletesRewriteExecutor.java} (88%) create mode 100644 spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkRewriteExecutor.java rename spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/{SparkShufflingDataRewriter.java => SparkShufflingDataRewriteExecutor.java} (93%) rename spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/{SparkSizeBasedDataRewriter.java => SparkSizeBasedDataRewriteExecutor.java} (74%) rename spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/{SparkSortDataRewriter.java => SparkSortDataRewriteExecutor.java} (89%) rename spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/{SparkZOrderDataRewriter.java => SparkZOrderDataRewriteExecutor.java} (97%) rename spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/{TestSparkFileRewriter.java => TestSparkFileRewriteExecutor.java} (57%) diff --git a/core/src/main/java/org/apache/iceberg/actions/FileRewriter.java b/core/src/main/java/org/apache/iceberg/actions/FileRewriteExecutor.java similarity index 62% rename from core/src/main/java/org/apache/iceberg/actions/FileRewriter.java rename to core/src/main/java/org/apache/iceberg/actions/FileRewriteExecutor.java index 7c6b4e8d7ef5..c024cb42877a 100644 --- a/core/src/main/java/org/apache/iceberg/actions/FileRewriter.java +++ b/core/src/main/java/org/apache/iceberg/actions/FileRewriteExecutor.java @@ -18,24 +18,24 @@ */ package org.apache.iceberg.actions; -import java.util.List; import java.util.Map; import java.util.Set; import org.apache.iceberg.ContentFile; import org.apache.iceberg.ContentScanTask; /** - * A class for rewriting content files. + * A class for rewriting content file groups ({@link FileRewriteGroup}). * - *

The entire rewrite operation is broken down into pieces based on partitioning, and size-based - * groups within a partition. These subunits of the rewrite are referred to as file groups. A file - * group will be processed by a single framework "action". For example, in Spark this means that - * each group would be rewritten in its own Spark job. - * - * @param the Java type of tasks to read content files - * @param the Java type of content files + * @param the Java type of the plan info + * @param the Java type of the tasks to read content files + * @param the Java type of the content files + * @param the Java type of the planned groups */ -public interface FileRewriter, F extends ContentFile> { +public interface FileRewriteExecutor< + I, + T extends ContentScanTask, + F extends ContentFile, + G extends FileRewriteGroup> { /** Returns a description for this rewriter. */ default String description() { @@ -56,14 +56,11 @@ default String description() { void init(Map options); /** - * Selects files which this rewriter believes are valid targets to be rewritten based on their - * scan tasks and groups those scan tasks into file groups. The file groups are then rewritten in - * a single executable unit, such as a Spark job. + * Initializes the rewriter using the information generated during planning. * - * @param tasks an iterable of scan task for files in a partition - * @return groups of scan tasks for files to be rewritten in a single executable unit + * @param plan containing the configuration data */ - Iterable> planFileGroups(Iterable tasks); + void initPlan(FileRewritePlan plan); /** * Rewrite a group of files represented by the given list of scan tasks. @@ -73,5 +70,5 @@ default String description() { * @param group a group of scan tasks for files to be rewritten together * @return a set of newly written files */ - Set rewrite(List group); + Set rewrite(G group); } diff --git a/core/src/main/java/org/apache/iceberg/actions/FileRewriteGroup.java b/core/src/main/java/org/apache/iceberg/actions/FileRewriteGroup.java new file mode 100644 index 000000000000..c43bf5cd85f6 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/actions/FileRewriteGroup.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.actions; + +import java.util.Comparator; +import java.util.List; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.ContentScanTask; +import org.apache.iceberg.RewriteJobOrder; + +/** + * Container class representing a set of files to be rewritten by a {@link FileRewriteExecutor}. + * + * @param the Java type of the plan info + * @param the Java type of the tasks to read content files + * @param the Java type of the content files + */ +public abstract class FileRewriteGroup, F extends ContentFile> { + private final I info; + private final List fileScanTasks; + private final long splitSize; + private final int expectedOutputFiles; + + protected FileRewriteGroup( + I info, List fileScanTasks, long splitSize, int expectedOutputFiles) { + this.info = info; + this.fileScanTasks = fileScanTasks; + this.splitSize = splitSize; + this.expectedOutputFiles = expectedOutputFiles; + } + + public I info() { + return info; + } + + public List fileScans() { + return fileScanTasks; + } + + public long splitSize() { + return splitSize; + } + + public int expectedOutputFiles() { + return expectedOutputFiles; + } + + public long sizeInBytes() { + return fileScanTasks.stream().mapToLong(T::length).sum(); + } + + public int numInputFiles() { + return fileScanTasks.size(); + } + + public static , F extends ContentFile> + Comparator> comparator(RewriteJobOrder rewriteJobOrder) { + switch (rewriteJobOrder) { + case BYTES_ASC: + return Comparator.comparing(FileRewriteGroup::sizeInBytes); + case BYTES_DESC: + return Comparator.comparing(FileRewriteGroup::sizeInBytes, Comparator.reverseOrder()); + case FILES_ASC: + return Comparator.comparing(FileRewriteGroup::numInputFiles); + case FILES_DESC: + return Comparator.comparing(FileRewriteGroup::numInputFiles, Comparator.reverseOrder()); + default: + return (unused, unused2) -> 0; + } + } +} diff --git a/core/src/main/java/org/apache/iceberg/actions/FileRewritePlan.java b/core/src/main/java/org/apache/iceberg/actions/FileRewritePlan.java new file mode 100644 index 000000000000..ad6349de2f80 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/actions/FileRewritePlan.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.actions; + +import java.util.Map; +import java.util.stream.Stream; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.ContentScanTask; +import org.apache.iceberg.StructLike; + +/** + * Result of the file rewrite planning. + * + *

Contains the planned groups, calculated values required by the {@link FileRewriteExecutor}s + * and statistics. + * + * @param the Java type of the plan info + * @param the Java type of the tasks to read content files + * @param the Java type of the content files + * @param the Java type of the planned groups + */ +public class FileRewritePlan< + I, + T extends ContentScanTask, + F extends ContentFile, + G extends FileRewriteGroup> { + private final Stream groups; + private final int totalGroupCount; + private final Map groupsInPartition; + private final long writeMaxFileSize; + private final int outputSpecId; + + protected FileRewritePlan( + Stream groups, + int totalGroupCount, + Map groupsInPartition, + long writeMaxFileSize, + int outputSpecId) { + this.groups = groups; + this.totalGroupCount = totalGroupCount; + this.groupsInPartition = groupsInPartition; + this.writeMaxFileSize = writeMaxFileSize; + this.outputSpecId = outputSpecId; + } + + /** The stream of the generated {@link RewriteFileGroup}s. */ + public Stream groups() { + return groups; + } + + /** The number of the generated groups in the given partition. */ + public int groupsInPartition(StructLike partition) { + return groupsInPartition.get(partition); + } + + /** The total number of the groups generated by this plan. */ + public int totalGroupCount() { + return totalGroupCount; + } + + /** Calculated maximum file size for the target files */ + public long writeMaxFileSize() { + return writeMaxFileSize; + } + + /** Partition specification id for the target files */ + public int outputSpecId() { + return outputSpecId; + } +} diff --git a/core/src/main/java/org/apache/iceberg/actions/FileRewritePlanner.java b/core/src/main/java/org/apache/iceberg/actions/FileRewritePlanner.java new file mode 100644 index 000000000000..ff770874a9a3 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/actions/FileRewritePlanner.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.actions; + +import java.util.Map; +import java.util.Set; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.ContentScanTask; + +/** + * A class for planning content file rewrites. + * + *

The entire rewrite operation is broken down into pieces based on partitioning, and size-based + * groups within a partition. These subunits of the rewrite are referred to as file groups. A file + * group will be processed by a {@link FileRewriteExecutor} in a single framework "action". For + * example, in Spark this means that each group would be rewritten in its own Spark job. + * + * @param the Java type of the plan info + * @param the Java type of the tasks to read content files + * @param the Java type of the content files + * @param the Java type of the planned groups + */ +public interface FileRewritePlanner< + I, + T extends ContentScanTask, + F extends ContentFile, + G extends FileRewriteGroup> { + + /** Returns a description for this rewriter. */ + default String description() { + return getClass().getName(); + } + + /** + * Returns a set of supported options for this rewriter. Only options specified in this list will + * be accepted at runtime. Any other options will be rejected. + */ + Set validOptions(); + + /** + * Initializes this rewriter using provided options. + * + * @param options options to initialize this rewriter + */ + void init(Map options); + + /** + * Generates the plan for rewrite. + * + * @return the generated plan which could be executed during the compaction + */ + FileRewritePlan plan(); +} diff --git a/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroup.java b/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroup.java index dfc9842780f5..b43d94a2bb8c 100644 --- a/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroup.java +++ b/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroup.java @@ -18,39 +18,29 @@ */ package org.apache.iceberg.actions; -import java.util.Comparator; import java.util.List; import java.util.Set; import java.util.stream.Collectors; import org.apache.iceberg.DataFile; import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.RewriteJobOrder; import org.apache.iceberg.actions.RewriteDataFiles.FileGroupInfo; import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.util.DataFileSet; /** - * Container class representing a set of files to be rewritten by a RewriteAction and the new files - * which have been written by the action. + * Container class representing a set of data files to be rewritten by a RewriteAction and the new + * files which have been written by the action. */ -public class RewriteFileGroup { - private final FileGroupInfo info; - private final List fileScanTasks; - +public class RewriteFileGroup extends FileRewriteGroup { private DataFileSet addedFiles = DataFileSet.create(); - public RewriteFileGroup(FileGroupInfo info, List fileScanTasks) { - this.info = info; - this.fileScanTasks = fileScanTasks; - } - - public FileGroupInfo info() { - return info; - } - - public List fileScans() { - return fileScanTasks; + public RewriteFileGroup( + FileGroupInfo info, + List fileScanTasks, + long splitSize, + int expectedOutputFiles) { + super(info, fileScanTasks, splitSize, expectedOutputFiles); } public void setOutputFiles(Set files) { @@ -70,9 +60,9 @@ public Set addedFiles() { public RewriteDataFiles.FileGroupRewriteResult asResult() { Preconditions.checkState(addedFiles != null, "Cannot get result, Group was never rewritten"); return ImmutableRewriteDataFiles.FileGroupRewriteResult.builder() - .info(info) + .info(info()) .addedDataFilesCount(addedFiles.size()) - .rewrittenDataFilesCount(fileScanTasks.size()) + .rewrittenDataFilesCount(fileScans().size()) .rewrittenBytesCount(sizeInBytes()) .build(); } @@ -80,35 +70,12 @@ public RewriteDataFiles.FileGroupRewriteResult asResult() { @Override public String toString() { return MoreObjects.toStringHelper(this) - .add("info", info) - .add("numRewrittenFiles", fileScanTasks.size()) + .add("info", info()) + .add("numRewrittenFiles", fileScans().size()) .add( "numAddedFiles", addedFiles == null ? "Rewrite Incomplete" : Integer.toString(addedFiles.size())) .add("numRewrittenBytes", sizeInBytes()) .toString(); } - - public long sizeInBytes() { - return fileScanTasks.stream().mapToLong(FileScanTask::length).sum(); - } - - public int numFiles() { - return fileScanTasks.size(); - } - - public static Comparator comparator(RewriteJobOrder rewriteJobOrder) { - switch (rewriteJobOrder) { - case BYTES_ASC: - return Comparator.comparing(RewriteFileGroup::sizeInBytes); - case BYTES_DESC: - return Comparator.comparing(RewriteFileGroup::sizeInBytes, Comparator.reverseOrder()); - case FILES_ASC: - return Comparator.comparing(RewriteFileGroup::numFiles); - case FILES_DESC: - return Comparator.comparing(RewriteFileGroup::numFiles, Comparator.reverseOrder()); - default: - return (unused, unused2) -> 0; - } - } } diff --git a/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java b/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java index e7f0f8ea6518..38df04217d98 100644 --- a/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java +++ b/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Stream; import org.apache.iceberg.DataFile; @@ -28,13 +29,20 @@ import org.apache.iceberg.RewriteJobOrder; import org.apache.iceberg.StructLike; import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.actions.RewriteDataFiles.FileGroupInfo; import org.apache.iceberg.data.GenericRecord; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.PropertyUtil; import org.apache.iceberg.util.StructLikeMap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -43,30 +51,86 @@ * Groups specified files in the {@link Table} by {@link RewriteFileGroup}s. These will be grouped * by partitions. */ -public class RewriteFileGroupPlanner { +public class RewriteFileGroupPlanner + extends SizeBasedFileRewritePlanner { + /** + * The minimum number of deletes that needs to be associated with a data file for it to be + * considered for rewriting. If a data file has this number of deletes or more, it will be + * rewritten regardless of its file size determined by {@link #MIN_FILE_SIZE_BYTES} and {@link + * #MAX_FILE_SIZE_BYTES}. If a file group contains a file that satisfies this condition, the file + * group will be rewritten regardless of the number of files in the file group determined by + * {@link #MIN_INPUT_FILES}. + * + *

Defaults to Integer.MAX_VALUE, which means this feature is not enabled by default. + */ + public static final String DELETE_FILE_THRESHOLD = "delete-file-threshold"; + + public static final int DELETE_FILE_THRESHOLD_DEFAULT = Integer.MAX_VALUE; + private static final Logger LOG = LoggerFactory.getLogger(RewriteFileGroupPlanner.class); - private final FileRewriter rewriter; - private final RewriteJobOrder rewriteJobOrder; + private final Expression filter; + private final long snapshotId; + private final boolean caseSensitive; + + private int deleteFileThreshold; + private RewriteJobOrder rewriteJobOrder; public RewriteFileGroupPlanner( - FileRewriter rewriter, RewriteJobOrder rewriteJobOrder) { - this.rewriter = rewriter; - this.rewriteJobOrder = rewriteJobOrder; + Table table, Expression filter, long snapshotId, boolean caseSensitive) { + super(table); + this.filter = filter; + this.snapshotId = snapshotId; + this.caseSensitive = caseSensitive; + } + + @Override + public Set validOptions() { + return ImmutableSet.builder() + .addAll(super.validOptions()) + .add(DELETE_FILE_THRESHOLD) + .add(RewriteDataFiles.REWRITE_JOB_ORDER) + .build(); + } + + @Override + public void init(Map options) { + super.init(options); + this.deleteFileThreshold = deleteFileThreshold(options); + this.rewriteJobOrder = + RewriteJobOrder.fromName( + PropertyUtil.propertyAsString( + options, + RewriteDataFiles.REWRITE_JOB_ORDER, + RewriteDataFiles.REWRITE_JOB_ORDER_DEFAULT)); + } + + @Override + protected Iterable filterFiles(Iterable tasks) { + return Iterables.filter(tasks, task -> wronglySized(task) || tooManyDeletes(task)); + } + + @Override + protected Iterable> filterFileGroups(List> groups) { + return Iterables.filter(groups, this::shouldRewrite); + } + + @Override + protected long defaultTargetFileSize() { + return PropertyUtil.propertyAsLong( + table().properties(), + TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, + TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT); } /** * Generates the plan for the current table. * - * @param table to plan for - * @param filter to exclude files from planning - * @param snapshotId of the last snapshot included in the plan - * @param caseSensitive setting for filtering * @return the generated plan which could be executed during the compaction */ - public RewritePlan plan(Table table, Expression filter, long snapshotId, boolean caseSensitive) { - StructLikeMap>> plan = - planFileGroups(table, filter, snapshotId, caseSensitive); + @Override + public FileRewritePlan plan() { + StructLikeMap>> plan = planFileGroups(); RewriteExecutionContext ctx = new RewriteExecutionContext(); Stream groups = plan.entrySet().stream() @@ -75,31 +139,67 @@ public RewritePlan plan(Table table, Expression filter, long snapshotId, boolean e -> { StructLike partition = e.getKey(); List> scanGroups = e.getValue(); - return scanGroups.stream().map(tasks -> newRewriteGroup(ctx, partition, tasks)); + return scanGroups.stream() + .map( + tasks -> { + long inputSize = inputSize(tasks); + return newRewriteGroup( + ctx, + partition, + tasks, + splitSize(inputSize), + numOutputFiles(inputSize)); + }); }) .sorted(RewriteFileGroup.comparator(rewriteJobOrder)); Map groupsInPartition = plan.transformValues(List::size); int totalGroupCount = groupsInPartition.values().stream().reduce(Integer::sum).orElse(0); - return new RewritePlan(groups, totalGroupCount, groupsInPartition); + return new FileRewritePlan<>( + groups, totalGroupCount, groupsInPartition, writeMaxFileSize(), outputSpecId()); } - private StructLikeMap>> planFileGroups( - Table table, Expression filter, long snapshotId, boolean caseSensitive) { - CloseableIterable fileScanTasks = - table - .newScan() - .useSnapshot(snapshotId) - .caseSensitive(caseSensitive) - .filter(filter) - .ignoreResiduals() - .planFiles(); + @VisibleForTesting + CloseableIterable tasks() { + return table() + .newScan() + .useSnapshot(snapshotId) + .caseSensitive(caseSensitive) + .filter(filter) + .ignoreResiduals() + .planFiles(); + } + + private int deleteFileThreshold(Map options) { + int value = + PropertyUtil.propertyAsInt(options, DELETE_FILE_THRESHOLD, DELETE_FILE_THRESHOLD_DEFAULT); + Preconditions.checkArgument( + value >= 0, "'%s' is set to %s but must be >= 0", DELETE_FILE_THRESHOLD, value); + return value; + } + + private boolean tooManyDeletes(FileScanTask task) { + return task.deletes() != null && task.deletes().size() >= deleteFileThreshold; + } + + private boolean shouldRewrite(List group) { + return enoughInputFiles(group) + || enoughContent(group) + || tooMuchContent(group) + || anyTaskHasTooManyDeletes(group); + } + + private boolean anyTaskHasTooManyDeletes(List group) { + return group.stream().anyMatch(this::tooManyDeletes); + } + + private StructLikeMap>> planFileGroups() { + CloseableIterable fileScanTasks = tasks(); try { - Types.StructType partitionType = table.spec().partitionType(); + Types.StructType partitionType = table().spec().partitionType(); StructLikeMap> filesByPartition = - groupByPartition(table, partitionType, fileScanTasks); - return filesByPartition.transformValues( - tasks -> ImmutableList.copyOf(rewriter.planFileGroups(tasks))); + groupByPartition(table(), partitionType, fileScanTasks); + return filesByPartition.transformValues(tasks -> ImmutableList.copyOf(planFileGroups(tasks))); } finally { try { fileScanTasks.close(); @@ -128,45 +228,18 @@ private StructLikeMap> groupByPartition( } private RewriteFileGroup newRewriteGroup( - RewriteExecutionContext ctx, StructLike partition, List tasks) { - RewriteDataFiles.FileGroupInfo info = + RewriteExecutionContext ctx, + StructLike partition, + List tasks, + long splitSize, + int numOutputSize) { + FileGroupInfo info = ImmutableRewriteDataFiles.FileGroupInfo.builder() .globalIndex(ctx.currentGlobalIndex()) .partitionIndex(ctx.currentPartitionIndex(partition)) .partition(partition) .build(); - return new RewriteFileGroup(info, Lists.newArrayList(tasks)); - } - - /** Result of the data file rewrite planning. */ - public static class RewritePlan { - private final Stream groups; - private final int totalGroupCount; - private final Map groupsInPartition; - - private RewritePlan( - Stream groups, - int totalGroupCount, - Map groupsInPartition) { - this.groups = groups; - this.totalGroupCount = totalGroupCount; - this.groupsInPartition = groupsInPartition; - } - - /** The stream of the generated {@link RewriteFileGroup}s. */ - public Stream groups() { - return groups; - } - - /** The number of the generated groups in the given partition. */ - public int groupsInPartition(StructLike partition) { - return groupsInPartition.get(partition); - } - - /** The total number of the groups generated by this plan. */ - public int totalGroupCount() { - return totalGroupCount; - } + return new RewriteFileGroup(info, Lists.newArrayList(tasks), splitSize, numOutputSize); } private static class RewriteExecutionContext { diff --git a/core/src/main/java/org/apache/iceberg/actions/RewritePositionDeletesGroup.java b/core/src/main/java/org/apache/iceberg/actions/RewritePositionDeletesGroup.java index d1c688417a64..96640bb5d9b6 100644 --- a/core/src/main/java/org/apache/iceberg/actions/RewritePositionDeletesGroup.java +++ b/core/src/main/java/org/apache/iceberg/actions/RewritePositionDeletesGroup.java @@ -18,13 +18,11 @@ */ package org.apache.iceberg.actions; -import java.util.Comparator; import java.util.List; import java.util.Set; import java.util.stream.Collectors; import org.apache.iceberg.DeleteFile; import org.apache.iceberg.PositionDeletesScanTask; -import org.apache.iceberg.RewriteJobOrder; import org.apache.iceberg.actions.RewritePositionDeleteFiles.FileGroupInfo; import org.apache.iceberg.actions.RewritePositionDeleteFiles.FileGroupRewriteResult; import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; @@ -35,29 +33,23 @@ * Container class representing a set of position delete files to be rewritten by a {@link * RewritePositionDeleteFiles} and the new files which have been written by the action. */ -public class RewritePositionDeletesGroup { - private final FileGroupInfo info; - private final List tasks; +public class RewritePositionDeletesGroup + extends FileRewriteGroup { private final long maxRewrittenDataSequenceNumber; private DeleteFileSet addedDeleteFiles = DeleteFileSet.create(); - public RewritePositionDeletesGroup(FileGroupInfo info, List tasks) { + public RewritePositionDeletesGroup( + FileGroupInfo info, + List tasks, + long splitSize, + int expectedOutputFiles) { + super(info, tasks, splitSize, expectedOutputFiles); Preconditions.checkArgument(!tasks.isEmpty(), "Tasks must not be empty"); - this.info = info; - this.tasks = tasks; this.maxRewrittenDataSequenceNumber = tasks.stream().mapToLong(t -> t.file().dataSequenceNumber()).max().getAsLong(); } - public FileGroupInfo info() { - return info; - } - - public List tasks() { - return tasks; - } - public void setOutputFiles(Set files) { addedDeleteFiles = DeleteFileSet.of(files); } @@ -67,7 +59,7 @@ public long maxRewrittenDataSequenceNumber() { } public Set rewrittenDeleteFiles() { - return tasks().stream() + return fileScans().stream() .map(PositionDeletesScanTask::file) .collect(Collectors.toCollection(DeleteFileSet::create)); } @@ -81,9 +73,9 @@ public FileGroupRewriteResult asResult() { addedDeleteFiles != null, "Cannot get result, Group was never rewritten"); return ImmutableRewritePositionDeleteFiles.FileGroupRewriteResult.builder() - .info(info) + .info(info()) .addedDeleteFilesCount(addedDeleteFiles.size()) - .rewrittenDeleteFilesCount(tasks.size()) + .rewrittenDeleteFilesCount(fileScans().size()) .rewrittenBytesCount(rewrittenBytes()) .addedBytesCount(addedBytes()) .build(); @@ -92,8 +84,8 @@ public FileGroupRewriteResult asResult() { @Override public String toString() { return MoreObjects.toStringHelper(this) - .add("info", info) - .add("numRewrittenPositionDeleteFiles", tasks.size()) + .add("info", info()) + .add("numRewrittenPositionDeleteFiles", fileScans().size()) .add( "numAddedPositionDeleteFiles", addedDeleteFiles == null @@ -105,31 +97,10 @@ public String toString() { } public long rewrittenBytes() { - return tasks.stream().mapToLong(PositionDeletesScanTask::length).sum(); + return fileScans().stream().mapToLong(PositionDeletesScanTask::length).sum(); } public long addedBytes() { return addedDeleteFiles.stream().mapToLong(DeleteFile::fileSizeInBytes).sum(); } - - public int numRewrittenDeleteFiles() { - return tasks.size(); - } - - public static Comparator comparator(RewriteJobOrder order) { - switch (order) { - case BYTES_ASC: - return Comparator.comparing(RewritePositionDeletesGroup::rewrittenBytes); - case BYTES_DESC: - return Comparator.comparing( - RewritePositionDeletesGroup::rewrittenBytes, Comparator.reverseOrder()); - case FILES_ASC: - return Comparator.comparing(RewritePositionDeletesGroup::numRewrittenDeleteFiles); - case FILES_DESC: - return Comparator.comparing( - RewritePositionDeletesGroup::numRewrittenDeleteFiles, Comparator.reverseOrder()); - default: - return (unused, unused2) -> 0; - } - } } diff --git a/core/src/main/java/org/apache/iceberg/actions/RewritePositionDeletesGroupPlanner.java b/core/src/main/java/org/apache/iceberg/actions/RewritePositionDeletesGroupPlanner.java new file mode 100644 index 000000000000..d83677139a37 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/actions/RewritePositionDeletesGroupPlanner.java @@ -0,0 +1,235 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.actions; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Stream; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.MetadataTableType; +import org.apache.iceberg.MetadataTableUtils; +import org.apache.iceberg.Partitioning; +import org.apache.iceberg.PositionDeletesScanTask; +import org.apache.iceberg.PositionDeletesTable; +import org.apache.iceberg.RewriteJobOrder; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.actions.RewritePositionDeleteFiles.FileGroupInfo; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.PartitionUtil; +import org.apache.iceberg.util.PropertyUtil; +import org.apache.iceberg.util.StructLikeMap; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Groups specified files in the {@link Table} by {@link RewriteFileGroup}s. These will be grouped + * by partitions. + */ +public class RewritePositionDeletesGroupPlanner + extends SizeBasedFileRewritePlanner< + FileGroupInfo, PositionDeletesScanTask, DeleteFile, RewritePositionDeletesGroup> { + private static final Logger LOG = + LoggerFactory.getLogger(RewritePositionDeletesGroupPlanner.class); + + private final Expression filter; + private final boolean caseSensitive; + private RewriteJobOrder rewriteJobOrder; + + public RewritePositionDeletesGroupPlanner(Table table, Expression filter, boolean caseSensitive) { + super(table); + this.caseSensitive = caseSensitive; + this.filter = filter; + } + + @Override + public Set validOptions() { + return ImmutableSet.builder() + .addAll(super.validOptions()) + .add(RewriteDataFiles.REWRITE_JOB_ORDER) + .build(); + } + + @Override + public void init(Map options) { + super.init(options); + this.rewriteJobOrder = + RewriteJobOrder.fromName( + PropertyUtil.propertyAsString( + options, + RewritePositionDeleteFiles.REWRITE_JOB_ORDER, + RewritePositionDeleteFiles.REWRITE_JOB_ORDER_DEFAULT)); + } + + /** + * Generates the plan for the current table. + * + * @return the generated plan which could be executed during the compaction + */ + @Override + public FileRewritePlan< + FileGroupInfo, PositionDeletesScanTask, DeleteFile, RewritePositionDeletesGroup> + plan() { + StructLikeMap>> plan = planFileGroups(); + RewriteExecutionContext ctx = new RewriteExecutionContext(); + Stream groups = + plan.entrySet().stream() + .filter(e -> !e.getValue().isEmpty()) + .flatMap( + e -> { + StructLike partition = e.getKey(); + List> scanGroups = e.getValue(); + return scanGroups.stream() + .map( + tasks -> { + long inputSize = inputSize(tasks); + return newRewriteGroup( + ctx, + partition, + tasks, + splitSize(inputSize), + numOutputFiles(inputSize)); + }); + }) + .sorted(RewritePositionDeletesGroup.comparator(rewriteJobOrder)); + Map groupsInPartition = plan.transformValues(List::size); + int totalGroupCount = groupsInPartition.values().stream().reduce(Integer::sum).orElse(0); + return new FileRewritePlan<>( + groups, totalGroupCount, groupsInPartition, writeMaxFileSize(), outputSpecId()); + } + + private StructLikeMap>> planFileGroups() { + Table deletesTable = + MetadataTableUtils.createMetadataTableInstance(table(), MetadataTableType.POSITION_DELETES); + CloseableIterable fileTasks = planFiles(deletesTable); + + try { + Types.StructType partitionType = Partitioning.partitionType(deletesTable); + StructLikeMap> fileTasksByPartition = + groupByPartition(partitionType, fileTasks); + return fileTasksByPartition.transformValues( + tasks -> ImmutableList.copyOf(planFileGroups(tasks))); + } finally { + try { + fileTasks.close(); + } catch (IOException io) { + LOG.error("Cannot properly close file iterable while planning for rewrite", io); + } + } + } + + @Override + protected Iterable filterFiles(Iterable tasks) { + return Iterables.filter(tasks, this::wronglySized); + } + + @Override + protected Iterable> filterFileGroups( + List> groups) { + return Iterables.filter(groups, this::shouldRewrite); + } + + private boolean shouldRewrite(List group) { + return enoughInputFiles(group) || enoughContent(group) || tooMuchContent(group); + } + + @Override + protected long defaultTargetFileSize() { + return PropertyUtil.propertyAsLong( + table().properties(), + TableProperties.DELETE_TARGET_FILE_SIZE_BYTES, + TableProperties.DELETE_TARGET_FILE_SIZE_BYTES_DEFAULT); + } + + private CloseableIterable planFiles(Table deletesTable) { + PositionDeletesTable.PositionDeletesBatchScan scan = + (PositionDeletesTable.PositionDeletesBatchScan) deletesTable.newBatchScan(); + return CloseableIterable.transform( + scan.baseTableFilter(filter).caseSensitive(caseSensitive).ignoreResiduals().planFiles(), + PositionDeletesScanTask.class::cast); + } + + private StructLikeMap> groupByPartition( + Types.StructType partitionType, Iterable tasks) { + StructLikeMap> filesByPartition = + StructLikeMap.create(partitionType); + + for (PositionDeletesScanTask task : tasks) { + StructLike coerced = coercePartition(task, partitionType); + + List partitionTasks = filesByPartition.get(coerced); + if (partitionTasks == null) { + partitionTasks = Lists.newArrayList(); + } + partitionTasks.add(task); + filesByPartition.put(coerced, partitionTasks); + } + + return filesByPartition; + } + + private RewritePositionDeletesGroup newRewriteGroup( + RewriteExecutionContext ctx, + StructLike partition, + List tasks, + long splitSize, + int numOutputSize) { + ImmutableRewritePositionDeleteFiles.FileGroupInfo info = + ImmutableRewritePositionDeleteFiles.FileGroupInfo.builder() + .globalIndex(ctx.currentGlobalIndex()) + .partitionIndex(ctx.currentPartitionIndex(partition)) + .partition(partition) + .build(); + return new RewritePositionDeletesGroup( + info, Lists.newArrayList(tasks), splitSize, numOutputSize); + } + + private static class RewriteExecutionContext { + private final Map partitionIndexMap; + private final AtomicInteger groupIndex; + + private RewriteExecutionContext() { + this.partitionIndexMap = Maps.newConcurrentMap(); + this.groupIndex = new AtomicInteger(1); + } + + private int currentGlobalIndex() { + return groupIndex.getAndIncrement(); + } + + private int currentPartitionIndex(StructLike partition) { + return partitionIndexMap.merge(partition, 1, Integer::sum); + } + } + + private StructLike coercePartition(PositionDeletesScanTask task, Types.StructType partitionType) { + return PartitionUtil.coercePartition(partitionType, task.spec(), task.partition()); + } +} diff --git a/core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java b/core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java deleted file mode 100644 index e5b5908804e7..000000000000 --- a/core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.actions; - -import java.util.List; -import java.util.Map; -import java.util.Set; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.util.PropertyUtil; - -public abstract class SizeBasedDataRewriter extends SizeBasedFileRewriter { - - /** - * The minimum number of deletes that needs to be associated with a data file for it to be - * considered for rewriting. If a data file has this number of deletes or more, it will be - * rewritten regardless of its file size determined by {@link #MIN_FILE_SIZE_BYTES} and {@link - * #MAX_FILE_SIZE_BYTES}. If a file group contains a file that satisfies this condition, the file - * group will be rewritten regardless of the number of files in the file group determined by - * {@link #MIN_INPUT_FILES}. - * - *

Defaults to Integer.MAX_VALUE, which means this feature is not enabled by default. - */ - public static final String DELETE_FILE_THRESHOLD = "delete-file-threshold"; - - public static final int DELETE_FILE_THRESHOLD_DEFAULT = Integer.MAX_VALUE; - - private int deleteFileThreshold; - - protected SizeBasedDataRewriter(Table table) { - super(table); - } - - @Override - public Set validOptions() { - return ImmutableSet.builder() - .addAll(super.validOptions()) - .add(DELETE_FILE_THRESHOLD) - .build(); - } - - @Override - public void init(Map options) { - super.init(options); - this.deleteFileThreshold = deleteFileThreshold(options); - } - - @Override - protected Iterable filterFiles(Iterable tasks) { - return Iterables.filter(tasks, task -> wronglySized(task) || tooManyDeletes(task)); - } - - private boolean tooManyDeletes(FileScanTask task) { - return task.deletes() != null && task.deletes().size() >= deleteFileThreshold; - } - - @Override - protected Iterable> filterFileGroups(List> groups) { - return Iterables.filter(groups, this::shouldRewrite); - } - - private boolean shouldRewrite(List group) { - return enoughInputFiles(group) - || enoughContent(group) - || tooMuchContent(group) - || anyTaskHasTooManyDeletes(group); - } - - private boolean anyTaskHasTooManyDeletes(List group) { - return group.stream().anyMatch(this::tooManyDeletes); - } - - @Override - protected long defaultTargetFileSize() { - return PropertyUtil.propertyAsLong( - table().properties(), - TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, - TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT); - } - - private int deleteFileThreshold(Map options) { - int value = - PropertyUtil.propertyAsInt(options, DELETE_FILE_THRESHOLD, DELETE_FILE_THRESHOLD_DEFAULT); - Preconditions.checkArgument( - value >= 0, "'%s' is set to %s but must be >= 0", DELETE_FILE_THRESHOLD, value); - return value; - } -} diff --git a/core/src/main/java/org/apache/iceberg/actions/SizeBasedFileRewriter.java b/core/src/main/java/org/apache/iceberg/actions/SizeBasedFileRewritePlanner.java similarity index 92% rename from core/src/main/java/org/apache/iceberg/actions/SizeBasedFileRewriter.java rename to core/src/main/java/org/apache/iceberg/actions/SizeBasedFileRewritePlanner.java index 5d45392c5487..f743c689da35 100644 --- a/core/src/main/java/org/apache/iceberg/actions/SizeBasedFileRewriter.java +++ b/core/src/main/java/org/apache/iceberg/actions/SizeBasedFileRewritePlanner.java @@ -24,7 +24,6 @@ import java.util.Set; import org.apache.iceberg.ContentFile; import org.apache.iceberg.ContentScanTask; -import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Table; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; @@ -36,7 +35,7 @@ import org.slf4j.LoggerFactory; /** - * A file rewriter that determines which files to rewrite based on their size. + * A file rewrite planner that determines which files to rewrite based on their size. * *

If files are smaller than the {@link #MIN_FILE_SIZE_BYTES} threshold or larger than the {@link * #MAX_FILE_SIZE_BYTES} threshold, they are considered targets for being rewritten. @@ -48,10 +47,14 @@ * *

Note that implementations may add extra conditions for selecting files or filtering groups. */ -public abstract class SizeBasedFileRewriter, F extends ContentFile> - implements FileRewriter { +public abstract class SizeBasedFileRewritePlanner< + I, + T extends ContentScanTask, + F extends ContentFile, + G extends FileRewriteGroup> + implements FileRewritePlanner { - private static final Logger LOG = LoggerFactory.getLogger(SizeBasedFileRewriter.class); + private static final Logger LOG = LoggerFactory.getLogger(SizeBasedFileRewritePlanner.class); /** The target output file size that this file rewriter will attempt to generate. */ public static final String TARGET_FILE_SIZE_BYTES = "target-file-size-bytes"; @@ -102,7 +105,7 @@ public abstract class SizeBasedFileRewriter, F exte public static final long MAX_FILE_GROUP_SIZE_BYTES_DEFAULT = 100L * 1024 * 1024 * 1024; // 100 GB - private static final long SPLIT_OVERHEAD = 5 * 1024; + private static final long SPLIT_OVERHEAD = 5L * 1024; private final Table table; private long targetFileSize; @@ -114,7 +117,7 @@ public abstract class SizeBasedFileRewriter, F exte private int outputSpecId; - protected SizeBasedFileRewriter(Table table) { + protected SizeBasedFileRewritePlanner(Table table) { this.table = table; } @@ -145,7 +148,6 @@ public void init(Map options) { this.targetFileSize = sizeThresholds.get(TARGET_FILE_SIZE_BYTES); this.minFileSize = sizeThresholds.get(MIN_FILE_SIZE_BYTES); this.maxFileSize = sizeThresholds.get(MAX_FILE_SIZE_BYTES); - this.minInputFiles = minInputFiles(options); this.rewriteAll = rewriteAll(options); this.maxGroupSize = maxGroupSize(options); @@ -160,7 +162,6 @@ protected boolean wronglySized(T task) { return task.length() < minFileSize || task.length() > maxFileSize; } - @Override public Iterable> planFileGroups(Iterable tasks) { Iterable filteredTasks = rewriteAll ? tasks : filterFiles(tasks); BinPacking.ListPacker packer = new BinPacking.ListPacker<>(maxGroupSize, 1, false); @@ -191,14 +192,12 @@ protected long inputSize(List group) { * of output files. The final split size is adjusted to be at least as big as the target file size * but less than the max write file size. */ - public long splitSize(long inputSize) { + protected long splitSize(long inputSize) { long estimatedSplitSize = (inputSize / numOutputFiles(inputSize)) + SPLIT_OVERHEAD; if (estimatedSplitSize < targetFileSize) { return targetFileSize; - } else if (estimatedSplitSize > writeMaxFileSize()) { - return writeMaxFileSize(); } else { - return estimatedSplitSize; + return Math.min(estimatedSplitSize, writeMaxFileSize()); } } @@ -216,7 +215,7 @@ public long splitSize(long inputSize) { * @param inputSize a total input size for a file group * @return the number of files this rewriter should create */ - protected long numOutputFiles(long inputSize) { + protected int numOutputFiles(long inputSize) { if (inputSize < targetFileSize) { return 1; } @@ -227,18 +226,17 @@ protected long numOutputFiles(long inputSize) { if (LongMath.mod(inputSize, targetFileSize) > minFileSize) { // the remainder file is of a valid size for this rewrite so keep it - return numFilesWithRemainder; + return (int) numFilesWithRemainder; - } else if (avgFileSizeWithoutRemainder - < Math.min(1.1 * targetFileSize, (double) writeMaxFileSize())) { + } else if (avgFileSizeWithoutRemainder < Math.min(1.1 * targetFileSize, writeMaxFileSize())) { // if the reminder is distributed amongst other files, // the average file size will be no more than 10% bigger than the target file size // so round down and distribute remainder amongst other files - return numFilesWithoutRemainder; + return (int) numFilesWithoutRemainder; } else { // keep the remainder file as it is not OK to distribute it amongst other files - return numFilesWithRemainder; + return (int) numFilesWithRemainder; } } @@ -259,15 +257,11 @@ protected long numOutputFiles(long inputSize) { * * @return the target size plus one half of the distance between max and target */ - protected long writeMaxFileSize() { + public long writeMaxFileSize() { return (long) (targetFileSize + ((maxFileSize - targetFileSize) * 0.5)); } - protected PartitionSpec outputSpec() { - return table.specs().get(outputSpecId); - } - - protected int outputSpecId() { + public int outputSpecId() { return outputSpecId; } diff --git a/core/src/main/java/org/apache/iceberg/actions/SizeBasedPositionDeletesRewriter.java b/core/src/main/java/org/apache/iceberg/actions/SizeBasedPositionDeletesRewriter.java deleted file mode 100644 index c08a31a731f4..000000000000 --- a/core/src/main/java/org/apache/iceberg/actions/SizeBasedPositionDeletesRewriter.java +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.actions; - -import java.util.List; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.PositionDeletesScanTask; -import org.apache.iceberg.Table; -import org.apache.iceberg.TableProperties; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.util.PropertyUtil; - -public abstract class SizeBasedPositionDeletesRewriter - extends SizeBasedFileRewriter { - - protected SizeBasedPositionDeletesRewriter(Table table) { - super(table); - } - - @Override - protected Iterable filterFiles(Iterable tasks) { - return Iterables.filter(tasks, this::wronglySized); - } - - @Override - protected Iterable> filterFileGroups( - List> groups) { - return Iterables.filter(groups, this::shouldRewrite); - } - - private boolean shouldRewrite(List group) { - return enoughInputFiles(group) || enoughContent(group) || tooMuchContent(group); - } - - @Override - protected long defaultTargetFileSize() { - return PropertyUtil.propertyAsLong( - table().properties(), - TableProperties.DELETE_TARGET_FILE_SIZE_BYTES, - TableProperties.DELETE_TARGET_FILE_SIZE_BYTES_DEFAULT); - } -} diff --git a/core/src/test/java/org/apache/iceberg/actions/TestRewriteFileGroupPlanner.java b/core/src/test/java/org/apache/iceberg/actions/TestRewriteFileGroupPlanner.java index 8bf7018eccc4..903e7b27313c 100644 --- a/core/src/test/java/org/apache/iceberg/actions/TestRewriteFileGroupPlanner.java +++ b/core/src/test/java/org/apache/iceberg/actions/TestRewriteFileGroupPlanner.java @@ -18,12 +18,12 @@ */ package org.apache.iceberg.actions; +import static org.apache.iceberg.actions.RewriteDataFiles.REWRITE_JOB_ORDER; import static org.assertj.core.api.Assertions.assertThat; import java.io.File; import java.util.List; import java.util.Map; -import java.util.Set; import java.util.UUID; import java.util.stream.Collectors; import org.apache.iceberg.DataFile; @@ -33,10 +33,10 @@ import org.apache.iceberg.StructLike; import org.apache.iceberg.TestBase; import org.apache.iceberg.TestTables; +import org.apache.iceberg.actions.RewriteDataFiles.FileGroupInfo; import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -90,9 +90,14 @@ void testGroups(RewriteJobOrder order) { .appendFile(FILE_5) .appendFile(FILE_6) .commit(); - RewriteFileGroupPlanner planner = new RewriteFileGroupPlanner(new DummyRewriter(false), order); - RewriteFileGroupPlanner.RewritePlan result = - planner.plan(table, Expressions.alwaysTrue(), table.currentSnapshot().snapshotId(), false); + RewriteFileGroupPlanner planner = + new RewriteFileGroupPlanner( + table, Expressions.alwaysTrue(), table.currentSnapshot().snapshotId(), false); + planner.init( + ImmutableMap.of( + RewriteFileGroupPlanner.REWRITE_ALL, "true", REWRITE_JOB_ORDER, order.name())); + FileRewritePlan result = + planner.plan(); List groups = result.groups().collect(Collectors.toList()); assertThat(groups.stream().map(group -> group.info().partition()).collect(Collectors.toList())) .isEqualTo(EXPECTED.get(order)); @@ -112,44 +117,22 @@ void testContext() { .appendFile(FILE_6) .commit(); RewriteFileGroupPlanner planner = - new RewriteFileGroupPlanner(new DummyRewriter(true), RewriteJobOrder.FILES_DESC); - RewriteFileGroupPlanner.RewritePlan result = - planner.plan(table, Expressions.alwaysTrue(), table.currentSnapshot().snapshotId(), false); + new RewriteFileGroupPlanner( + table, Expressions.alwaysTrue(), table.currentSnapshot().snapshotId(), false); + planner.init( + ImmutableMap.of( + RewriteFileGroupPlanner.REWRITE_ALL, + "true", + RewriteFileGroupPlanner.MAX_FILE_GROUP_SIZE_BYTES, + "10")); + FileRewritePlan result = + planner.plan(); assertThat(result.totalGroupCount()).isEqualTo(6); assertThat(result.groupsInPartition(FILE_1.partition())).isEqualTo(3); assertThat(result.groupsInPartition(FILE_4.partition())).isEqualTo(2); assertThat(result.groupsInPartition(FILE_6.partition())).isEqualTo(1); } - private static class DummyRewriter implements FileRewriter { - private final boolean split; - - private DummyRewriter(boolean split) { - this.split = split; - } - - @Override - public Set validOptions() { - return Set.of(); - } - - @Override - public void init(Map options) {} - - @Override - public Iterable> planFileGroups(Iterable tasks) { - List taskList = Lists.newArrayList(tasks); - return split - ? taskList.stream().map(ImmutableList::of).collect(Collectors.toList()) - : ImmutableList.of(taskList); - } - - @Override - public Set rewrite(List group) { - return Set.of(); - } - } - private static DataFile newDataFile(String partitionPath, long fileSize) { return DataFiles.builder(TestBase.SPEC) .withPath("/path/to/data-" + UUID.randomUUID() + ".parquet") diff --git a/core/src/test/java/org/apache/iceberg/actions/TestSizeBasedRewriter.java b/core/src/test/java/org/apache/iceberg/actions/TestSizeBasedRewriter.java index 77d16d3bc821..82286d250574 100644 --- a/core/src/test/java/org/apache/iceberg/actions/TestSizeBasedRewriter.java +++ b/core/src/test/java/org/apache/iceberg/actions/TestSizeBasedRewriter.java @@ -19,25 +19,30 @@ package org.apache.iceberg.actions; import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.when; import java.util.Arrays; import java.util.List; import java.util.Map; -import java.util.Set; import org.apache.iceberg.DataFile; import org.apache.iceberg.FileScanTask; import org.apache.iceberg.MockFileScanTask; import org.apache.iceberg.ParameterizedTestExtension; import org.apache.iceberg.Parameters; +import org.apache.iceberg.StructLike; import org.apache.iceberg.Table; import org.apache.iceberg.TestBase; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.junit.jupiter.api.TestTemplate; import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mockito; @ExtendWith(ParameterizedTestExtension.class) -public class TestSizeBasedRewriter extends TestBase { +class TestSizeBasedRewriter extends TestBase { @Parameters(name = "formatVersion = {0}") protected static List parameters() { @@ -45,54 +50,57 @@ protected static List parameters() { } @TestTemplate - public void testSplitSizeLowerBound() { - SizeBasedDataFileRewriterImpl rewriter = new SizeBasedDataFileRewriterImpl(table); - - FileScanTask task1 = new MockFileScanTask(145L * 1024 * 1024); - FileScanTask task2 = new MockFileScanTask(145L * 1024 * 1024); - FileScanTask task3 = new MockFileScanTask(145L * 1024 * 1024); - FileScanTask task4 = new MockFileScanTask(145L * 1024 * 1024); + void testSplitSizeLowerBound() { + FileScanTask task1 = new MockFileScanTask(mockDataFile()); + FileScanTask task2 = new MockFileScanTask(mockDataFile()); + FileScanTask task3 = new MockFileScanTask(mockDataFile()); + FileScanTask task4 = new MockFileScanTask(mockDataFile()); List tasks = ImmutableList.of(task1, task2, task3, task4); + RewriteFileGroupPlanner planner = new TestingPlanner(table, Expressions.alwaysTrue(), 1, tasks); + long minFileSize = 256L * 1024 * 1024; long targetFileSize = 512L * 1024 * 1024; long maxFileSize = 768L * 1024 * 1024; Map options = ImmutableMap.of( - SizeBasedDataRewriter.MIN_FILE_SIZE_BYTES, String.valueOf(minFileSize), - SizeBasedDataRewriter.TARGET_FILE_SIZE_BYTES, String.valueOf(targetFileSize), - SizeBasedDataRewriter.MAX_FILE_SIZE_BYTES, String.valueOf(maxFileSize)); - rewriter.init(options); + RewriteFileGroupPlanner.MIN_FILE_SIZE_BYTES, String.valueOf(minFileSize), + RewriteFileGroupPlanner.TARGET_FILE_SIZE_BYTES, String.valueOf(targetFileSize), + RewriteFileGroupPlanner.MAX_FILE_SIZE_BYTES, String.valueOf(maxFileSize)); + planner.init(options); // the total task size is 580 MB and the target file size is 512 MB // the remainder must be written into a separate file as it exceeds 10% - long numOutputFiles = rewriter.computeNumOutputFiles(tasks); - assertThat(numOutputFiles).isEqualTo(2); + + RewriteFileGroup group = planner.plan().groups().iterator().next(); + + assertThat(group.expectedOutputFiles()).isEqualTo(2); // the split size must be >= targetFileSize and < maxFileSize - long splitSize = rewriter.computeSplitSize(tasks); - assertThat(splitSize).isGreaterThanOrEqualTo(targetFileSize); - assertThat(splitSize).isLessThan(maxFileSize); + long splitSize = group.sizeInBytes(); + assertThat(splitSize).isGreaterThanOrEqualTo(targetFileSize).isLessThan(maxFileSize); } - private static class SizeBasedDataFileRewriterImpl extends SizeBasedDataRewriter { + private static class TestingPlanner extends RewriteFileGroupPlanner { + private final List tasks; - SizeBasedDataFileRewriterImpl(Table table) { - super(table); + private TestingPlanner( + Table table, Expression filter, long snapshotId, List tasks) { + super(table, filter, snapshotId, false); + this.tasks = tasks; } @Override - public Set rewrite(List group) { - throw new UnsupportedOperationException("Not implemented"); - } - - public long computeSplitSize(List group) { - return splitSize(inputSize(group)); + CloseableIterable tasks() { + return CloseableIterable.withNoopClose(tasks); } + } - public long computeNumOutputFiles(List group) { - return numOutputFiles(inputSize(group)); - } + private DataFile mockDataFile() { + DataFile file = Mockito.mock(DataFile.class); + when(file.partition()).thenReturn(Mockito.mock(StructLike.class)); + when(file.fileSizeInBytes()).thenReturn(145L * 1024 * 1024); + return file; } } diff --git a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewritePositionDeleteFiles.java b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewritePositionDeleteFiles.java index f3be0a870972..5a1bdb983f7c 100644 --- a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewritePositionDeleteFiles.java +++ b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRewritePositionDeleteFiles.java @@ -51,7 +51,7 @@ import org.apache.iceberg.Table; import org.apache.iceberg.actions.RewritePositionDeleteFiles.FileGroupRewriteResult; import org.apache.iceberg.actions.RewritePositionDeleteFiles.Result; -import org.apache.iceberg.actions.SizeBasedFileRewriter; +import org.apache.iceberg.actions.SizeBasedFileRewritePlanner; import org.apache.iceberg.data.GenericAppenderFactory; import org.apache.iceberg.data.Record; import org.apache.iceberg.deletes.PositionDelete; @@ -217,7 +217,7 @@ private void testDanglingDelete(String partitionCol, int numDataFiles) throws Ex SparkActions.get(spark) .rewriteDataFiles(table) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .execute(); // write dangling delete files for 'old data files' @@ -230,7 +230,7 @@ private void testDanglingDelete(String partitionCol, int numDataFiles) throws Ex Result result = SparkActions.get(spark) .rewritePositionDeletes(table) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .execute(); List newDeleteFiles = deleteFiles(table); diff --git a/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java b/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java index 95bebc7caed4..88ab82bd600a 100644 --- a/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java +++ b/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java @@ -35,7 +35,7 @@ import org.apache.iceberg.SortDirection; import org.apache.iceberg.SortOrder; import org.apache.iceberg.Table; -import org.apache.iceberg.actions.SizeBasedFileRewriter; +import org.apache.iceberg.actions.SizeBasedFileRewritePlanner; import org.apache.iceberg.relocated.com.google.common.io.Files; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSchemaUtil; @@ -105,7 +105,7 @@ public void cleanUpIteration() throws IOException { public void sortInt() { SparkActions.get() .rewriteDataFiles(table()) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .sort( SortOrder.builderFor(table().schema()) .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) @@ -118,7 +118,7 @@ public void sortInt() { public void sortInt2() { SparkActions.get() .rewriteDataFiles(table()) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .sort( SortOrder.builderFor(table().schema()) .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) @@ -132,7 +132,7 @@ public void sortInt2() { public void sortInt3() { SparkActions.get() .rewriteDataFiles(table()) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .sort( SortOrder.builderFor(table().schema()) .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) @@ -148,7 +148,7 @@ public void sortInt3() { public void sortInt4() { SparkActions.get() .rewriteDataFiles(table()) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .sort( SortOrder.builderFor(table().schema()) .sortBy("intCol", SortDirection.ASC, NullOrder.NULLS_FIRST) @@ -164,7 +164,7 @@ public void sortInt4() { public void sortString() { SparkActions.get() .rewriteDataFiles(table()) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .sort( SortOrder.builderFor(table().schema()) .sortBy("stringCol", SortDirection.ASC, NullOrder.NULLS_FIRST) @@ -177,7 +177,7 @@ public void sortString() { public void sortFourColumns() { SparkActions.get() .rewriteDataFiles(table()) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .sort( SortOrder.builderFor(table().schema()) .sortBy("stringCol", SortDirection.ASC, NullOrder.NULLS_FIRST) @@ -193,7 +193,7 @@ public void sortFourColumns() { public void sortSixColumns() { SparkActions.get() .rewriteDataFiles(table()) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .sort( SortOrder.builderFor(table().schema()) .sortBy("stringCol", SortDirection.ASC, NullOrder.NULLS_FIRST) @@ -211,7 +211,7 @@ public void sortSixColumns() { public void zSortInt() { SparkActions.get() .rewriteDataFiles(table()) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .zOrder("intCol") .execute(); } @@ -221,7 +221,7 @@ public void zSortInt() { public void zSortInt2() { SparkActions.get() .rewriteDataFiles(table()) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .zOrder("intCol", "intCol2") .execute(); } @@ -231,7 +231,7 @@ public void zSortInt2() { public void zSortInt3() { SparkActions.get() .rewriteDataFiles(table()) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .zOrder("intCol", "intCol2", "intCol3") .execute(); } @@ -241,7 +241,7 @@ public void zSortInt3() { public void zSortInt4() { SparkActions.get() .rewriteDataFiles(table()) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .zOrder("intCol", "intCol2", "intCol3", "intCol4") .execute(); } @@ -251,7 +251,7 @@ public void zSortInt4() { public void zSortString() { SparkActions.get() .rewriteDataFiles(table()) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .zOrder("stringCol") .execute(); } @@ -261,7 +261,7 @@ public void zSortString() { public void zSortFourColumns() { SparkActions.get() .rewriteDataFiles(table()) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .zOrder("stringCol", "intCol", "dateCol", "doubleCol") .execute(); } @@ -271,7 +271,7 @@ public void zSortFourColumns() { public void zSortSixColumns() { SparkActions.get() .rewriteDataFiles(table()) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .zOrder("stringCol", "intCol", "dateCol", "timestampCol", "doubleCol", "longCol") .execute(); } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java index 442ebc09670f..8cf189ee8b79 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java @@ -30,18 +30,17 @@ import java.util.stream.Collectors; import org.apache.iceberg.DataFile; import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.RewriteJobOrder; import org.apache.iceberg.SortOrder; import org.apache.iceberg.StructLike; import org.apache.iceberg.Table; -import org.apache.iceberg.actions.FileRewriter; +import org.apache.iceberg.actions.FileRewriteExecutor; +import org.apache.iceberg.actions.FileRewritePlan; import org.apache.iceberg.actions.ImmutableRewriteDataFiles; import org.apache.iceberg.actions.ImmutableRewriteDataFiles.Result.Builder; import org.apache.iceberg.actions.RewriteDataFiles; import org.apache.iceberg.actions.RewriteDataFilesCommitManager; import org.apache.iceberg.actions.RewriteFileGroup; import org.apache.iceberg.actions.RewriteFileGroupPlanner; -import org.apache.iceberg.actions.RewriteFileGroupPlanner.RewritePlan; import org.apache.iceberg.exceptions.CommitFailedException; import org.apache.iceberg.exceptions.ValidationException; import org.apache.iceberg.expressions.Expression; @@ -93,9 +92,10 @@ public class RewriteDataFilesSparkAction private boolean partialProgressEnabled; private boolean removeDanglingDeletes; private boolean useStartingSequenceNumber; - private RewriteJobOrder rewriteJobOrder; - private FileRewriter rewriter = null; private boolean caseSensitive; + private RewriteFileGroupPlanner planner = null; + private FileRewriteExecutor rewriter = + null; RewriteDataFilesSparkAction(SparkSession spark, Table table) { super(spark.cloneSession()); @@ -114,7 +114,7 @@ protected RewriteDataFilesSparkAction self() { public RewriteDataFilesSparkAction binPack() { Preconditions.checkArgument( rewriter == null, "Must use only one rewriter type (bin-pack, sort, zorder)"); - this.rewriter = new SparkBinPackDataRewriter(spark(), table); + this.rewriter = new SparkBinPackDataRewriteExecutor(spark(), table); return this; } @@ -122,7 +122,7 @@ public RewriteDataFilesSparkAction binPack() { public RewriteDataFilesSparkAction sort(SortOrder sortOrder) { Preconditions.checkArgument( rewriter == null, "Must use only one rewriter type (bin-pack, sort, zorder)"); - this.rewriter = new SparkSortDataRewriter(spark(), table, sortOrder); + this.rewriter = new SparkSortDataRewriteExecutor(spark(), table, sortOrder); return this; } @@ -130,7 +130,7 @@ public RewriteDataFilesSparkAction sort(SortOrder sortOrder) { public RewriteDataFilesSparkAction sort() { Preconditions.checkArgument( rewriter == null, "Must use only one rewriter type (bin-pack, sort, zorder)"); - this.rewriter = new SparkSortDataRewriter(spark(), table); + this.rewriter = new SparkSortDataRewriteExecutor(spark(), table); return this; } @@ -138,7 +138,7 @@ public RewriteDataFilesSparkAction sort() { public RewriteDataFilesSparkAction zOrder(String... columnNames) { Preconditions.checkArgument( rewriter == null, "Must use only one rewriter type (bin-pack, sort, zorder)"); - this.rewriter = new SparkZOrderDataRewriter(spark(), table, Arrays.asList(columnNames)); + this.rewriter = new SparkZOrderDataRewriteExecutor(spark(), table, Arrays.asList(columnNames)); return this; } @@ -156,14 +156,10 @@ public RewriteDataFiles.Result execute() { long startingSnapshotId = table.currentSnapshot().snapshotId(); - // Default to BinPack if no strategy selected - if (this.rewriter == null) { - this.rewriter = new SparkBinPackDataRewriter(spark(), table); - } - - validateAndInitOptions(); + init(startingSnapshotId); - RewritePlan plan = plan(startingSnapshotId); + FileRewritePlan plan = plan(); + rewriter.initPlan(plan); if (plan.totalGroupCount() == 0) { LOG.info("Nothing found to rewrite in {}", table.name()); @@ -185,18 +181,32 @@ public RewriteDataFiles.Result execute() { return resultBuilder.build(); } - RewritePlan plan(long startingSnapshotId) { - return new RewriteFileGroupPlanner(rewriter, rewriteJobOrder) - .plan(table, filter, startingSnapshotId, caseSensitive); + @VisibleForTesting + FileRewritePlan plan() { + return planner.plan(); } @VisibleForTesting - RewriteFileGroup rewriteFiles(RewritePlan plan, RewriteFileGroup fileGroup) { + void init(long startingSnapshotId) { + + this.planner = new RewriteFileGroupPlanner(table, filter, startingSnapshotId, caseSensitive); + + // Default to BinPack if no strategy selected + if (this.rewriter == null) { + this.rewriter = new SparkBinPackDataRewriteExecutor(spark(), table); + } + + validateAndInitOptions(); + } + + @VisibleForTesting + RewriteFileGroup rewriteFiles( + FileRewritePlan plan, + RewriteFileGroup fileGroup) { String desc = jobDesc(fileGroup, plan); Set addedFiles = withJobGroupInfo( - newJobGroupInfo("REWRITE-DATA-FILES", desc), - () -> rewriter.rewrite(fileGroup.fileScans())); + newJobGroupInfo("REWRITE-DATA-FILES", desc), () -> rewriter.rewrite(fileGroup)); fileGroup.setOutputFiles(addedFiles); LOG.info("Rewrite Files Ready to be Committed - {}", desc); @@ -217,7 +227,9 @@ RewriteDataFilesCommitManager commitManager(long startingSnapshotId) { table, startingSnapshotId, useStartingSequenceNumber, commitSummary()); } - private Builder doExecute(RewritePlan plan, RewriteDataFilesCommitManager commitManager) { + private Builder doExecute( + FileRewritePlan plan, + RewriteDataFilesCommitManager commitManager) { ExecutorService rewriteService = rewriteService(); ConcurrentLinkedQueue rewrittenGroups = Queues.newConcurrentLinkedQueue(); @@ -277,7 +289,8 @@ private Builder doExecute(RewritePlan plan, RewriteDataFilesCommitManager commit } private Builder doExecuteWithPartialProgress( - RewritePlan plan, RewriteDataFilesCommitManager commitManager) { + FileRewritePlan plan, + RewriteDataFilesCommitManager commitManager) { ExecutorService rewriteService = rewriteService(); // start commit service @@ -298,7 +311,7 @@ private Builder doExecuteWithPartialProgress( rewriteFailures.add( ImmutableRewriteDataFiles.FileGroupFailureResult.builder() .info(fileGroup.info()) - .dataFilesCount(fileGroup.numFiles()) + .dataFilesCount(fileGroup.numInputFiles()) .build()); }) .run(fileGroup -> commitService.offer(rewriteFiles(plan, fileGroup))); @@ -341,6 +354,7 @@ private Iterable toRewriteResults(List void validateAndInitOptions() { Set validOptions = Sets.newHashSet(rewriter.validOptions()); validOptions.addAll(VALID_OPTIONS); + validOptions.addAll(planner.validOptions()); Set invalidKeys = Sets.newHashSet(options().keySet()); invalidKeys.removeAll(validOptions); @@ -351,6 +365,7 @@ void validateAndInitOptions() { invalidKeys, rewriter.description()); + planner.init(options()); rewriter.init(options()); maxConcurrentFileGroupRewrites = @@ -378,10 +393,6 @@ void validateAndInitOptions() { PropertyUtil.propertyAsBoolean( options(), REMOVE_DANGLING_DELETES, REMOVE_DANGLING_DELETES_DEFAULT); - rewriteJobOrder = - RewriteJobOrder.fromName( - PropertyUtil.propertyAsString(options(), REWRITE_JOB_ORDER, REWRITE_JOB_ORDER_DEFAULT)); - Preconditions.checkArgument( maxConcurrentFileGroupRewrites >= 1, "Cannot set %s to %s, the value must be positive.", @@ -396,7 +407,9 @@ void validateAndInitOptions() { PARTIAL_PROGRESS_ENABLED); } - private String jobDesc(RewriteFileGroup group, RewritePlan plan) { + private String jobDesc( + RewriteFileGroup group, + FileRewritePlan plan) { StructLike partition = group.info().partition(); if (partition.size() > 0) { return String.format( diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewritePositionDeleteFilesSparkAction.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewritePositionDeleteFilesSparkAction.java index 2562c74eafcc..e237f46a163f 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewritePositionDeleteFilesSparkAction.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewritePositionDeleteFilesSparkAction.java @@ -18,52 +18,39 @@ */ package org.apache.iceberg.spark.actions; -import java.io.IOException; import java.math.RoundingMode; import java.util.List; -import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.ThreadPoolExecutor; -import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; -import java.util.stream.Stream; import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.MetadataTableType; -import org.apache.iceberg.MetadataTableUtils; -import org.apache.iceberg.Partitioning; import org.apache.iceberg.PositionDeletesScanTask; -import org.apache.iceberg.PositionDeletesTable.PositionDeletesBatchScan; -import org.apache.iceberg.RewriteJobOrder; import org.apache.iceberg.StructLike; import org.apache.iceberg.Table; +import org.apache.iceberg.actions.FileRewritePlan; import org.apache.iceberg.actions.ImmutableRewritePositionDeleteFiles; import org.apache.iceberg.actions.RewritePositionDeleteFiles; import org.apache.iceberg.actions.RewritePositionDeletesCommitManager; import org.apache.iceberg.actions.RewritePositionDeletesCommitManager.CommitService; import org.apache.iceberg.actions.RewritePositionDeletesGroup; +import org.apache.iceberg.actions.RewritePositionDeletesGroupPlanner; import org.apache.iceberg.exceptions.CommitFailedException; import org.apache.iceberg.exceptions.ValidationException; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.relocated.com.google.common.collect.Queues; import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.relocated.com.google.common.math.IntMath; import org.apache.iceberg.relocated.com.google.common.util.concurrent.MoreExecutors; import org.apache.iceberg.relocated.com.google.common.util.concurrent.ThreadFactoryBuilder; import org.apache.iceberg.spark.SparkUtil; -import org.apache.iceberg.types.Types.StructType; -import org.apache.iceberg.util.PartitionUtil; import org.apache.iceberg.util.PropertyUtil; -import org.apache.iceberg.util.StructLikeMap; import org.apache.iceberg.util.Tasks; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; @@ -86,20 +73,20 @@ public class RewritePositionDeleteFilesSparkAction ImmutableRewritePositionDeleteFiles.Result.builder().build(); private final Table table; - private final SparkBinPackPositionDeletesRewriter rewriter; + private RewritePositionDeletesGroupPlanner planner; + private final SparkBinPackPositionDeletesRewriteExecutor rewriter; private Expression filter = Expressions.alwaysTrue(); private int maxConcurrentFileGroupRewrites; private int maxCommits; private boolean partialProgressEnabled; - private RewriteJobOrder rewriteJobOrder; private boolean caseSensitive; RewritePositionDeleteFilesSparkAction(SparkSession spark, Table table) { super(spark); this.table = table; - this.rewriter = new SparkBinPackPositionDeletesRewriter(spark(), table); this.caseSensitive = SparkUtil.caseSensitive(spark); + this.rewriter = new SparkBinPackPositionDeletesRewriteExecutor(spark(), table); } @Override @@ -120,86 +107,41 @@ public RewritePositionDeleteFiles.Result execute() { return EMPTY_RESULT; } + this.planner = new RewritePositionDeletesGroupPlanner(table, filter, caseSensitive); + validateAndInitOptions(); - StructLikeMap>> fileGroupsByPartition = planFileGroups(); - RewriteExecutionContext ctx = new RewriteExecutionContext(fileGroupsByPartition); + FileRewritePlan + plan = plan(); + rewriter.initPlan(plan); - if (ctx.totalGroupCount() == 0) { + if (plan.totalGroupCount() == 0) { LOG.info("Nothing found to rewrite in {}", table.name()); return EMPTY_RESULT; } - Stream groupStream = toGroupStream(ctx, fileGroupsByPartition); - if (partialProgressEnabled) { - return doExecuteWithPartialProgress(ctx, groupStream, commitManager()); + return doExecuteWithPartialProgress(plan, commitManager()); } else { - return doExecute(ctx, groupStream, commitManager()); - } - } - - private StructLikeMap>> planFileGroups() { - Table deletesTable = - MetadataTableUtils.createMetadataTableInstance(table, MetadataTableType.POSITION_DELETES); - CloseableIterable fileTasks = planFiles(deletesTable); - - try { - StructType partitionType = Partitioning.partitionType(deletesTable); - StructLikeMap> fileTasksByPartition = - groupByPartition(partitionType, fileTasks); - return fileGroupsByPartition(fileTasksByPartition); - } finally { - try { - fileTasks.close(); - } catch (IOException io) { - LOG.error("Cannot properly close file iterable while planning for rewrite", io); - } - } - } - - private CloseableIterable planFiles(Table deletesTable) { - PositionDeletesBatchScan scan = (PositionDeletesBatchScan) deletesTable.newBatchScan(); - return CloseableIterable.transform( - scan.baseTableFilter(filter).caseSensitive(caseSensitive).ignoreResiduals().planFiles(), - task -> (PositionDeletesScanTask) task); - } - - private StructLikeMap> groupByPartition( - StructType partitionType, Iterable tasks) { - StructLikeMap> filesByPartition = - StructLikeMap.create(partitionType); - - for (PositionDeletesScanTask task : tasks) { - StructLike coerced = coercePartition(task, partitionType); - - List partitionTasks = filesByPartition.get(coerced); - if (partitionTasks == null) { - partitionTasks = Lists.newArrayList(); - } - partitionTasks.add(task); - filesByPartition.put(coerced, partitionTasks); + return doExecute(plan, commitManager()); } - - return filesByPartition; - } - - private StructLikeMap>> fileGroupsByPartition( - StructLikeMap> filesByPartition) { - return filesByPartition.transformValues(this::planFileGroups); } - private List> planFileGroups(List tasks) { - return ImmutableList.copyOf(rewriter.planFileGroups(tasks)); + @VisibleForTesting + FileRewritePlan + plan() { + return planner.plan(); } private RewritePositionDeletesGroup rewriteDeleteFiles( - RewriteExecutionContext ctx, RewritePositionDeletesGroup fileGroup) { - String desc = jobDesc(fileGroup, ctx); + FileRewritePlan< + FileGroupInfo, PositionDeletesScanTask, DeleteFile, RewritePositionDeletesGroup> + plan, + RewritePositionDeletesGroup fileGroup) { + String desc = jobDesc(fileGroup, plan); Set addedFiles = withJobGroupInfo( - newJobGroupInfo("REWRITE-POSITION-DELETES", desc), - () -> rewriter.rewrite(fileGroup.tasks())); + newJobGroupInfo("REWRITE-POSITION-DELETES", desc), () -> rewriter.rewrite(fileGroup)); fileGroup.setOutputFiles(addedFiles); LOG.info("Rewrite position deletes ready to be committed - {}", desc); @@ -221,8 +163,9 @@ private RewritePositionDeletesCommitManager commitManager() { } private Result doExecute( - RewriteExecutionContext ctx, - Stream groupStream, + FileRewritePlan< + FileGroupInfo, PositionDeletesScanTask, DeleteFile, RewritePositionDeletesGroup> + plan, RewritePositionDeletesCommitManager commitManager) { ExecutorService rewriteService = rewriteService(); @@ -230,7 +173,7 @@ private Result doExecute( Queues.newConcurrentLinkedQueue(); Tasks.Builder rewriteTaskBuilder = - Tasks.foreach(groupStream) + Tasks.foreach(plan.groups()) .executeWith(rewriteService) .stopOnFailure() .noRetry() @@ -242,7 +185,7 @@ private Result doExecute( exception)); try { - rewriteTaskBuilder.run(fileGroup -> rewrittenGroups.add(rewriteDeleteFiles(ctx, fileGroup))); + rewriteTaskBuilder.run(fileGroup -> rewrittenGroups.add(rewriteDeleteFiles(plan, fileGroup))); } catch (Exception e) { // At least one rewrite group failed, clean up all completed rewrites LOG.error( @@ -288,25 +231,26 @@ private Result doExecute( } private Result doExecuteWithPartialProgress( - RewriteExecutionContext ctx, - Stream groupStream, + FileRewritePlan< + FileGroupInfo, PositionDeletesScanTask, DeleteFile, RewritePositionDeletesGroup> + plan, RewritePositionDeletesCommitManager commitManager) { ExecutorService rewriteService = rewriteService(); // start commit service - int groupsPerCommit = IntMath.divide(ctx.totalGroupCount(), maxCommits, RoundingMode.CEILING); + int groupsPerCommit = IntMath.divide(plan.totalGroupCount(), maxCommits, RoundingMode.CEILING); CommitService commitService = commitManager.service(groupsPerCommit); commitService.start(); // start rewrite tasks - Tasks.foreach(groupStream) + Tasks.foreach(plan.groups()) .suppressFailureWhenFinished() .executeWith(rewriteService) .noRetry() .onFailure( (fileGroup, exception) -> LOG.error("Failure during rewrite group {}", fileGroup.info(), exception)) - .run(fileGroup -> commitService.offer(rewriteDeleteFiles(ctx, fileGroup))); + .run(fileGroup -> commitService.offer(rewriteDeleteFiles(plan, fileGroup))); rewriteService.shutdown(); // stop commit service @@ -330,36 +274,10 @@ private Result doExecuteWithPartialProgress( .build(); } - private Stream toGroupStream( - RewriteExecutionContext ctx, - Map>> groupsByPartition) { - return groupsByPartition.entrySet().stream() - .filter(e -> !e.getValue().isEmpty()) - .flatMap( - e -> { - StructLike partition = e.getKey(); - List> scanGroups = e.getValue(); - return scanGroups.stream().map(tasks -> newRewriteGroup(ctx, partition, tasks)); - }) - .sorted(RewritePositionDeletesGroup.comparator(rewriteJobOrder)); - } - - private RewritePositionDeletesGroup newRewriteGroup( - RewriteExecutionContext ctx, StructLike partition, List tasks) { - int globalIndex = ctx.currentGlobalIndex(); - int partitionIndex = ctx.currentPartitionIndex(partition); - FileGroupInfo info = - ImmutableRewritePositionDeleteFiles.FileGroupInfo.builder() - .globalIndex(globalIndex) - .partitionIndex(partitionIndex) - .partition(partition) - .build(); - return new RewritePositionDeletesGroup(info, tasks); - } - private void validateAndInitOptions() { Set validOptions = Sets.newHashSet(rewriter.validOptions()); validOptions.addAll(VALID_OPTIONS); + validOptions.addAll(planner.validOptions()); Set invalidKeys = Sets.newHashSet(options().keySet()); invalidKeys.removeAll(validOptions); @@ -370,6 +288,7 @@ private void validateAndInitOptions() { invalidKeys, rewriter.description()); + planner.init(options()); rewriter.init(options()); this.maxConcurrentFileGroupRewrites = @@ -386,10 +305,6 @@ private void validateAndInitOptions() { PropertyUtil.propertyAsBoolean( options(), PARTIAL_PROGRESS_ENABLED, PARTIAL_PROGRESS_ENABLED_DEFAULT); - this.rewriteJobOrder = - RewriteJobOrder.fromName( - PropertyUtil.propertyAsString(options(), REWRITE_JOB_ORDER, REWRITE_JOB_ORDER_DEFAULT)); - Preconditions.checkArgument( maxConcurrentFileGroupRewrites >= 1, "Cannot set %s to %s, the value must be positive.", @@ -404,7 +319,11 @@ private void validateAndInitOptions() { PARTIAL_PROGRESS_ENABLED); } - private String jobDesc(RewritePositionDeletesGroup group, RewriteExecutionContext ctx) { + private String jobDesc( + RewritePositionDeletesGroup group, + FileRewritePlan< + FileGroupInfo, PositionDeletesScanTask, DeleteFile, RewritePositionDeletesGroup> + plan) { StructLike partition = group.info().partition(); if (partition.size() > 0) { return String.format( @@ -412,10 +331,10 @@ private String jobDesc(RewritePositionDeletesGroup group, RewriteExecutionContex group.rewrittenDeleteFiles().size(), rewriter.description(), group.info().globalIndex(), - ctx.totalGroupCount(), + plan.totalGroupCount(), partition, group.info().partitionIndex(), - ctx.groupsInPartition(partition), + plan.groupsInPartition(partition), table.name()); } else { return String.format( @@ -423,43 +342,8 @@ private String jobDesc(RewritePositionDeletesGroup group, RewriteExecutionContex group.rewrittenDeleteFiles().size(), rewriter.description(), group.info().globalIndex(), - ctx.totalGroupCount(), + plan.totalGroupCount(), table.name()); } } - - static class RewriteExecutionContext { - private final StructLikeMap numGroupsByPartition; - private final int totalGroupCount; - private final Map partitionIndexMap; - private final AtomicInteger groupIndex; - - RewriteExecutionContext( - StructLikeMap>> fileTasksByPartition) { - this.numGroupsByPartition = fileTasksByPartition.transformValues(List::size); - this.totalGroupCount = numGroupsByPartition.values().stream().reduce(Integer::sum).orElse(0); - this.partitionIndexMap = Maps.newConcurrentMap(); - this.groupIndex = new AtomicInteger(1); - } - - public int currentGlobalIndex() { - return groupIndex.getAndIncrement(); - } - - public int currentPartitionIndex(StructLike partition) { - return partitionIndexMap.merge(partition, 1, Integer::sum); - } - - public int groupsInPartition(StructLike partition) { - return numGroupsByPartition.get(partition); - } - - public int totalGroupCount() { - return totalGroupCount; - } - } - - private StructLike coercePartition(PositionDeletesScanTask task, StructType partitionType) { - return PartitionUtil.coercePartition(partitionType, task.spec(), task.partition()); - } } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkBinPackDataRewriter.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkBinPackDataRewriteExecutor.java similarity index 88% rename from spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkBinPackDataRewriter.java rename to spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkBinPackDataRewriteExecutor.java index d256bf2794e2..d1c70ee289c6 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkBinPackDataRewriter.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkBinPackDataRewriteExecutor.java @@ -28,9 +28,9 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; -class SparkBinPackDataRewriter extends SparkSizeBasedDataRewriter { +class SparkBinPackDataRewriteExecutor extends SparkSizeBasedDataRewriteExecutor { - SparkBinPackDataRewriter(SparkSession spark, Table table) { + SparkBinPackDataRewriteExecutor(SparkSession spark, Table table) { super(spark, table); } @@ -40,14 +40,15 @@ public String description() { } @Override - protected void doRewrite(String groupId, List group) { + protected void doRewrite( + String groupId, List group, long splitSize, int expectedOutputFiles) { // read the files packing them into splits of the required size Dataset scanDF = spark() .read() .format("iceberg") .option(SparkReadOptions.SCAN_TASK_SET_ID, groupId) - .option(SparkReadOptions.SPLIT_SIZE, splitSize(inputSize(group))) + .option(SparkReadOptions.SPLIT_SIZE, splitSize) .option(SparkReadOptions.FILE_OPEN_COST, "0") .load(groupId); diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkBinPackPositionDeletesRewriter.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkBinPackPositionDeletesRewriteExecutor.java similarity index 88% rename from spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkBinPackPositionDeletesRewriter.java rename to spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkBinPackPositionDeletesRewriteExecutor.java index 5afd724aad88..fb8b73f17463 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkBinPackPositionDeletesRewriter.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkBinPackPositionDeletesRewriteExecutor.java @@ -34,7 +34,8 @@ import org.apache.iceberg.PositionDeletesScanTask; import org.apache.iceberg.StructLike; import org.apache.iceberg.Table; -import org.apache.iceberg.actions.SizeBasedPositionDeletesRewriter; +import org.apache.iceberg.actions.RewritePositionDeleteFiles.FileGroupInfo; +import org.apache.iceberg.actions.RewritePositionDeletesGroup; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.spark.PositionDeletesRewriteCoordinator; import org.apache.iceberg.spark.ScanTaskSetManager; @@ -51,7 +52,9 @@ import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; -class SparkBinPackPositionDeletesRewriter extends SizeBasedPositionDeletesRewriter { +class SparkBinPackPositionDeletesRewriteExecutor + extends SparkRewriteExecutor< + FileGroupInfo, PositionDeletesScanTask, DeleteFile, RewritePositionDeletesGroup> { private final SparkSession spark; private final SparkTableCache tableCache = SparkTableCache.get(); @@ -59,7 +62,7 @@ class SparkBinPackPositionDeletesRewriter extends SizeBasedPositionDeletesRewrit private final PositionDeletesRewriteCoordinator coordinator = PositionDeletesRewriteCoordinator.get(); - SparkBinPackPositionDeletesRewriter(SparkSession spark, Table table) { + SparkBinPackPositionDeletesRewriteExecutor(SparkSession spark, Table table) { super(table); // Disable Adaptive Query Execution as this may change the output partitioning of our write this.spark = spark.cloneSession(); @@ -72,14 +75,14 @@ public String description() { } @Override - public Set rewrite(List group) { + public Set rewrite(RewritePositionDeletesGroup group) { String groupId = UUID.randomUUID().toString(); Table deletesTable = MetadataTableUtils.createMetadataTableInstance(table(), POSITION_DELETES); try { tableCache.add(groupId, deletesTable); - taskSetManager.stageTasks(deletesTable, groupId, group); + taskSetManager.stageTasks(deletesTable, groupId, group.fileScans()); - doRewrite(groupId, group); + doRewrite(groupId, group.fileScans(), group.splitSize()); return coordinator.fetchNewFiles(deletesTable, groupId); } finally { @@ -89,7 +92,7 @@ public Set rewrite(List group) { } } - protected void doRewrite(String groupId, List group) { + protected void doRewrite(String groupId, List group, long splitSize) { // all position deletes are of the same partition, because they are in same file group Preconditions.checkArgument(!group.isEmpty(), "Empty group"); Types.StructType partitionType = group.get(0).spec().partitionType(); @@ -101,7 +104,7 @@ protected void doRewrite(String groupId, List group) { .read() .format("iceberg") .option(SparkReadOptions.SCAN_TASK_SET_ID, groupId) - .option(SparkReadOptions.SPLIT_SIZE, splitSize(inputSize(group))) + .option(SparkReadOptions.SPLIT_SIZE, splitSize) .option(SparkReadOptions.FILE_OPEN_COST, "0") .load(groupId); diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkRewriteExecutor.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkRewriteExecutor.java new file mode 100644 index 000000000000..f723be7d633d --- /dev/null +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkRewriteExecutor.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.actions; + +import java.util.Map; +import java.util.Set; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.ContentScanTask; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Table; +import org.apache.iceberg.actions.FileRewriteExecutor; +import org.apache.iceberg.actions.FileRewriteGroup; +import org.apache.iceberg.actions.FileRewritePlan; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; + +/** + * Common parent for data and positional delete rewrite executors. + * + * @param the Java type of the plan info + * @param the Java type of the tasks to read content files + * @param the Java type of the content files + * @param the Java type of the planned groups + */ +abstract class SparkRewriteExecutor< + I, + T extends ContentScanTask, + F extends ContentFile, + G extends FileRewriteGroup> + implements FileRewriteExecutor { + private final Table table; + private long writeMaxFileSize; + private int outputSpecId; + + SparkRewriteExecutor(Table table) { + this.table = table; + } + + Table table() { + return table; + } + + long writeMaxFileSize() { + return writeMaxFileSize; + } + + int outputSpecId() { + return outputSpecId; + } + + PartitionSpec outputSpec() { + return table.specs().get(outputSpecId); + } + + @Override + public void initPlan(FileRewritePlan plan) { + this.writeMaxFileSize = plan.writeMaxFileSize(); + this.outputSpecId = plan.outputSpecId(); + } + + @Override + public Set validOptions() { + return ImmutableSet.of(); + } + + @Override + public void init(Map options) {} +} diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingDataRewriter.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingDataRewriteExecutor.java similarity index 93% rename from spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingDataRewriter.java rename to spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingDataRewriteExecutor.java index ce572c6486cc..e5090a68bff2 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingDataRewriter.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingDataRewriteExecutor.java @@ -48,7 +48,7 @@ import org.apache.spark.sql.execution.datasources.v2.DistributionAndOrderingUtils$; import scala.Option; -abstract class SparkShufflingDataRewriter extends SparkSizeBasedDataRewriter { +abstract class SparkShufflingDataRewriteExecutor extends SparkSizeBasedDataRewriteExecutor { /** * The number of shuffle partitions and consequently the number of output files created by the @@ -82,7 +82,7 @@ abstract class SparkShufflingDataRewriter extends SparkSizeBasedDataRewriter { private double compressionFactor; private int numShufflePartitionsPerFile; - protected SparkShufflingDataRewriter(SparkSession spark, Table table) { + protected SparkShufflingDataRewriteExecutor(SparkSession spark, Table table) { super(spark, table); } @@ -118,7 +118,8 @@ public void init(Map options) { } @Override - public void doRewrite(String groupId, List group) { + public void doRewrite( + String groupId, List group, long splitSize, int expectedOutputFiles) { Dataset scanDF = spark() .read() @@ -126,7 +127,7 @@ public void doRewrite(String groupId, List group) { .option(SparkReadOptions.SCAN_TASK_SET_ID, groupId) .load(groupId); - Dataset sortedDF = sortedDF(scanDF, sortFunction(group)); + Dataset sortedDF = sortedDF(scanDF, sortFunction(group, expectedOutputFiles)); sortedDF .write() @@ -139,9 +140,10 @@ public void doRewrite(String groupId, List group) { .save(groupId); } - private Function, Dataset> sortFunction(List group) { + private Function, Dataset> sortFunction( + List group, int expectedOutputFiles) { SortOrder[] ordering = Spark3Util.toOrdering(outputSortOrder(group)); - int numShufflePartitions = numShufflePartitions(group); + int numShufflePartitions = Math.max(1, expectedOutputFiles * numShufflePartitionsPerFile); return (df) -> transformPlan(df, plan -> sortPlan(plan, ordering, numShufflePartitions)); } @@ -176,11 +178,6 @@ private org.apache.iceberg.SortOrder outputSortOrder(List group) { } } - private int numShufflePartitions(List group) { - int numOutputFiles = (int) numOutputFiles((long) (inputSize(group) * compressionFactor)); - return Math.max(1, numOutputFiles * numShufflePartitionsPerFile); - } - private double compressionFactor(Map options) { double value = PropertyUtil.propertyAsDouble(options, COMPRESSION_FACTOR, COMPRESSION_FACTOR_DEFAULT); diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkSizeBasedDataRewriter.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkSizeBasedDataRewriteExecutor.java similarity index 74% rename from spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkSizeBasedDataRewriter.java rename to spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkSizeBasedDataRewriteExecutor.java index ae0e0d20dd4e..068979d8e5db 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkSizeBasedDataRewriter.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkSizeBasedDataRewriteExecutor.java @@ -24,38 +24,41 @@ import org.apache.iceberg.DataFile; import org.apache.iceberg.FileScanTask; import org.apache.iceberg.Table; -import org.apache.iceberg.actions.SizeBasedDataRewriter; +import org.apache.iceberg.actions.RewriteDataFiles.FileGroupInfo; +import org.apache.iceberg.actions.RewriteFileGroup; import org.apache.iceberg.spark.FileRewriteCoordinator; import org.apache.iceberg.spark.ScanTaskSetManager; import org.apache.iceberg.spark.SparkTableCache; import org.apache.spark.sql.SparkSession; -abstract class SparkSizeBasedDataRewriter extends SizeBasedDataRewriter { +abstract class SparkSizeBasedDataRewriteExecutor + extends SparkRewriteExecutor { private final SparkSession spark; private final SparkTableCache tableCache = SparkTableCache.get(); private final ScanTaskSetManager taskSetManager = ScanTaskSetManager.get(); private final FileRewriteCoordinator coordinator = FileRewriteCoordinator.get(); - SparkSizeBasedDataRewriter(SparkSession spark, Table table) { + SparkSizeBasedDataRewriteExecutor(SparkSession spark, Table table) { super(table); this.spark = spark; } - protected abstract void doRewrite(String groupId, List group); + protected abstract void doRewrite( + String groupId, List group, long splitSize, int expectedOutputFiles); protected SparkSession spark() { return spark; } @Override - public Set rewrite(List group) { + public Set rewrite(RewriteFileGroup group) { String groupId = UUID.randomUUID().toString(); try { tableCache.add(groupId, table()); - taskSetManager.stageTasks(table(), groupId, group); + taskSetManager.stageTasks(table(), groupId, group.fileScans()); - doRewrite(groupId, group); + doRewrite(groupId, group.fileScans(), group.splitSize(), group.expectedOutputFiles()); return coordinator.fetchNewFiles(table(), groupId); } finally { diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkSortDataRewriter.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkSortDataRewriteExecutor.java similarity index 89% rename from spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkSortDataRewriter.java rename to spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkSortDataRewriteExecutor.java index 1f70d4d7ca9d..a1d4c57894cc 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkSortDataRewriter.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkSortDataRewriteExecutor.java @@ -26,11 +26,11 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; -class SparkSortDataRewriter extends SparkShufflingDataRewriter { +class SparkSortDataRewriteExecutor extends SparkShufflingDataRewriteExecutor { private final SortOrder sortOrder; - SparkSortDataRewriter(SparkSession spark, Table table) { + SparkSortDataRewriteExecutor(SparkSession spark, Table table) { super(spark, table); Preconditions.checkArgument( table.sortOrder().isSorted(), @@ -39,7 +39,7 @@ class SparkSortDataRewriter extends SparkShufflingDataRewriter { this.sortOrder = table.sortOrder(); } - SparkSortDataRewriter(SparkSession spark, Table table, SortOrder sortOrder) { + SparkSortDataRewriteExecutor(SparkSession spark, Table table, SortOrder sortOrder) { super(spark, table); Preconditions.checkArgument( sortOrder != null && sortOrder.isSorted(), diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderDataRewriter.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderDataRewriteExecutor.java similarity index 97% rename from spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderDataRewriter.java rename to spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderDataRewriteExecutor.java index cc4fb78ebd18..d4dc5affb4b6 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderDataRewriter.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderDataRewriteExecutor.java @@ -44,9 +44,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -class SparkZOrderDataRewriter extends SparkShufflingDataRewriter { +class SparkZOrderDataRewriteExecutor extends SparkShufflingDataRewriteExecutor { - private static final Logger LOG = LoggerFactory.getLogger(SparkZOrderDataRewriter.class); + private static final Logger LOG = LoggerFactory.getLogger(SparkZOrderDataRewriteExecutor.class); private static final String Z_COLUMN = "ICEZVALUE"; private static final Schema Z_SCHEMA = @@ -78,7 +78,7 @@ class SparkZOrderDataRewriter extends SparkShufflingDataRewriter { private int maxOutputSize; private int varLengthContribution; - SparkZOrderDataRewriter(SparkSession spark, Table table, List zOrderColNames) { + SparkZOrderDataRewriteExecutor(SparkSession spark, Table table, List zOrderColNames) { super(spark, table); this.zOrderColNames = validZOrderColNames(spark, table, zOrderColNames); } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java index 2127b20aa9b1..980a1e71bef9 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java @@ -76,8 +76,8 @@ import org.apache.iceberg.actions.RewriteDataFiles.Result; import org.apache.iceberg.actions.RewriteDataFilesCommitManager; import org.apache.iceberg.actions.RewriteFileGroup; -import org.apache.iceberg.actions.SizeBasedDataRewriter; -import org.apache.iceberg.actions.SizeBasedFileRewriter; +import org.apache.iceberg.actions.RewriteFileGroupPlanner; +import org.apache.iceberg.actions.SizeBasedFileRewritePlanner; import org.apache.iceberg.data.GenericAppenderFactory; import org.apache.iceberg.data.GenericRecord; import org.apache.iceberg.data.Record; @@ -168,7 +168,9 @@ public void setupTableLocation() throws Exception { private RewriteDataFilesSparkAction basicRewrite(Table table) { // Always compact regardless of input files table.refresh(); - return actions().rewriteDataFiles(table).option(SizeBasedFileRewriter.MIN_INPUT_FILES, "1"); + return actions() + .rewriteDataFiles(table) + .option(SizeBasedFileRewritePlanner.MIN_INPUT_FILES, "1"); } @TestTemplate @@ -289,9 +291,9 @@ public void testBinPackAfterPartitionChange() { RewriteDataFiles.Result result = basicRewrite(table) - .option(SizeBasedFileRewriter.MIN_INPUT_FILES, "1") + .option(SizeBasedFileRewritePlanner.MIN_INPUT_FILES, "1") .option( - SizeBasedFileRewriter.MIN_FILE_SIZE_BYTES, + SizeBasedFileRewritePlanner.MIN_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table) + 1000)) .option( RewriteDataFiles.TARGET_FILE_SIZE_BYTES, @@ -356,11 +358,12 @@ public void testBinPackWithDeletes() throws IOException { actions() .rewriteDataFiles(table) // do not include any file based on bin pack file size configs - .option(SizeBasedFileRewriter.MIN_FILE_SIZE_BYTES, "0") + .option(SizeBasedFileRewritePlanner.MIN_FILE_SIZE_BYTES, "0") .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE - 1)) - .option(SizeBasedFileRewriter.MAX_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE)) + .option( + SizeBasedFileRewritePlanner.MAX_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE)) // set DELETE_FILE_THRESHOLD to 1 since DVs only produce one delete file per data file - .option(SizeBasedDataRewriter.DELETE_FILE_THRESHOLD, "1") + .option(RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, "1") .execute(); assertThat(result.rewrittenDataFilesCount()) .as("Action should rewrite 5 data files") @@ -371,10 +374,11 @@ public void testBinPackWithDeletes() throws IOException { actions() .rewriteDataFiles(table) // do not include any file based on bin pack file size configs - .option(SizeBasedFileRewriter.MIN_FILE_SIZE_BYTES, "0") + .option(SizeBasedFileRewritePlanner.MIN_FILE_SIZE_BYTES, "0") .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE - 1)) - .option(SizeBasedFileRewriter.MAX_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE)) - .option(SizeBasedDataRewriter.DELETE_FILE_THRESHOLD, "2") + .option( + SizeBasedFileRewritePlanner.MAX_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE)) + .option(RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, "2") .execute(); assertThat(result.rewrittenDataFilesCount()) .as("Action should rewrite 2 data files") @@ -432,7 +436,7 @@ public void testRemoveDangledEqualityDeletesPartitionEvolution() { RewriteDataFiles.Result result = basicRewrite(table) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .filter(Expressions.equal("c1", 1)) .option(RewriteDataFiles.REMOVE_DANGLING_DELETES, "true") .execute(); @@ -492,7 +496,7 @@ public void testRemoveDangledPositionDeletesPartitionEvolution() throws IOExcept actions() .rewriteDataFiles(table) .filter(Expressions.equal("c1", 1)) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .option(RewriteDataFiles.REMOVE_DANGLING_DELETES, "true") .execute(); @@ -538,7 +542,7 @@ public void testBinPackWithDeleteAllData() throws IOException { Result result = actions() .rewriteDataFiles(table) - .option(SizeBasedDataRewriter.DELETE_FILE_THRESHOLD, "1") + .option(RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, "1") .execute(); assertThat(result.rewrittenDataFilesCount()).as("Action should rewrite 1 data files").isOne(); assertThat(result.rewrittenBytesCount()).isEqualTo(dataSizeBefore); @@ -689,7 +693,9 @@ public void testBinPackSplitLargeFile() { Result result = basicRewrite(table) .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(targetSize)) - .option(SizeBasedFileRewriter.MAX_FILE_SIZE_BYTES, Long.toString(targetSize * 2 - 2000)) + .option( + SizeBasedFileRewritePlanner.MAX_FILE_SIZE_BYTES, + Long.toString(targetSize * 2 - 2000)) .execute(); assertThat(result.rewrittenDataFilesCount()).as("Action should delete 1 data files").isOne(); @@ -720,8 +726,12 @@ public void testBinPackCombineMixedFiles() { Result result = basicRewrite(table) .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(targetSize + 1000)) - .option(SizeBasedFileRewriter.MAX_FILE_SIZE_BYTES, Integer.toString(targetSize + 80000)) - .option(SizeBasedFileRewriter.MIN_FILE_SIZE_BYTES, Integer.toString(targetSize - 1000)) + .option( + SizeBasedFileRewritePlanner.MAX_FILE_SIZE_BYTES, + Integer.toString(targetSize + 80000)) + .option( + SizeBasedFileRewritePlanner.MIN_FILE_SIZE_BYTES, + Integer.toString(targetSize - 1000)) .execute(); assertThat(result.rewrittenDataFilesCount()) @@ -752,10 +762,10 @@ public void testBinPackCombineMediumFiles() { basicRewrite(table) .option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(targetSize)) .option( - SizeBasedFileRewriter.MAX_FILE_SIZE_BYTES, + SizeBasedFileRewritePlanner.MAX_FILE_SIZE_BYTES, Integer.toString((int) (targetSize * 1.8))) .option( - SizeBasedFileRewriter.MIN_FILE_SIZE_BYTES, + SizeBasedFileRewritePlanner.MIN_FILE_SIZE_BYTES, Integer.toString(targetSize - 100)) // All files too small .execute(); @@ -815,7 +825,7 @@ public void testMultipleGroups() { basicRewrite(table) .option( RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) - .option(SizeBasedFileRewriter.MIN_INPUT_FILES, "1") + .option(SizeBasedFileRewritePlanner.MIN_INPUT_FILES, "1") .execute(); assertThat(result.rewriteResults()).as("Should have 10 fileGroups").hasSize(10); @@ -1206,7 +1216,7 @@ public void testInvalidOptions() { () -> basicRewrite(table) .sort(SortOrder.builderFor(table.schema()).asc("c2").build()) - .option(SparkShufflingDataRewriter.SHUFFLE_PARTITIONS_PER_FILE, "5") + .option(SparkShufflingDataRewriteExecutor.SHUFFLE_PARTITIONS_PER_FILE, "5") .execute()) .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining("requires enabling Iceberg Spark session extensions"); @@ -1227,7 +1237,7 @@ public void testSortMultipleGroups() { RewriteDataFiles.Result result = basicRewrite(table) .sort() - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .option( RewriteDataFiles.MAX_FILE_GROUP_SIZE_BYTES, Integer.toString(fileSize * 2 + 1000)) .execute(); @@ -1257,8 +1267,8 @@ public void testSimpleSort() { RewriteDataFiles.Result result = basicRewrite(table) .sort() - .option(SizeBasedFileRewriter.MIN_INPUT_FILES, "1") - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.MIN_INPUT_FILES, "1") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .option( RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))) .execute(); @@ -1291,8 +1301,8 @@ public void testSortAfterPartitionChange() { RewriteDataFiles.Result result = basicRewrite(table) .sort() - .option(SizeBasedFileRewriter.MIN_INPUT_FILES, "1") - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.MIN_INPUT_FILES, "1") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .option( RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))) .execute(); @@ -1325,7 +1335,7 @@ public void testSortCustomSortOrder() { RewriteDataFiles.Result result = basicRewrite(table) .sort(SortOrder.builderFor(table.schema()).asc("c2").build()) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .option( RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table))) .execute(); @@ -1363,7 +1373,7 @@ public void testSortCustomSortOrderRequiresRepartition() { RewriteDataFiles.Result result = basicRewrite(table) .sort(SortOrder.builderFor(table.schema()).asc("c3").build()) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .option( RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table) / partitions)) @@ -1397,13 +1407,13 @@ public void testAutoSortShuffleOutput() { basicRewrite(table) .sort(SortOrder.builderFor(table.schema()).asc("c2").build()) .option( - SizeBasedFileRewriter.MAX_FILE_SIZE_BYTES, + SizeBasedFileRewritePlanner.MAX_FILE_SIZE_BYTES, Integer.toString((averageFileSize(table) / 2) + 2)) // Divide files in 2 .option( RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table) / 2)) - .option(SizeBasedFileRewriter.MIN_INPUT_FILES, "1") + .option(SizeBasedFileRewritePlanner.MIN_INPUT_FILES, "1") .execute(); assertThat(result.rewriteResults()).as("Should have 1 fileGroups").hasSize(1); @@ -1477,13 +1487,13 @@ public void testZOrderSort() { basicRewrite(table) .zOrder("c2", "c3") .option( - SizeBasedFileRewriter.MAX_FILE_SIZE_BYTES, + SizeBasedFileRewritePlanner.MAX_FILE_SIZE_BYTES, Integer.toString((averageFileSize(table) / 2) + 2)) // Divide files in 2 .option( RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Integer.toString(averageFileSize(table) / 2)) - .option(SizeBasedFileRewriter.MIN_INPUT_FILES, "1") + .option(SizeBasedFileRewritePlanner.MIN_INPUT_FILES, "1") .execute(); assertThat(result.rewriteResults()).as("Should have 1 fileGroups").hasSize(1); @@ -1539,8 +1549,8 @@ public void testZOrderAllTypesSort() { "stringCol", "binaryCol", "booleanCol") - .option(SizeBasedFileRewriter.MIN_INPUT_FILES, "1") - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.MIN_INPUT_FILES, "1") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .execute(); assertThat(result.rewriteResults()).as("Should have 1 fileGroups").hasSize(1); @@ -1655,7 +1665,7 @@ public void testRewriteJobOrderFilesAsc() { RewriteDataFilesSparkAction basicRewrite = basicRewrite(table).binPack(); List expected = toGroupStream(table, basicRewrite) - .mapToLong(RewriteFileGroup::numFiles) + .mapToLong(RewriteFileGroup::numInputFiles) .boxed() .collect(Collectors.toList()); @@ -1665,7 +1675,7 @@ public void testRewriteJobOrderFilesAsc() { .binPack(); List actual = toGroupStream(table, jobOrderRewrite) - .mapToLong(RewriteFileGroup::numFiles) + .mapToLong(RewriteFileGroup::numInputFiles) .boxed() .collect(Collectors.toList()); @@ -1687,7 +1697,7 @@ public void testRewriteJobOrderFilesDesc() { RewriteDataFilesSparkAction basicRewrite = basicRewrite(table).binPack(); List expected = toGroupStream(table, basicRewrite) - .mapToLong(RewriteFileGroup::numFiles) + .mapToLong(RewriteFileGroup::numInputFiles) .boxed() .collect(Collectors.toList()); @@ -1697,7 +1707,7 @@ public void testRewriteJobOrderFilesDesc() { .binPack(); List actual = toGroupStream(table, jobOrderRewrite) - .mapToLong(RewriteFileGroup::numFiles) + .mapToLong(RewriteFileGroup::numInputFiles) .boxed() .collect(Collectors.toList()); @@ -1737,7 +1747,7 @@ public void testBinPackRewriterWithSpecificUnparitionedOutputSpec() { RewriteDataFiles.Result result = basicRewrite(table) .option(RewriteDataFiles.OUTPUT_SPEC_ID, String.valueOf(outputSpecId)) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .binPack() .execute(); @@ -1760,7 +1770,7 @@ public void testBinPackRewriterWithSpecificOutputSpec() { RewriteDataFiles.Result result = basicRewrite(table) .option(RewriteDataFiles.OUTPUT_SPEC_ID, String.valueOf(outputSpecId)) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .binPack() .execute(); @@ -1799,7 +1809,7 @@ public void testSortRewriterWithSpecificOutputSpecId() { RewriteDataFiles.Result result = basicRewrite(table) .option(RewriteDataFiles.OUTPUT_SPEC_ID, String.valueOf(outputSpecId)) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .sort(SortOrder.builderFor(table.schema()).asc("c2").asc("c3").build()) .execute(); @@ -1822,7 +1832,7 @@ public void testZOrderRewriteWithSpecificOutputSpecId() { RewriteDataFiles.Result result = basicRewrite(table) .option(RewriteDataFiles.OUTPUT_SPEC_ID, String.valueOf(outputSpecId)) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .zOrder("c2", "c3") .execute(); @@ -1849,9 +1859,9 @@ protected List currentDataFiles(Table table) { } private Stream toGroupStream(Table table, RewriteDataFilesSparkAction rewrite) { - rewrite.validateAndInitOptions(); + rewrite.init(table.currentSnapshot().snapshotId()); - return rewrite.plan(table.currentSnapshot().snapshotId()).groups(); + return rewrite.plan().groups(); } protected List currentData() { diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewritePositionDeleteFilesAction.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewritePositionDeleteFilesAction.java index 12b104fca27c..0fa8ccee9903 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewritePositionDeleteFilesAction.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewritePositionDeleteFilesAction.java @@ -57,7 +57,7 @@ import org.apache.iceberg.TableProperties; import org.apache.iceberg.actions.RewritePositionDeleteFiles.FileGroupRewriteResult; import org.apache.iceberg.actions.RewritePositionDeleteFiles.Result; -import org.apache.iceberg.actions.SizeBasedFileRewriter; +import org.apache.iceberg.actions.SizeBasedFileRewritePlanner; import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.data.FileHelpers; import org.apache.iceberg.deletes.DeleteGranularity; @@ -166,7 +166,7 @@ private void checkDeleteGranularity(DeleteGranularity deleteGranularity) throws Result result = SparkActions.get(spark) .rewritePositionDeletes(table) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .execute(); int expectedDeleteFilesCount = deleteGranularity == DeleteGranularity.FILE ? 2 : 1; @@ -191,7 +191,7 @@ public void testUnpartitioned() throws Exception { Result result = SparkActions.get(spark) .rewritePositionDeletes(table) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .execute(); List newDeleteFiles = deleteFiles(table); assertThat(newDeleteFiles).as("Expected 1 new delete file").hasSize(1); @@ -225,8 +225,10 @@ public void testRewriteAll() throws Exception { Result result = SparkActions.get(spark) .rewritePositionDeletes(table) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") - .option(SizeBasedFileRewriter.TARGET_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE - 1)) + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") + .option( + SizeBasedFileRewritePlanner.TARGET_FILE_SIZE_BYTES, + Long.toString(Long.MAX_VALUE - 1)) .execute(); List newDeleteFiles = deleteFiles(table); @@ -270,8 +272,10 @@ public void testRewriteFilter() throws Exception { SparkActions.get(spark) .rewritePositionDeletes(table) .filter(filter) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") - .option(SizeBasedFileRewriter.TARGET_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE - 1)) + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") + .option( + SizeBasedFileRewritePlanner.TARGET_FILE_SIZE_BYTES, + Long.toString(Long.MAX_VALUE - 1)) .execute(); List newDeleteFiles = except(deleteFiles(table), deleteFiles); @@ -322,8 +326,8 @@ public void testRewriteToSmallerTarget() throws Exception { Result result = SparkActions.get(spark) .rewritePositionDeletes(table) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") - .option(SizeBasedFileRewriter.TARGET_FILE_SIZE_BYTES, String.valueOf(avgSize / 2)) + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.TARGET_FILE_SIZE_BYTES, String.valueOf(avgSize / 2)) .execute(); List newDeleteFiles = deleteFiles(table); assertThat(newDeleteFiles).as("Should have 8 new delete files").hasSize(8); @@ -362,13 +366,13 @@ public void testRemoveDanglingDeletes() throws Exception { SparkActions.get(spark) .rewriteDataFiles(table) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .execute(); Result result = SparkActions.get(spark) .rewritePositionDeletes(table) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .execute(); List newDeleteFiles = deleteFiles(table); assertThat(newDeleteFiles).as("Should have 0 new delete files").hasSize(0); @@ -404,13 +408,13 @@ public void testSomePartitionsDanglingDeletes() throws Exception { SparkActions.get(spark) .rewriteDataFiles(table) .filter(filter) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .execute(); Result result = SparkActions.get(spark) .rewritePositionDeletes(table) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .execute(); List newDeleteFiles = deleteFiles(table); assertThat(newDeleteFiles).as("Should have 2 new delete files").hasSize(2); @@ -456,7 +460,7 @@ public void testRewriteFilterRemoveDangling() throws Exception { SparkActions.get(spark) .rewriteDataFiles(table) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .execute(); Expression filter = Expressions.or(Expressions.equal("c1", 0), Expressions.equal("c1", 1)); @@ -464,8 +468,10 @@ public void testRewriteFilterRemoveDangling() throws Exception { SparkActions.get(spark) .rewritePositionDeletes(table) .filter(filter) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") - .option(SizeBasedFileRewriter.TARGET_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE - 1)) + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") + .option( + SizeBasedFileRewritePlanner.TARGET_FILE_SIZE_BYTES, + Long.toString(Long.MAX_VALUE - 1)) .execute(); List newDeleteFiles = except(deleteFiles(table), deleteFiles); @@ -517,7 +523,7 @@ public void testPartitionEvolutionAdd() throws Exception { Result result = SparkActions.get(spark) .rewritePositionDeletes(table) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .execute(); List rewrittenDeleteFiles = @@ -568,7 +574,7 @@ public void testPartitionEvolutionRemove() throws Exception { Result result = SparkActions.get(spark) .rewritePositionDeletes(table) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .execute(); List newDeleteFiles = deleteFiles(table); assertThat(newDeleteFiles).as("Should have 3 new delete files").hasSize(3); @@ -615,7 +621,7 @@ public void testSchemaEvolution() throws Exception { Result result = SparkActions.get(spark) .rewritePositionDeletes(table) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .execute(); List rewrittenDeleteFiles = @@ -646,7 +652,7 @@ public void testSnapshotProperty() throws Exception { SparkActions.get(spark) .rewritePositionDeletes(table) .snapshotProperty("key", "value") - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .execute(); assertThat(table.currentSnapshot().summary()) .containsAllEntriesOf(ImmutableMap.of("key", "value")); @@ -711,8 +717,10 @@ public void testRewriteManyColumns() throws Exception { Result result = SparkActions.get(spark) .rewritePositionDeletes(table) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") - .option(SizeBasedFileRewriter.TARGET_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE - 1)) + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") + .option( + SizeBasedFileRewritePlanner.TARGET_FILE_SIZE_BYTES, + Long.toString(Long.MAX_VALUE - 1)) .execute(); List newDeleteFiles = deleteFiles(table); diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestSparkFileRewriter.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestSparkFileRewriteExecutor.java similarity index 57% rename from spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestSparkFileRewriter.java rename to spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestSparkFileRewriteExecutor.java index e223d2e16411..bce2bf11209c 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestSparkFileRewriter.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestSparkFileRewriteExecutor.java @@ -29,9 +29,11 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.SortOrder; import org.apache.iceberg.Table; -import org.apache.iceberg.actions.SizeBasedDataRewriter; -import org.apache.iceberg.actions.SizeBasedFileRewriter; +import org.apache.iceberg.actions.RewriteDataFiles; +import org.apache.iceberg.actions.RewriteFileGroupPlanner; +import org.apache.iceberg.actions.SizeBasedFileRewritePlanner; import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; @@ -43,7 +45,7 @@ import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Test; -public class TestSparkFileRewriter extends TestBase { +public class TestSparkFileRewriteExecutor extends TestBase { private static final TableIdentifier TABLE_IDENT = TableIdentifier.of("default", "tbl"); private static final Schema SCHEMA = @@ -62,7 +64,8 @@ public void removeTable() { @Test public void testBinPackDataSelectFiles() { Table table = catalog.createTable(TABLE_IDENT, SCHEMA); - SparkBinPackDataRewriter rewriter = new SparkBinPackDataRewriter(spark, table); + RewriteFileGroupPlanner rewriter = + new RewriteFileGroupPlanner(table, Expressions.alwaysTrue(), 1, false); checkDataFileSizeFiltering(rewriter); checkDataFilesDeleteThreshold(rewriter); @@ -71,32 +74,7 @@ public void testBinPackDataSelectFiles() { checkDataFileGroupWithTooMuchData(rewriter); } - @Test - public void testSortDataSelectFiles() { - Table table = catalog.createTable(TABLE_IDENT, SCHEMA); - SparkSortDataRewriter rewriter = new SparkSortDataRewriter(spark, table, SORT_ORDER); - - checkDataFileSizeFiltering(rewriter); - checkDataFilesDeleteThreshold(rewriter); - checkDataFileGroupWithEnoughFiles(rewriter); - checkDataFileGroupWithEnoughData(rewriter); - checkDataFileGroupWithTooMuchData(rewriter); - } - - @Test - public void testZOrderDataSelectFiles() { - Table table = catalog.createTable(TABLE_IDENT, SCHEMA); - ImmutableList zOrderCols = ImmutableList.of("id"); - SparkZOrderDataRewriter rewriter = new SparkZOrderDataRewriter(spark, table, zOrderCols); - - checkDataFileSizeFiltering(rewriter); - checkDataFilesDeleteThreshold(rewriter); - checkDataFileGroupWithEnoughFiles(rewriter); - checkDataFileGroupWithEnoughData(rewriter); - checkDataFileGroupWithTooMuchData(rewriter); - } - - private void checkDataFileSizeFiltering(SizeBasedDataRewriter rewriter) { + private void checkDataFileSizeFiltering(RewriteFileGroupPlanner rewriter) { FileScanTask tooSmallTask = new MockFileScanTask(100L); FileScanTask optimal = new MockFileScanTask(450); FileScanTask tooBigTask = new MockFileScanTask(1000L); @@ -104,10 +82,10 @@ private void checkDataFileSizeFiltering(SizeBasedDataRewriter rewriter) { Map options = ImmutableMap.of( - SizeBasedDataRewriter.MIN_FILE_SIZE_BYTES, "250", - SizeBasedDataRewriter.TARGET_FILE_SIZE_BYTES, "500", - SizeBasedDataRewriter.MAX_FILE_SIZE_BYTES, "750", - SizeBasedDataRewriter.DELETE_FILE_THRESHOLD, String.valueOf(Integer.MAX_VALUE)); + RewriteFileGroupPlanner.MIN_FILE_SIZE_BYTES, "250", + RewriteFileGroupPlanner.TARGET_FILE_SIZE_BYTES, "500", + RewriteFileGroupPlanner.MAX_FILE_SIZE_BYTES, "750", + RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, String.valueOf(Integer.MAX_VALUE)); rewriter.init(options); Iterable> groups = rewriter.planFileGroups(tasks); @@ -116,17 +94,17 @@ private void checkDataFileSizeFiltering(SizeBasedDataRewriter rewriter) { assertThat(group).as("Must rewrite 2 files").hasSize(2); } - private void checkDataFilesDeleteThreshold(SizeBasedDataRewriter rewriter) { + private void checkDataFilesDeleteThreshold(RewriteFileGroupPlanner rewriter) { FileScanTask tooManyDeletesTask = MockFileScanTask.mockTaskWithDeletes(1000L, 3); FileScanTask optimalTask = MockFileScanTask.mockTaskWithDeletes(1000L, 1); List tasks = ImmutableList.of(tooManyDeletesTask, optimalTask); Map options = ImmutableMap.of( - SizeBasedDataRewriter.MIN_FILE_SIZE_BYTES, "1", - SizeBasedDataRewriter.TARGET_FILE_SIZE_BYTES, "2000", - SizeBasedDataRewriter.MAX_FILE_SIZE_BYTES, "5000", - SizeBasedDataRewriter.DELETE_FILE_THRESHOLD, "2"); + RewriteFileGroupPlanner.MIN_FILE_SIZE_BYTES, "1", + RewriteFileGroupPlanner.TARGET_FILE_SIZE_BYTES, "2000", + RewriteFileGroupPlanner.MAX_FILE_SIZE_BYTES, "5000", + RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, "2"); rewriter.init(options); Iterable> groups = rewriter.planFileGroups(tasks); @@ -135,7 +113,7 @@ private void checkDataFilesDeleteThreshold(SizeBasedDataRewriter rewriter) { assertThat(group).as("Must rewrite 1 file").hasSize(1); } - private void checkDataFileGroupWithEnoughFiles(SizeBasedDataRewriter rewriter) { + private void checkDataFileGroupWithEnoughFiles(RewriteFileGroupPlanner rewriter) { List tasks = ImmutableList.of( new MockFileScanTask(100L), @@ -145,11 +123,11 @@ private void checkDataFileGroupWithEnoughFiles(SizeBasedDataRewriter rewriter) { Map options = ImmutableMap.of( - SizeBasedDataRewriter.MIN_INPUT_FILES, "3", - SizeBasedDataRewriter.MIN_FILE_SIZE_BYTES, "150", - SizeBasedDataRewriter.TARGET_FILE_SIZE_BYTES, "1000", - SizeBasedDataRewriter.MAX_FILE_SIZE_BYTES, "5000", - SizeBasedDataRewriter.DELETE_FILE_THRESHOLD, String.valueOf(Integer.MAX_VALUE)); + RewriteFileGroupPlanner.MIN_INPUT_FILES, "3", + RewriteFileGroupPlanner.MIN_FILE_SIZE_BYTES, "150", + RewriteFileGroupPlanner.TARGET_FILE_SIZE_BYTES, "1000", + RewriteFileGroupPlanner.MAX_FILE_SIZE_BYTES, "5000", + RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, String.valueOf(Integer.MAX_VALUE)); rewriter.init(options); Iterable> groups = rewriter.planFileGroups(tasks); @@ -158,18 +136,18 @@ private void checkDataFileGroupWithEnoughFiles(SizeBasedDataRewriter rewriter) { assertThat(group).as("Must rewrite 4 files").hasSize(4); } - private void checkDataFileGroupWithEnoughData(SizeBasedDataRewriter rewriter) { + private void checkDataFileGroupWithEnoughData(RewriteFileGroupPlanner rewriter) { List tasks = ImmutableList.of( new MockFileScanTask(100L), new MockFileScanTask(100L), new MockFileScanTask(100L)); Map options = ImmutableMap.of( - SizeBasedDataRewriter.MIN_INPUT_FILES, "5", - SizeBasedDataRewriter.MIN_FILE_SIZE_BYTES, "200", - SizeBasedDataRewriter.TARGET_FILE_SIZE_BYTES, "250", - SizeBasedDataRewriter.MAX_FILE_SIZE_BYTES, "500", - SizeBasedDataRewriter.DELETE_FILE_THRESHOLD, String.valueOf(Integer.MAX_VALUE)); + RewriteFileGroupPlanner.MIN_INPUT_FILES, "5", + RewriteFileGroupPlanner.MIN_FILE_SIZE_BYTES, "200", + RewriteFileGroupPlanner.TARGET_FILE_SIZE_BYTES, "250", + RewriteFileGroupPlanner.MAX_FILE_SIZE_BYTES, "500", + RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, String.valueOf(Integer.MAX_VALUE)); rewriter.init(options); Iterable> groups = rewriter.planFileGroups(tasks); @@ -178,16 +156,16 @@ private void checkDataFileGroupWithEnoughData(SizeBasedDataRewriter rewriter) { assertThat(group).as("Must rewrite 3 files").hasSize(3); } - private void checkDataFileGroupWithTooMuchData(SizeBasedDataRewriter rewriter) { + private void checkDataFileGroupWithTooMuchData(RewriteFileGroupPlanner rewriter) { List tasks = ImmutableList.of(new MockFileScanTask(2000L)); Map options = ImmutableMap.of( - SizeBasedDataRewriter.MIN_INPUT_FILES, "5", - SizeBasedDataRewriter.MIN_FILE_SIZE_BYTES, "200", - SizeBasedDataRewriter.TARGET_FILE_SIZE_BYTES, "250", - SizeBasedDataRewriter.MAX_FILE_SIZE_BYTES, "500", - SizeBasedDataRewriter.DELETE_FILE_THRESHOLD, String.valueOf(Integer.MAX_VALUE)); + RewriteFileGroupPlanner.MIN_INPUT_FILES, "5", + RewriteFileGroupPlanner.MIN_FILE_SIZE_BYTES, "200", + RewriteFileGroupPlanner.TARGET_FILE_SIZE_BYTES, "250", + RewriteFileGroupPlanner.MAX_FILE_SIZE_BYTES, "500", + RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, String.valueOf(Integer.MAX_VALUE)); rewriter.init(options); Iterable> groups = rewriter.planFileGroups(tasks); @@ -200,15 +178,15 @@ private void checkDataFileGroupWithTooMuchData(SizeBasedDataRewriter rewriter) { public void testInvalidConstructorUsagesSortData() { Table table = catalog.createTable(TABLE_IDENT, SCHEMA); - assertThatThrownBy(() -> new SparkSortDataRewriter(spark, table)) + assertThatThrownBy(() -> new SparkSortDataRewriteExecutor(spark, table)) .hasMessageContaining("Cannot sort data without a valid sort order") .hasMessageContaining("is unsorted and no sort order is provided"); - assertThatThrownBy(() -> new SparkSortDataRewriter(spark, table, null)) + assertThatThrownBy(() -> new SparkSortDataRewriteExecutor(spark, table, null)) .hasMessageContaining("Cannot sort data without a valid sort order") .hasMessageContaining("the provided sort order is null or empty"); - assertThatThrownBy(() -> new SparkSortDataRewriter(spark, table, SortOrder.unsorted())) + assertThatThrownBy(() -> new SparkSortDataRewriteExecutor(spark, table, SortOrder.unsorted())) .hasMessageContaining("Cannot sort data without a valid sort order") .hasMessageContaining("the provided sort order is null or empty"); } @@ -217,17 +195,19 @@ public void testInvalidConstructorUsagesSortData() { public void testInvalidConstructorUsagesZOrderData() { Table table = catalog.createTable(TABLE_IDENT, SCHEMA, SPEC); - assertThatThrownBy(() -> new SparkZOrderDataRewriter(spark, table, null)) + assertThatThrownBy(() -> new SparkZOrderDataRewriteExecutor(spark, table, null)) .hasMessageContaining("Cannot ZOrder when no columns are specified"); - assertThatThrownBy(() -> new SparkZOrderDataRewriter(spark, table, ImmutableList.of())) + assertThatThrownBy(() -> new SparkZOrderDataRewriteExecutor(spark, table, ImmutableList.of())) .hasMessageContaining("Cannot ZOrder when no columns are specified"); - assertThatThrownBy(() -> new SparkZOrderDataRewriter(spark, table, ImmutableList.of("dep"))) + assertThatThrownBy( + () -> new SparkZOrderDataRewriteExecutor(spark, table, ImmutableList.of("dep"))) .hasMessageContaining("Cannot ZOrder") .hasMessageContaining("all columns provided were identity partition columns"); - assertThatThrownBy(() -> new SparkZOrderDataRewriter(spark, table, ImmutableList.of("DeP"))) + assertThatThrownBy( + () -> new SparkZOrderDataRewriteExecutor(spark, table, ImmutableList.of("DeP"))) .hasMessageContaining("Cannot ZOrder") .hasMessageContaining("all columns provided were identity partition columns"); } @@ -235,91 +215,119 @@ public void testInvalidConstructorUsagesZOrderData() { @Test public void testBinPackDataValidOptions() { Table table = catalog.createTable(TABLE_IDENT, SCHEMA); - SparkBinPackDataRewriter rewriter = new SparkBinPackDataRewriter(spark, table); + SparkBinPackDataRewriteExecutor rewriter = new SparkBinPackDataRewriteExecutor(spark, table); + RewriteFileGroupPlanner planner = + new RewriteFileGroupPlanner(table, Expressions.alwaysTrue(), 1, false); assertThat(rewriter.validOptions()) .as("Rewriter must report all supported options") + .isEqualTo(ImmutableSet.of()); + + assertThat(planner.validOptions()) + .as("Planner must report all supported options") .isEqualTo( ImmutableSet.of( - SparkBinPackDataRewriter.TARGET_FILE_SIZE_BYTES, - SparkBinPackDataRewriter.MIN_FILE_SIZE_BYTES, - SparkBinPackDataRewriter.MAX_FILE_SIZE_BYTES, - SparkBinPackDataRewriter.MIN_INPUT_FILES, - SparkBinPackDataRewriter.REWRITE_ALL, - SparkBinPackDataRewriter.MAX_FILE_GROUP_SIZE_BYTES, - SparkBinPackDataRewriter.DELETE_FILE_THRESHOLD)); + RewriteFileGroupPlanner.TARGET_FILE_SIZE_BYTES, + RewriteFileGroupPlanner.MIN_FILE_SIZE_BYTES, + RewriteFileGroupPlanner.MAX_FILE_SIZE_BYTES, + RewriteFileGroupPlanner.MIN_INPUT_FILES, + RewriteFileGroupPlanner.REWRITE_ALL, + RewriteFileGroupPlanner.MAX_FILE_GROUP_SIZE_BYTES, + RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, + RewriteDataFiles.REWRITE_JOB_ORDER)); } @Test public void testSortDataValidOptions() { Table table = catalog.createTable(TABLE_IDENT, SCHEMA); - SparkSortDataRewriter rewriter = new SparkSortDataRewriter(spark, table, SORT_ORDER); + SparkSortDataRewriteExecutor rewriter = + new SparkSortDataRewriteExecutor(spark, table, SORT_ORDER); + RewriteFileGroupPlanner planner = + new RewriteFileGroupPlanner(table, Expressions.alwaysTrue(), 1, false); assertThat(rewriter.validOptions()) .as("Rewriter must report all supported options") .isEqualTo( ImmutableSet.of( - SparkSortDataRewriter.SHUFFLE_PARTITIONS_PER_FILE, - SparkSortDataRewriter.TARGET_FILE_SIZE_BYTES, - SparkSortDataRewriter.MIN_FILE_SIZE_BYTES, - SparkSortDataRewriter.MAX_FILE_SIZE_BYTES, - SparkSortDataRewriter.MIN_INPUT_FILES, - SparkSortDataRewriter.REWRITE_ALL, - SparkSortDataRewriter.MAX_FILE_GROUP_SIZE_BYTES, - SparkSortDataRewriter.DELETE_FILE_THRESHOLD, - SparkSortDataRewriter.COMPRESSION_FACTOR)); + SparkSortDataRewriteExecutor.SHUFFLE_PARTITIONS_PER_FILE, + SparkSortDataRewriteExecutor.COMPRESSION_FACTOR)); + + assertThat(planner.validOptions()) + .as("Planner must report all supported options") + .isEqualTo( + ImmutableSet.of( + RewriteFileGroupPlanner.TARGET_FILE_SIZE_BYTES, + RewriteFileGroupPlanner.MIN_FILE_SIZE_BYTES, + RewriteFileGroupPlanner.MAX_FILE_SIZE_BYTES, + RewriteFileGroupPlanner.MIN_INPUT_FILES, + RewriteFileGroupPlanner.REWRITE_ALL, + RewriteFileGroupPlanner.MAX_FILE_GROUP_SIZE_BYTES, + RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, + RewriteDataFiles.REWRITE_JOB_ORDER)); } @Test public void testZOrderDataValidOptions() { Table table = catalog.createTable(TABLE_IDENT, SCHEMA); ImmutableList zOrderCols = ImmutableList.of("id"); - SparkZOrderDataRewriter rewriter = new SparkZOrderDataRewriter(spark, table, zOrderCols); + SparkZOrderDataRewriteExecutor rewriter = + new SparkZOrderDataRewriteExecutor(spark, table, zOrderCols); + RewriteFileGroupPlanner planner = + new RewriteFileGroupPlanner(table, Expressions.alwaysTrue(), 1, false); assertThat(rewriter.validOptions()) .as("Rewriter must report all supported options") .isEqualTo( ImmutableSet.of( - SparkZOrderDataRewriter.SHUFFLE_PARTITIONS_PER_FILE, - SparkZOrderDataRewriter.TARGET_FILE_SIZE_BYTES, - SparkZOrderDataRewriter.MIN_FILE_SIZE_BYTES, - SparkZOrderDataRewriter.MAX_FILE_SIZE_BYTES, - SparkZOrderDataRewriter.MIN_INPUT_FILES, - SparkZOrderDataRewriter.REWRITE_ALL, - SparkZOrderDataRewriter.MAX_FILE_GROUP_SIZE_BYTES, - SparkZOrderDataRewriter.DELETE_FILE_THRESHOLD, - SparkZOrderDataRewriter.COMPRESSION_FACTOR, - SparkZOrderDataRewriter.MAX_OUTPUT_SIZE, - SparkZOrderDataRewriter.VAR_LENGTH_CONTRIBUTION)); + SparkZOrderDataRewriteExecutor.SHUFFLE_PARTITIONS_PER_FILE, + SparkZOrderDataRewriteExecutor.COMPRESSION_FACTOR, + SparkZOrderDataRewriteExecutor.MAX_OUTPUT_SIZE, + SparkZOrderDataRewriteExecutor.VAR_LENGTH_CONTRIBUTION)); + assertThat(planner.validOptions()) + .as("Planner must report all supported options") + .isEqualTo( + ImmutableSet.of( + RewriteFileGroupPlanner.TARGET_FILE_SIZE_BYTES, + RewriteFileGroupPlanner.MIN_FILE_SIZE_BYTES, + RewriteFileGroupPlanner.MAX_FILE_SIZE_BYTES, + RewriteFileGroupPlanner.MIN_INPUT_FILES, + RewriteFileGroupPlanner.REWRITE_ALL, + RewriteFileGroupPlanner.MAX_FILE_GROUP_SIZE_BYTES, + RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, + RewriteDataFiles.REWRITE_JOB_ORDER)); } @Test public void testInvalidValuesForBinPackDataOptions() { Table table = catalog.createTable(TABLE_IDENT, SCHEMA); - SparkBinPackDataRewriter rewriter = new SparkBinPackDataRewriter(spark, table); + RewriteFileGroupPlanner planner = + new RewriteFileGroupPlanner(table, Expressions.alwaysTrue(), 1, false); - validateSizeBasedRewriterOptions(rewriter); + validateSizeBasedRewriterOptions(planner); Map invalidDeleteThresholdOptions = - ImmutableMap.of(SizeBasedDataRewriter.DELETE_FILE_THRESHOLD, "-1"); - assertThatThrownBy(() -> rewriter.init(invalidDeleteThresholdOptions)) + ImmutableMap.of(RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, "-1"); + assertThatThrownBy(() -> planner.init(invalidDeleteThresholdOptions)) .hasMessageContaining("'delete-file-threshold' is set to -1 but must be >= 0"); } @Test public void testInvalidValuesForSortDataOptions() { Table table = catalog.createTable(TABLE_IDENT, SCHEMA); - SparkSortDataRewriter rewriter = new SparkSortDataRewriter(spark, table, SORT_ORDER); + SparkSortDataRewriteExecutor rewriter = + new SparkSortDataRewriteExecutor(spark, table, SORT_ORDER); + RewriteFileGroupPlanner planner = + new RewriteFileGroupPlanner(table, Expressions.alwaysTrue(), 1, false); - validateSizeBasedRewriterOptions(rewriter); + validateSizeBasedRewriterOptions(planner); Map invalidDeleteThresholdOptions = - ImmutableMap.of(SizeBasedDataRewriter.DELETE_FILE_THRESHOLD, "-1"); - assertThatThrownBy(() -> rewriter.init(invalidDeleteThresholdOptions)) + ImmutableMap.of(RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, "-1"); + assertThatThrownBy(() -> planner.init(invalidDeleteThresholdOptions)) .hasMessageContaining("'delete-file-threshold' is set to -1 but must be >= 0"); Map invalidCompressionFactorOptions = - ImmutableMap.of(SparkShufflingDataRewriter.COMPRESSION_FACTOR, "0"); + ImmutableMap.of(SparkShufflingDataRewriteExecutor.COMPRESSION_FACTOR, "0"); assertThatThrownBy(() -> rewriter.init(invalidCompressionFactorOptions)) .hasMessageContaining("'compression-factor' is set to 0.0 but must be > 0"); } @@ -328,67 +336,70 @@ public void testInvalidValuesForSortDataOptions() { public void testInvalidValuesForZOrderDataOptions() { Table table = catalog.createTable(TABLE_IDENT, SCHEMA); ImmutableList zOrderCols = ImmutableList.of("id"); - SparkZOrderDataRewriter rewriter = new SparkZOrderDataRewriter(spark, table, zOrderCols); + SparkZOrderDataRewriteExecutor rewriter = + new SparkZOrderDataRewriteExecutor(spark, table, zOrderCols); + RewriteFileGroupPlanner planner = + new RewriteFileGroupPlanner(table, Expressions.alwaysTrue(), 1, false); - validateSizeBasedRewriterOptions(rewriter); + validateSizeBasedRewriterOptions(planner); Map invalidDeleteThresholdOptions = - ImmutableMap.of(SizeBasedDataRewriter.DELETE_FILE_THRESHOLD, "-1"); - assertThatThrownBy(() -> rewriter.init(invalidDeleteThresholdOptions)) + ImmutableMap.of(RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, "-1"); + assertThatThrownBy(() -> planner.init(invalidDeleteThresholdOptions)) .hasMessageContaining("'delete-file-threshold' is set to -1 but must be >= 0"); Map invalidCompressionFactorOptions = - ImmutableMap.of(SparkShufflingDataRewriter.COMPRESSION_FACTOR, "0"); + ImmutableMap.of(SparkShufflingDataRewriteExecutor.COMPRESSION_FACTOR, "0"); assertThatThrownBy(() -> rewriter.init(invalidCompressionFactorOptions)) .hasMessageContaining("'compression-factor' is set to 0.0 but must be > 0"); Map invalidMaxOutputOptions = - ImmutableMap.of(SparkZOrderDataRewriter.MAX_OUTPUT_SIZE, "0"); + ImmutableMap.of(SparkZOrderDataRewriteExecutor.MAX_OUTPUT_SIZE, "0"); assertThatThrownBy(() -> rewriter.init(invalidMaxOutputOptions)) .hasMessageContaining("Cannot have the interleaved ZOrder value use less than 1 byte") .hasMessageContaining("'max-output-size' was set to 0"); Map invalidVarLengthContributionOptions = - ImmutableMap.of(SparkZOrderDataRewriter.VAR_LENGTH_CONTRIBUTION, "0"); + ImmutableMap.of(SparkZOrderDataRewriteExecutor.VAR_LENGTH_CONTRIBUTION, "0"); assertThatThrownBy(() -> rewriter.init(invalidVarLengthContributionOptions)) .hasMessageContaining("Cannot use less than 1 byte for variable length types with ZOrder") .hasMessageContaining("'var-length-contribution' was set to 0"); } - private void validateSizeBasedRewriterOptions(SizeBasedFileRewriter rewriter) { + private void validateSizeBasedRewriterOptions(SizeBasedFileRewritePlanner rewriter) { Map invalidTargetSizeOptions = - ImmutableMap.of(SizeBasedFileRewriter.TARGET_FILE_SIZE_BYTES, "0"); + ImmutableMap.of(SizeBasedFileRewritePlanner.TARGET_FILE_SIZE_BYTES, "0"); assertThatThrownBy(() -> rewriter.init(invalidTargetSizeOptions)) .hasMessageContaining("'target-file-size-bytes' is set to 0 but must be > 0"); Map invalidMinSizeOptions = - ImmutableMap.of(SizeBasedFileRewriter.MIN_FILE_SIZE_BYTES, "-1"); + ImmutableMap.of(SizeBasedFileRewritePlanner.MIN_FILE_SIZE_BYTES, "-1"); assertThatThrownBy(() -> rewriter.init(invalidMinSizeOptions)) .hasMessageContaining("'min-file-size-bytes' is set to -1 but must be >= 0"); Map invalidTargetMinSizeOptions = ImmutableMap.of( - SizeBasedFileRewriter.TARGET_FILE_SIZE_BYTES, "3", - SizeBasedFileRewriter.MIN_FILE_SIZE_BYTES, "5"); + SizeBasedFileRewritePlanner.TARGET_FILE_SIZE_BYTES, "3", + SizeBasedFileRewritePlanner.MIN_FILE_SIZE_BYTES, "5"); assertThatThrownBy(() -> rewriter.init(invalidTargetMinSizeOptions)) .hasMessageContaining("'target-file-size-bytes' (3) must be > 'min-file-size-bytes' (5)") .hasMessageContaining("all new files will be smaller than the min threshold"); Map invalidTargetMaxSizeOptions = ImmutableMap.of( - SizeBasedFileRewriter.TARGET_FILE_SIZE_BYTES, "5", - SizeBasedFileRewriter.MAX_FILE_SIZE_BYTES, "3"); + SizeBasedFileRewritePlanner.TARGET_FILE_SIZE_BYTES, "5", + SizeBasedFileRewritePlanner.MAX_FILE_SIZE_BYTES, "3"); assertThatThrownBy(() -> rewriter.init(invalidTargetMaxSizeOptions)) .hasMessageContaining("'target-file-size-bytes' (5) must be < 'max-file-size-bytes' (3)") .hasMessageContaining("all new files will be larger than the max threshold"); Map invalidMinInputFilesOptions = - ImmutableMap.of(SizeBasedFileRewriter.MIN_INPUT_FILES, "0"); + ImmutableMap.of(SizeBasedFileRewritePlanner.MIN_INPUT_FILES, "0"); assertThatThrownBy(() -> rewriter.init(invalidMinInputFilesOptions)) .hasMessageContaining("'min-input-files' is set to 0 but must be > 0"); Map invalidMaxFileGroupSizeOptions = - ImmutableMap.of(SizeBasedFileRewriter.MAX_FILE_GROUP_SIZE_BYTES, "0"); + ImmutableMap.of(SizeBasedFileRewritePlanner.MAX_FILE_GROUP_SIZE_BYTES, "0"); assertThatThrownBy(() -> rewriter.init(invalidMaxFileGroupSizeOptions)) .hasMessageContaining("'max-file-group-size-bytes' is set to 0 but must be > 0"); } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestCompressionSettings.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestCompressionSettings.java index f411920a5dcc..24a14bb64d86 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestCompressionSettings.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestCompressionSettings.java @@ -58,7 +58,7 @@ import org.apache.iceberg.Parameters; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Table; -import org.apache.iceberg.actions.SizeBasedFileRewriter; +import org.apache.iceberg.actions.SizeBasedFileRewritePlanner; import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.io.InputFile; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; @@ -230,7 +230,7 @@ public void testWriteDataWithDifferentSetting() throws Exception { SparkActions.get(spark) .rewritePositionDeletes(table) - .option(SizeBasedFileRewriter.REWRITE_ALL, "true") + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") .execute(); table.refresh(); deleteManifestFiles = table.currentSnapshot().deleteManifests(table.io()); From 0e96b10decaed6e327257a33af7308a6c0c8c1b5 Mon Sep 17 00:00:00 2001 From: Peter Vary Date: Thu, 21 Nov 2024 14:10:43 +0100 Subject: [PATCH 06/11] Revapi fix so the tests could run. Temporary removal for Spark 3.4, 3.3 tests. Disabling Spark 3.4, 3.3 compilation as well. --- .github/workflows/spark-ci.yml | 9 +-- .palantir/revapi.yml | 142 +++++++++++++++++++++++++++++++++ gradle.properties | 2 +- 3 files changed, 144 insertions(+), 9 deletions(-) diff --git a/.github/workflows/spark-ci.yml b/.github/workflows/spark-ci.yml index 0d7bd2d3d3e7..295943a7dab0 100644 --- a/.github/workflows/spark-ci.yml +++ b/.github/workflows/spark-ci.yml @@ -73,15 +73,8 @@ jobs: strategy: matrix: jvm: [11, 17, 21] - spark: ['3.3', '3.4', '3.5'] + spark: ['3.5'] scala: ['2.12', '2.13'] - exclude: - # Spark 3.5 is the first version not failing on Java 21 (https://issues.apache.org/jira/browse/SPARK-42369) - # Full Java 21 support is coming in Spark 4 (https://issues.apache.org/jira/browse/SPARK-43831) - - jvm: 21 - spark: '3.3' - - jvm: 21 - spark: '3.4' env: SPARK_LOCAL_IP: localhost steps: diff --git a/.palantir/revapi.yml b/.palantir/revapi.yml index fade79326a49..e689ed9803d2 100644 --- a/.palantir/revapi.yml +++ b/.palantir/revapi.yml @@ -1145,6 +1145,148 @@ acceptedBreaks: new: "method org.apache.iceberg.BaseMetastoreOperations.CommitStatus org.apache.iceberg.BaseMetastoreTableOperations::checkCommitStatus(java.lang.String,\ \ org.apache.iceberg.TableMetadata)" justification: "Removing deprecated code" + "1.7.0": + org.apache.iceberg:iceberg-core: + - code: "java.class.removed" + old: "class org.apache.iceberg.actions.SizeBasedDataRewriter" + justification: "We will decide what to do with the API changes after the new\ + \ API has been finalized" + - code: "java.class.removed" + old: "class org.apache.iceberg.actions.SizeBasedFileRewriter>, F>" + justification: "We will decide what to do with the API changes after the new\ + \ API has been finalized" + - code: "java.class.removed" + old: "class org.apache.iceberg.actions.SizeBasedPositionDeletesRewriter" + justification: "We will decide what to do with the API changes after the new\ + \ API has been finalized" + - code: "java.class.removed" + old: "interface org.apache.iceberg.actions.FileRewriter>, F>" + justification: "We will decide what to do with the API changes after the new\ + \ API has been finalized" + - code: "java.generics.elementNowParameterized" + old: "method java.util.Comparator\ + \ org.apache.iceberg.actions.RewriteFileGroup::comparator(org.apache.iceberg.RewriteJobOrder)" + new: "method >,\ + \ F extends org.apache.iceberg.ContentFile>>\ + \ java.util.Comparator>\ + \ org.apache.iceberg.actions.FileRewriteGroup>, F extends org.apache.iceberg.ContentFile>>::comparator(org.apache.iceberg.RewriteJobOrder)\ + \ @ org.apache.iceberg.actions.RewriteFileGroup" + justification: "We will decide what to do with the API changes after the new\ + \ API has been finalized" + - code: "java.generics.elementNowParameterized" + old: "method java.util.Comparator\ + \ org.apache.iceberg.actions.RewritePositionDeletesGroup::comparator(org.apache.iceberg.RewriteJobOrder)" + new: "method >,\ + \ F extends org.apache.iceberg.ContentFile>>\ + \ java.util.Comparator>\ + \ org.apache.iceberg.actions.FileRewriteGroup>, F extends org.apache.iceberg.ContentFile>>::comparator(org.apache.iceberg.RewriteJobOrder)\ + \ @ org.apache.iceberg.actions.RewritePositionDeletesGroup" + justification: "We will decide what to do with the API changes after the new\ + \ API has been finalized" + - code: "java.generics.formalTypeParameterAdded" + old: "method java.util.Comparator\ + \ org.apache.iceberg.actions.RewriteFileGroup::comparator(org.apache.iceberg.RewriteJobOrder)" + new: "method >,\ + \ F extends org.apache.iceberg.ContentFile>>\ + \ java.util.Comparator>\ + \ org.apache.iceberg.actions.FileRewriteGroup>, F extends org.apache.iceberg.ContentFile>>::comparator(org.apache.iceberg.RewriteJobOrder)\ + \ @ org.apache.iceberg.actions.RewriteFileGroup" + justification: "We will decide what to do with the API changes after the new\ + \ API has been finalized" + - code: "java.generics.formalTypeParameterAdded" + old: "method java.util.Comparator\ + \ org.apache.iceberg.actions.RewritePositionDeletesGroup::comparator(org.apache.iceberg.RewriteJobOrder)" + new: "method >,\ + \ F extends org.apache.iceberg.ContentFile>>\ + \ java.util.Comparator>\ + \ org.apache.iceberg.actions.FileRewriteGroup>, F extends org.apache.iceberg.ContentFile>>::comparator(org.apache.iceberg.RewriteJobOrder)\ + \ @ org.apache.iceberg.actions.RewritePositionDeletesGroup" + justification: "We will decide what to do with the API changes after the new\ + \ API has been finalized" + - code: "java.method.movedToSuperClass" + old: "method java.util.Comparator\ + \ org.apache.iceberg.actions.RewriteFileGroup::comparator(org.apache.iceberg.RewriteJobOrder)" + new: "method >,\ + \ F extends org.apache.iceberg.ContentFile>>\ + \ java.util.Comparator>\ + \ org.apache.iceberg.actions.FileRewriteGroup>, F extends org.apache.iceberg.ContentFile>>::comparator(org.apache.iceberg.RewriteJobOrder)\ + \ @ org.apache.iceberg.actions.RewriteFileGroup" + justification: "We will decide what to do with the API changes after the new\ + \ API has been finalized" + - code: "java.method.movedToSuperClass" + old: "method java.util.Comparator\ + \ org.apache.iceberg.actions.RewritePositionDeletesGroup::comparator(org.apache.iceberg.RewriteJobOrder)" + new: "method >,\ + \ F extends org.apache.iceberg.ContentFile>>\ + \ java.util.Comparator>\ + \ org.apache.iceberg.actions.FileRewriteGroup>, F extends org.apache.iceberg.ContentFile>>::comparator(org.apache.iceberg.RewriteJobOrder)\ + \ @ org.apache.iceberg.actions.RewritePositionDeletesGroup" + justification: "We will decide what to do with the API changes after the new\ + \ API has been finalized" + - code: "java.method.numberOfParametersChanged" + old: "method void org.apache.iceberg.actions.RewriteFileGroup::(org.apache.iceberg.actions.RewriteDataFiles.FileGroupInfo,\ + \ java.util.List)" + new: "method void org.apache.iceberg.actions.RewriteFileGroup::(org.apache.iceberg.actions.RewriteDataFiles.FileGroupInfo,\ + \ java.util.List, long, int)" + justification: "We will decide what to do with the API changes after the new\ + \ API has been finalized" + - code: "java.method.numberOfParametersChanged" + old: "method void org.apache.iceberg.actions.RewritePositionDeletesGroup::(org.apache.iceberg.actions.RewritePositionDeleteFiles.FileGroupInfo,\ + \ java.util.List)" + new: "method void org.apache.iceberg.actions.RewritePositionDeletesGroup::(org.apache.iceberg.actions.RewritePositionDeleteFiles.FileGroupInfo,\ + \ java.util.List, long, int)" + justification: "We will decide what to do with the API changes after the new\ + \ API has been finalized" + - code: "java.method.removed" + old: "method int org.apache.iceberg.actions.RewriteFileGroup::numFiles()" + justification: "We will decide what to do with the API changes after the new\ + \ API has been finalized" + - code: "java.method.removed" + old: "method int org.apache.iceberg.actions.RewritePositionDeletesGroup::numRewrittenDeleteFiles()" + justification: "We will decide what to do with the API changes after the new\ + \ API has been finalized" + - code: "java.method.removed" + old: "method java.util.List org.apache.iceberg.actions.RewritePositionDeletesGroup::tasks()" + justification: "We will decide what to do with the API changes after the new\ + \ API has been finalized" + - code: "java.method.returnTypeTypeParametersChanged" + old: "method java.util.Comparator\ + \ org.apache.iceberg.actions.RewriteFileGroup::comparator(org.apache.iceberg.RewriteJobOrder)" + new: "method >,\ + \ F extends org.apache.iceberg.ContentFile>>\ + \ java.util.Comparator>\ + \ org.apache.iceberg.actions.FileRewriteGroup>, F extends org.apache.iceberg.ContentFile>>::comparator(org.apache.iceberg.RewriteJobOrder)\ + \ @ org.apache.iceberg.actions.RewriteFileGroup" + justification: "We will decide what to do with the API changes after the new\ + \ API has been finalized" + - code: "java.method.returnTypeTypeParametersChanged" + old: "method java.util.Comparator\ + \ org.apache.iceberg.actions.RewritePositionDeletesGroup::comparator(org.apache.iceberg.RewriteJobOrder)" + new: "method >,\ + \ F extends org.apache.iceberg.ContentFile>>\ + \ java.util.Comparator>\ + \ org.apache.iceberg.actions.FileRewriteGroup>, F extends org.apache.iceberg.ContentFile>>::comparator(org.apache.iceberg.RewriteJobOrder)\ + \ @ org.apache.iceberg.actions.RewritePositionDeletesGroup" + justification: "We will decide what to do with the API changes after the new\ + \ API has been finalized" apache-iceberg-0.14.0: org.apache.iceberg:iceberg-api: - code: "java.class.defaultSerializationChanged" diff --git a/gradle.properties b/gradle.properties index dc1e1a509b01..5c62371efe35 100644 --- a/gradle.properties +++ b/gradle.properties @@ -21,7 +21,7 @@ systemProp.knownFlinkVersions=1.18,1.19,1.20 systemProp.defaultHiveVersions=2 systemProp.knownHiveVersions=2,3 systemProp.defaultSparkVersions=3.5 -systemProp.knownSparkVersions=3.3,3.4,3.5 +systemProp.knownSparkVersions=3.5 systemProp.defaultKafkaVersions=3 systemProp.knownKafkaVersions=3 systemProp.defaultScalaVersion=2.12 From fed8e69fa249e6e02cfce1735f97454a3a615361 Mon Sep 17 00:00:00 2001 From: Peter Vary Date: Wed, 27 Nov 2024 15:05:08 +0100 Subject: [PATCH 07/11] Revert API changes and use deprecation instead --- .github/workflows/spark-ci.yml | 9 +- .palantir/revapi.yml | 142 ------- .../iceberg/actions/FileRewriteGroup.java | 2 +- .../apache/iceberg/actions/FileRewriter.java | 80 ++++ .../iceberg/actions/RewriteFileGroup.java | 38 ++ .../actions/RewriteFileGroupPlanner.java | 2 +- .../actions/RewritePositionDeletesGroup.java | 48 +++ .../RewritePositionDeletesGroupPlanner.java | 2 +- .../actions/SizeBasedDataRewriter.java | 112 ++++++ .../actions/SizeBasedFileRewriter.java | 348 ++++++++++++++++++ .../SizeBasedPositionDeletesRewriter.java | 63 ++++ gradle.properties | 2 +- 12 files changed, 701 insertions(+), 147 deletions(-) create mode 100644 core/src/main/java/org/apache/iceberg/actions/FileRewriter.java create mode 100644 core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java create mode 100644 core/src/main/java/org/apache/iceberg/actions/SizeBasedFileRewriter.java create mode 100644 core/src/main/java/org/apache/iceberg/actions/SizeBasedPositionDeletesRewriter.java diff --git a/.github/workflows/spark-ci.yml b/.github/workflows/spark-ci.yml index 295943a7dab0..0d7bd2d3d3e7 100644 --- a/.github/workflows/spark-ci.yml +++ b/.github/workflows/spark-ci.yml @@ -73,8 +73,15 @@ jobs: strategy: matrix: jvm: [11, 17, 21] - spark: ['3.5'] + spark: ['3.3', '3.4', '3.5'] scala: ['2.12', '2.13'] + exclude: + # Spark 3.5 is the first version not failing on Java 21 (https://issues.apache.org/jira/browse/SPARK-42369) + # Full Java 21 support is coming in Spark 4 (https://issues.apache.org/jira/browse/SPARK-43831) + - jvm: 21 + spark: '3.3' + - jvm: 21 + spark: '3.4' env: SPARK_LOCAL_IP: localhost steps: diff --git a/.palantir/revapi.yml b/.palantir/revapi.yml index e689ed9803d2..fade79326a49 100644 --- a/.palantir/revapi.yml +++ b/.palantir/revapi.yml @@ -1145,148 +1145,6 @@ acceptedBreaks: new: "method org.apache.iceberg.BaseMetastoreOperations.CommitStatus org.apache.iceberg.BaseMetastoreTableOperations::checkCommitStatus(java.lang.String,\ \ org.apache.iceberg.TableMetadata)" justification: "Removing deprecated code" - "1.7.0": - org.apache.iceberg:iceberg-core: - - code: "java.class.removed" - old: "class org.apache.iceberg.actions.SizeBasedDataRewriter" - justification: "We will decide what to do with the API changes after the new\ - \ API has been finalized" - - code: "java.class.removed" - old: "class org.apache.iceberg.actions.SizeBasedFileRewriter>, F>" - justification: "We will decide what to do with the API changes after the new\ - \ API has been finalized" - - code: "java.class.removed" - old: "class org.apache.iceberg.actions.SizeBasedPositionDeletesRewriter" - justification: "We will decide what to do with the API changes after the new\ - \ API has been finalized" - - code: "java.class.removed" - old: "interface org.apache.iceberg.actions.FileRewriter>, F>" - justification: "We will decide what to do with the API changes after the new\ - \ API has been finalized" - - code: "java.generics.elementNowParameterized" - old: "method java.util.Comparator\ - \ org.apache.iceberg.actions.RewriteFileGroup::comparator(org.apache.iceberg.RewriteJobOrder)" - new: "method >,\ - \ F extends org.apache.iceberg.ContentFile>>\ - \ java.util.Comparator>\ - \ org.apache.iceberg.actions.FileRewriteGroup>, F extends org.apache.iceberg.ContentFile>>::comparator(org.apache.iceberg.RewriteJobOrder)\ - \ @ org.apache.iceberg.actions.RewriteFileGroup" - justification: "We will decide what to do with the API changes after the new\ - \ API has been finalized" - - code: "java.generics.elementNowParameterized" - old: "method java.util.Comparator\ - \ org.apache.iceberg.actions.RewritePositionDeletesGroup::comparator(org.apache.iceberg.RewriteJobOrder)" - new: "method >,\ - \ F extends org.apache.iceberg.ContentFile>>\ - \ java.util.Comparator>\ - \ org.apache.iceberg.actions.FileRewriteGroup>, F extends org.apache.iceberg.ContentFile>>::comparator(org.apache.iceberg.RewriteJobOrder)\ - \ @ org.apache.iceberg.actions.RewritePositionDeletesGroup" - justification: "We will decide what to do with the API changes after the new\ - \ API has been finalized" - - code: "java.generics.formalTypeParameterAdded" - old: "method java.util.Comparator\ - \ org.apache.iceberg.actions.RewriteFileGroup::comparator(org.apache.iceberg.RewriteJobOrder)" - new: "method >,\ - \ F extends org.apache.iceberg.ContentFile>>\ - \ java.util.Comparator>\ - \ org.apache.iceberg.actions.FileRewriteGroup>, F extends org.apache.iceberg.ContentFile>>::comparator(org.apache.iceberg.RewriteJobOrder)\ - \ @ org.apache.iceberg.actions.RewriteFileGroup" - justification: "We will decide what to do with the API changes after the new\ - \ API has been finalized" - - code: "java.generics.formalTypeParameterAdded" - old: "method java.util.Comparator\ - \ org.apache.iceberg.actions.RewritePositionDeletesGroup::comparator(org.apache.iceberg.RewriteJobOrder)" - new: "method >,\ - \ F extends org.apache.iceberg.ContentFile>>\ - \ java.util.Comparator>\ - \ org.apache.iceberg.actions.FileRewriteGroup>, F extends org.apache.iceberg.ContentFile>>::comparator(org.apache.iceberg.RewriteJobOrder)\ - \ @ org.apache.iceberg.actions.RewritePositionDeletesGroup" - justification: "We will decide what to do with the API changes after the new\ - \ API has been finalized" - - code: "java.method.movedToSuperClass" - old: "method java.util.Comparator\ - \ org.apache.iceberg.actions.RewriteFileGroup::comparator(org.apache.iceberg.RewriteJobOrder)" - new: "method >,\ - \ F extends org.apache.iceberg.ContentFile>>\ - \ java.util.Comparator>\ - \ org.apache.iceberg.actions.FileRewriteGroup>, F extends org.apache.iceberg.ContentFile>>::comparator(org.apache.iceberg.RewriteJobOrder)\ - \ @ org.apache.iceberg.actions.RewriteFileGroup" - justification: "We will decide what to do with the API changes after the new\ - \ API has been finalized" - - code: "java.method.movedToSuperClass" - old: "method java.util.Comparator\ - \ org.apache.iceberg.actions.RewritePositionDeletesGroup::comparator(org.apache.iceberg.RewriteJobOrder)" - new: "method >,\ - \ F extends org.apache.iceberg.ContentFile>>\ - \ java.util.Comparator>\ - \ org.apache.iceberg.actions.FileRewriteGroup>, F extends org.apache.iceberg.ContentFile>>::comparator(org.apache.iceberg.RewriteJobOrder)\ - \ @ org.apache.iceberg.actions.RewritePositionDeletesGroup" - justification: "We will decide what to do with the API changes after the new\ - \ API has been finalized" - - code: "java.method.numberOfParametersChanged" - old: "method void org.apache.iceberg.actions.RewriteFileGroup::(org.apache.iceberg.actions.RewriteDataFiles.FileGroupInfo,\ - \ java.util.List)" - new: "method void org.apache.iceberg.actions.RewriteFileGroup::(org.apache.iceberg.actions.RewriteDataFiles.FileGroupInfo,\ - \ java.util.List, long, int)" - justification: "We will decide what to do with the API changes after the new\ - \ API has been finalized" - - code: "java.method.numberOfParametersChanged" - old: "method void org.apache.iceberg.actions.RewritePositionDeletesGroup::(org.apache.iceberg.actions.RewritePositionDeleteFiles.FileGroupInfo,\ - \ java.util.List)" - new: "method void org.apache.iceberg.actions.RewritePositionDeletesGroup::(org.apache.iceberg.actions.RewritePositionDeleteFiles.FileGroupInfo,\ - \ java.util.List, long, int)" - justification: "We will decide what to do with the API changes after the new\ - \ API has been finalized" - - code: "java.method.removed" - old: "method int org.apache.iceberg.actions.RewriteFileGroup::numFiles()" - justification: "We will decide what to do with the API changes after the new\ - \ API has been finalized" - - code: "java.method.removed" - old: "method int org.apache.iceberg.actions.RewritePositionDeletesGroup::numRewrittenDeleteFiles()" - justification: "We will decide what to do with the API changes after the new\ - \ API has been finalized" - - code: "java.method.removed" - old: "method java.util.List org.apache.iceberg.actions.RewritePositionDeletesGroup::tasks()" - justification: "We will decide what to do with the API changes after the new\ - \ API has been finalized" - - code: "java.method.returnTypeTypeParametersChanged" - old: "method java.util.Comparator\ - \ org.apache.iceberg.actions.RewriteFileGroup::comparator(org.apache.iceberg.RewriteJobOrder)" - new: "method >,\ - \ F extends org.apache.iceberg.ContentFile>>\ - \ java.util.Comparator>\ - \ org.apache.iceberg.actions.FileRewriteGroup>, F extends org.apache.iceberg.ContentFile>>::comparator(org.apache.iceberg.RewriteJobOrder)\ - \ @ org.apache.iceberg.actions.RewriteFileGroup" - justification: "We will decide what to do with the API changes after the new\ - \ API has been finalized" - - code: "java.method.returnTypeTypeParametersChanged" - old: "method java.util.Comparator\ - \ org.apache.iceberg.actions.RewritePositionDeletesGroup::comparator(org.apache.iceberg.RewriteJobOrder)" - new: "method >,\ - \ F extends org.apache.iceberg.ContentFile>>\ - \ java.util.Comparator>\ - \ org.apache.iceberg.actions.FileRewriteGroup>, F extends org.apache.iceberg.ContentFile>>::comparator(org.apache.iceberg.RewriteJobOrder)\ - \ @ org.apache.iceberg.actions.RewritePositionDeletesGroup" - justification: "We will decide what to do with the API changes after the new\ - \ API has been finalized" apache-iceberg-0.14.0: org.apache.iceberg:iceberg-api: - code: "java.class.defaultSerializationChanged" diff --git a/core/src/main/java/org/apache/iceberg/actions/FileRewriteGroup.java b/core/src/main/java/org/apache/iceberg/actions/FileRewriteGroup.java index c43bf5cd85f6..c48a6d6f4a2c 100644 --- a/core/src/main/java/org/apache/iceberg/actions/FileRewriteGroup.java +++ b/core/src/main/java/org/apache/iceberg/actions/FileRewriteGroup.java @@ -70,7 +70,7 @@ public int numInputFiles() { } public static , F extends ContentFile> - Comparator> comparator(RewriteJobOrder rewriteJobOrder) { + Comparator> taskComparator(RewriteJobOrder rewriteJobOrder) { switch (rewriteJobOrder) { case BYTES_ASC: return Comparator.comparing(FileRewriteGroup::sizeInBytes); diff --git a/core/src/main/java/org/apache/iceberg/actions/FileRewriter.java b/core/src/main/java/org/apache/iceberg/actions/FileRewriter.java new file mode 100644 index 000000000000..f014aea0c034 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/actions/FileRewriter.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.actions; + +import java.util.List; +import java.util.Map; +import java.util.Set; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.ContentScanTask; + +/** + * A class for rewriting content files. + * + *

The entire rewrite operation is broken down into pieces based on partitioning, and size-based + * groups within a partition. These subunits of the rewrite are referred to as file groups. A file + * group will be processed by a single framework "action". For example, in Spark this means that + * each group would be rewritten in its own Spark job. + * + * @param the Java type of tasks to read content files + * @param the Java type of content files + * @deprecated since 1.8.0, will be removed in 1.9.0; use {@link FileRewritePlanner} and {@link + * FileRewriteExecutor}. + */ +@Deprecated +public interface FileRewriter, F extends ContentFile> { + + /** Returns a description for this rewriter. */ + default String description() { + return getClass().getName(); + } + + /** + * Returns a set of supported options for this rewriter. Only options specified in this list will + * be accepted at runtime. Any other options will be rejected. + */ + Set validOptions(); + + /** + * Initializes this rewriter using provided options. + * + * @param options options to initialize this rewriter + */ + void init(Map options); + + /** + * Selects files which this rewriter believes are valid targets to be rewritten based on their + * scan tasks and groups those scan tasks into file groups. The file groups are then rewritten in + * a single executable unit, such as a Spark job. + * + * @param tasks an iterable of scan task for files in a partition + * @return groups of scan tasks for files to be rewritten in a single executable unit + */ + Iterable> planFileGroups(Iterable tasks); + + /** + * Rewrite a group of files represented by the given list of scan tasks. + * + *

The implementation is supposed to be engine-specific (e.g. Spark, Flink, Trino). + * + * @param group a group of scan tasks for files to be rewritten together + * @return a set of newly written files + */ + Set rewrite(List group); +} diff --git a/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroup.java b/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroup.java index b43d94a2bb8c..996e7b0f8ba2 100644 --- a/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroup.java +++ b/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroup.java @@ -18,11 +18,13 @@ */ package org.apache.iceberg.actions; +import java.util.Comparator; import java.util.List; import java.util.Set; import java.util.stream.Collectors; import org.apache.iceberg.DataFile; import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.RewriteJobOrder; import org.apache.iceberg.actions.RewriteDataFiles.FileGroupInfo; import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; @@ -35,6 +37,14 @@ public class RewriteFileGroup extends FileRewriteGroup { private DataFileSet addedFiles = DataFileSet.create(); + /** + * @deprecated since 1.8.0, will be removed in 1.9.0. + */ + @Deprecated + public RewriteFileGroup(FileGroupInfo info, List fileScanTasks) { + this(info, fileScanTasks, 0L, 0); + } + public RewriteFileGroup( FileGroupInfo info, List fileScanTasks, @@ -78,4 +88,32 @@ public String toString() { .add("numRewrittenBytes", sizeInBytes()) .toString(); } + + /** + * @deprecated since 1.8.0, will be removed in 1.9.0. Use {@link #numInputFiles()} instead. + */ + @Deprecated + public int numFiles() { + return fileScans().size(); + } + + /** + * @deprecated since 1.8.0, will be removed in 1.9.0. Use {@link + * FileRewriteGroup#taskComparator(RewriteJobOrder)} instead. + */ + @Deprecated + public static Comparator comparator(RewriteJobOrder rewriteJobOrder) { + switch (rewriteJobOrder) { + case BYTES_ASC: + return Comparator.comparing(RewriteFileGroup::sizeInBytes); + case BYTES_DESC: + return Comparator.comparing(RewriteFileGroup::sizeInBytes, Comparator.reverseOrder()); + case FILES_ASC: + return Comparator.comparing(RewriteFileGroup::numFiles); + case FILES_DESC: + return Comparator.comparing(RewriteFileGroup::numFiles, Comparator.reverseOrder()); + default: + return (unused, unused2) -> 0; + } + } } diff --git a/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java b/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java index 38df04217d98..3fdcfba3fbbd 100644 --- a/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java +++ b/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java @@ -151,7 +151,7 @@ public FileRewritePlan numOutputFiles(inputSize)); }); }) - .sorted(RewriteFileGroup.comparator(rewriteJobOrder)); + .sorted(FileRewriteGroup.taskComparator(rewriteJobOrder)); Map groupsInPartition = plan.transformValues(List::size); int totalGroupCount = groupsInPartition.values().stream().reduce(Integer::sum).orElse(0); return new FileRewritePlan<>( diff --git a/core/src/main/java/org/apache/iceberg/actions/RewritePositionDeletesGroup.java b/core/src/main/java/org/apache/iceberg/actions/RewritePositionDeletesGroup.java index 96640bb5d9b6..c7b1f9ddaf51 100644 --- a/core/src/main/java/org/apache/iceberg/actions/RewritePositionDeletesGroup.java +++ b/core/src/main/java/org/apache/iceberg/actions/RewritePositionDeletesGroup.java @@ -18,11 +18,13 @@ */ package org.apache.iceberg.actions; +import java.util.Comparator; import java.util.List; import java.util.Set; import java.util.stream.Collectors; import org.apache.iceberg.DeleteFile; import org.apache.iceberg.PositionDeletesScanTask; +import org.apache.iceberg.RewriteJobOrder; import org.apache.iceberg.actions.RewritePositionDeleteFiles.FileGroupInfo; import org.apache.iceberg.actions.RewritePositionDeleteFiles.FileGroupRewriteResult; import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; @@ -39,6 +41,14 @@ public class RewritePositionDeletesGroup private DeleteFileSet addedDeleteFiles = DeleteFileSet.create(); + /** + * @deprecated since 1.8.0, will be removed in 1.9.0. + */ + @Deprecated + public RewritePositionDeletesGroup(FileGroupInfo info, List tasks) { + this(info, tasks, 0L, 0); + } + public RewritePositionDeletesGroup( FileGroupInfo info, List tasks, @@ -50,6 +60,14 @@ public RewritePositionDeletesGroup( tasks.stream().mapToLong(t -> t.file().dataSequenceNumber()).max().getAsLong(); } + /** + * @deprecated since 1.8.0, will be removed in 1.9.0. Use {@link #fileScans()} instead. + */ + @Deprecated + public List tasks() { + return fileScans(); + } + public void setOutputFiles(Set files) { addedDeleteFiles = DeleteFileSet.of(files); } @@ -103,4 +121,34 @@ public long rewrittenBytes() { public long addedBytes() { return addedDeleteFiles.stream().mapToLong(DeleteFile::fileSizeInBytes).sum(); } + + /** + * @deprecated since 1.8.0, will be removed in 1.9.0. Use {@link #numInputFiles()} instead. + */ + @Deprecated + public int numRewrittenDeleteFiles() { + return fileScans().size(); + } + + /** + * @deprecated since 1.8.0, will be removed in 1.9.0. Use {@link + * FileRewriteGroup#taskComparator(RewriteJobOrder)} instead. + */ + @Deprecated + public static Comparator comparator(RewriteJobOrder order) { + switch (order) { + case BYTES_ASC: + return Comparator.comparing(RewritePositionDeletesGroup::rewrittenBytes); + case BYTES_DESC: + return Comparator.comparing( + RewritePositionDeletesGroup::rewrittenBytes, Comparator.reverseOrder()); + case FILES_ASC: + return Comparator.comparing(RewritePositionDeletesGroup::numRewrittenDeleteFiles); + case FILES_DESC: + return Comparator.comparing( + RewritePositionDeletesGroup::numRewrittenDeleteFiles, Comparator.reverseOrder()); + default: + return (unused, unused2) -> 0; + } + } } diff --git a/core/src/main/java/org/apache/iceberg/actions/RewritePositionDeletesGroupPlanner.java b/core/src/main/java/org/apache/iceberg/actions/RewritePositionDeletesGroupPlanner.java index d83677139a37..74109df05a1f 100644 --- a/core/src/main/java/org/apache/iceberg/actions/RewritePositionDeletesGroupPlanner.java +++ b/core/src/main/java/org/apache/iceberg/actions/RewritePositionDeletesGroupPlanner.java @@ -118,7 +118,7 @@ public void init(Map options) { numOutputFiles(inputSize)); }); }) - .sorted(RewritePositionDeletesGroup.comparator(rewriteJobOrder)); + .sorted(FileRewriteGroup.taskComparator(rewriteJobOrder)); Map groupsInPartition = plan.transformValues(List::size); int totalGroupCount = groupsInPartition.values().stream().reduce(Integer::sum).orElse(0); return new FileRewritePlan<>( diff --git a/core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java b/core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java new file mode 100644 index 000000000000..66b759321ac8 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.actions; + +import java.util.List; +import java.util.Map; +import java.util.Set; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.util.PropertyUtil; + +public abstract class SizeBasedDataRewriter extends SizeBasedFileRewriter { + + /** + * The minimum number of deletes that needs to be associated with a data file for it to be + * considered for rewriting. If a data file has this number of deletes or more, it will be + * rewritten regardless of its file size determined by {@link #MIN_FILE_SIZE_BYTES} and {@link + * #MAX_FILE_SIZE_BYTES}. If a file group contains a file that satisfies this condition, the file + * group will be rewritten regardless of the number of files in the file group determined by + * {@link #MIN_INPUT_FILES}. + * + *

Defaults to Integer.MAX_VALUE, which means this feature is not enabled by default. + * + * @deprecated since 1.8.0, will be removed in 1.9.0; use {@link RewriteFileGroupPlanner} and + * {@link FileRewriteExecutor}. + */ + @Deprecated public static final String DELETE_FILE_THRESHOLD = "delete-file-threshold"; + + public static final int DELETE_FILE_THRESHOLD_DEFAULT = Integer.MAX_VALUE; + + private int deleteFileThreshold; + + protected SizeBasedDataRewriter(Table table) { + super(table); + } + + @Override + public Set validOptions() { + return ImmutableSet.builder() + .addAll(super.validOptions()) + .add(DELETE_FILE_THRESHOLD) + .build(); + } + + @Override + public void init(Map options) { + super.init(options); + this.deleteFileThreshold = deleteFileThreshold(options); + } + + @Override + protected Iterable filterFiles(Iterable tasks) { + return Iterables.filter(tasks, task -> wronglySized(task) || tooManyDeletes(task)); + } + + private boolean tooManyDeletes(FileScanTask task) { + return task.deletes() != null && task.deletes().size() >= deleteFileThreshold; + } + + @Override + protected Iterable> filterFileGroups(List> groups) { + return Iterables.filter(groups, this::shouldRewrite); + } + + private boolean shouldRewrite(List group) { + return enoughInputFiles(group) + || enoughContent(group) + || tooMuchContent(group) + || anyTaskHasTooManyDeletes(group); + } + + private boolean anyTaskHasTooManyDeletes(List group) { + return group.stream().anyMatch(this::tooManyDeletes); + } + + @Override + protected long defaultTargetFileSize() { + return PropertyUtil.propertyAsLong( + table().properties(), + TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, + TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT); + } + + private int deleteFileThreshold(Map options) { + int value = + PropertyUtil.propertyAsInt(options, DELETE_FILE_THRESHOLD, DELETE_FILE_THRESHOLD_DEFAULT); + Preconditions.checkArgument( + value >= 0, "'%s' is set to %s but must be >= 0", DELETE_FILE_THRESHOLD, value); + return value; + } +} diff --git a/core/src/main/java/org/apache/iceberg/actions/SizeBasedFileRewriter.java b/core/src/main/java/org/apache/iceberg/actions/SizeBasedFileRewriter.java new file mode 100644 index 000000000000..319e44c4a20c --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/actions/SizeBasedFileRewriter.java @@ -0,0 +1,348 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.actions; + +import java.math.RoundingMode; +import java.util.List; +import java.util.Map; +import java.util.Set; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.ContentScanTask; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Table; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.math.LongMath; +import org.apache.iceberg.util.BinPacking; +import org.apache.iceberg.util.PropertyUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * A file rewriter that determines which files to rewrite based on their size. + * + *

If files are smaller than the {@link #MIN_FILE_SIZE_BYTES} threshold or larger than the {@link + * #MAX_FILE_SIZE_BYTES} threshold, they are considered targets for being rewritten. + * + *

Once selected, files are grouped based on the {@link BinPacking bin-packing algorithm} into + * groups of no more than {@link #MAX_FILE_GROUP_SIZE_BYTES}. Groups will be actually rewritten if + * they contain more than {@link #MIN_INPUT_FILES} or if they would produce at least one file of + * {@link #TARGET_FILE_SIZE_BYTES}. + * + *

Note that implementations may add extra conditions for selecting files or filtering groups. + * + * @deprecated since 1.8.0, will be removed in 1.9.0; use {@link SizeBasedFileRewritePlanner} and + * {@link FileRewriteExecutor}. + */ +@Deprecated +public abstract class SizeBasedFileRewriter, F extends ContentFile> + implements FileRewriter { + + private static final Logger LOG = LoggerFactory.getLogger(SizeBasedFileRewriter.class); + + /** The target output file size that this file rewriter will attempt to generate. */ + public static final String TARGET_FILE_SIZE_BYTES = "target-file-size-bytes"; + + /** + * Controls which files will be considered for rewriting. Files with sizes under this threshold + * will be considered for rewriting regardless of any other criteria. + * + *

Defaults to 75% of the target file size. + */ + public static final String MIN_FILE_SIZE_BYTES = "min-file-size-bytes"; + + public static final double MIN_FILE_SIZE_DEFAULT_RATIO = 0.75; + + /** + * Controls which files will be considered for rewriting. Files with sizes above this threshold + * will be considered for rewriting regardless of any other criteria. + * + *

Defaults to 180% of the target file size. + */ + public static final String MAX_FILE_SIZE_BYTES = "max-file-size-bytes"; + + public static final double MAX_FILE_SIZE_DEFAULT_RATIO = 1.80; + + /** + * Any file group exceeding this number of files will be rewritten regardless of other criteria. + * This config ensures file groups that contain many files are compacted even if the total size of + * that group is less than the target file size. This can also be thought of as the maximum number + * of wrongly sized files that could remain in a partition after rewriting. + */ + public static final String MIN_INPUT_FILES = "min-input-files"; + + public static final int MIN_INPUT_FILES_DEFAULT = 5; + + /** Overrides other options and forces rewriting of all provided files. */ + public static final String REWRITE_ALL = "rewrite-all"; + + public static final boolean REWRITE_ALL_DEFAULT = false; + + /** + * This option controls the largest amount of data that should be rewritten in a single file + * group. It helps with breaking down the rewriting of very large partitions which may not be + * rewritable otherwise due to the resource constraints of the cluster. For example, a sort-based + * rewrite may not scale to TB-sized partitions, and those partitions need to be worked on in + * small subsections to avoid exhaustion of resources. + */ + public static final String MAX_FILE_GROUP_SIZE_BYTES = "max-file-group-size-bytes"; + + public static final long MAX_FILE_GROUP_SIZE_BYTES_DEFAULT = 100L * 1024 * 1024 * 1024; // 100 GB + + private static final long SPLIT_OVERHEAD = 5 * 1024; + + private final Table table; + private long targetFileSize; + private long minFileSize; + private long maxFileSize; + private int minInputFiles; + private boolean rewriteAll; + private long maxGroupSize; + + private int outputSpecId; + + protected SizeBasedFileRewriter(Table table) { + this.table = table; + } + + protected abstract long defaultTargetFileSize(); + + protected abstract Iterable filterFiles(Iterable tasks); + + protected abstract Iterable> filterFileGroups(List> groups); + + protected Table table() { + return table; + } + + @Override + public Set validOptions() { + return ImmutableSet.of( + TARGET_FILE_SIZE_BYTES, + MIN_FILE_SIZE_BYTES, + MAX_FILE_SIZE_BYTES, + MIN_INPUT_FILES, + REWRITE_ALL, + MAX_FILE_GROUP_SIZE_BYTES); + } + + @Override + public void init(Map options) { + Map sizeThresholds = sizeThresholds(options); + this.targetFileSize = sizeThresholds.get(TARGET_FILE_SIZE_BYTES); + this.minFileSize = sizeThresholds.get(MIN_FILE_SIZE_BYTES); + this.maxFileSize = sizeThresholds.get(MAX_FILE_SIZE_BYTES); + + this.minInputFiles = minInputFiles(options); + this.rewriteAll = rewriteAll(options); + this.maxGroupSize = maxGroupSize(options); + this.outputSpecId = outputSpecId(options); + + if (rewriteAll) { + LOG.info("Configured to rewrite all provided files in table {}", table.name()); + } + } + + protected boolean wronglySized(T task) { + return task.length() < minFileSize || task.length() > maxFileSize; + } + + @Override + public Iterable> planFileGroups(Iterable tasks) { + Iterable filteredTasks = rewriteAll ? tasks : filterFiles(tasks); + BinPacking.ListPacker packer = new BinPacking.ListPacker<>(maxGroupSize, 1, false); + List> groups = packer.pack(filteredTasks, ContentScanTask::length); + return rewriteAll ? groups : filterFileGroups(groups); + } + + protected boolean enoughInputFiles(List group) { + return group.size() > 1 && group.size() >= minInputFiles; + } + + protected boolean enoughContent(List group) { + return group.size() > 1 && inputSize(group) > targetFileSize; + } + + protected boolean tooMuchContent(List group) { + return inputSize(group) > maxFileSize; + } + + protected long inputSize(List group) { + return group.stream().mapToLong(ContentScanTask::length).sum(); + } + + /** + * Calculates the split size to use in bin-packing rewrites. + * + *

This method determines the target split size as the input size divided by the desired number + * of output files. The final split size is adjusted to be at least as big as the target file size + * but less than the max write file size. + */ + public long splitSize(long inputSize) { + long estimatedSplitSize = (inputSize / numOutputFiles(inputSize)) + SPLIT_OVERHEAD; + if (estimatedSplitSize < targetFileSize) { + return targetFileSize; + } else if (estimatedSplitSize > writeMaxFileSize()) { + return writeMaxFileSize(); + } else { + return estimatedSplitSize; + } + } + + /** + * Determines the preferable number of output files when rewriting a particular file group. + * + *

If the rewriter is handling 10.1 GB of data with a target file size of 1 GB, it could + * produce 11 files, one of which would only have 0.1 GB. This would most likely be less + * preferable to 10 files with 1.01 GB each. So this method decides whether to round up or round + * down based on what the estimated average file size will be if the remainder (0.1 GB) is + * distributed amongst other files. If the new average file size is no more than 10% greater than + * the target file size, then this method will round down when determining the number of output + * files. Otherwise, the remainder will be written into a separate file. + * + * @param inputSize a total input size for a file group + * @return the number of files this rewriter should create + */ + protected long numOutputFiles(long inputSize) { + if (inputSize < targetFileSize) { + return 1; + } + + long numFilesWithRemainder = LongMath.divide(inputSize, targetFileSize, RoundingMode.CEILING); + long numFilesWithoutRemainder = LongMath.divide(inputSize, targetFileSize, RoundingMode.FLOOR); + long avgFileSizeWithoutRemainder = inputSize / numFilesWithoutRemainder; + + if (LongMath.mod(inputSize, targetFileSize) > minFileSize) { + // the remainder file is of a valid size for this rewrite so keep it + return numFilesWithRemainder; + + } else if (avgFileSizeWithoutRemainder + < Math.min(1.1 * targetFileSize, (double) writeMaxFileSize())) { + // if the reminder is distributed amongst other files, + // the average file size will be no more than 10% bigger than the target file size + // so round down and distribute remainder amongst other files + return numFilesWithoutRemainder; + + } else { + // keep the remainder file as it is not OK to distribute it amongst other files + return numFilesWithRemainder; + } + } + + /** + * Estimates a larger max target file size than the target size used in task creation to avoid + * creating tiny remainder files. + * + *

While we create tasks that should all be smaller than our target size, there is a chance + * that the actual data will end up being larger than our target size due to various factors of + * compression, serialization, which are outside our control. If this occurs, instead of making a + * single file that is close in size to our target, we would end up producing one file of the + * target size, and then a small extra file with the remaining data. + * + *

For example, if our target is 512 MB, we may generate a rewrite task that should be 500 MB. + * When we write the data we may find we actually have to write out 530 MB. If we use the target + * size while writing, we would produce a 512 MB file and an 18 MB file. If instead we use a + * larger size estimated by this method, then we end up writing a single file. + * + * @return the target size plus one half of the distance between max and target + */ + protected long writeMaxFileSize() { + return (long) (targetFileSize + ((maxFileSize - targetFileSize) * 0.5)); + } + + protected PartitionSpec outputSpec() { + return table.specs().get(outputSpecId); + } + + protected int outputSpecId() { + return outputSpecId; + } + + private int outputSpecId(Map options) { + int specId = + PropertyUtil.propertyAsInt(options, RewriteDataFiles.OUTPUT_SPEC_ID, table.spec().specId()); + Preconditions.checkArgument( + table.specs().containsKey(specId), + "Cannot use output spec id %s because the table does not contain a reference to this spec-id.", + specId); + return specId; + } + + private Map sizeThresholds(Map options) { + long target = + PropertyUtil.propertyAsLong(options, TARGET_FILE_SIZE_BYTES, defaultTargetFileSize()); + + long defaultMin = (long) (target * MIN_FILE_SIZE_DEFAULT_RATIO); + long min = PropertyUtil.propertyAsLong(options, MIN_FILE_SIZE_BYTES, defaultMin); + + long defaultMax = (long) (target * MAX_FILE_SIZE_DEFAULT_RATIO); + long max = PropertyUtil.propertyAsLong(options, MAX_FILE_SIZE_BYTES, defaultMax); + + Preconditions.checkArgument( + target > 0, "'%s' is set to %s but must be > 0", TARGET_FILE_SIZE_BYTES, target); + + Preconditions.checkArgument( + min >= 0, "'%s' is set to %s but must be >= 0", MIN_FILE_SIZE_BYTES, min); + + Preconditions.checkArgument( + target > min, + "'%s' (%s) must be > '%s' (%s), all new files will be smaller than the min threshold", + TARGET_FILE_SIZE_BYTES, + target, + MIN_FILE_SIZE_BYTES, + min); + + Preconditions.checkArgument( + target < max, + "'%s' (%s) must be < '%s' (%s), all new files will be larger than the max threshold", + TARGET_FILE_SIZE_BYTES, + target, + MAX_FILE_SIZE_BYTES, + max); + + Map values = Maps.newHashMap(); + + values.put(TARGET_FILE_SIZE_BYTES, target); + values.put(MIN_FILE_SIZE_BYTES, min); + values.put(MAX_FILE_SIZE_BYTES, max); + + return values; + } + + private int minInputFiles(Map options) { + int value = PropertyUtil.propertyAsInt(options, MIN_INPUT_FILES, MIN_INPUT_FILES_DEFAULT); + Preconditions.checkArgument( + value > 0, "'%s' is set to %s but must be > 0", MIN_INPUT_FILES, value); + return value; + } + + private long maxGroupSize(Map options) { + long value = + PropertyUtil.propertyAsLong( + options, MAX_FILE_GROUP_SIZE_BYTES, MAX_FILE_GROUP_SIZE_BYTES_DEFAULT); + Preconditions.checkArgument( + value > 0, "'%s' is set to %s but must be > 0", MAX_FILE_GROUP_SIZE_BYTES, value); + return value; + } + + private boolean rewriteAll(Map options) { + return PropertyUtil.propertyAsBoolean(options, REWRITE_ALL, REWRITE_ALL_DEFAULT); + } +} diff --git a/core/src/main/java/org/apache/iceberg/actions/SizeBasedPositionDeletesRewriter.java b/core/src/main/java/org/apache/iceberg/actions/SizeBasedPositionDeletesRewriter.java new file mode 100644 index 000000000000..60f37b79d24c --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/actions/SizeBasedPositionDeletesRewriter.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.actions; + +import java.util.List; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.PositionDeletesScanTask; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.util.PropertyUtil; + +/** + * @deprecated since 1.8.0, will be removed in 1.9.0; use {@link RewritePositionDeletesGroupPlanner} + * and {@link FileRewriteExecutor}. + */ +@Deprecated +public abstract class SizeBasedPositionDeletesRewriter + extends SizeBasedFileRewriter { + + protected SizeBasedPositionDeletesRewriter(Table table) { + super(table); + } + + @Override + protected Iterable filterFiles(Iterable tasks) { + return Iterables.filter(tasks, this::wronglySized); + } + + @Override + protected Iterable> filterFileGroups( + List> groups) { + return Iterables.filter(groups, this::shouldRewrite); + } + + private boolean shouldRewrite(List group) { + return enoughInputFiles(group) || enoughContent(group) || tooMuchContent(group); + } + + @Override + protected long defaultTargetFileSize() { + return PropertyUtil.propertyAsLong( + table().properties(), + TableProperties.DELETE_TARGET_FILE_SIZE_BYTES, + TableProperties.DELETE_TARGET_FILE_SIZE_BYTES_DEFAULT); + } +} diff --git a/gradle.properties b/gradle.properties index 5c62371efe35..dc1e1a509b01 100644 --- a/gradle.properties +++ b/gradle.properties @@ -21,7 +21,7 @@ systemProp.knownFlinkVersions=1.18,1.19,1.20 systemProp.defaultHiveVersions=2 systemProp.knownHiveVersions=2,3 systemProp.defaultSparkVersions=3.5 -systemProp.knownSparkVersions=3.5 +systemProp.knownSparkVersions=3.3,3.4,3.5 systemProp.defaultKafkaVersions=3 systemProp.knownKafkaVersions=3 systemProp.defaultScalaVersion=2.12 From 440618df19fe1724a4600dff469cd8a4dad485e9 Mon Sep 17 00:00:00 2001 From: Peter Vary Date: Thu, 28 Nov 2024 14:46:32 +0100 Subject: [PATCH 08/11] Flashing out tests for RewriteFileGroupPlanner --- .../actions/RewriteFileGroupPlanner.java | 33 +++- .../actions/TestRewriteFileGroupPlanner.java | 174 +++++++++++++++--- .../actions/TestSparkFileRewriteExecutor.java | 22 +-- 3 files changed, 182 insertions(+), 47 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java b/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java index 3fdcfba3fbbd..0fd786f99a99 100644 --- a/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java +++ b/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java @@ -30,9 +30,11 @@ import org.apache.iceberg.StructLike; import org.apache.iceberg.Table; import org.apache.iceberg.TableProperties; +import org.apache.iceberg.TableScan; import org.apache.iceberg.actions.RewriteDataFiles.FileGroupInfo; import org.apache.iceberg.data.GenericRecord; import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; @@ -70,14 +72,26 @@ public class RewriteFileGroupPlanner private static final Logger LOG = LoggerFactory.getLogger(RewriteFileGroupPlanner.class); private final Expression filter; - private final long snapshotId; + private final Long snapshotId; private final boolean caseSensitive; private int deleteFileThreshold; private RewriteJobOrder rewriteJobOrder; + public RewriteFileGroupPlanner(Table table) { + this(table, Expressions.alwaysTrue()); + } + + public RewriteFileGroupPlanner(Table table, Expression filter) { + this( + table, + filter, + table.currentSnapshot() != null ? table.currentSnapshot().snapshotId() : null, + false); + } + public RewriteFileGroupPlanner( - Table table, Expression filter, long snapshotId, boolean caseSensitive) { + Table table, Expression filter, Long snapshotId, boolean caseSensitive) { super(table); this.filter = filter; this.snapshotId = snapshotId; @@ -160,13 +174,14 @@ public FileRewritePlan @VisibleForTesting CloseableIterable tasks() { - return table() - .newScan() - .useSnapshot(snapshotId) - .caseSensitive(caseSensitive) - .filter(filter) - .ignoreResiduals() - .planFiles(); + TableScan scan = + table().newScan().filter(filter).caseSensitive(caseSensitive).ignoreResiduals(); + + if (snapshotId != null) { + scan = scan.useSnapshot(snapshotId); + } + + return scan.planFiles(); } private int deleteFileThreshold(Map options) { diff --git a/core/src/test/java/org/apache/iceberg/actions/TestRewriteFileGroupPlanner.java b/core/src/test/java/org/apache/iceberg/actions/TestRewriteFileGroupPlanner.java index 903e7b27313c..746395d57bca 100644 --- a/core/src/test/java/org/apache/iceberg/actions/TestRewriteFileGroupPlanner.java +++ b/core/src/test/java/org/apache/iceberg/actions/TestRewriteFileGroupPlanner.java @@ -19,7 +19,9 @@ package org.apache.iceberg.actions; import static org.apache.iceberg.actions.RewriteDataFiles.REWRITE_JOB_ORDER; +import static org.apache.iceberg.actions.RewriteFileGroupPlanner.MAX_FILE_SIZE_DEFAULT_RATIO; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.io.File; import java.util.List; @@ -37,14 +39,19 @@ import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.EnumSource; +import org.junit.jupiter.params.provider.ValueSource; class TestRewriteFileGroupPlanner { + private static final Map REWRITE_ALL = + ImmutableMap.of(RewriteFileGroupPlanner.REWRITE_ALL, "true"); + private static final DataFile FILE_1 = newDataFile("data_bucket=0", 10); private static final DataFile FILE_2 = newDataFile("data_bucket=0", 10); private static final DataFile FILE_3 = newDataFile("data_bucket=0", 10); @@ -80,19 +87,9 @@ public void cleanupTables() { @EnumSource( value = RewriteJobOrder.class, names = {"FILES_DESC", "FILES_ASC", "BYTES_DESC", "BYTES_ASC"}) - void testGroups(RewriteJobOrder order) { - table - .newAppend() - .appendFile(FILE_1) - .appendFile(FILE_2) - .appendFile(FILE_3) - .appendFile(FILE_4) - .appendFile(FILE_5) - .appendFile(FILE_6) - .commit(); - RewriteFileGroupPlanner planner = - new RewriteFileGroupPlanner( - table, Expressions.alwaysTrue(), table.currentSnapshot().snapshotId(), false); + void testJobOrder(RewriteJobOrder order) { + addFiles(); + RewriteFileGroupPlanner planner = new RewriteFileGroupPlanner(table); planner.init( ImmutableMap.of( RewriteFileGroupPlanner.REWRITE_ALL, "true", REWRITE_JOB_ORDER, order.name())); @@ -106,19 +103,33 @@ void testGroups(RewriteJobOrder order) { } @Test - void testContext() { + void testUnpartitionedTable() { + table.updateSpec().removeField("data_bucket").commit(); + table.refresh(); + table .newAppend() - .appendFile(FILE_1) - .appendFile(FILE_2) - .appendFile(FILE_3) - .appendFile(FILE_4) - .appendFile(FILE_5) - .appendFile(FILE_6) + .appendFile(newDataFile("", 10)) + .appendFile(newDataFile("", 20)) + .appendFile(newDataFile("", 30)) .commit(); - RewriteFileGroupPlanner planner = - new RewriteFileGroupPlanner( - table, Expressions.alwaysTrue(), table.currentSnapshot().snapshotId(), false); + RewriteFileGroupPlanner planner = new RewriteFileGroupPlanner(table); + planner.init( + ImmutableMap.of( + RewriteFileGroupPlanner.MIN_INPUT_FILES, + "1", + RewriteFileGroupPlanner.MIN_FILE_SIZE_BYTES, + "30")); + FileRewritePlan result = + planner.plan(); + assertThat(result.totalGroupCount()).isEqualTo(1); + assertThat(result.groups().iterator().next().numInputFiles()).isEqualTo(2); + } + + @Test + void testMaxGroupSize() { + addFiles(); + RewriteFileGroupPlanner planner = new RewriteFileGroupPlanner(table); planner.init( ImmutableMap.of( RewriteFileGroupPlanner.REWRITE_ALL, @@ -133,6 +144,123 @@ void testContext() { assertThat(result.groupsInPartition(FILE_6.partition())).isEqualTo(1); } + @Test + void testEmptyTable() { + RewriteFileGroupPlanner planner = new RewriteFileGroupPlanner(table); + + planner.init(REWRITE_ALL); + + FileRewritePlan result = + planner.plan(); + + assertThat(table.currentSnapshot()).as("Table must be empty").isNull(); + assertThat(result.totalGroupCount()).isZero(); + } + + @Test + void testFilter() { + addFiles(); + RewriteFileGroupPlanner planner = + new RewriteFileGroupPlanner( + table, + Expressions.or( + Expressions.equal(Expressions.bucket("data", 16), 0), + Expressions.equal(Expressions.bucket("data", 16), 2))); + planner.init(REWRITE_ALL); + FileRewritePlan plan = + planner.plan(); + List groups = plan.groups().collect(Collectors.toList()); + + assertThat(plan.totalGroupCount()).isEqualTo(2); + assertThat(groups).hasSize(2); + assertThat(groups.stream().mapToLong(FileRewriteGroup::numInputFiles).sum()).isEqualTo(4); + } + + @Test + void testWriteMaxFileSize() { + int targetFileSize = 10; + addFiles(); + + RewriteFileGroupPlanner planner = new RewriteFileGroupPlanner(table); + planner.init( + ImmutableMap.of( + RewriteFileGroupPlanner.REWRITE_ALL, + "true", + RewriteFileGroupPlanner.TARGET_FILE_SIZE_BYTES, + String.valueOf(targetFileSize))); + FileRewritePlan plan = + planner.plan(); + assertThat(plan.writeMaxFileSize()) + .isGreaterThan(targetFileSize) + .isLessThan((long) (targetFileSize * MAX_FILE_SIZE_DEFAULT_RATIO)); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + void testOutputSpec(boolean specific) { + addFiles(); + + int oldSpecId = table.spec().specId(); + table.updateSpec().removeField("data_bucket").commit(); + table.newAppend().appendFile(newDataFile("", 10)).commit(); + table.refresh(); + int newSpecId = table.spec().specId(); + + RewriteFileGroupPlanner planner = new RewriteFileGroupPlanner(table); + + Map options = Maps.newHashMap(REWRITE_ALL); + if (specific) { + options.put(RewriteDataFiles.OUTPUT_SPEC_ID, String.valueOf(oldSpecId)); + } + + planner.init(options); + + FileRewritePlan plan = + planner.plan(); + assertThat(plan.outputSpecId()).isEqualTo(specific ? oldSpecId : newSpecId); + } + + @Test + public void testInvalidOption() { + addFiles(); + + assertThatThrownBy( + () -> { + RewriteFileGroupPlanner planner = new RewriteFileGroupPlanner(table); + + planner.init(ImmutableMap.of(RewriteDataFiles.REWRITE_JOB_ORDER, "foo")); + }) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid rewrite job order name: foo"); + + assertThatThrownBy( + () -> { + RewriteFileGroupPlanner planner = new RewriteFileGroupPlanner(table); + + planner.init( + ImmutableMap.of( + RewriteFileGroupPlanner.REWRITE_ALL, + "true", + RewriteDataFiles.OUTPUT_SPEC_ID, + String.valueOf(1234))); + }) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage( + "Cannot use output spec id 1234 because the table does not contain a reference to this spec-id."); + } + + private void addFiles() { + table + .newAppend() + .appendFile(FILE_1) + .appendFile(FILE_2) + .appendFile(FILE_3) + .appendFile(FILE_4) + .appendFile(FILE_5) + .appendFile(FILE_6) + .commit(); + } + private static DataFile newDataFile(String partitionPath, long fileSize) { return DataFiles.builder(TestBase.SPEC) .withPath("/path/to/data-" + UUID.randomUUID() + ".parquet") diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestSparkFileRewriteExecutor.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestSparkFileRewriteExecutor.java index bce2bf11209c..444bbf458f17 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestSparkFileRewriteExecutor.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestSparkFileRewriteExecutor.java @@ -33,7 +33,6 @@ import org.apache.iceberg.actions.RewriteFileGroupPlanner; import org.apache.iceberg.actions.SizeBasedFileRewritePlanner; import org.apache.iceberg.catalog.TableIdentifier; -import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; @@ -64,8 +63,7 @@ public void removeTable() { @Test public void testBinPackDataSelectFiles() { Table table = catalog.createTable(TABLE_IDENT, SCHEMA); - RewriteFileGroupPlanner rewriter = - new RewriteFileGroupPlanner(table, Expressions.alwaysTrue(), 1, false); + RewriteFileGroupPlanner rewriter = new RewriteFileGroupPlanner(table); checkDataFileSizeFiltering(rewriter); checkDataFilesDeleteThreshold(rewriter); @@ -216,8 +214,7 @@ public void testInvalidConstructorUsagesZOrderData() { public void testBinPackDataValidOptions() { Table table = catalog.createTable(TABLE_IDENT, SCHEMA); SparkBinPackDataRewriteExecutor rewriter = new SparkBinPackDataRewriteExecutor(spark, table); - RewriteFileGroupPlanner planner = - new RewriteFileGroupPlanner(table, Expressions.alwaysTrue(), 1, false); + RewriteFileGroupPlanner planner = new RewriteFileGroupPlanner(table); assertThat(rewriter.validOptions()) .as("Rewriter must report all supported options") @@ -242,8 +239,7 @@ public void testSortDataValidOptions() { Table table = catalog.createTable(TABLE_IDENT, SCHEMA); SparkSortDataRewriteExecutor rewriter = new SparkSortDataRewriteExecutor(spark, table, SORT_ORDER); - RewriteFileGroupPlanner planner = - new RewriteFileGroupPlanner(table, Expressions.alwaysTrue(), 1, false); + RewriteFileGroupPlanner planner = new RewriteFileGroupPlanner(table); assertThat(rewriter.validOptions()) .as("Rewriter must report all supported options") @@ -272,8 +268,7 @@ public void testZOrderDataValidOptions() { ImmutableList zOrderCols = ImmutableList.of("id"); SparkZOrderDataRewriteExecutor rewriter = new SparkZOrderDataRewriteExecutor(spark, table, zOrderCols); - RewriteFileGroupPlanner planner = - new RewriteFileGroupPlanner(table, Expressions.alwaysTrue(), 1, false); + RewriteFileGroupPlanner planner = new RewriteFileGroupPlanner(table); assertThat(rewriter.validOptions()) .as("Rewriter must report all supported options") @@ -300,8 +295,7 @@ public void testZOrderDataValidOptions() { @Test public void testInvalidValuesForBinPackDataOptions() { Table table = catalog.createTable(TABLE_IDENT, SCHEMA); - RewriteFileGroupPlanner planner = - new RewriteFileGroupPlanner(table, Expressions.alwaysTrue(), 1, false); + RewriteFileGroupPlanner planner = new RewriteFileGroupPlanner(table); validateSizeBasedRewriterOptions(planner); @@ -316,8 +310,7 @@ public void testInvalidValuesForSortDataOptions() { Table table = catalog.createTable(TABLE_IDENT, SCHEMA); SparkSortDataRewriteExecutor rewriter = new SparkSortDataRewriteExecutor(spark, table, SORT_ORDER); - RewriteFileGroupPlanner planner = - new RewriteFileGroupPlanner(table, Expressions.alwaysTrue(), 1, false); + RewriteFileGroupPlanner planner = new RewriteFileGroupPlanner(table); validateSizeBasedRewriterOptions(planner); @@ -338,8 +331,7 @@ public void testInvalidValuesForZOrderDataOptions() { ImmutableList zOrderCols = ImmutableList.of("id"); SparkZOrderDataRewriteExecutor rewriter = new SparkZOrderDataRewriteExecutor(spark, table, zOrderCols); - RewriteFileGroupPlanner planner = - new RewriteFileGroupPlanner(table, Expressions.alwaysTrue(), 1, false); + RewriteFileGroupPlanner planner = new RewriteFileGroupPlanner(table); validateSizeBasedRewriterOptions(planner); From c76e9c14c7ffcaf1d7d780e4f17629cfd87d2201 Mon Sep 17 00:00:00 2001 From: Peter Vary Date: Mon, 2 Dec 2024 14:11:30 +0100 Subject: [PATCH 09/11] RewritePositionDeletesGroupPlanner tests --- .../iceberg/actions/FileRewriteExecutor.java | 6 +- .../iceberg/actions/FileRewritePlan.java | 12 +- .../actions/RewriteFileGroupPlanner.java | 8 +- .../iceberg/actions/RewriteFilePlan.java | 47 ++++ .../actions/RewritePositionDeletePlan.java | 41 +++ .../RewritePositionDeletesGroupPlanner.java | 13 +- .../actions/TestRewriteFileGroupPlanner.java | 25 +- ...estRewritePositionDeletesGroupPlanner.java | 250 ++++++++++++++++++ ...a => TestSizeBasedFileRewritePlanner.java} | 73 +++-- .../actions/RewriteDataFilesSparkAction.java | 26 +- ...RewritePositionDeleteFilesSparkAction.java | 34 +-- ...BinPackPositionDeletesRewriteExecutor.java | 7 +- .../spark/actions/SparkRewriteExecutor.java | 19 +- .../SparkSizeBasedDataRewriteExecutor.java | 20 +- 14 files changed, 455 insertions(+), 126 deletions(-) create mode 100644 core/src/main/java/org/apache/iceberg/actions/RewriteFilePlan.java create mode 100644 core/src/main/java/org/apache/iceberg/actions/RewritePositionDeletePlan.java create mode 100644 core/src/test/java/org/apache/iceberg/actions/TestRewritePositionDeletesGroupPlanner.java rename core/src/test/java/org/apache/iceberg/actions/{TestSizeBasedRewriter.java => TestSizeBasedFileRewritePlanner.java} (62%) diff --git a/core/src/main/java/org/apache/iceberg/actions/FileRewriteExecutor.java b/core/src/main/java/org/apache/iceberg/actions/FileRewriteExecutor.java index c024cb42877a..bbe84915fa63 100644 --- a/core/src/main/java/org/apache/iceberg/actions/FileRewriteExecutor.java +++ b/core/src/main/java/org/apache/iceberg/actions/FileRewriteExecutor.java @@ -30,12 +30,14 @@ * @param the Java type of the tasks to read content files * @param the Java type of the content files * @param the Java type of the planned groups + * @param

the Java type of the plan to execute */ public interface FileRewriteExecutor< I, T extends ContentScanTask, F extends ContentFile, - G extends FileRewriteGroup> { + G extends FileRewriteGroup, + P extends FileRewritePlan> { /** Returns a description for this rewriter. */ default String description() { @@ -60,7 +62,7 @@ default String description() { * * @param plan containing the configuration data */ - void initPlan(FileRewritePlan plan); + void initPlan(P plan); /** * Rewrite a group of files represented by the given list of scan tasks. diff --git a/core/src/main/java/org/apache/iceberg/actions/FileRewritePlan.java b/core/src/main/java/org/apache/iceberg/actions/FileRewritePlan.java index ad6349de2f80..2cab4d7d12b4 100644 --- a/core/src/main/java/org/apache/iceberg/actions/FileRewritePlan.java +++ b/core/src/main/java/org/apache/iceberg/actions/FileRewritePlan.java @@ -35,7 +35,7 @@ * @param the Java type of the content files * @param the Java type of the planned groups */ -public class FileRewritePlan< +public abstract class FileRewritePlan< I, T extends ContentScanTask, F extends ContentFile, @@ -44,19 +44,16 @@ public class FileRewritePlan< private final int totalGroupCount; private final Map groupsInPartition; private final long writeMaxFileSize; - private final int outputSpecId; protected FileRewritePlan( Stream groups, int totalGroupCount, Map groupsInPartition, - long writeMaxFileSize, - int outputSpecId) { + long writeMaxFileSize) { this.groups = groups; this.totalGroupCount = totalGroupCount; this.groupsInPartition = groupsInPartition; this.writeMaxFileSize = writeMaxFileSize; - this.outputSpecId = outputSpecId; } /** The stream of the generated {@link RewriteFileGroup}s. */ @@ -78,9 +75,4 @@ public int totalGroupCount() { public long writeMaxFileSize() { return writeMaxFileSize; } - - /** Partition specification id for the target files */ - public int outputSpecId() { - return outputSpecId; - } } diff --git a/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java b/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java index 0fd786f99a99..ba499f392411 100644 --- a/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java +++ b/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java @@ -36,7 +36,6 @@ import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; @@ -143,7 +142,7 @@ protected long defaultTargetFileSize() { * @return the generated plan which could be executed during the compaction */ @Override - public FileRewritePlan plan() { + public RewriteFilePlan plan() { StructLikeMap>> plan = planFileGroups(); RewriteExecutionContext ctx = new RewriteExecutionContext(); Stream groups = @@ -168,12 +167,11 @@ public FileRewritePlan .sorted(FileRewriteGroup.taskComparator(rewriteJobOrder)); Map groupsInPartition = plan.transformValues(List::size); int totalGroupCount = groupsInPartition.values().stream().reduce(Integer::sum).orElse(0); - return new FileRewritePlan<>( + return new RewriteFilePlan( groups, totalGroupCount, groupsInPartition, writeMaxFileSize(), outputSpecId()); } - @VisibleForTesting - CloseableIterable tasks() { + private CloseableIterable tasks() { TableScan scan = table().newScan().filter(filter).caseSensitive(caseSensitive).ignoreResiduals(); diff --git a/core/src/main/java/org/apache/iceberg/actions/RewriteFilePlan.java b/core/src/main/java/org/apache/iceberg/actions/RewriteFilePlan.java new file mode 100644 index 000000000000..b6d9a35ebd9f --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/actions/RewriteFilePlan.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.actions; + +import java.util.Map; +import java.util.stream.Stream; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.StructLike; + +/** Result of the data file rewrite planning. */ +public class RewriteFilePlan + extends FileRewritePlan< + RewriteDataFiles.FileGroupInfo, FileScanTask, DataFile, RewriteFileGroup> { + private final int outputSpecId; + + public RewriteFilePlan( + Stream groups, + int totalGroupCount, + Map groupsInPartition, + long writeMaxFileSize, + int outputSpecId) { + super(groups, totalGroupCount, groupsInPartition, writeMaxFileSize); + this.outputSpecId = outputSpecId; + } + + /** Partition specification id for the target files */ + public int outputSpecId() { + return outputSpecId; + } +} diff --git a/core/src/main/java/org/apache/iceberg/actions/RewritePositionDeletePlan.java b/core/src/main/java/org/apache/iceberg/actions/RewritePositionDeletePlan.java new file mode 100644 index 000000000000..15ee241ad99e --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/actions/RewritePositionDeletePlan.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.actions; + +import java.util.Map; +import java.util.stream.Stream; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.PositionDeletesScanTask; +import org.apache.iceberg.StructLike; + +/** Result of the positional delete file rewrite planning. */ +public class RewritePositionDeletePlan + extends FileRewritePlan< + RewritePositionDeleteFiles.FileGroupInfo, + PositionDeletesScanTask, + DeleteFile, + RewritePositionDeletesGroup> { + public RewritePositionDeletePlan( + Stream groups, + int totalGroupCount, + Map groupsInPartition, + long writeMaxFileSize) { + super(groups, totalGroupCount, groupsInPartition, writeMaxFileSize); + } +} diff --git a/core/src/main/java/org/apache/iceberg/actions/RewritePositionDeletesGroupPlanner.java b/core/src/main/java/org/apache/iceberg/actions/RewritePositionDeletesGroupPlanner.java index 74109df05a1f..b1017ffcad14 100644 --- a/core/src/main/java/org/apache/iceberg/actions/RewritePositionDeletesGroupPlanner.java +++ b/core/src/main/java/org/apache/iceberg/actions/RewritePositionDeletesGroupPlanner.java @@ -36,6 +36,7 @@ import org.apache.iceberg.TableProperties; import org.apache.iceberg.actions.RewritePositionDeleteFiles.FileGroupInfo; import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; @@ -63,6 +64,10 @@ public class RewritePositionDeletesGroupPlanner private final boolean caseSensitive; private RewriteJobOrder rewriteJobOrder; + public RewritePositionDeletesGroupPlanner(Table table) { + this(table, Expressions.alwaysTrue(), false); + } + public RewritePositionDeletesGroupPlanner(Table table, Expression filter, boolean caseSensitive) { super(table); this.caseSensitive = caseSensitive; @@ -94,9 +99,7 @@ public void init(Map options) { * @return the generated plan which could be executed during the compaction */ @Override - public FileRewritePlan< - FileGroupInfo, PositionDeletesScanTask, DeleteFile, RewritePositionDeletesGroup> - plan() { + public RewritePositionDeletePlan plan() { StructLikeMap>> plan = planFileGroups(); RewriteExecutionContext ctx = new RewriteExecutionContext(); Stream groups = @@ -121,8 +124,8 @@ public void init(Map options) { .sorted(FileRewriteGroup.taskComparator(rewriteJobOrder)); Map groupsInPartition = plan.transformValues(List::size); int totalGroupCount = groupsInPartition.values().stream().reduce(Integer::sum).orElse(0); - return new FileRewritePlan<>( - groups, totalGroupCount, groupsInPartition, writeMaxFileSize(), outputSpecId()); + return new RewritePositionDeletePlan( + groups, totalGroupCount, groupsInPartition, writeMaxFileSize()); } private StructLikeMap>> planFileGroups() { diff --git a/core/src/test/java/org/apache/iceberg/actions/TestRewriteFileGroupPlanner.java b/core/src/test/java/org/apache/iceberg/actions/TestRewriteFileGroupPlanner.java index 746395d57bca..e10019a9f547 100644 --- a/core/src/test/java/org/apache/iceberg/actions/TestRewriteFileGroupPlanner.java +++ b/core/src/test/java/org/apache/iceberg/actions/TestRewriteFileGroupPlanner.java @@ -30,12 +30,10 @@ import java.util.stream.Collectors; import org.apache.iceberg.DataFile; import org.apache.iceberg.DataFiles; -import org.apache.iceberg.FileScanTask; import org.apache.iceberg.RewriteJobOrder; import org.apache.iceberg.StructLike; import org.apache.iceberg.TestBase; import org.apache.iceberg.TestTables; -import org.apache.iceberg.actions.RewriteDataFiles.FileGroupInfo; import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; @@ -93,8 +91,7 @@ void testJobOrder(RewriteJobOrder order) { planner.init( ImmutableMap.of( RewriteFileGroupPlanner.REWRITE_ALL, "true", REWRITE_JOB_ORDER, order.name())); - FileRewritePlan result = - planner.plan(); + RewriteFilePlan result = planner.plan(); List groups = result.groups().collect(Collectors.toList()); assertThat(groups.stream().map(group -> group.info().partition()).collect(Collectors.toList())) .isEqualTo(EXPECTED.get(order)); @@ -120,8 +117,7 @@ void testUnpartitionedTable() { "1", RewriteFileGroupPlanner.MIN_FILE_SIZE_BYTES, "30")); - FileRewritePlan result = - planner.plan(); + RewriteFilePlan result = planner.plan(); assertThat(result.totalGroupCount()).isEqualTo(1); assertThat(result.groups().iterator().next().numInputFiles()).isEqualTo(2); } @@ -136,8 +132,7 @@ void testMaxGroupSize() { "true", RewriteFileGroupPlanner.MAX_FILE_GROUP_SIZE_BYTES, "10")); - FileRewritePlan result = - planner.plan(); + RewriteFilePlan result = planner.plan(); assertThat(result.totalGroupCount()).isEqualTo(6); assertThat(result.groupsInPartition(FILE_1.partition())).isEqualTo(3); assertThat(result.groupsInPartition(FILE_4.partition())).isEqualTo(2); @@ -150,8 +145,7 @@ void testEmptyTable() { planner.init(REWRITE_ALL); - FileRewritePlan result = - planner.plan(); + RewriteFilePlan result = planner.plan(); assertThat(table.currentSnapshot()).as("Table must be empty").isNull(); assertThat(result.totalGroupCount()).isZero(); @@ -167,8 +161,7 @@ void testFilter() { Expressions.equal(Expressions.bucket("data", 16), 0), Expressions.equal(Expressions.bucket("data", 16), 2))); planner.init(REWRITE_ALL); - FileRewritePlan plan = - planner.plan(); + RewriteFilePlan plan = planner.plan(); List groups = plan.groups().collect(Collectors.toList()); assertThat(plan.totalGroupCount()).isEqualTo(2); @@ -188,8 +181,7 @@ void testWriteMaxFileSize() { "true", RewriteFileGroupPlanner.TARGET_FILE_SIZE_BYTES, String.valueOf(targetFileSize))); - FileRewritePlan plan = - planner.plan(); + RewriteFilePlan plan = planner.plan(); assertThat(plan.writeMaxFileSize()) .isGreaterThan(targetFileSize) .isLessThan((long) (targetFileSize * MAX_FILE_SIZE_DEFAULT_RATIO)); @@ -215,13 +207,12 @@ void testOutputSpec(boolean specific) { planner.init(options); - FileRewritePlan plan = - planner.plan(); + RewriteFilePlan plan = planner.plan(); assertThat(plan.outputSpecId()).isEqualTo(specific ? oldSpecId : newSpecId); } @Test - public void testInvalidOption() { + void testInvalidOption() { addFiles(); assertThatThrownBy( diff --git a/core/src/test/java/org/apache/iceberg/actions/TestRewritePositionDeletesGroupPlanner.java b/core/src/test/java/org/apache/iceberg/actions/TestRewritePositionDeletesGroupPlanner.java new file mode 100644 index 000000000000..2c4520d96d99 --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/actions/TestRewritePositionDeletesGroupPlanner.java @@ -0,0 +1,250 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.actions; + +import static org.apache.iceberg.actions.RewritePositionDeleteFiles.REWRITE_JOB_ORDER; +import static org.apache.iceberg.actions.RewritePositionDeletesGroupPlanner.MAX_FILE_SIZE_DEFAULT_RATIO; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.File; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import java.util.stream.Collectors; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DataFiles; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.FileMetadata; +import org.apache.iceberg.PartitionData; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.RewriteJobOrder; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.TestBase; +import org.apache.iceberg.TestTables; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +class TestRewritePositionDeletesGroupPlanner { + private static final Map REWRITE_ALL = + ImmutableMap.of(RewritePositionDeletesGroupPlanner.REWRITE_ALL, "true"); + + private static final DataFile FILE_1 = newDataFile("data_bucket=0"); + private static final DataFile FILE_2 = newDataFile("data_bucket=1"); + private static final DataFile FILE_3 = newDataFile("data_bucket=2"); + private static final Map> EXPECTED = + ImmutableMap.of( + RewriteJobOrder.FILES_DESC, + ImmutableList.of(FILE_1.partition(), FILE_2.partition(), FILE_3.partition()), + RewriteJobOrder.FILES_ASC, + ImmutableList.of(FILE_3.partition(), FILE_2.partition(), FILE_1.partition()), + RewriteJobOrder.BYTES_DESC, + ImmutableList.of(FILE_3.partition(), FILE_1.partition(), FILE_2.partition()), + RewriteJobOrder.BYTES_ASC, + ImmutableList.of(FILE_2.partition(), FILE_1.partition(), FILE_3.partition())); + + @TempDir private File tableDir = null; + private TestTables.TestTable table = null; + + @BeforeEach + public void setupTable() throws Exception { + this.table = TestTables.create(tableDir, "test", TestBase.SCHEMA, TestBase.SPEC, 2); + } + + @AfterEach + public void cleanupTables() { + TestTables.clearTables(); + } + + @ParameterizedTest + @EnumSource( + value = RewriteJobOrder.class, + names = {"FILES_DESC", "FILES_ASC", "BYTES_DESC", "BYTES_ASC"}) + void testJobOrder(RewriteJobOrder order) { + addFiles(); + RewritePositionDeletesGroupPlanner planner = new RewritePositionDeletesGroupPlanner(table); + planner.init( + ImmutableMap.of( + RewriteFileGroupPlanner.REWRITE_ALL, "true", REWRITE_JOB_ORDER, order.name())); + RewritePositionDeletePlan result = planner.plan(); + List groups = result.groups().collect(Collectors.toList()); + assertThat( + groups.stream() + .map( + group -> + new PartitionData(TestBase.SPEC.partitionType()) + .copyFor(group.info().partition())) + .collect(Collectors.toList())) + .isEqualTo(EXPECTED.get(order)); + assertThat(result.totalGroupCount()).isEqualTo(3); + EXPECTED.get(order).forEach(s -> assertThat(result.groupsInPartition(s)).isEqualTo(1)); + } + + @Test + void testUnpartitionedTable() { + table.updateSpec().removeField("data_bucket").commit(); + table.refresh(); + + table + .newRowDelta() + .addRows(newDataFile("")) + .addDeletes(newDeleteFile(10)) + .addDeletes(newDeleteFile(20)) + .addDeletes(newDeleteFile(30)) + .commit(); + + RewritePositionDeletesGroupPlanner planner = new RewritePositionDeletesGroupPlanner(table); + planner.init( + ImmutableMap.of( + RewriteFileGroupPlanner.MIN_INPUT_FILES, + "1", + RewriteFileGroupPlanner.MIN_FILE_SIZE_BYTES, + "30")); + RewritePositionDeletePlan result = planner.plan(); + assertThat(result.totalGroupCount()).isEqualTo(1); + assertThat(result.groups().iterator().next().numInputFiles()).isEqualTo(2); + } + + @Test + void testMaxGroupSize() { + addFiles(); + RewritePositionDeletesGroupPlanner planner = new RewritePositionDeletesGroupPlanner(table); + planner.init( + ImmutableMap.of( + RewritePositionDeletesGroupPlanner.REWRITE_ALL, + "true", + RewritePositionDeletesGroupPlanner.MAX_FILE_GROUP_SIZE_BYTES, + "10")); + RewritePositionDeletePlan result = planner.plan(); + assertThat(result.totalGroupCount()).isEqualTo(6); + assertThat(result.groupsInPartition(FILE_1.partition())).isEqualTo(3); + assertThat(result.groupsInPartition(FILE_2.partition())).isEqualTo(2); + assertThat(result.groupsInPartition(FILE_3.partition())).isEqualTo(1); + } + + @Test + void testEmptyTable() { + RewritePositionDeletesGroupPlanner planner = new RewritePositionDeletesGroupPlanner(table); + + planner.init(REWRITE_ALL); + + RewritePositionDeletePlan result = planner.plan(); + + assertThat(table.currentSnapshot()).as("Table must be empty").isNull(); + assertThat(result.totalGroupCount()).isZero(); + } + + @Test + void testFilter() { + addFiles(); + RewritePositionDeletesGroupPlanner planner = + new RewritePositionDeletesGroupPlanner( + table, + Expressions.or( + Expressions.equal(Expressions.bucket("data", 16), 0), + Expressions.equal(Expressions.bucket("data", 16), 2)), + false); + planner.init(REWRITE_ALL); + RewritePositionDeletePlan plan = planner.plan(); + List groups = plan.groups().collect(Collectors.toList()); + + assertThat(plan.totalGroupCount()).isEqualTo(2); + assertThat(groups).hasSize(2); + assertThat(groups.stream().mapToLong(FileRewriteGroup::numInputFiles).sum()).isEqualTo(4); + } + + @Test + void testWriteMaxFileSize() { + int targetFileSize = 10; + addFiles(); + + RewritePositionDeletesGroupPlanner planner = new RewritePositionDeletesGroupPlanner(table); + planner.init( + ImmutableMap.of( + RewritePositionDeletesGroupPlanner.REWRITE_ALL, + "true", + RewritePositionDeletesGroupPlanner.TARGET_FILE_SIZE_BYTES, + String.valueOf(targetFileSize))); + RewritePositionDeletePlan plan = planner.plan(); + assertThat(plan.writeMaxFileSize()) + .isGreaterThan(targetFileSize) + .isLessThan((long) (targetFileSize * MAX_FILE_SIZE_DEFAULT_RATIO)); + } + + @Test + void testInvalidOption() { + addFiles(); + + assertThatThrownBy( + () -> { + RewritePositionDeletesGroupPlanner planner = + new RewritePositionDeletesGroupPlanner(table); + + planner.init(ImmutableMap.of(RewritePositionDeleteFiles.REWRITE_JOB_ORDER, "foo")); + }) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid rewrite job order name: foo"); + } + + private void addFiles() { + table + .newRowDelta() + .addRows(FILE_1) + .addDeletes(newDeleteFile(FILE_1.partition(), 10)) + .addDeletes(newDeleteFile(FILE_1.partition(), 10)) + .addDeletes(newDeleteFile(FILE_1.partition(), 10)) + .addRows(FILE_2) + .addDeletes(newDeleteFile(FILE_2.partition(), 11)) + .addDeletes(newDeleteFile(FILE_2.partition(), 11)) + .addRows(FILE_3) + .addDeletes(newDeleteFile(FILE_3.partition(), 50)) + .commit(); + } + + private static DataFile newDataFile(String partitionPath) { + return DataFiles.builder(TestBase.SPEC) + .withPath("/path/to/data-" + UUID.randomUUID() + ".parquet") + .withFileSizeInBytes(10) + .withPartitionPath(partitionPath) + .withRecordCount(1) + .build(); + } + + private static DeleteFile newDeleteFile(long fileSize) { + return newDeleteFile( + new PartitionData(PartitionSpec.unpartitioned().partitionType()), fileSize); + } + + private static DeleteFile newDeleteFile(StructLike partition, long fileSize) { + return FileMetadata.deleteFileBuilder(TestBase.SPEC) + .ofPositionDeletes() + .withPath("/path/to/delete-" + UUID.randomUUID() + ".parquet") + .withFileSizeInBytes(fileSize) + .withPartition(partition) + .withRecordCount(1) + .build(); + } +} diff --git a/core/src/test/java/org/apache/iceberg/actions/TestSizeBasedRewriter.java b/core/src/test/java/org/apache/iceberg/actions/TestSizeBasedFileRewritePlanner.java similarity index 62% rename from core/src/test/java/org/apache/iceberg/actions/TestSizeBasedRewriter.java rename to core/src/test/java/org/apache/iceberg/actions/TestSizeBasedFileRewritePlanner.java index 82286d250574..013be9cb94b8 100644 --- a/core/src/test/java/org/apache/iceberg/actions/TestSizeBasedRewriter.java +++ b/core/src/test/java/org/apache/iceberg/actions/TestSizeBasedFileRewritePlanner.java @@ -21,35 +21,40 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.Mockito.when; -import java.util.Arrays; +import java.io.File; import java.util.List; import java.util.Map; import org.apache.iceberg.DataFile; import org.apache.iceberg.FileScanTask; import org.apache.iceberg.MockFileScanTask; -import org.apache.iceberg.ParameterizedTestExtension; -import org.apache.iceberg.Parameters; import org.apache.iceberg.StructLike; import org.apache.iceberg.Table; import org.apache.iceberg.TestBase; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.TestTables; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.junit.jupiter.api.TestTemplate; -import org.junit.jupiter.api.extension.ExtendWith; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; import org.mockito.Mockito; -@ExtendWith(ParameterizedTestExtension.class) -class TestSizeBasedRewriter extends TestBase { +class TestSizeBasedFileRewritePlanner { + @TempDir private File tableDir = null; + private TestTables.TestTable table = null; - @Parameters(name = "formatVersion = {0}") - protected static List parameters() { - return Arrays.asList(1, 2, 3); + @BeforeEach + public void setupTable() throws Exception { + this.table = TestTables.create(tableDir, "test", TestBase.SCHEMA, TestBase.SPEC, 3); } - @TestTemplate + @AfterEach + public void cleanupTables() { + TestTables.clearTables(); + } + + @Test void testSplitSizeLowerBound() { FileScanTask task1 = new MockFileScanTask(mockDataFile()); FileScanTask task2 = new MockFileScanTask(mockDataFile()); @@ -57,7 +62,7 @@ void testSplitSizeLowerBound() { FileScanTask task4 = new MockFileScanTask(mockDataFile()); List tasks = ImmutableList.of(task1, task2, task3, task4); - RewriteFileGroupPlanner planner = new TestingPlanner(table, Expressions.alwaysTrue(), 1, tasks); + TestingPlanner planner = new TestingPlanner(table); long minFileSize = 256L * 1024 * 1024; long targetFileSize = 512L * 1024 * 1024; @@ -72,28 +77,42 @@ void testSplitSizeLowerBound() { // the total task size is 580 MB and the target file size is 512 MB // the remainder must be written into a separate file as it exceeds 10% + List> groups = Lists.newArrayList(planner.planFileGroups(tasks).iterator()); - RewriteFileGroup group = planner.plan().groups().iterator().next(); - - assertThat(group.expectedOutputFiles()).isEqualTo(2); + assertThat(groups).hasSize(1); + List group = groups.get(0); // the split size must be >= targetFileSize and < maxFileSize - long splitSize = group.sizeInBytes(); + long splitSize = group.stream().mapToLong(FileScanTask::sizeBytes).sum(); assertThat(splitSize).isGreaterThanOrEqualTo(targetFileSize).isLessThan(maxFileSize); } - private static class TestingPlanner extends RewriteFileGroupPlanner { - private final List tasks; + private static class TestingPlanner + extends SizeBasedFileRewritePlanner< + RewriteDataFiles.FileGroupInfo, FileScanTask, DataFile, RewriteFileGroup> { + protected TestingPlanner(Table table) { + super(table); + } - private TestingPlanner( - Table table, Expression filter, long snapshotId, List tasks) { - super(table, filter, snapshotId, false); - this.tasks = tasks; + @Override + protected long defaultTargetFileSize() { + return 0; + } + + @Override + protected Iterable filterFiles(Iterable tasks) { + return tasks; + } + + @Override + protected Iterable> filterFileGroups(List> groups) { + return groups; } @Override - CloseableIterable tasks() { - return CloseableIterable.withNoopClose(tasks); + public FileRewritePlan + plan() { + throw new UnsupportedOperationException("Not supported"); } } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java index 8cf189ee8b79..f28bcd90ea9c 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java @@ -34,13 +34,13 @@ import org.apache.iceberg.StructLike; import org.apache.iceberg.Table; import org.apache.iceberg.actions.FileRewriteExecutor; -import org.apache.iceberg.actions.FileRewritePlan; import org.apache.iceberg.actions.ImmutableRewriteDataFiles; import org.apache.iceberg.actions.ImmutableRewriteDataFiles.Result.Builder; import org.apache.iceberg.actions.RewriteDataFiles; import org.apache.iceberg.actions.RewriteDataFilesCommitManager; import org.apache.iceberg.actions.RewriteFileGroup; import org.apache.iceberg.actions.RewriteFileGroupPlanner; +import org.apache.iceberg.actions.RewriteFilePlan; import org.apache.iceberg.exceptions.CommitFailedException; import org.apache.iceberg.exceptions.ValidationException; import org.apache.iceberg.expressions.Expression; @@ -94,8 +94,9 @@ public class RewriteDataFilesSparkAction private boolean useStartingSequenceNumber; private boolean caseSensitive; private RewriteFileGroupPlanner planner = null; - private FileRewriteExecutor rewriter = - null; + private FileRewriteExecutor< + FileGroupInfo, FileScanTask, DataFile, RewriteFileGroup, RewriteFilePlan> + rewriter = null; RewriteDataFilesSparkAction(SparkSession spark, Table table) { super(spark.cloneSession()); @@ -158,7 +159,7 @@ public RewriteDataFiles.Result execute() { init(startingSnapshotId); - FileRewritePlan plan = plan(); + RewriteFilePlan plan = plan(); rewriter.initPlan(plan); if (plan.totalGroupCount() == 0) { @@ -182,7 +183,7 @@ public RewriteDataFiles.Result execute() { } @VisibleForTesting - FileRewritePlan plan() { + RewriteFilePlan plan() { return planner.plan(); } @@ -200,9 +201,7 @@ void init(long startingSnapshotId) { } @VisibleForTesting - RewriteFileGroup rewriteFiles( - FileRewritePlan plan, - RewriteFileGroup fileGroup) { + RewriteFileGroup rewriteFiles(RewriteFilePlan plan, RewriteFileGroup fileGroup) { String desc = jobDesc(fileGroup, plan); Set addedFiles = withJobGroupInfo( @@ -227,9 +226,7 @@ RewriteDataFilesCommitManager commitManager(long startingSnapshotId) { table, startingSnapshotId, useStartingSequenceNumber, commitSummary()); } - private Builder doExecute( - FileRewritePlan plan, - RewriteDataFilesCommitManager commitManager) { + private Builder doExecute(RewriteFilePlan plan, RewriteDataFilesCommitManager commitManager) { ExecutorService rewriteService = rewriteService(); ConcurrentLinkedQueue rewrittenGroups = Queues.newConcurrentLinkedQueue(); @@ -289,8 +286,7 @@ private Builder doExecute( } private Builder doExecuteWithPartialProgress( - FileRewritePlan plan, - RewriteDataFilesCommitManager commitManager) { + RewriteFilePlan plan, RewriteDataFilesCommitManager commitManager) { ExecutorService rewriteService = rewriteService(); // start commit service @@ -407,9 +403,7 @@ void validateAndInitOptions() { PARTIAL_PROGRESS_ENABLED); } - private String jobDesc( - RewriteFileGroup group, - FileRewritePlan plan) { + private String jobDesc(RewriteFileGroup group, RewriteFilePlan plan) { StructLike partition = group.info().partition(); if (partition.size() > 0) { return String.format( diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewritePositionDeleteFilesSparkAction.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewritePositionDeleteFilesSparkAction.java index e237f46a163f..4fc7934f783b 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewritePositionDeleteFilesSparkAction.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewritePositionDeleteFilesSparkAction.java @@ -27,12 +27,11 @@ import java.util.concurrent.ThreadPoolExecutor; import java.util.stream.Collectors; import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.PositionDeletesScanTask; import org.apache.iceberg.StructLike; import org.apache.iceberg.Table; -import org.apache.iceberg.actions.FileRewritePlan; import org.apache.iceberg.actions.ImmutableRewritePositionDeleteFiles; import org.apache.iceberg.actions.RewritePositionDeleteFiles; +import org.apache.iceberg.actions.RewritePositionDeletePlan; import org.apache.iceberg.actions.RewritePositionDeletesCommitManager; import org.apache.iceberg.actions.RewritePositionDeletesCommitManager.CommitService; import org.apache.iceberg.actions.RewritePositionDeletesGroup; @@ -41,7 +40,6 @@ import org.apache.iceberg.exceptions.ValidationException; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.expressions.Expressions; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; import org.apache.iceberg.relocated.com.google.common.collect.Queues; @@ -111,8 +109,7 @@ public RewritePositionDeleteFiles.Result execute() { validateAndInitOptions(); - FileRewritePlan - plan = plan(); + RewritePositionDeletePlan plan = planner.plan(); rewriter.initPlan(plan); if (plan.totalGroupCount() == 0) { @@ -127,17 +124,8 @@ public RewritePositionDeleteFiles.Result execute() { } } - @VisibleForTesting - FileRewritePlan - plan() { - return planner.plan(); - } - private RewritePositionDeletesGroup rewriteDeleteFiles( - FileRewritePlan< - FileGroupInfo, PositionDeletesScanTask, DeleteFile, RewritePositionDeletesGroup> - plan, - RewritePositionDeletesGroup fileGroup) { + RewritePositionDeletePlan plan, RewritePositionDeletesGroup fileGroup) { String desc = jobDesc(fileGroup, plan); Set addedFiles = withJobGroupInfo( @@ -163,10 +151,7 @@ private RewritePositionDeletesCommitManager commitManager() { } private Result doExecute( - FileRewritePlan< - FileGroupInfo, PositionDeletesScanTask, DeleteFile, RewritePositionDeletesGroup> - plan, - RewritePositionDeletesCommitManager commitManager) { + RewritePositionDeletePlan plan, RewritePositionDeletesCommitManager commitManager) { ExecutorService rewriteService = rewriteService(); ConcurrentLinkedQueue rewrittenGroups = @@ -231,10 +216,7 @@ private Result doExecute( } private Result doExecuteWithPartialProgress( - FileRewritePlan< - FileGroupInfo, PositionDeletesScanTask, DeleteFile, RewritePositionDeletesGroup> - plan, - RewritePositionDeletesCommitManager commitManager) { + RewritePositionDeletePlan plan, RewritePositionDeletesCommitManager commitManager) { ExecutorService rewriteService = rewriteService(); // start commit service @@ -319,11 +301,7 @@ private void validateAndInitOptions() { PARTIAL_PROGRESS_ENABLED); } - private String jobDesc( - RewritePositionDeletesGroup group, - FileRewritePlan< - FileGroupInfo, PositionDeletesScanTask, DeleteFile, RewritePositionDeletesGroup> - plan) { + private String jobDesc(RewritePositionDeletesGroup group, RewritePositionDeletePlan plan) { StructLike partition = group.info().partition(); if (partition.size() > 0) { return String.format( diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkBinPackPositionDeletesRewriteExecutor.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkBinPackPositionDeletesRewriteExecutor.java index fb8b73f17463..cfc939cf0347 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkBinPackPositionDeletesRewriteExecutor.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkBinPackPositionDeletesRewriteExecutor.java @@ -35,6 +35,7 @@ import org.apache.iceberg.StructLike; import org.apache.iceberg.Table; import org.apache.iceberg.actions.RewritePositionDeleteFiles.FileGroupInfo; +import org.apache.iceberg.actions.RewritePositionDeletePlan; import org.apache.iceberg.actions.RewritePositionDeletesGroup; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.spark.PositionDeletesRewriteCoordinator; @@ -54,7 +55,11 @@ class SparkBinPackPositionDeletesRewriteExecutor extends SparkRewriteExecutor< - FileGroupInfo, PositionDeletesScanTask, DeleteFile, RewritePositionDeletesGroup> { + FileGroupInfo, + PositionDeletesScanTask, + DeleteFile, + RewritePositionDeletesGroup, + RewritePositionDeletePlan> { private final SparkSession spark; private final SparkTableCache tableCache = SparkTableCache.get(); diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkRewriteExecutor.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkRewriteExecutor.java index f723be7d633d..99c2f88a9b0a 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkRewriteExecutor.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkRewriteExecutor.java @@ -22,7 +22,6 @@ import java.util.Set; import org.apache.iceberg.ContentFile; import org.apache.iceberg.ContentScanTask; -import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Table; import org.apache.iceberg.actions.FileRewriteExecutor; import org.apache.iceberg.actions.FileRewriteGroup; @@ -36,16 +35,17 @@ * @param the Java type of the tasks to read content files * @param the Java type of the content files * @param the Java type of the planned groups + * @param

the Java type of the plan to execute */ abstract class SparkRewriteExecutor< I, T extends ContentScanTask, F extends ContentFile, - G extends FileRewriteGroup> - implements FileRewriteExecutor { + G extends FileRewriteGroup, + P extends FileRewritePlan> + implements FileRewriteExecutor { private final Table table; private long writeMaxFileSize; - private int outputSpecId; SparkRewriteExecutor(Table table) { this.table = table; @@ -59,18 +59,9 @@ long writeMaxFileSize() { return writeMaxFileSize; } - int outputSpecId() { - return outputSpecId; - } - - PartitionSpec outputSpec() { - return table.specs().get(outputSpecId); - } - @Override - public void initPlan(FileRewritePlan plan) { + public void initPlan(P plan) { this.writeMaxFileSize = plan.writeMaxFileSize(); - this.outputSpecId = plan.outputSpecId(); } @Override diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkSizeBasedDataRewriteExecutor.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkSizeBasedDataRewriteExecutor.java index 068979d8e5db..9979beacd777 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkSizeBasedDataRewriteExecutor.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkSizeBasedDataRewriteExecutor.java @@ -23,21 +23,25 @@ import java.util.UUID; import org.apache.iceberg.DataFile; import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Table; import org.apache.iceberg.actions.RewriteDataFiles.FileGroupInfo; import org.apache.iceberg.actions.RewriteFileGroup; +import org.apache.iceberg.actions.RewriteFilePlan; import org.apache.iceberg.spark.FileRewriteCoordinator; import org.apache.iceberg.spark.ScanTaskSetManager; import org.apache.iceberg.spark.SparkTableCache; import org.apache.spark.sql.SparkSession; abstract class SparkSizeBasedDataRewriteExecutor - extends SparkRewriteExecutor { + extends SparkRewriteExecutor< + FileGroupInfo, FileScanTask, DataFile, RewriteFileGroup, RewriteFilePlan> { private final SparkSession spark; private final SparkTableCache tableCache = SparkTableCache.get(); private final ScanTaskSetManager taskSetManager = ScanTaskSetManager.get(); private final FileRewriteCoordinator coordinator = FileRewriteCoordinator.get(); + private int outputSpecId; SparkSizeBasedDataRewriteExecutor(SparkSession spark, Table table) { super(table); @@ -67,4 +71,18 @@ public Set rewrite(RewriteFileGroup group) { coordinator.clearRewrite(table(), groupId); } } + + @Override + public void initPlan(RewriteFilePlan plan) { + super.initPlan(plan); + this.outputSpecId = plan.outputSpecId(); + } + + int outputSpecId() { + return outputSpecId; + } + + PartitionSpec outputSpec() { + return table().specs().get(outputSpecId); + } } From 8e23a3601fe64bbf09495ba581a9b2d49f3dbcb6 Mon Sep 17 00:00:00 2001 From: Peter Vary Date: Tue, 3 Dec 2024 12:39:17 +0100 Subject: [PATCH 10/11] Some more test refactor and moved COMPRESSION_FACTOR to SparkShufflingDataRewritePlanner --- .../iceberg/actions/FileRewriteExecutor.java | 2 +- .../iceberg/actions/FileRewriteGroup.java | 10 +- .../iceberg/actions/FileRewritePlan.java | 2 +- .../actions/RewriteFileGroupPlanner.java | 17 +- .../RewritePositionDeletesGroupPlanner.java | 63 +++-- .../actions/SizeBasedFileRewritePlanner.java | 18 +- .../actions/SizeBasedFileRewriter.java | 2 +- .../actions/TestRewriteFileGroupPlanner.java | 163 +++++++++-- ...estRewritePositionDeletesGroupPlanner.java | 30 +- .../TestSizeBasedFileRewritePlanner.java | 62 +++- .../actions/RewriteDataFilesSparkAction.java | 10 +- .../SparkShufflingDataRewriteExecutor.java | 24 -- .../SparkShufflingDataRewritePlanner.java | 85 ++++++ .../SparkSizeBasedDataRewriteExecutor.java | 16 +- .../actions/TestRewriteDataFilesAction.java | 11 +- .../actions/TestSparkFileRewriteExecutor.java | 264 +----------------- .../actions/TestSparkFileRewritePlanner.java | 104 +++++++ 17 files changed, 513 insertions(+), 370 deletions(-) create mode 100644 spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingDataRewritePlanner.java create mode 100644 spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestSparkFileRewritePlanner.java diff --git a/core/src/main/java/org/apache/iceberg/actions/FileRewriteExecutor.java b/core/src/main/java/org/apache/iceberg/actions/FileRewriteExecutor.java index bbe84915fa63..bc4102a25de4 100644 --- a/core/src/main/java/org/apache/iceberg/actions/FileRewriteExecutor.java +++ b/core/src/main/java/org/apache/iceberg/actions/FileRewriteExecutor.java @@ -69,7 +69,7 @@ default String description() { * *

The implementation is supposed to be engine-specific (e.g. Spark, Flink, Trino). * - * @param group a group of scan tasks for files to be rewritten together + * @param group of scan tasks for files to be rewritten together * @return a set of newly written files */ Set rewrite(G group); diff --git a/core/src/main/java/org/apache/iceberg/actions/FileRewriteGroup.java b/core/src/main/java/org/apache/iceberg/actions/FileRewriteGroup.java index c48a6d6f4a2c..08f6e050b163 100644 --- a/core/src/main/java/org/apache/iceberg/actions/FileRewriteGroup.java +++ b/core/src/main/java/org/apache/iceberg/actions/FileRewriteGroup.java @@ -37,38 +37,44 @@ public abstract class FileRewriteGroup, F extend private final long splitSize; private final int expectedOutputFiles; - protected FileRewriteGroup( - I info, List fileScanTasks, long splitSize, int expectedOutputFiles) { + FileRewriteGroup(I info, List fileScanTasks, long splitSize, int expectedOutputFiles) { this.info = info; this.fileScanTasks = fileScanTasks; this.splitSize = splitSize; this.expectedOutputFiles = expectedOutputFiles; } + /** Identifiers and partition information about the group. */ public I info() { return info; } + /** Input of the group. {@link ContentScanTask}s to read. */ public List fileScans() { return fileScanTasks; } + /** Expected split size for the output files. */ public long splitSize() { return splitSize; } + /** Expected number of the output files. */ public int expectedOutputFiles() { return expectedOutputFiles; } + /** Accumulated size for the input files. */ public long sizeInBytes() { return fileScanTasks.stream().mapToLong(T::length).sum(); } + /** Number of the input files. */ public int numInputFiles() { return fileScanTasks.size(); } + /** Comparator to order the FileRewriteGroups based on a provided {@link RewriteJobOrder}. */ public static , F extends ContentFile> Comparator> taskComparator(RewriteJobOrder rewriteJobOrder) { switch (rewriteJobOrder) { diff --git a/core/src/main/java/org/apache/iceberg/actions/FileRewritePlan.java b/core/src/main/java/org/apache/iceberg/actions/FileRewritePlan.java index 2cab4d7d12b4..dc4cc9a6d57a 100644 --- a/core/src/main/java/org/apache/iceberg/actions/FileRewritePlan.java +++ b/core/src/main/java/org/apache/iceberg/actions/FileRewritePlan.java @@ -56,7 +56,7 @@ protected FileRewritePlan( this.writeMaxFileSize = writeMaxFileSize; } - /** The stream of the generated {@link RewriteFileGroup}s. */ + /** The stream of the generated {@link FileRewriteGroup}s. */ public Stream groups() { return groups; } diff --git a/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java b/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java index ba499f392411..2ba632fcf061 100644 --- a/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java +++ b/core/src/main/java/org/apache/iceberg/actions/RewriteFileGroupPlanner.java @@ -50,7 +50,8 @@ /** * Groups specified files in the {@link Table} by {@link RewriteFileGroup}s. These will be grouped - * by partitions. + * by partitions. Extends {@link SizeBasedFileRewritePlanner} with delete file threshold and job + * {@link RewriteDataFiles#REWRITE_JOB_ORDER} handling. */ public class RewriteFileGroupPlanner extends SizeBasedFileRewritePlanner { @@ -89,6 +90,15 @@ public RewriteFileGroupPlanner(Table table, Expression filter) { false); } + /** + * Creates the planner for the given table. + * + * @param table to plan for + * @param filter used to remove files from the plan + * @param snapshotId used as a basis for planning - should be used as starting snapshot id at + * commit time when replacing the files + * @param caseSensitive property used for scanning + */ public RewriteFileGroupPlanner( Table table, Expression filter, Long snapshotId, boolean caseSensitive) { super(table); @@ -136,11 +146,6 @@ protected long defaultTargetFileSize() { TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT); } - /** - * Generates the plan for the current table. - * - * @return the generated plan which could be executed during the compaction - */ @Override public RewriteFilePlan plan() { StructLikeMap>> plan = planFileGroups(); diff --git a/core/src/main/java/org/apache/iceberg/actions/RewritePositionDeletesGroupPlanner.java b/core/src/main/java/org/apache/iceberg/actions/RewritePositionDeletesGroupPlanner.java index b1017ffcad14..14bd3cabefcc 100644 --- a/core/src/main/java/org/apache/iceberg/actions/RewritePositionDeletesGroupPlanner.java +++ b/core/src/main/java/org/apache/iceberg/actions/RewritePositionDeletesGroupPlanner.java @@ -52,7 +52,8 @@ /** * Groups specified files in the {@link Table} by {@link RewriteFileGroup}s. These will be grouped - * by partitions. + * by partitions. Extends the {@link SizeBasedFileRewritePlanner} with {@link + * RewritePositionDeleteFiles#REWRITE_JOB_ORDER} handling. */ public class RewritePositionDeletesGroupPlanner extends SizeBasedFileRewritePlanner< @@ -68,6 +69,13 @@ public RewritePositionDeletesGroupPlanner(Table table) { this(table, Expressions.alwaysTrue(), false); } + /** + * Creates the planner for the given table. + * + * @param table to plan for + * @param filter used to remove files from the plan + * @param caseSensitive property used for scanning + */ public RewritePositionDeletesGroupPlanner(Table table, Expression filter, boolean caseSensitive) { super(table); this.caseSensitive = caseSensitive; @@ -78,7 +86,7 @@ public RewritePositionDeletesGroupPlanner(Table table, Expression filter, boolea public Set validOptions() { return ImmutableSet.builder() .addAll(super.validOptions()) - .add(RewriteDataFiles.REWRITE_JOB_ORDER) + .add(RewritePositionDeleteFiles.REWRITE_JOB_ORDER) .build(); } @@ -93,11 +101,6 @@ public void init(Map options) { RewritePositionDeleteFiles.REWRITE_JOB_ORDER_DEFAULT)); } - /** - * Generates the plan for the current table. - * - * @return the generated plan which could be executed during the compaction - */ @Override public RewritePositionDeletePlan plan() { StructLikeMap>> plan = planFileGroups(); @@ -128,6 +131,25 @@ public RewritePositionDeletePlan plan() { groups, totalGroupCount, groupsInPartition, writeMaxFileSize()); } + @Override + protected Iterable filterFiles(Iterable tasks) { + return Iterables.filter(tasks, this::wronglySized); + } + + @Override + protected Iterable> filterFileGroups( + List> groups) { + return Iterables.filter(groups, this::shouldRewrite); + } + + @Override + protected long defaultTargetFileSize() { + return PropertyUtil.propertyAsLong( + table().properties(), + TableProperties.DELETE_TARGET_FILE_SIZE_BYTES, + TableProperties.DELETE_TARGET_FILE_SIZE_BYTES_DEFAULT); + } + private StructLikeMap>> planFileGroups() { Table deletesTable = MetadataTableUtils.createMetadataTableInstance(table(), MetadataTableType.POSITION_DELETES); @@ -148,29 +170,6 @@ private StructLikeMap>> planFileGroups() { } } - @Override - protected Iterable filterFiles(Iterable tasks) { - return Iterables.filter(tasks, this::wronglySized); - } - - @Override - protected Iterable> filterFileGroups( - List> groups) { - return Iterables.filter(groups, this::shouldRewrite); - } - - private boolean shouldRewrite(List group) { - return enoughInputFiles(group) || enoughContent(group) || tooMuchContent(group); - } - - @Override - protected long defaultTargetFileSize() { - return PropertyUtil.propertyAsLong( - table().properties(), - TableProperties.DELETE_TARGET_FILE_SIZE_BYTES, - TableProperties.DELETE_TARGET_FILE_SIZE_BYTES_DEFAULT); - } - private CloseableIterable planFiles(Table deletesTable) { PositionDeletesTable.PositionDeletesBatchScan scan = (PositionDeletesTable.PositionDeletesBatchScan) deletesTable.newBatchScan(); @@ -214,6 +213,10 @@ private RewritePositionDeletesGroup newRewriteGroup( info, Lists.newArrayList(tasks), splitSize, numOutputSize); } + private boolean shouldRewrite(List group) { + return enoughInputFiles(group) || enoughContent(group) || tooMuchContent(group); + } + private static class RewriteExecutionContext { private final Map partitionIndexMap; private final AtomicInteger groupIndex; diff --git a/core/src/main/java/org/apache/iceberg/actions/SizeBasedFileRewritePlanner.java b/core/src/main/java/org/apache/iceberg/actions/SizeBasedFileRewritePlanner.java index f743c689da35..edaec5af0f27 100644 --- a/core/src/main/java/org/apache/iceberg/actions/SizeBasedFileRewritePlanner.java +++ b/core/src/main/java/org/apache/iceberg/actions/SizeBasedFileRewritePlanner.java @@ -114,23 +114,21 @@ public abstract class SizeBasedFileRewritePlanner< private int minInputFiles; private boolean rewriteAll; private long maxGroupSize; - private int outputSpecId; protected SizeBasedFileRewritePlanner(Table table) { this.table = table; } + /** Expected target file size before configuration. */ protected abstract long defaultTargetFileSize(); + /** Additional filter for tasks before grouping. */ protected abstract Iterable filterFiles(Iterable tasks); + /** Additional filter for groups. */ protected abstract Iterable> filterFileGroups(List> groups); - protected Table table() { - return table; - } - @Override public Set validOptions() { return ImmutableSet.of( @@ -158,11 +156,15 @@ public void init(Map options) { } } + protected Table table() { + return table; + } + protected boolean wronglySized(T task) { return task.length() < minFileSize || task.length() > maxFileSize; } - public Iterable> planFileGroups(Iterable tasks) { + protected Iterable> planFileGroups(Iterable tasks) { Iterable filteredTasks = rewriteAll ? tasks : filterFiles(tasks); BinPacking.ListPacker packer = new BinPacking.ListPacker<>(maxGroupSize, 1, false); List> groups = packer.pack(filteredTasks, ContentScanTask::length); @@ -257,11 +259,11 @@ protected int numOutputFiles(long inputSize) { * * @return the target size plus one half of the distance between max and target */ - public long writeMaxFileSize() { + protected long writeMaxFileSize() { return (long) (targetFileSize + ((maxFileSize - targetFileSize) * 0.5)); } - public int outputSpecId() { + protected int outputSpecId() { return outputSpecId; } diff --git a/core/src/main/java/org/apache/iceberg/actions/SizeBasedFileRewriter.java b/core/src/main/java/org/apache/iceberg/actions/SizeBasedFileRewriter.java index 319e44c4a20c..00ef0b6694de 100644 --- a/core/src/main/java/org/apache/iceberg/actions/SizeBasedFileRewriter.java +++ b/core/src/main/java/org/apache/iceberg/actions/SizeBasedFileRewriter.java @@ -195,7 +195,7 @@ protected long inputSize(List group) { * of output files. The final split size is adjusted to be at least as big as the target file size * but less than the max write file size. */ - public long splitSize(long inputSize) { + protected long splitSize(long inputSize) { long estimatedSplitSize = (inputSize / numOutputFiles(inputSize)) + SPLIT_OVERHEAD; if (estimatedSplitSize < targetFileSize) { return targetFileSize; diff --git a/core/src/test/java/org/apache/iceberg/actions/TestRewriteFileGroupPlanner.java b/core/src/test/java/org/apache/iceberg/actions/TestRewriteFileGroupPlanner.java index e10019a9f547..48991cd601b7 100644 --- a/core/src/test/java/org/apache/iceberg/actions/TestRewriteFileGroupPlanner.java +++ b/core/src/test/java/org/apache/iceberg/actions/TestRewriteFileGroupPlanner.java @@ -30,6 +30,8 @@ import java.util.stream.Collectors; import org.apache.iceberg.DataFile; import org.apache.iceberg.DataFiles; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.MockFileScanTask; import org.apache.iceberg.RewriteJobOrder; import org.apache.iceberg.StructLike; import org.apache.iceberg.TestBase; @@ -37,6 +39,8 @@ import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; @@ -212,32 +216,155 @@ void testOutputSpec(boolean specific) { } @Test - void testInvalidOption() { - addFiles(); + void testValidOptions() { + RewriteFileGroupPlanner planner = new RewriteFileGroupPlanner(table); - assertThatThrownBy( - () -> { - RewriteFileGroupPlanner planner = new RewriteFileGroupPlanner(table); + assertThat(planner.validOptions()) + .as("Planner must report all supported options") + .isEqualTo( + ImmutableSet.of( + RewriteFileGroupPlanner.TARGET_FILE_SIZE_BYTES, + RewriteFileGroupPlanner.MIN_FILE_SIZE_BYTES, + RewriteFileGroupPlanner.MAX_FILE_SIZE_BYTES, + RewriteFileGroupPlanner.MIN_INPUT_FILES, + RewriteFileGroupPlanner.REWRITE_ALL, + RewriteFileGroupPlanner.MAX_FILE_GROUP_SIZE_BYTES, + RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, + RewriteDataFiles.REWRITE_JOB_ORDER)); + } - planner.init(ImmutableMap.of(RewriteDataFiles.REWRITE_JOB_ORDER, "foo")); - }) + @Test + void testInvalidOption() { + RewriteFileGroupPlanner planner = new RewriteFileGroupPlanner(table); + + Map invalidRewriteJobOrderOptions = + ImmutableMap.of(RewriteDataFiles.REWRITE_JOB_ORDER, "foo"); + assertThatThrownBy(() -> planner.init(invalidRewriteJobOrderOptions)) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Invalid rewrite job order name: foo"); - assertThatThrownBy( - () -> { - RewriteFileGroupPlanner planner = new RewriteFileGroupPlanner(table); - - planner.init( - ImmutableMap.of( - RewriteFileGroupPlanner.REWRITE_ALL, - "true", - RewriteDataFiles.OUTPUT_SPEC_ID, - String.valueOf(1234))); - }) + Map invalidOutputSpecIdOptions = + ImmutableMap.of(RewriteDataFiles.OUTPUT_SPEC_ID, String.valueOf(1234)); + assertThatThrownBy(() -> planner.init(invalidOutputSpecIdOptions)) .isInstanceOf(IllegalArgumentException.class) .hasMessage( "Cannot use output spec id 1234 because the table does not contain a reference to this spec-id."); + + Map invalidDeleteFileThresholdOptions = + ImmutableMap.of(RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, "-1"); + assertThatThrownBy(() -> planner.init(invalidDeleteFileThresholdOptions)) + .hasMessageContaining("'delete-file-threshold' is set to -1 but must be >= 0"); + } + + @Test + void testBinPackDataSelectFiles() { + RewriteFileGroupPlanner planner = new RewriteFileGroupPlanner(table); + + checkDataFileSizeFiltering(planner); + checkDataFilesDeleteThreshold(planner); + checkDataFileGroupWithEnoughFiles(planner); + checkDataFileGroupWithEnoughData(planner); + checkDataFileGroupWithTooMuchData(planner); + } + + private void checkDataFileSizeFiltering(RewriteFileGroupPlanner planner) { + FileScanTask tooSmallTask = new MockFileScanTask(100L); + FileScanTask optimal = new MockFileScanTask(450); + FileScanTask tooBigTask = new MockFileScanTask(1000L); + List tasks = ImmutableList.of(tooSmallTask, optimal, tooBigTask); + + Map options = + ImmutableMap.of( + RewriteFileGroupPlanner.MIN_FILE_SIZE_BYTES, "250", + RewriteFileGroupPlanner.TARGET_FILE_SIZE_BYTES, "500", + RewriteFileGroupPlanner.MAX_FILE_SIZE_BYTES, "750", + RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, String.valueOf(Integer.MAX_VALUE)); + planner.init(options); + + Iterable> groups = planner.planFileGroups(tasks); + assertThat(groups).as("Must have 1 group").hasSize(1); + List group = Iterables.getOnlyElement(groups); + assertThat(group).as("Must rewrite 2 files").hasSize(2); + } + + private void checkDataFilesDeleteThreshold(RewriteFileGroupPlanner planner) { + FileScanTask tooManyDeletesTask = MockFileScanTask.mockTaskWithDeletes(1000L, 3); + FileScanTask optimalTask = MockFileScanTask.mockTaskWithDeletes(1000L, 1); + List tasks = ImmutableList.of(tooManyDeletesTask, optimalTask); + + Map options = + ImmutableMap.of( + RewriteFileGroupPlanner.MIN_FILE_SIZE_BYTES, "1", + RewriteFileGroupPlanner.TARGET_FILE_SIZE_BYTES, "2000", + RewriteFileGroupPlanner.MAX_FILE_SIZE_BYTES, "5000", + RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, "2"); + planner.init(options); + + Iterable> groups = planner.planFileGroups(tasks); + assertThat(groups).as("Must have 1 group").hasSize(1); + List group = Iterables.getOnlyElement(groups); + assertThat(group).as("Must rewrite 1 file").hasSize(1); + } + + private void checkDataFileGroupWithEnoughFiles(RewriteFileGroupPlanner planner) { + List tasks = + ImmutableList.of( + new MockFileScanTask(100L), + new MockFileScanTask(100L), + new MockFileScanTask(100L), + new MockFileScanTask(100L)); + + Map options = + ImmutableMap.of( + RewriteFileGroupPlanner.MIN_INPUT_FILES, "3", + RewriteFileGroupPlanner.MIN_FILE_SIZE_BYTES, "150", + RewriteFileGroupPlanner.TARGET_FILE_SIZE_BYTES, "1000", + RewriteFileGroupPlanner.MAX_FILE_SIZE_BYTES, "5000", + RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, String.valueOf(Integer.MAX_VALUE)); + planner.init(options); + + Iterable> groups = planner.planFileGroups(tasks); + assertThat(groups).as("Must have 1 group").hasSize(1); + List group = Iterables.getOnlyElement(groups); + assertThat(group).as("Must rewrite 4 files").hasSize(4); + } + + private void checkDataFileGroupWithEnoughData(RewriteFileGroupPlanner planner) { + List tasks = + ImmutableList.of( + new MockFileScanTask(100L), new MockFileScanTask(100L), new MockFileScanTask(100L)); + + Map options = + ImmutableMap.of( + RewriteFileGroupPlanner.MIN_INPUT_FILES, "5", + RewriteFileGroupPlanner.MIN_FILE_SIZE_BYTES, "200", + RewriteFileGroupPlanner.TARGET_FILE_SIZE_BYTES, "250", + RewriteFileGroupPlanner.MAX_FILE_SIZE_BYTES, "500", + RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, String.valueOf(Integer.MAX_VALUE)); + planner.init(options); + + Iterable> groups = planner.planFileGroups(tasks); + assertThat(groups).as("Must have 1 group").hasSize(1); + List group = Iterables.getOnlyElement(groups); + assertThat(group).as("Must rewrite 3 files").hasSize(3); + } + + private void checkDataFileGroupWithTooMuchData(RewriteFileGroupPlanner planner) { + List tasks = ImmutableList.of(new MockFileScanTask(2000L)); + + Map options = + ImmutableMap.of( + RewriteFileGroupPlanner.MIN_INPUT_FILES, "5", + RewriteFileGroupPlanner.MIN_FILE_SIZE_BYTES, "200", + RewriteFileGroupPlanner.TARGET_FILE_SIZE_BYTES, "250", + RewriteFileGroupPlanner.MAX_FILE_SIZE_BYTES, "500", + RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, String.valueOf(Integer.MAX_VALUE)); + planner.init(options); + + Iterable> groups = planner.planFileGroups(tasks); + assertThat(groups).as("Must have 1 group").hasSize(1); + List group = Iterables.getOnlyElement(groups); + assertThat(group).as("Must rewrite big file").hasSize(1); } private void addFiles() { diff --git a/core/src/test/java/org/apache/iceberg/actions/TestRewritePositionDeletesGroupPlanner.java b/core/src/test/java/org/apache/iceberg/actions/TestRewritePositionDeletesGroupPlanner.java index 2c4520d96d99..07858706d9ee 100644 --- a/core/src/test/java/org/apache/iceberg/actions/TestRewritePositionDeletesGroupPlanner.java +++ b/core/src/test/java/org/apache/iceberg/actions/TestRewritePositionDeletesGroupPlanner.java @@ -41,6 +41,7 @@ import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -195,16 +196,29 @@ void testWriteMaxFileSize() { } @Test - void testInvalidOption() { - addFiles(); + void testValidOptions() { + RewritePositionDeletesGroupPlanner planner = new RewritePositionDeletesGroupPlanner(table); + + assertThat(planner.validOptions()) + .as("Planner must report all supported options") + .isEqualTo( + ImmutableSet.of( + RewritePositionDeletesGroupPlanner.TARGET_FILE_SIZE_BYTES, + RewritePositionDeletesGroupPlanner.MIN_FILE_SIZE_BYTES, + RewritePositionDeletesGroupPlanner.MAX_FILE_SIZE_BYTES, + RewritePositionDeletesGroupPlanner.MIN_INPUT_FILES, + RewritePositionDeletesGroupPlanner.REWRITE_ALL, + RewritePositionDeletesGroupPlanner.MAX_FILE_GROUP_SIZE_BYTES, + RewriteDataFiles.REWRITE_JOB_ORDER)); + } - assertThatThrownBy( - () -> { - RewritePositionDeletesGroupPlanner planner = - new RewritePositionDeletesGroupPlanner(table); + @Test + void testInvalidOption() { + RewritePositionDeletesGroupPlanner planner = new RewritePositionDeletesGroupPlanner(table); - planner.init(ImmutableMap.of(RewritePositionDeleteFiles.REWRITE_JOB_ORDER, "foo")); - }) + Map invalidRewriteJobOrderOptions = + ImmutableMap.of(RewritePositionDeleteFiles.REWRITE_JOB_ORDER, "foo"); + assertThatThrownBy(() -> planner.init(invalidRewriteJobOrderOptions)) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Invalid rewrite job order name: foo"); } diff --git a/core/src/test/java/org/apache/iceberg/actions/TestSizeBasedFileRewritePlanner.java b/core/src/test/java/org/apache/iceberg/actions/TestSizeBasedFileRewritePlanner.java index 013be9cb94b8..43ea307e11ff 100644 --- a/core/src/test/java/org/apache/iceberg/actions/TestSizeBasedFileRewritePlanner.java +++ b/core/src/test/java/org/apache/iceberg/actions/TestSizeBasedFileRewritePlanner.java @@ -19,6 +19,7 @@ package org.apache.iceberg.actions; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.mockito.Mockito.when; import java.io.File; @@ -29,10 +30,12 @@ import org.apache.iceberg.MockFileScanTask; import org.apache.iceberg.StructLike; import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; import org.apache.iceberg.TestBase; import org.apache.iceberg.TestTables; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; @@ -87,6 +90,63 @@ void testSplitSizeLowerBound() { assertThat(splitSize).isGreaterThanOrEqualTo(targetFileSize).isLessThan(maxFileSize); } + @Test + void testValidOptions() { + TestingPlanner planner = new TestingPlanner(table); + + assertThat(planner.validOptions()) + .as("Planner must report all supported options") + .isEqualTo( + ImmutableSet.of( + RewriteFileGroupPlanner.TARGET_FILE_SIZE_BYTES, + RewriteFileGroupPlanner.MIN_FILE_SIZE_BYTES, + RewriteFileGroupPlanner.MAX_FILE_SIZE_BYTES, + RewriteFileGroupPlanner.MIN_INPUT_FILES, + RewriteFileGroupPlanner.REWRITE_ALL, + RewriteFileGroupPlanner.MAX_FILE_GROUP_SIZE_BYTES)); + } + + @Test + void testInvalidOption() { + TestingPlanner planner = new TestingPlanner(table); + + Map invalidTargetSizeOptions = + ImmutableMap.of(SizeBasedFileRewritePlanner.TARGET_FILE_SIZE_BYTES, "0"); + assertThatThrownBy(() -> planner.init(invalidTargetSizeOptions)) + .hasMessageContaining("'target-file-size-bytes' is set to 0 but must be > 0"); + + Map invalidMinSizeOptions = + ImmutableMap.of(SizeBasedFileRewritePlanner.MIN_FILE_SIZE_BYTES, "-1"); + assertThatThrownBy(() -> planner.init(invalidMinSizeOptions)) + .hasMessageContaining("'min-file-size-bytes' is set to -1 but must be >= 0"); + + Map invalidTargetMinSizeOptions = + ImmutableMap.of( + SizeBasedFileRewritePlanner.TARGET_FILE_SIZE_BYTES, "3", + SizeBasedFileRewritePlanner.MIN_FILE_SIZE_BYTES, "5"); + assertThatThrownBy(() -> planner.init(invalidTargetMinSizeOptions)) + .hasMessageContaining("'target-file-size-bytes' (3) must be > 'min-file-size-bytes' (5)") + .hasMessageContaining("all new files will be smaller than the min threshold"); + + Map invalidTargetMaxSizeOptions = + ImmutableMap.of( + SizeBasedFileRewritePlanner.TARGET_FILE_SIZE_BYTES, "5", + SizeBasedFileRewritePlanner.MAX_FILE_SIZE_BYTES, "3"); + assertThatThrownBy(() -> planner.init(invalidTargetMaxSizeOptions)) + .hasMessageContaining("'target-file-size-bytes' (5) must be < 'max-file-size-bytes' (3)") + .hasMessageContaining("all new files will be larger than the max threshold"); + + Map invalidMinInputFilesOptions = + ImmutableMap.of(SizeBasedFileRewritePlanner.MIN_INPUT_FILES, "0"); + assertThatThrownBy(() -> planner.init(invalidMinInputFilesOptions)) + .hasMessageContaining("'min-input-files' is set to 0 but must be > 0"); + + Map invalidMaxFileGroupSizeOptions = + ImmutableMap.of(SizeBasedFileRewritePlanner.MAX_FILE_GROUP_SIZE_BYTES, "0"); + assertThatThrownBy(() -> planner.init(invalidMaxFileGroupSizeOptions)) + .hasMessageContaining("'max-file-group-size-bytes' is set to 0 but must be > 0"); + } + private static class TestingPlanner extends SizeBasedFileRewritePlanner< RewriteDataFiles.FileGroupInfo, FileScanTask, DataFile, RewriteFileGroup> { @@ -96,7 +156,7 @@ protected TestingPlanner(Table table) { @Override protected long defaultTargetFileSize() { - return 0; + return TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT; } @Override diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java index f28bcd90ea9c..29585a85ac59 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteDataFilesSparkAction.java @@ -97,6 +97,7 @@ public class RewriteDataFilesSparkAction private FileRewriteExecutor< FileGroupInfo, FileScanTask, DataFile, RewriteFileGroup, RewriteFilePlan> rewriter = null; + private boolean shufflingPlanner = false; RewriteDataFilesSparkAction(SparkSession spark, Table table) { super(spark.cloneSession()); @@ -116,6 +117,7 @@ public RewriteDataFilesSparkAction binPack() { Preconditions.checkArgument( rewriter == null, "Must use only one rewriter type (bin-pack, sort, zorder)"); this.rewriter = new SparkBinPackDataRewriteExecutor(spark(), table); + this.shufflingPlanner = false; return this; } @@ -124,6 +126,7 @@ public RewriteDataFilesSparkAction sort(SortOrder sortOrder) { Preconditions.checkArgument( rewriter == null, "Must use only one rewriter type (bin-pack, sort, zorder)"); this.rewriter = new SparkSortDataRewriteExecutor(spark(), table, sortOrder); + this.shufflingPlanner = true; return this; } @@ -132,6 +135,7 @@ public RewriteDataFilesSparkAction sort() { Preconditions.checkArgument( rewriter == null, "Must use only one rewriter type (bin-pack, sort, zorder)"); this.rewriter = new SparkSortDataRewriteExecutor(spark(), table); + this.shufflingPlanner = true; return this; } @@ -140,6 +144,7 @@ public RewriteDataFilesSparkAction zOrder(String... columnNames) { Preconditions.checkArgument( rewriter == null, "Must use only one rewriter type (bin-pack, sort, zorder)"); this.rewriter = new SparkZOrderDataRewriteExecutor(spark(), table, Arrays.asList(columnNames)); + this.shufflingPlanner = true; return this; } @@ -190,7 +195,10 @@ RewriteFilePlan plan() { @VisibleForTesting void init(long startingSnapshotId) { - this.planner = new RewriteFileGroupPlanner(table, filter, startingSnapshotId, caseSensitive); + this.planner = + shufflingPlanner + ? new SparkShufflingDataRewritePlanner(table, filter, startingSnapshotId, caseSensitive) + : new RewriteFileGroupPlanner(table, filter, startingSnapshotId, caseSensitive); // Default to BinPack if no strategy selected if (this.rewriter == null) { diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingDataRewriteExecutor.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingDataRewriteExecutor.java index e5090a68bff2..87b9326e3b6c 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingDataRewriteExecutor.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingDataRewriteExecutor.java @@ -50,19 +50,6 @@ abstract class SparkShufflingDataRewriteExecutor extends SparkSizeBasedDataRewriteExecutor { - /** - * The number of shuffle partitions and consequently the number of output files created by the - * Spark sort is based on the size of the input data files used in this file rewriter. Due to - * compression, the disk file sizes may not accurately represent the size of files in the output. - * This parameter lets the user adjust the file size used for estimating actual output data size. - * A factor greater than 1.0 would generate more files than we would expect based on the on-disk - * file size. A value less than 1.0 would create fewer files than we would expect based on the - * on-disk size. - */ - public static final String COMPRESSION_FACTOR = "compression-factor"; - - public static final double COMPRESSION_FACTOR_DEFAULT = 1.0; - /** * The number of shuffle partitions to use for each output file. By default, this file rewriter * assumes each shuffle partition would become a separate output file. Attempting to generate @@ -79,7 +66,6 @@ abstract class SparkShufflingDataRewriteExecutor extends SparkSizeBasedDataRewri public static final int SHUFFLE_PARTITIONS_PER_FILE_DEFAULT = 1; - private double compressionFactor; private int numShufflePartitionsPerFile; protected SparkShufflingDataRewriteExecutor(SparkSession spark, Table table) { @@ -105,7 +91,6 @@ protected abstract Dataset sortedDF( public Set validOptions() { return ImmutableSet.builder() .addAll(super.validOptions()) - .add(COMPRESSION_FACTOR) .add(SHUFFLE_PARTITIONS_PER_FILE) .build(); } @@ -113,7 +98,6 @@ public Set validOptions() { @Override public void init(Map options) { super.init(options); - this.compressionFactor = compressionFactor(options); this.numShufflePartitionsPerFile = numShufflePartitionsPerFile(options); } @@ -178,14 +162,6 @@ private org.apache.iceberg.SortOrder outputSortOrder(List group) { } } - private double compressionFactor(Map options) { - double value = - PropertyUtil.propertyAsDouble(options, COMPRESSION_FACTOR, COMPRESSION_FACTOR_DEFAULT); - Preconditions.checkArgument( - value > 0, "'%s' is set to %s but must be > 0", COMPRESSION_FACTOR, value); - return value; - } - private int numShufflePartitionsPerFile(Map options) { int value = PropertyUtil.propertyAsInt( diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingDataRewritePlanner.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingDataRewritePlanner.java new file mode 100644 index 000000000000..16410946bd7d --- /dev/null +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingDataRewritePlanner.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.actions; + +import java.util.Map; +import java.util.Set; +import org.apache.iceberg.Table; +import org.apache.iceberg.actions.RewriteFileGroupPlanner; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.util.PropertyUtil; + +/** + * Extends the {@link RewriteFileGroupPlanner} with the possibility to set the expected compression + * factor. + */ +public class SparkShufflingDataRewritePlanner extends RewriteFileGroupPlanner { + /** + * The number of shuffle partitions and consequently the number of output files created by the + * Spark sort is based on the size of the input data files used in this file rewriter. Due to + * compression, the disk file sizes may not accurately represent the size of files in the output. + * This parameter lets the user adjust the file size used for estimating actual output data size. + * A factor greater than 1.0 would generate more files than we would expect based on the on-disk + * file size. A value less than 1.0 would create fewer files than we would expect based on the + * on-disk size. + */ + public static final String COMPRESSION_FACTOR = "compression-factor"; + + public static final double COMPRESSION_FACTOR_DEFAULT = 1.0; + + private double compressionFactor; + + public SparkShufflingDataRewritePlanner(Table table) { + super(table); + } + + public SparkShufflingDataRewritePlanner( + Table table, Expression filter, Long snapshotId, boolean caseSensitive) { + super(table, filter, snapshotId, caseSensitive); + } + + @Override + public Set validOptions() { + return ImmutableSet.builder() + .addAll(super.validOptions()) + .add(COMPRESSION_FACTOR) + .build(); + } + + @Override + public void init(Map options) { + super.init(options); + this.compressionFactor = compressionFactor(options); + } + + @Override + protected int numOutputFiles(long inputSize) { + return Math.max(1, super.numOutputFiles((long) (inputSize * compressionFactor))); + } + + private double compressionFactor(Map options) { + double value = + PropertyUtil.propertyAsDouble(options, COMPRESSION_FACTOR, COMPRESSION_FACTOR_DEFAULT); + Preconditions.checkArgument( + value > 0, "'%s' is set to %s but must be > 0", COMPRESSION_FACTOR, value); + return value; + } +} diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkSizeBasedDataRewriteExecutor.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkSizeBasedDataRewriteExecutor.java index 9979beacd777..800882cc31e5 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkSizeBasedDataRewriteExecutor.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkSizeBasedDataRewriteExecutor.java @@ -55,6 +55,14 @@ protected SparkSession spark() { return spark; } + protected int outputSpecId() { + return outputSpecId; + } + + protected PartitionSpec outputSpec() { + return table().specs().get(outputSpecId); + } + @Override public Set rewrite(RewriteFileGroup group) { String groupId = UUID.randomUUID().toString(); @@ -77,12 +85,4 @@ public void initPlan(RewriteFilePlan plan) { super.initPlan(plan); this.outputSpecId = plan.outputSpecId(); } - - int outputSpecId() { - return outputSpecId; - } - - PartitionSpec outputSpec() { - return table().specs().get(outputSpecId); - } } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java index 980a1e71bef9..d25710e7cd13 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java @@ -77,6 +77,7 @@ import org.apache.iceberg.actions.RewriteDataFilesCommitManager; import org.apache.iceberg.actions.RewriteFileGroup; import org.apache.iceberg.actions.RewriteFileGroupPlanner; +import org.apache.iceberg.actions.RewriteFilePlan; import org.apache.iceberg.actions.SizeBasedFileRewritePlanner; import org.apache.iceberg.data.GenericAppenderFactory; import org.apache.iceberg.data.GenericRecord; @@ -887,7 +888,7 @@ public void testSingleCommitWithRewriteFailure() { GroupInfoMatcher failGroup = new GroupInfoMatcher(1, 3, 7); doThrow(new RuntimeException("Rewrite Failed")) .when(spyRewrite) - .rewriteFiles(any(), argThat(failGroup)); + .rewriteFiles(any(RewriteFilePlan.class), argThat(failGroup)); assertThatThrownBy(spyRewrite::execute) .isInstanceOf(RuntimeException.class) @@ -990,7 +991,7 @@ public void testParallelSingleCommitWithRewriteFailure() { GroupInfoMatcher failGroup = new GroupInfoMatcher(1, 3, 7); doThrow(new CommitFailedException("Rewrite Failed")) .when(spyRewrite) - .rewriteFiles(any(), argThat(failGroup)); + .rewriteFiles(any(RewriteFilePlan.class), argThat(failGroup)); assertThatThrownBy(spyRewrite::execute) .isInstanceOf(CommitFailedException.class) @@ -1027,7 +1028,7 @@ public void testPartialProgressWithRewriteFailure() { GroupInfoMatcher failGroup = new GroupInfoMatcher(1, 3, 7); doThrow(new RuntimeException("Rewrite Failed")) .when(spyRewrite) - .rewriteFiles(any(), argThat(failGroup)); + .rewriteFiles(any(RewriteFilePlan.class), argThat(failGroup)); RewriteDataFiles.Result result = spyRewrite.execute(); @@ -1070,7 +1071,7 @@ public void testParallelPartialProgressWithRewriteFailure() { GroupInfoMatcher failGroup = new GroupInfoMatcher(1, 3, 7); doThrow(new RuntimeException("Rewrite Failed")) .when(spyRewrite) - .rewriteFiles(any(), argThat(failGroup)); + .rewriteFiles(any(RewriteFilePlan.class), argThat(failGroup)); RewriteDataFiles.Result result = spyRewrite.execute(); @@ -1158,7 +1159,7 @@ public void testParallelPartialProgressWithMaxFailedCommits() { GroupInfoMatcher failGroup = new GroupInfoMatcher(1, 3, 7); doThrow(new RuntimeException("Rewrite Failed")) .when(spyRewrite) - .rewriteFiles(any(), argThat(failGroup)); + .rewriteFiles(any(RewriteFilePlan.class), argThat(failGroup)); assertThatThrownBy(() -> spyRewrite.execute()) .isInstanceOf(RuntimeException.class) diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestSparkFileRewriteExecutor.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestSparkFileRewriteExecutor.java index 444bbf458f17..32d1816e56e4 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestSparkFileRewriteExecutor.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestSparkFileRewriteExecutor.java @@ -21,22 +21,15 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; -import java.util.List; import java.util.Map; -import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.MockFileScanTask; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.SortOrder; import org.apache.iceberg.Table; -import org.apache.iceberg.actions.RewriteDataFiles; -import org.apache.iceberg.actions.RewriteFileGroupPlanner; -import org.apache.iceberg.actions.SizeBasedFileRewritePlanner; import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Types.IntegerType; import org.apache.iceberg.types.Types.NestedField; @@ -44,7 +37,7 @@ import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Test; -public class TestSparkFileRewriteExecutor extends TestBase { +class TestSparkFileRewriteExecutor extends TestBase { private static final TableIdentifier TABLE_IDENT = TableIdentifier.of("default", "tbl"); private static final Schema SCHEMA = @@ -61,119 +54,7 @@ public void removeTable() { } @Test - public void testBinPackDataSelectFiles() { - Table table = catalog.createTable(TABLE_IDENT, SCHEMA); - RewriteFileGroupPlanner rewriter = new RewriteFileGroupPlanner(table); - - checkDataFileSizeFiltering(rewriter); - checkDataFilesDeleteThreshold(rewriter); - checkDataFileGroupWithEnoughFiles(rewriter); - checkDataFileGroupWithEnoughData(rewriter); - checkDataFileGroupWithTooMuchData(rewriter); - } - - private void checkDataFileSizeFiltering(RewriteFileGroupPlanner rewriter) { - FileScanTask tooSmallTask = new MockFileScanTask(100L); - FileScanTask optimal = new MockFileScanTask(450); - FileScanTask tooBigTask = new MockFileScanTask(1000L); - List tasks = ImmutableList.of(tooSmallTask, optimal, tooBigTask); - - Map options = - ImmutableMap.of( - RewriteFileGroupPlanner.MIN_FILE_SIZE_BYTES, "250", - RewriteFileGroupPlanner.TARGET_FILE_SIZE_BYTES, "500", - RewriteFileGroupPlanner.MAX_FILE_SIZE_BYTES, "750", - RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, String.valueOf(Integer.MAX_VALUE)); - rewriter.init(options); - - Iterable> groups = rewriter.planFileGroups(tasks); - assertThat(groups).as("Must have 1 group").hasSize(1); - List group = Iterables.getOnlyElement(groups); - assertThat(group).as("Must rewrite 2 files").hasSize(2); - } - - private void checkDataFilesDeleteThreshold(RewriteFileGroupPlanner rewriter) { - FileScanTask tooManyDeletesTask = MockFileScanTask.mockTaskWithDeletes(1000L, 3); - FileScanTask optimalTask = MockFileScanTask.mockTaskWithDeletes(1000L, 1); - List tasks = ImmutableList.of(tooManyDeletesTask, optimalTask); - - Map options = - ImmutableMap.of( - RewriteFileGroupPlanner.MIN_FILE_SIZE_BYTES, "1", - RewriteFileGroupPlanner.TARGET_FILE_SIZE_BYTES, "2000", - RewriteFileGroupPlanner.MAX_FILE_SIZE_BYTES, "5000", - RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, "2"); - rewriter.init(options); - - Iterable> groups = rewriter.planFileGroups(tasks); - assertThat(groups).as("Must have 1 group").hasSize(1); - List group = Iterables.getOnlyElement(groups); - assertThat(group).as("Must rewrite 1 file").hasSize(1); - } - - private void checkDataFileGroupWithEnoughFiles(RewriteFileGroupPlanner rewriter) { - List tasks = - ImmutableList.of( - new MockFileScanTask(100L), - new MockFileScanTask(100L), - new MockFileScanTask(100L), - new MockFileScanTask(100L)); - - Map options = - ImmutableMap.of( - RewriteFileGroupPlanner.MIN_INPUT_FILES, "3", - RewriteFileGroupPlanner.MIN_FILE_SIZE_BYTES, "150", - RewriteFileGroupPlanner.TARGET_FILE_SIZE_BYTES, "1000", - RewriteFileGroupPlanner.MAX_FILE_SIZE_BYTES, "5000", - RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, String.valueOf(Integer.MAX_VALUE)); - rewriter.init(options); - - Iterable> groups = rewriter.planFileGroups(tasks); - assertThat(groups).as("Must have 1 group").hasSize(1); - List group = Iterables.getOnlyElement(groups); - assertThat(group).as("Must rewrite 4 files").hasSize(4); - } - - private void checkDataFileGroupWithEnoughData(RewriteFileGroupPlanner rewriter) { - List tasks = - ImmutableList.of( - new MockFileScanTask(100L), new MockFileScanTask(100L), new MockFileScanTask(100L)); - - Map options = - ImmutableMap.of( - RewriteFileGroupPlanner.MIN_INPUT_FILES, "5", - RewriteFileGroupPlanner.MIN_FILE_SIZE_BYTES, "200", - RewriteFileGroupPlanner.TARGET_FILE_SIZE_BYTES, "250", - RewriteFileGroupPlanner.MAX_FILE_SIZE_BYTES, "500", - RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, String.valueOf(Integer.MAX_VALUE)); - rewriter.init(options); - - Iterable> groups = rewriter.planFileGroups(tasks); - assertThat(groups).as("Must have 1 group").hasSize(1); - List group = Iterables.getOnlyElement(groups); - assertThat(group).as("Must rewrite 3 files").hasSize(3); - } - - private void checkDataFileGroupWithTooMuchData(RewriteFileGroupPlanner rewriter) { - List tasks = ImmutableList.of(new MockFileScanTask(2000L)); - - Map options = - ImmutableMap.of( - RewriteFileGroupPlanner.MIN_INPUT_FILES, "5", - RewriteFileGroupPlanner.MIN_FILE_SIZE_BYTES, "200", - RewriteFileGroupPlanner.TARGET_FILE_SIZE_BYTES, "250", - RewriteFileGroupPlanner.MAX_FILE_SIZE_BYTES, "500", - RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, String.valueOf(Integer.MAX_VALUE)); - rewriter.init(options); - - Iterable> groups = rewriter.planFileGroups(tasks); - assertThat(groups).as("Must have 1 group").hasSize(1); - List group = Iterables.getOnlyElement(groups); - assertThat(group).as("Must rewrite big file").hasSize(1); - } - - @Test - public void testInvalidConstructorUsagesSortData() { + void testInvalidConstructorUsagesSortData() { Table table = catalog.createTable(TABLE_IDENT, SCHEMA); assertThatThrownBy(() -> new SparkSortDataRewriteExecutor(spark, table)) @@ -190,7 +71,7 @@ public void testInvalidConstructorUsagesSortData() { } @Test - public void testInvalidConstructorUsagesZOrderData() { + void testInvalidConstructorUsagesZOrderData() { Table table = catalog.createTable(TABLE_IDENT, SCHEMA, SPEC); assertThatThrownBy(() -> new SparkZOrderDataRewriteExecutor(spark, table, null)) @@ -211,139 +92,48 @@ public void testInvalidConstructorUsagesZOrderData() { } @Test - public void testBinPackDataValidOptions() { + void testBinPackDataValidOptions() { Table table = catalog.createTable(TABLE_IDENT, SCHEMA); SparkBinPackDataRewriteExecutor rewriter = new SparkBinPackDataRewriteExecutor(spark, table); - RewriteFileGroupPlanner planner = new RewriteFileGroupPlanner(table); assertThat(rewriter.validOptions()) .as("Rewriter must report all supported options") .isEqualTo(ImmutableSet.of()); - - assertThat(planner.validOptions()) - .as("Planner must report all supported options") - .isEqualTo( - ImmutableSet.of( - RewriteFileGroupPlanner.TARGET_FILE_SIZE_BYTES, - RewriteFileGroupPlanner.MIN_FILE_SIZE_BYTES, - RewriteFileGroupPlanner.MAX_FILE_SIZE_BYTES, - RewriteFileGroupPlanner.MIN_INPUT_FILES, - RewriteFileGroupPlanner.REWRITE_ALL, - RewriteFileGroupPlanner.MAX_FILE_GROUP_SIZE_BYTES, - RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, - RewriteDataFiles.REWRITE_JOB_ORDER)); } @Test - public void testSortDataValidOptions() { + void testSortDataValidOptions() { Table table = catalog.createTable(TABLE_IDENT, SCHEMA); SparkSortDataRewriteExecutor rewriter = new SparkSortDataRewriteExecutor(spark, table, SORT_ORDER); - RewriteFileGroupPlanner planner = new RewriteFileGroupPlanner(table); assertThat(rewriter.validOptions()) .as("Rewriter must report all supported options") - .isEqualTo( - ImmutableSet.of( - SparkSortDataRewriteExecutor.SHUFFLE_PARTITIONS_PER_FILE, - SparkSortDataRewriteExecutor.COMPRESSION_FACTOR)); - - assertThat(planner.validOptions()) - .as("Planner must report all supported options") - .isEqualTo( - ImmutableSet.of( - RewriteFileGroupPlanner.TARGET_FILE_SIZE_BYTES, - RewriteFileGroupPlanner.MIN_FILE_SIZE_BYTES, - RewriteFileGroupPlanner.MAX_FILE_SIZE_BYTES, - RewriteFileGroupPlanner.MIN_INPUT_FILES, - RewriteFileGroupPlanner.REWRITE_ALL, - RewriteFileGroupPlanner.MAX_FILE_GROUP_SIZE_BYTES, - RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, - RewriteDataFiles.REWRITE_JOB_ORDER)); + .isEqualTo(ImmutableSet.of(SparkSortDataRewriteExecutor.SHUFFLE_PARTITIONS_PER_FILE)); } @Test - public void testZOrderDataValidOptions() { + void testZOrderDataValidOptions() { Table table = catalog.createTable(TABLE_IDENT, SCHEMA); ImmutableList zOrderCols = ImmutableList.of("id"); SparkZOrderDataRewriteExecutor rewriter = new SparkZOrderDataRewriteExecutor(spark, table, zOrderCols); - RewriteFileGroupPlanner planner = new RewriteFileGroupPlanner(table); assertThat(rewriter.validOptions()) .as("Rewriter must report all supported options") .isEqualTo( ImmutableSet.of( SparkZOrderDataRewriteExecutor.SHUFFLE_PARTITIONS_PER_FILE, - SparkZOrderDataRewriteExecutor.COMPRESSION_FACTOR, SparkZOrderDataRewriteExecutor.MAX_OUTPUT_SIZE, SparkZOrderDataRewriteExecutor.VAR_LENGTH_CONTRIBUTION)); - assertThat(planner.validOptions()) - .as("Planner must report all supported options") - .isEqualTo( - ImmutableSet.of( - RewriteFileGroupPlanner.TARGET_FILE_SIZE_BYTES, - RewriteFileGroupPlanner.MIN_FILE_SIZE_BYTES, - RewriteFileGroupPlanner.MAX_FILE_SIZE_BYTES, - RewriteFileGroupPlanner.MIN_INPUT_FILES, - RewriteFileGroupPlanner.REWRITE_ALL, - RewriteFileGroupPlanner.MAX_FILE_GROUP_SIZE_BYTES, - RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, - RewriteDataFiles.REWRITE_JOB_ORDER)); - } - - @Test - public void testInvalidValuesForBinPackDataOptions() { - Table table = catalog.createTable(TABLE_IDENT, SCHEMA); - RewriteFileGroupPlanner planner = new RewriteFileGroupPlanner(table); - - validateSizeBasedRewriterOptions(planner); - - Map invalidDeleteThresholdOptions = - ImmutableMap.of(RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, "-1"); - assertThatThrownBy(() -> planner.init(invalidDeleteThresholdOptions)) - .hasMessageContaining("'delete-file-threshold' is set to -1 but must be >= 0"); - } - - @Test - public void testInvalidValuesForSortDataOptions() { - Table table = catalog.createTable(TABLE_IDENT, SCHEMA); - SparkSortDataRewriteExecutor rewriter = - new SparkSortDataRewriteExecutor(spark, table, SORT_ORDER); - RewriteFileGroupPlanner planner = new RewriteFileGroupPlanner(table); - - validateSizeBasedRewriterOptions(planner); - - Map invalidDeleteThresholdOptions = - ImmutableMap.of(RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, "-1"); - assertThatThrownBy(() -> planner.init(invalidDeleteThresholdOptions)) - .hasMessageContaining("'delete-file-threshold' is set to -1 but must be >= 0"); - - Map invalidCompressionFactorOptions = - ImmutableMap.of(SparkShufflingDataRewriteExecutor.COMPRESSION_FACTOR, "0"); - assertThatThrownBy(() -> rewriter.init(invalidCompressionFactorOptions)) - .hasMessageContaining("'compression-factor' is set to 0.0 but must be > 0"); } @Test - public void testInvalidValuesForZOrderDataOptions() { + void testInvalidValuesForZOrderDataOptions() { Table table = catalog.createTable(TABLE_IDENT, SCHEMA); ImmutableList zOrderCols = ImmutableList.of("id"); SparkZOrderDataRewriteExecutor rewriter = new SparkZOrderDataRewriteExecutor(spark, table, zOrderCols); - RewriteFileGroupPlanner planner = new RewriteFileGroupPlanner(table); - - validateSizeBasedRewriterOptions(planner); - - Map invalidDeleteThresholdOptions = - ImmutableMap.of(RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, "-1"); - assertThatThrownBy(() -> planner.init(invalidDeleteThresholdOptions)) - .hasMessageContaining("'delete-file-threshold' is set to -1 but must be >= 0"); - - Map invalidCompressionFactorOptions = - ImmutableMap.of(SparkShufflingDataRewriteExecutor.COMPRESSION_FACTOR, "0"); - assertThatThrownBy(() -> rewriter.init(invalidCompressionFactorOptions)) - .hasMessageContaining("'compression-factor' is set to 0.0 but must be > 0"); Map invalidMaxOutputOptions = ImmutableMap.of(SparkZOrderDataRewriteExecutor.MAX_OUTPUT_SIZE, "0"); @@ -357,42 +147,4 @@ public void testInvalidValuesForZOrderDataOptions() { .hasMessageContaining("Cannot use less than 1 byte for variable length types with ZOrder") .hasMessageContaining("'var-length-contribution' was set to 0"); } - - private void validateSizeBasedRewriterOptions(SizeBasedFileRewritePlanner rewriter) { - Map invalidTargetSizeOptions = - ImmutableMap.of(SizeBasedFileRewritePlanner.TARGET_FILE_SIZE_BYTES, "0"); - assertThatThrownBy(() -> rewriter.init(invalidTargetSizeOptions)) - .hasMessageContaining("'target-file-size-bytes' is set to 0 but must be > 0"); - - Map invalidMinSizeOptions = - ImmutableMap.of(SizeBasedFileRewritePlanner.MIN_FILE_SIZE_BYTES, "-1"); - assertThatThrownBy(() -> rewriter.init(invalidMinSizeOptions)) - .hasMessageContaining("'min-file-size-bytes' is set to -1 but must be >= 0"); - - Map invalidTargetMinSizeOptions = - ImmutableMap.of( - SizeBasedFileRewritePlanner.TARGET_FILE_SIZE_BYTES, "3", - SizeBasedFileRewritePlanner.MIN_FILE_SIZE_BYTES, "5"); - assertThatThrownBy(() -> rewriter.init(invalidTargetMinSizeOptions)) - .hasMessageContaining("'target-file-size-bytes' (3) must be > 'min-file-size-bytes' (5)") - .hasMessageContaining("all new files will be smaller than the min threshold"); - - Map invalidTargetMaxSizeOptions = - ImmutableMap.of( - SizeBasedFileRewritePlanner.TARGET_FILE_SIZE_BYTES, "5", - SizeBasedFileRewritePlanner.MAX_FILE_SIZE_BYTES, "3"); - assertThatThrownBy(() -> rewriter.init(invalidTargetMaxSizeOptions)) - .hasMessageContaining("'target-file-size-bytes' (5) must be < 'max-file-size-bytes' (3)") - .hasMessageContaining("all new files will be larger than the max threshold"); - - Map invalidMinInputFilesOptions = - ImmutableMap.of(SizeBasedFileRewritePlanner.MIN_INPUT_FILES, "0"); - assertThatThrownBy(() -> rewriter.init(invalidMinInputFilesOptions)) - .hasMessageContaining("'min-input-files' is set to 0 but must be > 0"); - - Map invalidMaxFileGroupSizeOptions = - ImmutableMap.of(SizeBasedFileRewritePlanner.MAX_FILE_GROUP_SIZE_BYTES, "0"); - assertThatThrownBy(() -> rewriter.init(invalidMaxFileGroupSizeOptions)) - .hasMessageContaining("'max-file-group-size-bytes' is set to 0 but must be > 0"); - } } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestSparkFileRewritePlanner.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestSparkFileRewritePlanner.java new file mode 100644 index 000000000000..3426a6a71adb --- /dev/null +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestSparkFileRewritePlanner.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.actions; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.util.Map; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.actions.RewriteDataFiles; +import org.apache.iceberg.actions.RewriteFileGroupPlanner; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.spark.TestBase; +import org.apache.iceberg.types.Types.IntegerType; +import org.apache.iceberg.types.Types.NestedField; +import org.apache.iceberg.types.Types.StringType; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Test; + +class TestSparkFileRewritePlanner extends TestBase { + + private static final TableIdentifier TABLE_IDENT = TableIdentifier.of("default", "tbl"); + private static final Schema SCHEMA = + new Schema( + NestedField.required(1, "id", IntegerType.get()), + NestedField.required(2, "dep", StringType.get())); + private static final PartitionSpec SPEC = + PartitionSpec.builderFor(SCHEMA).identity("dep").build(); + + @AfterEach + public void removeTable() { + catalog.dropTable(TABLE_IDENT); + } + + @Test + void testRewriteFileGroupPlannerValidOptions() { + Table table = catalog.createTable(TABLE_IDENT, SCHEMA); + RewriteFileGroupPlanner planner = new RewriteFileGroupPlanner(table); + + assertThat(planner.validOptions()) + .as("Planner must report all supported options") + .isEqualTo( + ImmutableSet.of( + RewriteFileGroupPlanner.TARGET_FILE_SIZE_BYTES, + RewriteFileGroupPlanner.MIN_FILE_SIZE_BYTES, + RewriteFileGroupPlanner.MAX_FILE_SIZE_BYTES, + RewriteFileGroupPlanner.MIN_INPUT_FILES, + RewriteFileGroupPlanner.REWRITE_ALL, + RewriteFileGroupPlanner.MAX_FILE_GROUP_SIZE_BYTES, + RewriteFileGroupPlanner.DELETE_FILE_THRESHOLD, + RewriteDataFiles.REWRITE_JOB_ORDER)); + } + + @Test + void testSparkShufflingDataRewritePlannerValidOptions() { + Table table = catalog.createTable(TABLE_IDENT, SCHEMA); + SparkShufflingDataRewritePlanner planner = new SparkShufflingDataRewritePlanner(table); + + assertThat(planner.validOptions()) + .as("Planner must report all supported options") + .isEqualTo( + ImmutableSet.of( + SparkShufflingDataRewritePlanner.COMPRESSION_FACTOR, + SparkShufflingDataRewritePlanner.TARGET_FILE_SIZE_BYTES, + SparkShufflingDataRewritePlanner.MIN_FILE_SIZE_BYTES, + SparkShufflingDataRewritePlanner.MAX_FILE_SIZE_BYTES, + SparkShufflingDataRewritePlanner.MIN_INPUT_FILES, + SparkShufflingDataRewritePlanner.REWRITE_ALL, + SparkShufflingDataRewritePlanner.MAX_FILE_GROUP_SIZE_BYTES, + SparkShufflingDataRewritePlanner.DELETE_FILE_THRESHOLD, + RewriteDataFiles.REWRITE_JOB_ORDER)); + } + + @Test + void testInvalidValuesSparkShufflingDataRewritePlannerOptions() { + Table table = catalog.createTable(TABLE_IDENT, SCHEMA); + SparkShufflingDataRewritePlanner planner = new SparkShufflingDataRewritePlanner(table); + + Map invalidCompressionFactorOptions = + ImmutableMap.of(SparkShufflingDataRewritePlanner.COMPRESSION_FACTOR, "0"); + assertThatThrownBy(() -> planner.init(invalidCompressionFactorOptions)) + .hasMessageContaining("'compression-factor' is set to 0.0 but must be > 0"); + } +} From ab5ba4177fb76defda807f50b95b16cef8b245aa Mon Sep 17 00:00:00 2001 From: Peter Vary Date: Mon, 9 Dec 2024 20:35:32 +0100 Subject: [PATCH 11/11] Russell's comments --- .../iceberg/actions/FileRewriteExecutor.java | 14 +++++++++++++- .../apache/iceberg/actions/FileRewritePlan.java | 13 +++++++++---- .../iceberg/actions/FileRewritePlanner.java | 15 +++++++++++---- .../iceberg/actions/SizeBasedDataRewriter.java | 10 +++++++--- 4 files changed, 40 insertions(+), 12 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/actions/FileRewriteExecutor.java b/core/src/main/java/org/apache/iceberg/actions/FileRewriteExecutor.java index bc4102a25de4..5d589c6931c5 100644 --- a/core/src/main/java/org/apache/iceberg/actions/FileRewriteExecutor.java +++ b/core/src/main/java/org/apache/iceberg/actions/FileRewriteExecutor.java @@ -24,7 +24,19 @@ import org.apache.iceberg.ContentScanTask; /** - * A class for rewriting content file groups ({@link FileRewriteGroup}). + * A class for rewriting content file groups ({@link FileRewriteGroup}). The lifecycle for the + * executor looks like the following: + * + *

    + *
  • {@link #init(Map)} initializes the executor with the configuration parameters + *
  • {@link #initPlan(FileRewritePlan)} initializes the executor with the configuration + * calculated during planning ({@link FileRewritePlan#writeMaxFileSize()}, {@link + * RewriteFilePlan#outputSpecId()} + *
  • {@link #rewrite(FileRewriteGroup)} called for every group in the plan to do the actual + * rewrite of the files, and returns the generated new files. + *
+ * + * A single executor could be used to rewrite multiple groups for the same plan. * * @param the Java type of the plan info * @param the Java type of the tasks to read content files diff --git a/core/src/main/java/org/apache/iceberg/actions/FileRewritePlan.java b/core/src/main/java/org/apache/iceberg/actions/FileRewritePlan.java index dc4cc9a6d57a..f313fd1b070d 100644 --- a/core/src/main/java/org/apache/iceberg/actions/FileRewritePlan.java +++ b/core/src/main/java/org/apache/iceberg/actions/FileRewritePlan.java @@ -25,10 +25,15 @@ import org.apache.iceberg.StructLike; /** - * Result of the file rewrite planning. + * Result of the file rewrite planning as generated by the {@link FileRewritePlanner#plan()}. * - *

Contains the planned groups, calculated values required by the {@link FileRewriteExecutor}s - * and statistics. + *

The plan contains the stream of the planned groups and statistics about the number of the + * generated groups, like the total number of the groups and the groups per partition. The plan also + * contains some calculated values required by the {@link FileRewriteExecutor}s where the values are + * based on the input data and the planning parameters. + * + *

Groups in a plan could be processed independently. For example, in Spark this means that each + * group would be rewritten in its own Spark job. * * @param the Java type of the plan info * @param the Java type of the tasks to read content files @@ -71,7 +76,7 @@ public int totalGroupCount() { return totalGroupCount; } - /** Calculated maximum file size for the target files */ + /** Calculated maximum file size based on the planner target file size configuration */ public long writeMaxFileSize() { return writeMaxFileSize; } diff --git a/core/src/main/java/org/apache/iceberg/actions/FileRewritePlanner.java b/core/src/main/java/org/apache/iceberg/actions/FileRewritePlanner.java index ff770874a9a3..0d242bbe6bb0 100644 --- a/core/src/main/java/org/apache/iceberg/actions/FileRewritePlanner.java +++ b/core/src/main/java/org/apache/iceberg/actions/FileRewritePlanner.java @@ -26,10 +26,17 @@ /** * A class for planning content file rewrites. * - *

The entire rewrite operation is broken down into pieces based on partitioning, and size-based - * groups within a partition. These subunits of the rewrite are referred to as file groups. A file - * group will be processed by a {@link FileRewriteExecutor} in a single framework "action". For - * example, in Spark this means that each group would be rewritten in its own Spark job. + *

The entire rewrite operation is broken down into pieces. The grouping is based on partitioning + * and the planning could create multiple groups within a partition. As a result {@link + * FileRewritePlan} is generated which contains the data need by the {@link FileRewriteExecutor}s + * which execute the actual file rewrite. + * + *

The lifecycle of the planner is: + * + *

    + *
  • {@link #init(Map)} initializes the planner with the configuration parameters + *
  • {@link #plan()} generates the plan for the given configuration + *
* * @param the Java type of the plan info * @param the Java type of the tasks to read content files diff --git a/core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java b/core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java index 66b759321ac8..5c9e2321fd82 100644 --- a/core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java +++ b/core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java @@ -42,12 +42,16 @@ public abstract class SizeBasedDataRewriter extends SizeBasedFileRewriterDefaults to Integer.MAX_VALUE, which means this feature is not enabled by default. * - * @deprecated since 1.8.0, will be removed in 1.9.0; use {@link RewriteFileGroupPlanner} and - * {@link FileRewriteExecutor}. + * @deprecated since 1.8.0, will be removed in 1.9.0; use {@link + * RewriteFileGroupPlanner#DELETE_FILE_THRESHOLD}. */ @Deprecated public static final String DELETE_FILE_THRESHOLD = "delete-file-threshold"; - public static final int DELETE_FILE_THRESHOLD_DEFAULT = Integer.MAX_VALUE; + /** + * @deprecated since 1.8.0, will be removed in 1.9.0; use {@link + * RewriteFileGroupPlanner#DELETE_FILE_THRESHOLD_DEFAULT}. + */ + @Deprecated public static final int DELETE_FILE_THRESHOLD_DEFAULT = Integer.MAX_VALUE; private int deleteFileThreshold;