apache · pvary · Dec 8, 2024 · Nov 14, 2024 · Nov 14, 2024 · Nov 18, 2024
diff --git a/core/src/main/java/org/apache/iceberg/actions/FileRewriteExecutor.java b/core/src/main/java/org/apache/iceberg/actions/FileRewriteExecutor.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.actions;
+
+import java.util.Map;
+import java.util.Set;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ContentScanTask;
+
+/**
+ * A class for rewriting content file groups ({@link FileRewriteGroup}). The lifecycle for the
+ * executor looks like the following:
+ *
+ * <ul>
+ *   <li>{@link #init(Map)} initializes the executor with the configuration parameters
+ *   <li>{@link #initPlan(FileRewritePlan)} initializes the executor with the configuration
+ *       calculated during planning ({@link FileRewritePlan#writeMaxFileSize()}, {@link
+ *       RewriteFilePlan#outputSpecId()}
+ *   <li>{@link #rewrite(FileRewriteGroup)} called for every group in the plan to do the actual
+ *       rewrite of the files, and returns the generated new files.
+ * </ul>
+ *
+ * A single executor could be used to rewrite multiple groups for the same plan.
+ *
+ * @param <I> the Java type of the plan info
+ * @param <T> the Java type of the tasks to read content files
+ * @param <F> the Java type of the content files
+ * @param <G> the Java type of the planned groups
+ * @param <P> the Java type of the plan to execute
+ */
+public interface FileRewriteExecutor<
+    I,
+    T extends ContentScanTask<F>,
+    F extends ContentFile<F>,
+    G extends FileRewriteGroup<I, T, F>,
+    P extends FileRewritePlan<I, T, F, G>> {
+
+  /** Returns a description for this rewriter. */
+  default String description() {
+    return getClass().getName();
+  }
+
+  /**
+   * Returns a set of supported options for this rewriter. Only options specified in this list will
+   * be accepted at runtime. Any other options will be rejected.
+   */
+  Set<String> validOptions();
+
+  /**
+   * Initializes this rewriter using provided options.
+   *
+   * @param options options to initialize this rewriter
+   */
+  void init(Map<String, String> options);
+
+  /**
+   * Initializes the rewriter using the information generated during planning.
+   *
+   * @param plan containing the configuration data
+   */
+  void initPlan(P plan);
+
+  /**
+   * Rewrite a group of files represented by the given list of scan tasks.
+   *
+   * <p>The implementation is supposed to be engine-specific (e.g. Spark, Flink, Trino).
+   *
+   * @param group of scan tasks for files to be rewritten together
+   * @return a set of newly written files
+   */
+  Set<F> rewrite(G group);
+}
diff --git a/core/src/main/java/org/apache/iceberg/actions/FileRewriteGroup.java b/core/src/main/java/org/apache/iceberg/actions/FileRewriteGroup.java
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.actions;
+
+import java.util.Comparator;
+import java.util.List;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ContentScanTask;
+import org.apache.iceberg.RewriteJobOrder;
+
+/**
+ * Container class representing a set of files to be rewritten by a {@link FileRewriteExecutor}.
+ *
+ * @param <I> the Java type of the plan info
+ * @param <T> the Java type of the tasks to read content files
+ * @param <F> the Java type of the content files
+ */
+public abstract class FileRewriteGroup<I, T extends ContentScanTask<F>, F extends ContentFile<F>> {
+  private final I info;
+  private final List<T> fileScanTasks;
+  private final long splitSize;
+  private final int expectedOutputFiles;
+
+  FileRewriteGroup(I info, List<T> fileScanTasks, long splitSize, int expectedOutputFiles) {
+    this.info = info;
+    this.fileScanTasks = fileScanTasks;
+    this.splitSize = splitSize;
+    this.expectedOutputFiles = expectedOutputFiles;
+  }
+
+  /** Identifiers and partition information about the group. */
+  public I info() {
+    return info;
+  }
+
+  /** Input of the group. {@link ContentScanTask}s to read. */
+  public List<T> fileScans() {
+    return fileScanTasks;
+  }
+
+  /** Expected split size for the output files. */
+  public long splitSize() {
+    return splitSize;
+  }
+
+  /** Expected number of the output files. */
+  public int expectedOutputFiles() {
+    return expectedOutputFiles;
+  }
+
+  /** Accumulated size for the input files. */
+  public long sizeInBytes() {
+    return fileScanTasks.stream().mapToLong(T::length).sum();
+  }
+
+  /** Number of the input files. */
+  public int numInputFiles() {
+    return fileScanTasks.size();
+  }
+
+  /** Comparator to order the FileRewriteGroups based on a provided {@link RewriteJobOrder}. */
+  public static <I, T extends ContentScanTask<F>, F extends ContentFile<F>>
+      Comparator<FileRewriteGroup<I, T, F>> taskComparator(RewriteJobOrder rewriteJobOrder) {
+    switch (rewriteJobOrder) {
+      case BYTES_ASC:
+        return Comparator.comparing(FileRewriteGroup::sizeInBytes);
+      case BYTES_DESC:
+        return Comparator.comparing(FileRewriteGroup::sizeInBytes, Comparator.reverseOrder());
+      case FILES_ASC:
+        return Comparator.comparing(FileRewriteGroup::numInputFiles);
+      case FILES_DESC:
+        return Comparator.comparing(FileRewriteGroup::numInputFiles, Comparator.reverseOrder());
+      default:
+        return (unused, unused2) -> 0;
+    }
+  }
+}
diff --git a/core/src/main/java/org/apache/iceberg/actions/FileRewritePlan.java b/core/src/main/java/org/apache/iceberg/actions/FileRewritePlan.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.actions;
+
+import java.util.Map;
+import java.util.stream.Stream;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ContentScanTask;
+import org.apache.iceberg.StructLike;
+
+/**
+ * Result of the file rewrite planning as generated by the {@link FileRewritePlanner#plan()}.
+ *
+ * <p>The plan contains the stream of the planned groups and statistics about the number of the
+ * generated groups, like the total number of the groups and the groups per partition. The plan also
+ * contains some calculated values required by the {@link FileRewriteExecutor}s where the values are
+ * based on the input data and the planning parameters.
+ *
+ * <p>Groups in a plan could be processed independently. For example, in Spark this means that each
+ * group would be rewritten in its own Spark job.
+ *
+ * @param <I> the Java type of the plan info
+ * @param <T> the Java type of the tasks to read content files
+ * @param <F> the Java type of the content files
+ * @param <G> the Java type of the planned groups
+ */
+public abstract class FileRewritePlan<
+    I,
+    T extends ContentScanTask<F>,
+    F extends ContentFile<F>,
+    G extends FileRewriteGroup<I, T, F>> {
+  private final Stream<G> groups;
+  private final int totalGroupCount;
+  private final Map<StructLike, Integer> groupsInPartition;
+  private final long writeMaxFileSize;
+
+  protected FileRewritePlan(
+      Stream<G> groups,
+      int totalGroupCount,
+      Map<StructLike, Integer> groupsInPartition,
+      long writeMaxFileSize) {
+    this.groups = groups;
+    this.totalGroupCount = totalGroupCount;
+    this.groupsInPartition = groupsInPartition;
+    this.writeMaxFileSize = writeMaxFileSize;
+  }
+
+  /** The stream of the generated {@link FileRewriteGroup}s. */
+  public Stream<G> groups() {
+    return groups;
+  }
+
+  /** The number of the generated groups in the given partition. */
+  public int groupsInPartition(StructLike partition) {
+    return groupsInPartition.get(partition);
+  }
+
+  /** The total number of the groups generated by this plan. */
+  public int totalGroupCount() {
+    return totalGroupCount;
+  }
+
+  /** Calculated maximum file size based on the planner target file size configuration */
+  public long writeMaxFileSize() {
+    return writeMaxFileSize;
+  }
+}
diff --git a/core/src/main/java/org/apache/iceberg/actions/FileRewritePlanner.java b/core/src/main/java/org/apache/iceberg/actions/FileRewritePlanner.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.actions;
+
+import java.util.Map;
+import java.util.Set;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ContentScanTask;
+
+/**
+ * A class for planning content file rewrites.
+ *
+ * <p>The entire rewrite operation is broken down into pieces. The grouping is based on partitioning
+ * and the planning could create multiple groups within a partition. As a result {@link
+ * FileRewritePlan} is generated which contains the data need by the {@link FileRewriteExecutor}s
+ * which execute the actual file rewrite.
+ *
+ * <p>The lifecycle of the planner is:
+ *
+ * <ul>
+ *   <li>{@link #init(Map)} initializes the planner with the configuration parameters
+ *   <li>{@link #plan()} generates the plan for the given configuration
+ * </ul>
+ *
+ * @param <I> the Java type of the plan info
+ * @param <T> the Java type of the tasks to read content files
+ * @param <F> the Java type of the content files
+ * @param <G> the Java type of the planned groups
+ */
+public interface FileRewritePlanner<
+    I,
+    T extends ContentScanTask<F>,
+    F extends ContentFile<F>,
+    G extends FileRewriteGroup<I, T, F>> {
+
+  /** Returns a description for this rewriter. */
+  default String description() {
+    return getClass().getName();
+  }
+
+  /**
+   * Returns a set of supported options for this rewriter. Only options specified in this list will
+   * be accepted at runtime. Any other options will be rejected.
+   */
+  Set<String> validOptions();
+
+  /**
+   * Initializes this rewriter using provided options.
+   *
+   * @param options options to initialize this rewriter
+   */
+  void init(Map<String, String> options);
+
+  /**
+   * Generates the plan for rewrite.
+   *
+   * @return the generated plan which could be executed during the compaction
+   */
+  FileRewritePlan<I, T, F, G> plan();
+}
diff --git a/core/src/main/java/org/apache/iceberg/actions/FileRewriter.java b/core/src/main/java/org/apache/iceberg/actions/FileRewriter.java
@@ -34,7 +34,10 @@
  *
  * @param <T> the Java type of tasks to read content files
  * @param <F> the Java type of content files
+ * @deprecated since 1.8.0, will be removed in 1.9.0; use {@link FileRewritePlanner} and {@link
+ *     FileRewriteExecutor}.
  */
+@Deprecated
 public interface FileRewriter<T extends ContentScanTask<F>, F extends ContentFile<F>> {
 
   /** Returns a description for this rewriter. */