apache · Abacn · Jan 30, 2025 · Jan 14, 2025 · Jan 14, 2025 · Jan 22, 2025
diff --git a/.github/trigger_files/beam_PostCommit_Java_DataflowV1.json b/.github/trigger_files/beam_PostCommit_Java_DataflowV1.json
@@ -1,3 +1,4 @@
 {
-    "comment": "Modify this file in a trivial way to cause this test suite to run"
+    "comment": "Modify this file in a trivial way to cause this test suite to run",
+    "modification": 1
 }
diff --git a/CHANGES.md b/CHANGES.md
@@ -64,6 +64,8 @@
 
 * Support gcs-connector 3.x+ in GcsUtil ([#33368](https://github.com/apache/beam/pull/33368))
 * Support for X source added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)).
+* Introduced `--groupFilesFileLoad` pipeline option to mitigate side-input related issues in BigQueryIO
+  batch FILE_LOAD on certain runners (including Dataflow Runner V2) (Java) ([#33587](https://github.com/apache/beam/pull/33587)).
 
 ## New Features / Improvements
 

diff --git a/...o/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BatchLoads.java b/...o/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BatchLoads.java
@@ -480,12 +480,21 @@ public WriteResult expandUntriggered(PCollection<KV<DestinationT, ElementT>> inp
     TupleTag<KV<ShardedKey<DestinationT>, WritePartition.Result>> singlePartitionTag =
         new TupleTag<KV<ShardedKey<DestinationT>, WritePartition.Result>>("singlePartitionTag") {};
 
+    PTransform<
+            PCollection<WriteBundlesToFiles.Result<DestinationT>>,
+            PCollection<Iterable<WriteBundlesToFiles.Result<DestinationT>>>>
+        reifyTransform;
+    if (p.getOptions().as(BigQueryOptions.class).getGroupFilesFileLoad()) {
+      reifyTransform = new CombineAsIterable<>();
+    } else {
+      reifyTransform = new ReifyAsIterable<>();
+    }
     // This transform will look at the set of files written for each table, and if any table has
     // too many files or bytes, will partition that table's files into multiple partitions for
     // loading.
     PCollectionTuple partitions =
         results
-            .apply("ReifyResults", new ReifyAsIterable<>())
+            .apply("ReifyResults", reifyTransform)
             .setCoder(IterableCoder.of(WriteBundlesToFiles.ResultCoder.of(destinationCoder)))
             .apply(
                 "WritePartitionUntriggered",

diff --git a/...gle-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryOptions.java b/...gle-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryOptions.java
@@ -229,4 +229,20 @@ public interface BigQueryOptions
   String getBigQueryEndpoint();
 
   void setBigQueryEndpoint(String value);
+
+  /**
+   * Choose to use a GBK when gathering a list of files in batch FILE_LOAD.
+   *
+   * <p>The purpose of this option is to accommodate the runner compatibility, for example, some
+   * runners having known issues on large side input, turning on this option avoids trigger side
+   * input related issues.
+   *
+   * <p>This is an experimental pipeline option, no backward compatibility guaranteed.
+   */
+  @Hidden
+  @Description("Whether involves side input when gathering a list of files in batch FILE_LOAD.")
+  @Default.Boolean(false)
+  Boolean getGroupFilesFileLoad();
+
+  void setGroupFilesFileLoad(Boolean value);
 }
diff --git a/...e-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/CombineAsIterable.java b/...e-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/CombineAsIterable.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.sdk.io.gcp.bigquery;
+
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.GroupByKey;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.beam.sdk.transforms.Values;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PCollection;
+
+public class CombineAsIterable<T> extends PTransform<PCollection<T>, PCollection<Iterable<T>>> {
+  @Override
+  public PCollection<Iterable<T>> expand(PCollection<T> input) {
+    return input
+        .apply(
+            "assign single key",
+            ParDo.of(
+                new DoFn<T, KV<Void, T>>() {
+                  @ProcessElement
+                  public void processElement(@Element T element, OutputReceiver<KV<Void, T>> o) {
+                    o.output(KV.of(null, element));
+                  }
+                }))
+        .apply(GroupByKey.create())
+        .apply(Values.create());
+  }
+}
diff --git a/...cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOWriteTest.java b/...cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOWriteTest.java
@@ -316,11 +316,17 @@ abstract static class StringLongDestinations extends DynamicDestinations<String,
   public void testWriteEmptyPCollection() throws Exception {
     assumeTrue(!useStreaming);
     assumeTrue(!useStorageApi);
+    writeEmptyPCollection();
+    checkNotNull(
+        fakeDatasetService.getTable(
+            BigQueryHelpers.parseTableSpec("project-id:dataset-id.table-id")));
+  }
+
+  void writeEmptyPCollection() {
     TableSchema schema =
         new TableSchema()
             .setFields(
                 ImmutableList.of(new TableFieldSchema().setName("number").setType("INTEGER")));
-
     p.apply(Create.empty(TableRowJsonCoder.of()))
         .apply(
             BigQueryIO.writeTableRows()
@@ -331,8 +337,14 @@ public void testWriteEmptyPCollection() throws Exception {
                 .withSchema(schema)
                 .withoutValidation());
     p.run();
+  }
 
-    checkNotNull(
+  @Test
+  public void testWriteEmptyPCollectionGroupFilesFileLoad() throws Exception {
+    assumeFalse(useStorageApi || useStorageApiApproximate || useStreaming);
+    p.getOptions().as(BigQueryOptions.class).setGroupFilesFileLoad(true);
+    writeEmptyPCollection();
+    assertNull(
         fakeDatasetService.getTable(
             BigQueryHelpers.parseTableSpec("project-id:dataset-id.table-id")));
   }
@@ -342,6 +354,13 @@ public void testWriteDynamicDestinations() throws Exception {
     writeDynamicDestinations(false, false);
   }
 
+  @Test
+  public void testWriteDynamicDestinationsGroupFilesFileLoad() throws Exception {
+    assumeFalse(useStorageApi || useStorageApiApproximate || useStreaming);
+    p.getOptions().as(BigQueryOptions.class).setGroupFilesFileLoad(true);
+    writeDynamicDestinations(false, false);
+  }
+
   @Test
   public void testWriteDynamicDestinationsStreamingWithAutoSharding() throws Exception {
     assumeTrue(useStreaming);