From 8d0cc7e8319035101ed84dc040c862ff1af6c8d7 Mon Sep 17 00:00:00 2001
From: twosom <two_somang@icloud.com>
Date: Mon, 24 Feb 2025 23:29:13 +0900
Subject: [PATCH 1/7] feat : add delete checkpoint dir option in
 TestSparkPipelineOptions

---
 .../beam/runners/spark/TestSparkPipelineOptions.java     | 9 +++++++--
 .../org/apache/beam/runners/spark/TestSparkRunner.java   | 6 ++++--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/runners/spark/src/main/java/org/apache/beam/runners/spark/TestSparkPipelineOptions.java b/runners/spark/src/main/java/org/apache/beam/runners/spark/TestSparkPipelineOptions.java
index 4ad7dd120693..121ff3dfdb9e 100644
--- a/runners/spark/src/main/java/org/apache/beam/runners/spark/TestSparkPipelineOptions.java
+++ b/runners/spark/src/main/java/org/apache/beam/runners/spark/TestSparkPipelineOptions.java
@@ -35,8 +35,7 @@ public interface TestSparkPipelineOptions extends SparkPipelineOptions, TestPipe
   void setForceStreaming(boolean forceStreaming);
 
   @Description("A hard-coded expected number of assertions for this test pipeline.")
-  @Nullable
-  Integer getExpectedAssertions();
+  @Nullable Integer getExpectedAssertions();
 
   void setExpectedAssertions(Integer expectedAssertions);
 
@@ -48,6 +47,12 @@ public interface TestSparkPipelineOptions extends SparkPipelineOptions, TestPipe
 
   void setStopPipelineWatermark(Long stopPipelineWatermark);
 
+  @Description("Whether to delete the checkpoint directory after the pipeline execution.")
+  @Default.Boolean(true)
+  boolean isDeleteCheckpointDir();
+
+  void setDeleteCheckpointDir(boolean deleteCheckpointDir);
+
   /**
    * A factory to provide the default watermark to stop a pipeline that reads from an unbounded
    * source.
diff --git a/runners/spark/src/main/java/org/apache/beam/runners/spark/TestSparkRunner.java b/runners/spark/src/main/java/org/apache/beam/runners/spark/TestSparkRunner.java
index e46b6c5f5e0b..22e25e5272a2 100644
--- a/runners/spark/src/main/java/org/apache/beam/runners/spark/TestSparkRunner.java
+++ b/runners/spark/src/main/java/org/apache/beam/runners/spark/TestSparkRunner.java
@@ -106,8 +106,10 @@ public SparkPipelineResult run(Pipeline pipeline) {
             isOneOf(PipelineResult.State.STOPPED, PipelineResult.State.DONE));
       } finally {
         try {
-          // cleanup checkpoint dir.
-          FileUtils.deleteDirectory(new File(testSparkOptions.getCheckpointDir()));
+          if (testSparkOptions.isDeleteCheckpointDir()) {
+            // cleanup checkpoint dir.
+            FileUtils.deleteDirectory(new File(testSparkOptions.getCheckpointDir()));
+          }
         } catch (IOException e) {
           throw new RuntimeException("Failed to clear checkpoint tmp dir.", e);
         }

From 3df96636812a2656363b1a2369300c8ed704ea9a Mon Sep 17 00:00:00 2001
From: twosom <two_somang@icloud.com>
Date: Tue, 25 Feb 2025 22:33:22 +0900
Subject: [PATCH 2/7] feat : implementation SingleEmitInputDStream

---
 .../translation/SingleEmitInputDStream.java   |  70 ++++++++
 .../SingleEmitInputDStreamTest.java           | 163 ++++++++++++++++++
 2 files changed, 233 insertions(+)
 create mode 100644 runners/spark/src/main/java/org/apache/beam/runners/spark/translation/SingleEmitInputDStream.java
 create mode 100644 runners/spark/src/test/java/org/apache/beam/runners/spark/translation/SingleEmitInputDStreamTest.java

diff --git a/runners/spark/src/main/java/org/apache/beam/runners/spark/translation/SingleEmitInputDStream.java b/runners/spark/src/main/java/org/apache/beam/runners/spark/translation/SingleEmitInputDStream.java
new file mode 100644
index 000000000000..b4180d793315
--- /dev/null
+++ b/runners/spark/src/main/java/org/apache/beam/runners/spark/translation/SingleEmitInputDStream.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.spark.translation;
+
+import org.apache.spark.api.java.JavaSparkContext$;
+import org.apache.spark.rdd.RDD;
+import org.apache.spark.streaming.StreamingContext;
+import org.apache.spark.streaming.Time;
+import org.apache.spark.streaming.dstream.ConstantInputDStream;
+import org.apache.spark.streaming.dstream.QueueInputDStream;
+import org.joda.time.Instant;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import scala.Option;
+
+/**
+ * A specialized {@link ConstantInputDStream} that emits its RDD exactly once. Alternative to {@link
+ * QueueInputDStream} when checkpointing is required.
+ *
+ * <p>Features:
+ *
+ * <ul>
+ *   <li>Supports checkpointing
+ *   <li>Guarantees single emission of data
+ *   <li>Returns empty RDD after first emission
+ * </ul>
+ *
+ * @param <T> The type of elements in the RDD
+ */
+public class SingleEmitInputDStream<T> extends ConstantInputDStream<T> {
+
+  private static final Logger LOG = LoggerFactory.getLogger(SingleEmitInputDStream.class);
+
+  private boolean emitted = false;
+
+  public SingleEmitInputDStream(StreamingContext ssc, RDD<T> rdd) {
+    super(ssc, rdd, JavaSparkContext$.MODULE$.fakeClassTag());
+  }
+
+  @Override
+  public Option<RDD<T>> compute(Time validTime) {
+    if (this.emitted) {
+      LOG.info("{} already emitted.", getClass().getSimpleName());
+      return Option.apply(this.emptyRDD());
+    } else {
+      this.emitted = true;
+      LOG.info("{} emitted at1 {}", getClass().getSimpleName(), Instant.now());
+      return super.compute(validTime);
+    }
+  }
+
+  private RDD<T> emptyRDD() {
+    return this.context().sparkContext().emptyRDD(JavaSparkContext$.MODULE$.fakeClassTag());
+  }
+}
diff --git a/runners/spark/src/test/java/org/apache/beam/runners/spark/translation/SingleEmitInputDStreamTest.java b/runners/spark/src/test/java/org/apache/beam/runners/spark/translation/SingleEmitInputDStreamTest.java
new file mode 100644
index 000000000000..b13ca1b46c95
--- /dev/null
+++ b/runners/spark/src/test/java/org/apache/beam/runners/spark/translation/SingleEmitInputDStreamTest.java
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.spark.translation;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.empty;
+import static org.hamcrest.Matchers.hasItems;
+import static org.junit.Assert.assertNotNull;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.List;
+import org.apache.beam.runners.spark.SparkContextRule;
+import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.JavaSparkContext$;
+import org.apache.spark.rdd.RDD;
+import org.apache.spark.streaming.Duration;
+import org.apache.spark.streaming.api.java.JavaStreamingContext;
+import org.junit.Before;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import scala.Option;
+
+/**
+ * Tests for {@link SingleEmitInputDStream} class which ensures data is emitted exactly once in
+ * Spark streaming context.
+ */
+public class SingleEmitInputDStreamTest implements Serializable {
+  @ClassRule public static SparkContextRule sparkContext = new SparkContextRule();
+  @Rule public transient TemporaryFolder temporaryFolder = new TemporaryFolder();
+  private final Duration checkpointDuration = new Duration(500L);
+
+  /** Creates a temporary directory for storing checkpoints before each test execution. */
+  @Before
+  public void init() {
+    try {
+      temporaryFolder.create();
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  @Test
+  public void singleEmitInputDStreamShouldBeEmitOnlyOnce() {
+    // Initialize Spark contexts
+    JavaSparkContext jsc = sparkContext.getSparkContext();
+    JavaStreamingContext jssc = new JavaStreamingContext(jsc, checkpointDuration);
+
+    // Create test data and wrap it in SingleEmitInputDStream
+    final SingleEmitInputDStream<String> singleEmitInputDStream = singleEmitInputDSTream(jsc, jssc);
+    singleEmitInputDStream.checkpoint(checkpointDuration);
+
+    // First computation: should return the original data
+    Option<RDD<String>> rddOption = singleEmitInputDStream.compute(null);
+    RDD<String> rdd = rddOption.get();
+    JavaRDD<String> javaRDD = JavaRDD.fromRDD(rdd, JavaSparkContext$.MODULE$.fakeClassTag());
+    List<String> collect = javaRDD.collect();
+    assertNotNull(collect);
+    assertThat(collect, hasItems("foo", "bar"));
+
+    // Second computation: should return empty RDD since data was already emitted
+    rddOption = singleEmitInputDStream.compute(null);
+    rdd = rddOption.get();
+    javaRDD = JavaRDD.fromRDD(rdd, JavaSparkContext$.MODULE$.fakeClassTag());
+    collect = javaRDD.collect();
+    assertNotNull(collect);
+    assertThat(collect, empty());
+  }
+
+  private SingleEmitInputDStream<String> singleEmitInputDSTream(
+      JavaSparkContext jsc, JavaStreamingContext jssc) {
+    final JavaRDD<String> stringRDD = jsc.parallelize(Lists.newArrayList("foo", "bar"));
+    final SingleEmitInputDStream<String> singleEmitInputDStream =
+        new SingleEmitInputDStream<>(jssc.ssc(), stringRDD.rdd());
+    return singleEmitInputDStream;
+  }
+
+  @Test
+  public void singleEmitInputDStreamShouldBeEmptyAfterCheckpointRecovery()
+      throws InterruptedException {
+    // Set up checkpoint directory
+    String checkpointPath = temporaryFolder.getRoot().getPath();
+
+    // Initialize Spark contexts with checkpoint configuration
+    JavaSparkContext jsc = sparkContext.getSparkContext();
+    jsc.setCheckpointDir(checkpointPath);
+    JavaStreamingContext jssc = new JavaStreamingContext(jsc, checkpointDuration);
+    jssc.checkpoint(checkpointPath);
+
+    // Create test data and configure SingleEmitInputDStream
+    final SingleEmitInputDStream<String> singleEmitInputDStream = singleEmitInputDSTream(jsc, jssc);
+    singleEmitInputDStream.checkpoint(checkpointDuration);
+
+    // Register output operation required by Spark Streaming
+    singleEmitInputDStream.print();
+
+    // Compute initial RDD and verify original data is present
+    Option<RDD<String>> rddOption = singleEmitInputDStream.compute(null);
+    RDD<String> rdd = rddOption.get();
+
+    JavaRDD<String> javaRDD = JavaRDD.fromRDD(rdd, JavaSparkContext$.MODULE$.fakeClassTag());
+    List<String> collect = javaRDD.collect();
+    assertNotNull(collect);
+    assertThat(collect, hasItems("foo", "bar"));
+
+    // Start streaming context to create checkpoint data
+    jssc.start();
+    // Wait for checkpoint to be created and written
+    jssc.awaitTerminationOrTimeout(1000);
+    // Ensure clean shutdown and checkpoint writing
+    jssc.stop(true, true);
+
+    // Recover streaming context from checkpoint
+    JavaStreamingContext recoveredJssc =
+        JavaStreamingContext.getOrCreate(
+            checkpointPath,
+            () -> {
+              throw new RuntimeException(
+                  "Should not create new context, should recover from checkpoint");
+            });
+
+    try {
+      // Extract recovered DStream from the restored context
+      @SuppressWarnings("unchecked")
+      SingleEmitInputDStream<String> recoveredDStream =
+          (SingleEmitInputDStream<String>) recoveredJssc.ssc().graph().getInputStreams()[0];
+
+      // Compute RDD from recovered DStream and verify it's empty
+      Option<RDD<String>> recoveredRddOption = recoveredDStream.compute(null);
+      RDD<String> recoveredRdd = recoveredRddOption.get();
+      JavaRDD<String> recoveredJavaRdd =
+          JavaRDD.fromRDD(recoveredRdd, JavaSparkContext$.MODULE$.fakeClassTag());
+      List<String> recoveredCollect = recoveredJavaRdd.collect();
+
+      // Verify that recovered DStream produces empty results
+      assertNotNull(recoveredCollect);
+      assertThat(recoveredCollect, empty());
+
+    } finally {
+      // Ensure recovered context is properly cleaned up
+      recoveredJssc.stop(true, true);
+    }
+  }
+}

From c396ae55c31ad28e74c6d1b8c5bc3cd1b3e0a3cf Mon Sep 17 00:00:00 2001
From: twosom <two_somang@icloud.com>
Date: Tue, 25 Feb 2025 22:35:06 +0900
Subject: [PATCH 3/7] chore : spotlessApply

---
 .../apache/beam/runners/spark/TestSparkPipelineOptions.java    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/runners/spark/src/main/java/org/apache/beam/runners/spark/TestSparkPipelineOptions.java b/runners/spark/src/main/java/org/apache/beam/runners/spark/TestSparkPipelineOptions.java
index 121ff3dfdb9e..e9c358282132 100644
--- a/runners/spark/src/main/java/org/apache/beam/runners/spark/TestSparkPipelineOptions.java
+++ b/runners/spark/src/main/java/org/apache/beam/runners/spark/TestSparkPipelineOptions.java
@@ -35,7 +35,8 @@ public interface TestSparkPipelineOptions extends SparkPipelineOptions, TestPipe
   void setForceStreaming(boolean forceStreaming);
 
   @Description("A hard-coded expected number of assertions for this test pipeline.")
-  @Nullable Integer getExpectedAssertions();
+  @Nullable
+  Integer getExpectedAssertions();
 
   void setExpectedAssertions(Integer expectedAssertions);
 

From 37738378f670cb828175a6a446079d59cdc4e3bf Mon Sep 17 00:00:00 2001
From: twosom <two_somang@icloud.com>
Date: Wed, 26 Feb 2025 23:40:35 +0900
Subject: [PATCH 4/7] feat : Replace queueStream with SingleEmitInputDStream
 for Spark streaming

---
 ...rkStreamingPortablePipelineTranslator.java |  43 ++--
 .../StreamingTransformTranslator.java         |  19 +-
 .../StreamingTransformTranslatorTest.java     | 214 ++++++++++++++++++
 3 files changed, 251 insertions(+), 25 deletions(-)
 create mode 100644 runners/spark/src/test/java/org/apache/beam/runners/spark/translation/streaming/StreamingTransformTranslatorTest.java

diff --git a/runners/spark/src/main/java/org/apache/beam/runners/spark/translation/SparkStreamingPortablePipelineTranslator.java b/runners/spark/src/main/java/org/apache/beam/runners/spark/translation/SparkStreamingPortablePipelineTranslator.java
index 523dcbad0823..505a91e03b53 100644
--- a/runners/spark/src/main/java/org/apache/beam/runners/spark/translation/SparkStreamingPortablePipelineTranslator.java
+++ b/runners/spark/src/main/java/org/apache/beam/runners/spark/translation/SparkStreamingPortablePipelineTranslator.java
@@ -30,9 +30,7 @@
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
-import java.util.Queue;
 import java.util.Set;
-import java.util.concurrent.LinkedBlockingQueue;
 import org.apache.beam.model.pipeline.v1.RunnerApi;
 import org.apache.beam.runners.fnexecution.provisioning.JobInfo;
 import org.apache.beam.runners.spark.SparkPipelineOptions;
@@ -61,10 +59,11 @@
 import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.JavaSparkContext$;
 import org.apache.spark.broadcast.Broadcast;
 import org.apache.spark.storage.StorageLevel;
 import org.apache.spark.streaming.api.java.JavaDStream;
-import org.apache.spark.streaming.api.java.JavaInputDStream;
+import org.apache.spark.streaming.dstream.ConstantInputDStream;
 import scala.Tuple2;
 import scala.collection.JavaConverters;
 
@@ -158,15 +157,17 @@ private static void translateImpulse(
             .parallelize(CoderHelpers.toByteArrays(windowedValues, windowCoder))
             .map(CoderHelpers.fromByteFunction(windowCoder));
 
-    Queue<JavaRDD<WindowedValue<byte[]>>> rddQueue = new LinkedBlockingQueue<>();
-    rddQueue.offer(emptyByteArrayRDD);
-    JavaInputDStream<WindowedValue<byte[]>> emptyByteArrayStream =
-        context.getStreamingContext().queueStream(rddQueue, true /* oneAtATime */);
+    final ConstantInputDStream<WindowedValue<byte[]>> inputDStream =
+        new ConstantInputDStream<>(
+            context.getStreamingContext().ssc(),
+            emptyByteArrayRDD.rdd(),
+            JavaSparkContext$.MODULE$.fakeClassTag());
+
+    final JavaDStream<WindowedValue<byte[]>> stream =
+        JavaDStream.fromDStream(inputDStream, JavaSparkContext$.MODULE$.fakeClassTag());
 
     UnboundedDataset<byte[]> output =
-        new UnboundedDataset<>(
-            emptyByteArrayStream,
-            Collections.singletonList(emptyByteArrayStream.inputDStream().id()));
+        new UnboundedDataset<>(stream, Collections.singletonList(inputDStream.id()));
 
     // Add watermark to holder and advance to infinity to ensure future watermarks can be updated
     GlobalWatermarkHolder.SparkWatermarks sparkWatermark =
@@ -175,7 +176,6 @@ private static void translateImpulse(
             BoundedWindow.TIMESTAMP_MAX_VALUE,
             context.getFirstTimestamp());
     GlobalWatermarkHolder.add(output.getStreamSources().get(0), sparkWatermark);
-
     context.pushDataset(getOutputId(transformNode), output);
   }
 
@@ -297,6 +297,7 @@ public void setName(String name) {
     }
   }
 
+  @SuppressWarnings("unchecked")
   private static <T> void translateFlatten(
       PTransformNode transformNode,
       RunnerApi.Pipeline pipeline,
@@ -306,9 +307,11 @@ private static <T> void translateFlatten(
     List<Integer> streamSources = new ArrayList<>();
 
     if (inputsMap.isEmpty()) {
-      Queue<JavaRDD<WindowedValue<T>>> q = new LinkedBlockingQueue<>();
-      q.offer(context.getSparkContext().emptyRDD());
-      unifiedStreams = context.getStreamingContext().queueStream(q);
+      final JavaRDD<WindowedValue<T>> emptyRDD = context.getSparkContext().emptyRDD();
+      final SingleEmitInputDStream<WindowedValue<T>> singleEmitInputDStream =
+          new SingleEmitInputDStream<>(context.getStreamingContext().ssc(), emptyRDD.rdd());
+      unifiedStreams =
+          JavaDStream.fromDStream(singleEmitInputDStream, JavaSparkContext$.MODULE$.fakeClassTag());
     } else {
       List<JavaDStream<WindowedValue<T>>> dStreams = new ArrayList<>();
       for (String inputId : inputsMap.values()) {
@@ -319,11 +322,13 @@ private static <T> void translateFlatten(
           dStreams.add(unboundedDataset.getDStream());
         } else {
           // create a single RDD stream.
-          Queue<JavaRDD<WindowedValue<T>>> q = new LinkedBlockingQueue<>();
-          q.offer(((BoundedDataset) dataset).getRDD());
-          // TODO (https://github.com/apache/beam/issues/20426): this is not recoverable from
-          // checkpoint!
-          JavaDStream<WindowedValue<T>> dStream = context.getStreamingContext().queueStream(q);
+          final SingleEmitInputDStream<WindowedValue<T>> singleEmitInputDStream =
+              new SingleEmitInputDStream<WindowedValue<T>>(
+                  context.getStreamingContext().ssc(), ((BoundedDataset) dataset).getRDD().rdd());
+          final JavaDStream<WindowedValue<T>> dStream =
+              JavaDStream.fromDStream(
+                  singleEmitInputDStream, JavaSparkContext$.MODULE$.fakeClassTag());
+
           dStreams.add(dStream);
         }
       }
diff --git a/runners/spark/src/main/java/org/apache/beam/runners/spark/translation/streaming/StreamingTransformTranslator.java b/runners/spark/src/main/java/org/apache/beam/runners/spark/translation/streaming/StreamingTransformTranslator.java
index 539f8ff3efe6..e06ef79e483f 100644
--- a/runners/spark/src/main/java/org/apache/beam/runners/spark/translation/streaming/StreamingTransformTranslator.java
+++ b/runners/spark/src/main/java/org/apache/beam/runners/spark/translation/streaming/StreamingTransformTranslator.java
@@ -43,6 +43,7 @@
 import org.apache.beam.runners.spark.translation.EvaluationContext;
 import org.apache.beam.runners.spark.translation.GroupCombineFunctions;
 import org.apache.beam.runners.spark.translation.MultiDoFnFunction;
+import org.apache.beam.runners.spark.translation.SingleEmitInputDStream;
 import org.apache.beam.runners.spark.translation.SparkAssignWindowFn;
 import org.apache.beam.runners.spark.translation.SparkCombineFn;
 import org.apache.beam.runners.spark.translation.SparkPCollectionView;
@@ -88,6 +89,7 @@
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.JavaSparkContext$;
+import org.apache.spark.streaming.StreamingContext;
 import org.apache.spark.streaming.api.java.JavaDStream;
 import org.apache.spark.streaming.api.java.JavaInputDStream;
 import org.apache.spark.streaming.api.java.JavaPairDStream;
@@ -291,12 +293,8 @@ public void evaluate(Flatten.PCollections<T> transform, EvaluationContext contex
             dStreams.add(unboundedDataset.getDStream());
           } else {
             // create a single RDD stream.
-            Queue<JavaRDD<WindowedValue<T>>> q = new LinkedBlockingQueue<>();
-            q.offer(((BoundedDataset) dataset).getRDD());
-            // TODO (https://github.com/apache/beam/issues/20426): this is not recoverable from
-            // checkpoint!
-            JavaDStream<WindowedValue<T>> dStream = context.getStreamingContext().queueStream(q);
-            dStreams.add(dStream);
+            dStreams.add(
+                this.buildDStream(context.getStreamingContext().ssc(), (BoundedDataset) dataset));
           }
         }
         // start by unifying streams into a single stream.
@@ -305,6 +303,15 @@ public void evaluate(Flatten.PCollections<T> transform, EvaluationContext contex
         context.putDataset(transform, new UnboundedDataset<>(unifiedStreams, streamingSources));
       }
 
+      private JavaDStream<WindowedValue<T>> buildDStream(
+          final StreamingContext ssc, final BoundedDataset<T> dataset) {
+
+        final SingleEmitInputDStream<WindowedValue<T>> singleEmitDStream =
+            new SingleEmitInputDStream<>(ssc, dataset.getRDD().rdd());
+
+        return JavaDStream.fromDStream(singleEmitDStream, JavaSparkContext$.MODULE$.fakeClassTag());
+      }
+
       @Override
       public String toNativeString() {
         return "streamingContext.union(...)";
diff --git a/runners/spark/src/test/java/org/apache/beam/runners/spark/translation/streaming/StreamingTransformTranslatorTest.java b/runners/spark/src/test/java/org/apache/beam/runners/spark/translation/streaming/StreamingTransformTranslatorTest.java
new file mode 100644
index 000000000000..b324c209ad6d
--- /dev/null
+++ b/runners/spark/src/test/java/org/apache/beam/runners/spark/translation/streaming/StreamingTransformTranslatorTest.java
@@ -0,0 +1,214 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.beam.runners.spark.translation.streaming;
+
+import static org.apache.beam.sdk.metrics.MetricResultsMatchers.attemptedMetricsResult;
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.hasItem;
+
+import java.io.IOException;
+import java.io.Serializable;
+import org.apache.beam.runners.spark.StreamingTest;
+import org.apache.beam.runners.spark.TestSparkPipelineOptions;
+import org.apache.beam.runners.spark.TestSparkRunner;
+import org.apache.beam.runners.spark.UsesCheckpointRecovery;
+import org.apache.beam.runners.spark.io.MicrobatchSource;
+import org.apache.beam.runners.spark.metrics.MetricsAccumulator;
+import org.apache.beam.runners.spark.util.GlobalWatermarkHolder;
+import org.apache.beam.sdk.Pipeline;
+import org.apache.beam.sdk.PipelineResult;
+import org.apache.beam.sdk.io.GenerateSequence;
+import org.apache.beam.sdk.metrics.Distribution;
+import org.apache.beam.sdk.metrics.DistributionResult;
+import org.apache.beam.sdk.metrics.MetricNameFilter;
+import org.apache.beam.sdk.metrics.Metrics;
+import org.apache.beam.sdk.metrics.MetricsFilter;
+import org.apache.beam.sdk.options.PipelineOptionsFactory;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.Flatten;
+import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.beam.sdk.transforms.WithTimestamps;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Optional;
+import org.joda.time.Duration;
+import org.joda.time.Instant;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.junit.rules.TemporaryFolder;
+
+/** Test suite for {@link StreamingTransformTranslator}. */
+public class StreamingTransformTranslatorTest implements Serializable {
+
+  @Rule public transient TemporaryFolder temporaryFolder = new TemporaryFolder();
+  public transient Pipeline p;
+
+  /** Creates a temporary directory for storing checkpoints before each test execution. */
+  @Before
+  public void init() {
+    try {
+      temporaryFolder.create();
+      System.out.println("Checkpoint Directory = " + temporaryFolder.getRoot().getPath());
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  /**
+   * Tests that Flatten transform of Bounded and Unbounded PCollections correctly recovers from
+   * checkpoint.
+   *
+   * <p>Test scenario:
+   *
+   * <ol>
+   *   <li>First run:
+   *       <ul>
+   *         <li>Flattens Bounded PCollection(0-9) with Unbounded PCollection
+   *         <li>Stops pipeline after 400ms
+   *         <li>Validates metrics results
+   *       </ul>
+   *   <li>Second run (recovery from checkpoint):
+   *       <ul>
+   *         <li>Recovers from previous state and continues execution
+   *         <li>Stops pipeline after 1 second
+   *         <li>Validates accumulated metrics results
+   *       </ul>
+   * </ol>
+   */
+  @Category({UsesCheckpointRecovery.class, StreamingTest.class})
+  @Test
+  public void testFlattenPCollResumeFromCheckpoint() {
+    final MetricsFilter metricsFilter =
+        MetricsFilter.builder()
+            .addNameFilter(MetricNameFilter.inNamespace(PAssertFn.class))
+            .build();
+
+    PipelineResult res = run(Optional.of(new Instant(400)), false);
+
+    // Verify metrics for Bounded PCollection (sum of 0-9 = 45, count = 10)
+    assertThat(
+        res.metrics().queryMetrics(metricsFilter).getDistributions(),
+        hasItem(
+            attemptedMetricsResult(
+                PAssertFn.class.getName(),
+                "distribution",
+                "BoundedAssert",
+                DistributionResult.create(45, 10, 0L, 9L))));
+
+    // Verify metrics for Flattened result after first run
+    assertThat(
+        res.metrics().queryMetrics(metricsFilter).getDistributions(),
+        hasItem(
+            attemptedMetricsResult(
+                PAssertFn.class.getName(),
+                "distribution",
+                "FlattenedAssert",
+                DistributionResult.create(45, 10, 0L, 9L))));
+
+    // Clean up state
+    clean();
+
+    // Second run: recover from checkpoint
+    res = runAgain();
+
+    // Verify Bounded PCollection metrics remain the same
+    assertThat(
+        res.metrics().queryMetrics(metricsFilter).getDistributions(),
+        hasItem(
+            attemptedMetricsResult(
+                PAssertFn.class.getName(),
+                "distribution",
+                "BoundedAssert",
+                DistributionResult.create(45, 10, 0L, 9L))));
+
+    // Verify Flattened results show accumulated values from both runs
+    assertThat(
+        res.metrics().queryMetrics(metricsFilter).getDistributions(),
+        hasItem(
+            attemptedMetricsResult(
+                PAssertFn.class.getName(),
+                "distribution",
+                "FlattenedAssert",
+                DistributionResult.create(91, 14, 0, 13))));
+  }
+
+  /** Restarts the pipeline from checkpoint. Sets pipeline to stop after 1 second. */
+  private PipelineResult runAgain() {
+    return run(Optional.of(Instant.ofEpochMilli(Duration.standardSeconds(1L).getMillis())), true);
+  }
+
+  /**
+   * Sets up and runs the test pipeline.
+   *
+   * @param stopWatermarkOption Watermark at which to stop the pipeline
+   * @param deleteCheckpointDir Whether to delete checkpoint directory after completion
+   */
+  private PipelineResult run(Optional<Instant> stopWatermarkOption, boolean deleteCheckpointDir) {
+    TestSparkPipelineOptions options =
+        PipelineOptionsFactory.create().as(TestSparkPipelineOptions.class);
+    options.setSparkMaster("local[*]");
+    options.setRunner(TestSparkRunner.class);
+    options.setCheckpointDir(temporaryFolder.getRoot().getPath());
+    if (stopWatermarkOption.isPresent()) {
+      options.setStopPipelineWatermark(stopWatermarkOption.get().getMillis());
+    }
+    options.setDeleteCheckpointDir(deleteCheckpointDir);
+
+    p = Pipeline.create(options);
+
+    final PCollection<Long> bounded =
+        p.apply("Bounded", GenerateSequence.from(0).to(10))
+            .apply("BoundedAssert", ParDo.of(new PAssertFn()));
+
+    final PCollection<Long> unbounded =
+        p.apply("Unbounded", GenerateSequence.from(10).withRate(3, Duration.standardSeconds(1)))
+            .apply(WithTimestamps.of(e -> Instant.now()));
+
+    final PCollection<Long> flattened = bounded.apply(Flatten.with(unbounded));
+
+    flattened.apply("FlattenedAssert", ParDo.of(new PAssertFn()));
+    return p.run();
+  }
+
+  /**
+   * Cleans up accumulated state between test runs. Clears metrics, watermarks, and microbatch
+   * source cache.
+   */
+  @After
+  public void clean() {
+    MetricsAccumulator.clear();
+    GlobalWatermarkHolder.clear();
+    MicrobatchSource.clearCache();
+  }
+
+  /**
+   * DoFn that tracks element distribution through metrics. Used to verify correct processing of
+   * elements in both bounded and unbounded streams.
+   */
+  private static class PAssertFn extends DoFn<Long, Long> {
+    private final Distribution distribution = Metrics.distribution(PAssertFn.class, "distribution");
+
+    @ProcessElement
+    public void process(@Element Long element, OutputReceiver<Long> output) {
+      distribution.update(element);
+      output.output(element);
+    }
+  }
+}

From 8d68bbe8723289529363235ee56b11bdf40288a0 Mon Sep 17 00:00:00 2001
From: twosom <two_somang@icloud.com>
Date: Wed, 26 Feb 2025 23:49:57 +0900
Subject: [PATCH 5/7] chore : remove logging in SingleEmitInputDStream

---
 .../runners/spark/translation/SingleEmitInputDStream.java  | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/runners/spark/src/main/java/org/apache/beam/runners/spark/translation/SingleEmitInputDStream.java b/runners/spark/src/main/java/org/apache/beam/runners/spark/translation/SingleEmitInputDStream.java
index b4180d793315..3cde41d2812e 100644
--- a/runners/spark/src/main/java/org/apache/beam/runners/spark/translation/SingleEmitInputDStream.java
+++ b/runners/spark/src/main/java/org/apache/beam/runners/spark/translation/SingleEmitInputDStream.java
@@ -23,9 +23,6 @@
 import org.apache.spark.streaming.Time;
 import org.apache.spark.streaming.dstream.ConstantInputDStream;
 import org.apache.spark.streaming.dstream.QueueInputDStream;
-import org.joda.time.Instant;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 import scala.Option;
 
 /**
@@ -44,8 +41,6 @@
  */
 public class SingleEmitInputDStream<T> extends ConstantInputDStream<T> {
 
-  private static final Logger LOG = LoggerFactory.getLogger(SingleEmitInputDStream.class);
-
   private boolean emitted = false;
 
   public SingleEmitInputDStream(StreamingContext ssc, RDD<T> rdd) {
@@ -55,11 +50,9 @@ public SingleEmitInputDStream(StreamingContext ssc, RDD<T> rdd) {
   @Override
   public Option<RDD<T>> compute(Time validTime) {
     if (this.emitted) {
-      LOG.info("{} already emitted.", getClass().getSimpleName());
       return Option.apply(this.emptyRDD());
     } else {
       this.emitted = true;
-      LOG.info("{} emitted at1 {}", getClass().getSimpleName(), Instant.now());
       return super.compute(validTime);
     }
   }

From 24118f71f49a75665a1bcac7993d3ee88f2c2f76 Mon Sep 17 00:00:00 2001
From: twosom <two_somang@icloud.com>
Date: Wed, 26 Feb 2025 23:55:23 +0900
Subject: [PATCH 6/7] Touch trigger files

---
 .../beam_PostCommit_Java_ValidatesRunner_Spark.json            | 3 ++-
 ...stCommit_Java_ValidatesRunner_SparkStructuredStreaming.json | 3 ++-
 .../beam_PostCommit_Java_ValidatesRunner_Spark_Java11.json     | 3 ++-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/trigger_files/beam_PostCommit_Java_ValidatesRunner_Spark.json b/.github/trigger_files/beam_PostCommit_Java_ValidatesRunner_Spark.json
index dd2bf3aeb361..e3fd31aabce2 100644
--- a/.github/trigger_files/beam_PostCommit_Java_ValidatesRunner_Spark.json
+++ b/.github/trigger_files/beam_PostCommit_Java_ValidatesRunner_Spark.json
@@ -4,5 +4,6 @@
   "https://github.com/apache/beam/pull/31798": "noting that PR #31798 should run this test",
   "https://github.com/apache/beam/pull/32546": "noting that PR #32546 should run this test",
   "https://github.com/apache/beam/pull/33267": "noting that PR #33267 should run this test",
-  "https://github.com/apache/beam/pull/33322": "noting that PR #33322 should run this test"
+  "https://github.com/apache/beam/pull/33322": "noting that PR #33322 should run this test",
+  "https://github.com/apache/beam/pull/34080": "noting that PR #34080 should run this test"
 }
diff --git a/.github/trigger_files/beam_PostCommit_Java_ValidatesRunner_SparkStructuredStreaming.json b/.github/trigger_files/beam_PostCommit_Java_ValidatesRunner_SparkStructuredStreaming.json
index 74f4220571e5..4f32700a039a 100644
--- a/.github/trigger_files/beam_PostCommit_Java_ValidatesRunner_SparkStructuredStreaming.json
+++ b/.github/trigger_files/beam_PostCommit_Java_ValidatesRunner_SparkStructuredStreaming.json
@@ -3,5 +3,6 @@
   "https://github.com/apache/beam/pull/31156": "noting that PR #31156 should run this test",
   "https://github.com/apache/beam/pull/31798": "noting that PR #31798 should run this test",
   "https://github.com/apache/beam/pull/32546": "noting that PR #32546 should run this test",
-  "https://github.com/apache/beam/pull/33267": "noting that PR #33267 should run this test"
+  "https://github.com/apache/beam/pull/33267": "noting that PR #33267 should run this test",
+  "https://github.com/apache/beam/pull/34080": "noting that PR #34080 should run this test"
 }
diff --git a/.github/trigger_files/beam_PostCommit_Java_ValidatesRunner_Spark_Java11.json b/.github/trigger_files/beam_PostCommit_Java_ValidatesRunner_Spark_Java11.json
index dd2bf3aeb361..e3fd31aabce2 100644
--- a/.github/trigger_files/beam_PostCommit_Java_ValidatesRunner_Spark_Java11.json
+++ b/.github/trigger_files/beam_PostCommit_Java_ValidatesRunner_Spark_Java11.json
@@ -4,5 +4,6 @@
   "https://github.com/apache/beam/pull/31798": "noting that PR #31798 should run this test",
   "https://github.com/apache/beam/pull/32546": "noting that PR #32546 should run this test",
   "https://github.com/apache/beam/pull/33267": "noting that PR #33267 should run this test",
-  "https://github.com/apache/beam/pull/33322": "noting that PR #33322 should run this test"
+  "https://github.com/apache/beam/pull/33322": "noting that PR #33322 should run this test",
+  "https://github.com/apache/beam/pull/34080": "noting that PR #34080 should run this test"
 }

From 573f9d74188fbcd492fce5c01edc73ef89578082 Mon Sep 17 00:00:00 2001
From: twosom <two_somang@icloud.com>
Date: Thu, 27 Feb 2025 12:03:01 +0900
Subject: [PATCH 7/7] test : fix flaky StreamingTransformTranslatorTest

---
 .../StreamingTransformTranslatorTest.java     | 41 +++++++++++++++----
 1 file changed, 34 insertions(+), 7 deletions(-)

diff --git a/runners/spark/src/test/java/org/apache/beam/runners/spark/translation/streaming/StreamingTransformTranslatorTest.java b/runners/spark/src/test/java/org/apache/beam/runners/spark/translation/streaming/StreamingTransformTranslatorTest.java
index b324c209ad6d..e61f530748c7 100644
--- a/runners/spark/src/test/java/org/apache/beam/runners/spark/translation/streaming/StreamingTransformTranslatorTest.java
+++ b/runners/spark/src/test/java/org/apache/beam/runners/spark/translation/streaming/StreamingTransformTranslatorTest.java
@@ -19,6 +19,7 @@
 
 import static org.apache.beam.sdk.metrics.MetricResultsMatchers.attemptedMetricsResult;
 import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.anyOf;
 import static org.hamcrest.Matchers.hasItem;
 
 import java.io.IOException;
@@ -65,7 +66,6 @@ public class StreamingTransformTranslatorTest implements Serializable {
   public void init() {
     try {
       temporaryFolder.create();
-      System.out.println("Checkpoint Directory = " + temporaryFolder.getRoot().getPath());
     } catch (IOException e) {
       throw new RuntimeException(e);
     }
@@ -139,19 +139,37 @@ public void testFlattenPCollResumeFromCheckpoint() {
                 DistributionResult.create(45, 10, 0L, 9L))));
 
     // Verify Flattened results show accumulated values from both runs
+    // We use anyOf matcher because the unbounded source may emit either 2 or 3 elements during the
+    // test window:
+    // Case 1 (3 elements): sum=78 (45 from bounded + 33 from unbounded), count=13 (10 bounded + 3
+    // unbounded)
+    // Case 2 (2 elements): sum=66 (45 from bounded + 21 from unbounded), count=12 (10 bounded + 2
+    // unbounded)
+    // This variation occurs because the unbounded source's withRate(3, Duration.standardSeconds(1))
+    // timing may be affected by test environment conditions
     assertThat(
         res.metrics().queryMetrics(metricsFilter).getDistributions(),
         hasItem(
-            attemptedMetricsResult(
-                PAssertFn.class.getName(),
-                "distribution",
-                "FlattenedAssert",
-                DistributionResult.create(91, 14, 0, 13))));
+            anyOf(
+                attemptedMetricsResult(
+                    PAssertFn.class.getName(),
+                    "distribution",
+                    "FlattenedAssert",
+                    DistributionResult.create(78, 13, 0, 12)),
+                attemptedMetricsResult(
+                    PAssertFn.class.getName(),
+                    "distribution",
+                    "FlattenedAssert",
+                    DistributionResult.create(66, 12, 0, 11)))));
   }
 
   /** Restarts the pipeline from checkpoint. Sets pipeline to stop after 1 second. */
   private PipelineResult runAgain() {
-    return run(Optional.of(Instant.ofEpochMilli(Duration.standardSeconds(1L).getMillis())), true);
+    return run(
+        Optional.of(
+            Instant.ofEpochMilli(
+                Duration.standardSeconds(1L).plus(Duration.millis(50L)).getMillis())),
+        true);
   }
 
   /**
@@ -207,6 +225,15 @@ private static class PAssertFn extends DoFn<Long, Long> {
 
     @ProcessElement
     public void process(@Element Long element, OutputReceiver<Long> output) {
+      // For the unbounded source (starting from 10), we expect only 3 elements (10, 11, 12)
+      // to be emitted during the 1-second test window.
+      // However, different execution environments might emit more elements than expected
+      // despite the withRate(3, Duration.standardSeconds(1)) setting.
+      // Therefore, we filter out elements >= 13 to ensure consistent test behavior
+      // across all environments.
+      if (element >= 13L) {
+        return;
+      }
       distribution.update(element);
       output.output(element);
     }