From 7306870eb487a7499f916e4b0387b956ba59a034 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Fri, 14 Jun 2024 22:00:12 -0400 Subject: [PATCH 01/43] support writing pubsub messages with ordering key --- .../beam/sdk/io/gcp/pubsub/PubsubIO.java | 27 +++++- .../gcp/pubsub/PubsubMessageSchemaCoder.java | 90 +++++++++++++++++++ .../pubsub/PubsubMessageSchemaCoderTest.java | 63 +++++++++++++ .../beam/sdk/io/gcp/pubsub/PubsubWriteIT.java | 60 ++++++++++++- 4 files changed, 237 insertions(+), 3 deletions(-) create mode 100644 sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java create mode 100644 sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoderTest.java diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java index 01848d92d928..66a1c912cda4 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java @@ -735,6 +735,20 @@ public static Write writeMessages() { .build(); } + /** + * Returns A {@link PTransform} that writes {@link PubsubMessage}s, along with the {@link + * PubsubMessage#getMessageId() messageId} and {@link PubsubMessage#getOrderingKey()}, to a Google + * Cloud Pub/Sub stream. + */ + public static Write writeMessagesWithOrderingKey() { + return Write.newBuilder() + .setTopicProvider(null) + .setTopicFunction(null) + .setDynamicDestinations(false) + .setNeedsOrderingKey(true) + .build(); + } + /** * Enables dynamic destination topics. The {@link PubsubMessage} elements are each expected to * contain a destination topic, which can be set using {@link PubsubMessage#withTopic}. If {@link @@ -1288,6 +1302,8 @@ public abstract static class Write extends PTransform, PDone> abstract @Nullable String getPubsubRootUrl(); + abstract boolean getNeedsOrderingKey(); + abstract BadRecordRouter getBadRecordRouter(); abstract ErrorHandler getBadRecordErrorHandler(); @@ -1301,6 +1317,7 @@ static Builder newBuilder( builder.setFormatFn(formatFn); builder.setBadRecordRouter(BadRecordRouter.THROWING_ROUTER); builder.setBadRecordErrorHandler(new DefaultErrorHandler<>()); + builder.setNeedsOrderingKey(false); return builder; } @@ -1332,6 +1349,8 @@ abstract Builder setFormatFn( abstract Builder setPubsubRootUrl(String pubsubRootUrl); + abstract Builder setNeedsOrderingKey(boolean needsOrderingKey); + abstract Builder setBadRecordRouter(BadRecordRouter badRecordRouter); abstract Builder setBadRecordErrorHandler( @@ -1487,8 +1506,12 @@ public PDone expand(PCollection input) { pubsubMessageTuple .get(BAD_RECORD_TAG) .setCoder(BadRecord.getCoder(input.getPipeline()))); - PCollection pubsubMessages = - pubsubMessageTuple.get(pubsubMessageTupleTag).setCoder(new PubsubMessageWithTopicCoder()); + PCollection pubsubMessages = pubsubMessageTuple.get(pubsubMessageTupleTag); + if (getNeedsOrderingKey()) { + pubsubMessages.setCoder(PubsubMessageSchemaCoder.getSchemaCoder()); + } else { + pubsubMessages.setCoder(new PubsubMessageWithTopicCoder()); + } switch (input.isBounded()) { case BOUNDED: pubsubMessages.apply( diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java new file mode 100644 index 000000000000..72eb32d87668 --- /dev/null +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.gcp.pubsub; + +import java.util.HashMap; +import java.util.Map; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.schemas.SchemaCoder; +import org.apache.beam.sdk.transforms.SerializableFunction; +import org.apache.beam.sdk.values.Row; +import org.apache.beam.sdk.values.TypeDescriptor; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; + +/** + * Provides a {@link SchemaCoder} for {@link PubsubMessage}, including the topic and all fields of a + * PubSub message from server. + * + *

{@link SchemaCoder} is used so that fields can be added in the future without breaking update + * compatibility. + */ +public class PubsubMessageSchemaCoder { + private static final Schema PUBSUB_MESSAGE_SCHEMA = + Schema.builder() + .addByteArrayField("payload") + .addNullableStringField("topic") + .addNullableMapField("attributes", Schema.FieldType.STRING, Schema.FieldType.STRING) + .addNullableStringField("message_id") + .addNullableStringField("ordering_key") + .build(); + + private static final SerializableFunction TO_ROW = + (PubsubMessage message) -> { + Map fieldValues = new HashMap<>(); + fieldValues.put("payload", message.getPayload()); + + String topic = message.getTopic(); + if (topic != null) { + fieldValues.put("topic", topic); + } + Map attributeMap = message.getAttributeMap(); + if (attributeMap != null) { + fieldValues.put("attributes", attributeMap); + } + String messageId = message.getMessageId(); + if (messageId != null) { + fieldValues.put("message_id", messageId); + } + String orderingKey = message.getOrderingKey(); + if (orderingKey != null) { + fieldValues.put("ordering_key", orderingKey); + } + return Row.withSchema(PUBSUB_MESSAGE_SCHEMA).withFieldValues(fieldValues).build(); + }; + + private static final SerializableFunction FROM_ROW = + (Row row) -> { + PubsubMessage message = + new PubsubMessage( + Preconditions.checkNotNull(row.getBytes("payload")), + row.getMap("attributes"), + row.getString("message_id"), + row.getString("ordering_key")); + + String topic = row.getString("topic"); + if (topic != null) { + message = message.withTopic(topic); + } + return message; + }; + + public static SchemaCoder getSchemaCoder() { + return SchemaCoder.of( + PUBSUB_MESSAGE_SCHEMA, TypeDescriptor.of(PubsubMessage.class), TO_ROW, FROM_ROW); + } +} diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoderTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoderTest.java new file mode 100644 index 000000000000..0776b1cb100f --- /dev/null +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoderTest.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.gcp.pubsub; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.equalTo; + +import java.nio.charset.StandardCharsets; +import java.util.Map; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.testing.CoderProperties; +import org.apache.beam.sdk.util.SerializableUtils; +import org.apache.beam.sdk.values.TypeDescriptor; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; +import org.junit.Test; + +public class PubsubMessageSchemaCoderTest { + private static final String DATA = "testData"; + private static final String TOPIC = "testTopic"; + private static final String MESSAGE_ID = "testMessageId"; + private static final Map ATTRIBUTES = + new ImmutableMap.Builder().put("1", "hello").build(); + private static final String ORDERING_KEY = "key123"; + private static final Coder TEST_CODER = PubsubMessageSchemaCoder.getSchemaCoder(); + private static final PubsubMessage TEST_VALUE = + new PubsubMessage(DATA.getBytes(StandardCharsets.UTF_8), ATTRIBUTES, MESSAGE_ID, ORDERING_KEY) + .withTopic(TOPIC); + private static final PubsubMessage TEST_MINIMAL_VALUE = + new PubsubMessage(DATA.getBytes(StandardCharsets.UTF_8), null); + + @Test + public void testValueEncodable() { + SerializableUtils.ensureSerializableByCoder(TEST_CODER, TEST_VALUE, "error"); + SerializableUtils.ensureSerializableByCoder(TEST_CODER, TEST_MINIMAL_VALUE, "error"); + } + + @Test + public void testCoderDecodeEncodeEqual() throws Exception { + CoderProperties.structuralValueDecodeEncodeEqual(TEST_CODER, TEST_VALUE); + CoderProperties.structuralValueDecodeEncodeEqual(TEST_CODER, TEST_MINIMAL_VALUE); + } + + @Test + public void testEncodedTypeDescriptor() { + TypeDescriptor typeDescriptor = new TypeDescriptor() {}; + assertThat(TEST_CODER.getEncodedTypeDescriptor(), equalTo(typeDescriptor)); + } +} diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteIT.java index 1898c4ae4af0..02e37b633525 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteIT.java @@ -17,11 +17,17 @@ */ package org.apache.beam.sdk.io.gcp.pubsub; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + import java.io.IOException; import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.List; import java.util.Map; import java.util.Objects; import org.apache.beam.sdk.io.GenerateSequence; +import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.SubscriptionPath; import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.TopicPath; import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.transforms.Create; @@ -46,11 +52,12 @@ public class PubsubWriteIT { private PubsubClient pubsubClient; private TopicPath testTopic; + private String project; @Before public void setup() throws IOException { PubsubOptions options = TestPipeline.testingPipelineOptions().as(PubsubOptions.class); - String project = options.getProject(); + project = options.getProject(); pubsubClient = PubsubGrpcClient.FACTORY.newClient(null, null, options); testTopic = PubsubClient.topicPathFromName(project, "pubsub-write-" + Instant.now().getMillis()); @@ -102,4 +109,55 @@ public void testBoundedWriteMessageWithAttributes() { .apply(PubsubIO.writeMessages().to(testTopic.getPath())); pipeline.run(); } + + @Test + public void testBoundedWriteMessageWithAttributesAndMessageIdAndOrderingKey() throws IOException { + TopicPath testTopicPath = + PubsubClient.topicPathFromName( + project, "pubsub-write-ordering-key-" + Instant.now().getMillis()); + pubsubClient.createTopic(testTopicPath); + SubscriptionPath testSubscriptionPath = + pubsubClient.createRandomSubscription( + PubsubClient.projectPathFromId(project), testTopicPath, 10); + + byte[] payload = RandomStringUtils.randomAscii(1_000_000).getBytes(StandardCharsets.UTF_8); + Map attributes = + ImmutableMap.builder() + .put("id", "1") + .put("description", RandomStringUtils.randomAscii(100)) + .build(); + + PubsubMessage outgoingMessage = + new PubsubMessage(payload, attributes, "test_message", "111222"); + + pipeline + .apply(Create.of(outgoingMessage).withCoder(PubsubMessageSchemaCoder.getSchemaCoder())) + .apply(PubsubIO.writeMessagesWithOrderingKey().to(testTopicPath.getPath())); + pipeline.run().waitUntilFinish(); + + List incomingMessages = + pubsubClient.pull(Instant.now().getMillis(), testSubscriptionPath, 1, true); + + // sometimes the first pull comes up short. try 4 pulls to avoid flaky false-negatives + int numPulls = 1; + while (incomingMessages.isEmpty()) { + if (numPulls >= 4) { + throw new RuntimeException( + String.format("Pulled %s times from PubSub but retrieved no elements.", numPulls)); + } + incomingMessages = + pubsubClient.pull(Instant.now().getMillis(), testSubscriptionPath, 1, true); + numPulls++; + } + assertEquals(1, incomingMessages.size()); + + com.google.pubsub.v1.PubsubMessage incomingMessage = incomingMessages.get(0).message(); + assertTrue( + Arrays.equals(outgoingMessage.getPayload(), incomingMessage.getData().toByteArray())); + assertEquals(outgoingMessage.getAttributeMap(), incomingMessage.getAttributesMap()); + assertEquals(outgoingMessage.getOrderingKey(), incomingMessage.getOrderingKey()); + + pubsubClient.deleteSubscription(testSubscriptionPath); + pubsubClient.deleteTopic(testTopicPath); + } } From 4bcd3b43f3b25e5504e094f5b550f7e4084d2a66 Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Tue, 18 Jun 2024 22:53:11 +0000 Subject: [PATCH 02/43] Add ordering key size validation to validatePubsubMessageSize --- .../io/gcp/pubsub/PreparePubsubWriteDoFn.java | 14 +++++++ .../pubsub/PreparePubsubWriteDoFnTest.java | 39 +++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java index 521e65b934b9..1d4900f701b9 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java @@ -66,6 +66,20 @@ static int validatePubsubMessageSize(PubsubMessage message, int maxPublishBatchS } int totalSize = payloadSize; + @Nullable String orderingKey = message.getOrderingKey(); + if (orderingKey != null) { + int orderingKeySize = orderingKey.getBytes(StandardCharsets.UTF_8).length; + if (orderingKeySize > PUBSUB_MESSAGE_ATTRIBUTE_MAX_VALUE_BYTES) { + throw new SizeLimitExceededException( + "Pubsub message ordering key of length " + + orderingKeySize + + " exceeds maximum of " + + PUBSUB_MESSAGE_ATTRIBUTE_MAX_VALUE_BYTES + + " bytes. See https://cloud.google.com/pubsub/quotas#resource_limits"); + } + totalSize += orderingKeySize; + } + @Nullable Map attributes = message.getAttributeMap(); if (attributes != null) { if (attributes.size() > PUBSUB_MESSAGE_MAX_ATTRIBUTES) { diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFnTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFnTest.java index 494189d43f36..a125a7b67e69 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFnTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFnTest.java @@ -44,6 +44,19 @@ public void testValidatePubsubMessageSizeOnlyPayload() throws SizeLimitExceededE assertEquals(data.length, messageSize); } + @Test + public void testValidatePubsubMessageSizePayloadAndOrderingKey() + throws SizeLimitExceededException { + byte[] data = new byte[1024]; + String orderingKey = "key"; + PubsubMessage message = new PubsubMessage(data, null, null, orderingKey); + + int messageSize = + PreparePubsubWriteDoFn.validatePubsubMessageSize(message, PUBSUB_MESSAGE_MAX_TOTAL_SIZE); + + assertEquals(data.length + orderingKey.getBytes(StandardCharsets.UTF_8).length, messageSize); + } + @Test public void testValidatePubsubMessageSizePayloadAndAttributes() throws SizeLimitExceededException { @@ -76,6 +89,19 @@ public void testValidatePubsubMessageSizePayloadTooLarge() { message, PUBSUB_MESSAGE_MAX_TOTAL_SIZE)); } + @Test + public void testValidatePubsubMessageSizePayloadPlusOrderingKeyTooLarge() { + byte[] data = new byte[(10 << 20)]; + String orderingKey = "key"; + PubsubMessage message = new PubsubMessage(data, null, null, orderingKey); + + assertThrows( + SizeLimitExceededException.class, + () -> + PreparePubsubWriteDoFn.validatePubsubMessageSize( + message, PUBSUB_MESSAGE_MAX_TOTAL_SIZE)); + } + @Test public void testValidatePubsubMessageSizePayloadPlusAttributesTooLarge() { byte[] data = new byte[(10 << 20)]; @@ -121,6 +147,19 @@ public void testValidatePubsubMessageSizeAttributeValueTooLarge() { message, PUBSUB_MESSAGE_MAX_TOTAL_SIZE)); } + @Test + public void testValidatePubsubMessageSizeOrderingKeyTooLarge() { + byte[] data = new byte[1024]; + String orderingKey = RandomStringUtils.randomAscii(1025); + PubsubMessage message = new PubsubMessage(data, null, null, orderingKey); + + assertThrows( + SizeLimitExceededException.class, + () -> + PreparePubsubWriteDoFn.validatePubsubMessageSize( + message, PUBSUB_MESSAGE_MAX_TOTAL_SIZE)); + } + @Test public void testValidatePubsubMessagePayloadTooLarge() { byte[] data = new byte[(10 << 20) + 1]; From 9627cbeceeedfcfcd07ffa3a456085ed0d6e7b94 Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Tue, 18 Jun 2024 01:30:40 +0200 Subject: [PATCH 03/43] Refactor writeMessagesWithOrderingKey into withOrderingKey --- .../beam/sdk/io/gcp/pubsub/PubsubIO.java | 24 ++++++++----------- .../beam/sdk/io/gcp/pubsub/PubsubWriteIT.java | 2 +- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java index 6b767587312b..942cbee1b9f7 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java @@ -741,20 +741,6 @@ public static Write writeMessages() { .build(); } - /** - * Returns A {@link PTransform} that writes {@link PubsubMessage}s, along with the {@link - * PubsubMessage#getMessageId() messageId} and {@link PubsubMessage#getOrderingKey()}, to a Google - * Cloud Pub/Sub stream. - */ - public static Write writeMessagesWithOrderingKey() { - return Write.newBuilder() - .setTopicProvider(null) - .setTopicFunction(null) - .setDynamicDestinations(false) - .setNeedsOrderingKey(true) - .build(); - } - /** * Enables dynamic destination topics. The {@link PubsubMessage} elements are each expected to * contain a destination topic, which can be set using {@link PubsubMessage#withTopic}. If {@link @@ -1474,6 +1460,16 @@ public Write withMaxBatchBytesSize(int maxBatchBytesSize) { return toBuilder().setMaxBatchBytesSize(maxBatchBytesSize).build(); } + /** + * Writes to Pub/Sub with each record's ordering key. A subscription with message ordering + * enabled will receive messages published in the same region with the same ordering key in the + * order in which they were received by the service. Note that the order in which Beam publishes + * records to the service remains unspecified. + */ + public Write withOrderingKey() { + return toBuilder().setNeedsOrderingKey(true).build(); + } + /** * Writes to Pub/Sub and adds each record's timestamp to the published messages in an attribute * with the specified name. The value of the attribute will be a number representing the number diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteIT.java index 02e37b633525..fe36e31e69bb 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteIT.java @@ -132,7 +132,7 @@ public void testBoundedWriteMessageWithAttributesAndMessageIdAndOrderingKey() th pipeline .apply(Create.of(outgoingMessage).withCoder(PubsubMessageSchemaCoder.getSchemaCoder())) - .apply(PubsubIO.writeMessagesWithOrderingKey().to(testTopicPath.getPath())); + .apply(PubsubIO.writeMessages().withOrderingKey().to(testTopicPath.getPath())); pipeline.run().waitUntilFinish(); List incomingMessages = From ddd916f050dcb9436e4a5abdc5be2bd6d20f18c3 Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Wed, 19 Jun 2024 12:09:20 +0200 Subject: [PATCH 04/43] Route to bad records if key is defined, but would be dropped silently --- .../sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java | 11 +++++++++++ .../org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java | 1 + 2 files changed, 12 insertions(+) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java index 1d4900f701b9..7c7e78d72f12 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java @@ -40,6 +40,7 @@ public class PreparePubsubWriteDoFn extends DoFn private static final int PUBSUB_MESSAGE_ATTRIBUTE_MAX_VALUE_BYTES = 1024; // The amount of bytes that each attribute entry adds up to the request private static final int PUBSUB_MESSAGE_ATTRIBUTE_ENCODE_ADDITIONAL_BYTES = 6; + private boolean allowOrderingKey; private int maxPublishBatchSize; private SerializableFunction, PubsubMessage> formatFunction; @@ -139,12 +140,14 @@ static int validatePubsubMessageSize(PubsubMessage message, int maxPublishBatchS SerializableFunction, PubsubMessage> formatFunction, @Nullable SerializableFunction, PubsubIO.PubsubTopic> topicFunction, + boolean allowOrderingKey, int maxPublishBatchSize, BadRecordRouter badRecordRouter, Coder inputCoder, TupleTag outputTag) { this.formatFunction = formatFunction; this.topicFunction = topicFunction; + this.allowOrderingKey = allowOrderingKey; this.maxPublishBatchSize = maxPublishBatchSize; this.badRecordRouter = badRecordRouter; this.inputCoder = inputCoder; @@ -189,6 +192,14 @@ public void process( .add("pubsub", "topic", PubsubClient.topicPathFromPath(topic).getDataCatalogSegments()); reportedLineage = topic; } + if (!allowOrderingKey && message.getOrderingKey() != null) { + badRecordRouter.route( + o, + element, + inputCoder, + null, + "The transform was not configured to publish messages with ordering keys"); + } try { validatePubsubMessageSize(message, maxPublishBatchSize); } catch (SizeLimitExceededException e) { diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java index 942cbee1b9f7..40d4157b29ef 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java @@ -1541,6 +1541,7 @@ public PDone expand(PCollection input) { new PreparePubsubWriteDoFn<>( getFormatFn(), topicFunction, + getNeedsOrderingKey(), maxMessageSize, getBadRecordRouter(), input.getCoder(), From 4791fca1a241bbce999faf8113c01a52d3978a91 Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Wed, 19 Jun 2024 12:13:52 +0200 Subject: [PATCH 05/43] Add publishBatchWithOrderingKey to PubsubUnboundedSink --- .../beam/sdk/io/gcp/pubsub/PubsubIO.java | 1 + .../io/gcp/pubsub/PubsubUnboundedSink.java | 19 ++++++++++++++++++- .../PubSubWritePayloadTranslationTest.java | 3 +++ .../gcp/pubsub/PubsubUnboundedSinkTest.java | 5 +++++ 4 files changed, 27 insertions(+), 1 deletion(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java index 40d4157b29ef..5d14809fa1a9 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java @@ -1578,6 +1578,7 @@ public PDone expand(PCollection input) { getTimestampAttribute(), getIdAttribute(), 100 /* numShards */, + getNeedsOrderingKey(), MoreObjects.firstNonNull( getMaxBatchSize(), PubsubUnboundedSink.DEFAULT_PUBLISH_BATCH_SIZE), MoreObjects.firstNonNull( diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java index aa8e3a411486..66e58b67a1da 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java @@ -378,6 +378,9 @@ public void populateDisplayData(DisplayData.Builder builder) { */ private final int numShards; + /** Publish messages with an ordering key. Currently unsupported in DataflowRunner. */ + private final boolean publishBatchWithOrderingKey; + /** Maximum number of messages per publish. */ private final int publishBatchSize; @@ -402,6 +405,7 @@ public void populateDisplayData(DisplayData.Builder builder) { String timestampAttribute, String idAttribute, int numShards, + boolean publishBatchWithOrderingKey, int publishBatchSize, int publishBatchBytes, Duration maxLatency, @@ -412,6 +416,7 @@ public void populateDisplayData(DisplayData.Builder builder) { this.timestampAttribute = timestampAttribute; this.idAttribute = idAttribute; this.numShards = numShards; + this.publishBatchWithOrderingKey = publishBatchWithOrderingKey; this.publishBatchSize = publishBatchSize; this.publishBatchBytes = publishBatchBytes; this.maxLatency = maxLatency; @@ -424,13 +429,15 @@ public PubsubUnboundedSink( ValueProvider topic, String timestampAttribute, String idAttribute, - int numShards) { + int numShards, + boolean publishBatchWithOrderingKey) { this( pubsubFactory, topic, timestampAttribute, idAttribute, numShards, + publishBatchWithOrderingKey, DEFAULT_PUBLISH_BATCH_SIZE, DEFAULT_PUBLISH_BATCH_BYTES, DEFAULT_MAX_LATENCY, @@ -444,6 +451,7 @@ public PubsubUnboundedSink( String timestampAttribute, String idAttribute, int numShards, + boolean publishBatchWithOrderingKey, String pubsubRootUrl) { this( pubsubFactory, @@ -451,6 +459,7 @@ public PubsubUnboundedSink( timestampAttribute, idAttribute, numShards, + publishBatchWithOrderingKey, DEFAULT_PUBLISH_BATCH_SIZE, DEFAULT_PUBLISH_BATCH_BYTES, DEFAULT_MAX_LATENCY, @@ -464,6 +473,7 @@ public PubsubUnboundedSink( String timestampAttribute, String idAttribute, int numShards, + boolean publishBatchWithOrderingKey, int publishBatchSize, int publishBatchBytes) { this( @@ -472,6 +482,7 @@ public PubsubUnboundedSink( timestampAttribute, idAttribute, numShards, + publishBatchWithOrderingKey, publishBatchSize, publishBatchBytes, DEFAULT_MAX_LATENCY, @@ -485,6 +496,7 @@ public PubsubUnboundedSink( String timestampAttribute, String idAttribute, int numShards, + boolean publishBatchWithOrderingKey, int publishBatchSize, int publishBatchBytes, String pubsubRootUrl) { @@ -494,6 +506,7 @@ public PubsubUnboundedSink( timestampAttribute, idAttribute, numShards, + publishBatchWithOrderingKey, publishBatchSize, publishBatchBytes, DEFAULT_MAX_LATENCY, @@ -520,6 +533,10 @@ public PubsubUnboundedSink( return idAttribute; } + public boolean getPublishBatchWithOrderingKey() { + return publishBatchWithOrderingKey; + } + @Override public PDone expand(PCollection input) { if (topic != null) { diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubSubWritePayloadTranslationTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubSubWritePayloadTranslationTest.java index 45fbab0576fb..75a6173591a1 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubSubWritePayloadTranslationTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubSubWritePayloadTranslationTest.java @@ -67,6 +67,7 @@ public void testTranslateSinkWithTopic() throws Exception { TIMESTAMP_ATTRIBUTE, ID_ATTRIBUTE, 0, + false, 0, 0, Duration.ZERO, @@ -104,6 +105,7 @@ public void testTranslateDynamicSink() throws Exception { TIMESTAMP_ATTRIBUTE, ID_ATTRIBUTE, 0, + false, 0, 0, Duration.ZERO, @@ -143,6 +145,7 @@ public void testTranslateSinkWithTopicOverridden() throws Exception { TIMESTAMP_ATTRIBUTE, ID_ATTRIBUTE, 0, + false, 0, 0, Duration.ZERO, diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSinkTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSinkTest.java index be68083bb28c..5296f4850ef1 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSinkTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSinkTest.java @@ -120,6 +120,7 @@ public void sendOneMessage() throws IOException { TIMESTAMP_ATTRIBUTE, ID_ATTRIBUTE, NUM_SHARDS, + false, batchSize, batchBytes, Duration.standardSeconds(2), @@ -152,6 +153,7 @@ public void sendOneMessageWithoutAttributes() throws IOException { TIMESTAMP_ATTRIBUTE, ID_ATTRIBUTE, NUM_SHARDS, + false, 1 /* batchSize */, 1 /* batchBytes */, Duration.standardSeconds(2), @@ -207,6 +209,7 @@ public void testDynamicTopics() throws IOException { TIMESTAMP_ATTRIBUTE, ID_ATTRIBUTE, NUM_SHARDS, + false, 1 /* batchSize */, 1 /* batchBytes */, Duration.standardSeconds(2), @@ -258,6 +261,7 @@ public void sendMoreThanOneBatchByNumMessages() throws IOException { TIMESTAMP_ATTRIBUTE, ID_ATTRIBUTE, NUM_SHARDS, + false, batchSize, batchBytes, Duration.standardSeconds(2), @@ -303,6 +307,7 @@ public void sendMoreThanOneBatchByByteSize() throws IOException { TIMESTAMP_ATTRIBUTE, ID_ATTRIBUTE, NUM_SHARDS, + false, batchSize, batchBytes, Duration.standardSeconds(2), From 73b07c10a3dd84821bae274b075b1a3fe45e21a9 Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Wed, 19 Jun 2024 12:15:58 +0200 Subject: [PATCH 06/43] Abort override if PubsubUnboundedSink set publishBatchWithOrderingKey --- .../apache/beam/runners/dataflow/DataflowRunner.java | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java b/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java index abe7d0d364d3..75a31c37b232 100644 --- a/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java +++ b/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java @@ -20,6 +20,7 @@ import static java.nio.charset.StandardCharsets.UTF_8; import static org.apache.beam.sdk.io.gcp.pubsub.PubsubIO.ENABLE_CUSTOM_PUBSUB_SINK; import static org.apache.beam.sdk.io.gcp.pubsub.PubsubIO.ENABLE_CUSTOM_PUBSUB_SOURCE; +import static org.apache.beam.sdk.options.ExperimentalOptions.hasExperiment; import static org.apache.beam.sdk.util.CoderUtils.encodeToByteArray; import static org.apache.beam.sdk.util.SerializableUtils.serializeToByteArray; import static org.apache.beam.sdk.util.StringUtils.byteArrayToJsonString; @@ -2033,6 +2034,16 @@ private static void translate( PubsubUnboundedSink overriddenTransform, StepTranslationContext stepContext, PCollection input) { + if (overriddenTransform.getPublishBatchWithOrderingKey()) { + throw new UnsupportedOperationException( + String.format( + "%s does not currently support publishing with ordering keys. " + + "%s is required to support publishing with ordering keys. " + + "Set the pipeline option --experiments=%s to use this PTransform.", + StreamingPubsubIOWrite.class.getSimpleName(), + PubsubUnboundedSink.class.getSimpleName(), + ENABLE_CUSTOM_PUBSUB_SINK)); + } stepContext.addInput(PropertyNames.FORMAT, "pubsub"); if (overriddenTransform.getTopicProvider() != null) { if (overriddenTransform.getTopicProvider().isAccessible()) { From 986c2a5fb22943c76d6eb750528cf037dc07c3ec Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Wed, 19 Jun 2024 12:19:14 +0200 Subject: [PATCH 07/43] Add support for ordering keys in PubsubBoundedWriter --- .../beam/sdk/io/gcp/pubsub/PubsubIO.java | 30 ++++++++++--------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java index 5d14809fa1a9..268a164c55c4 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java @@ -1611,7 +1611,7 @@ private class OutgoingData { } } - private transient Map output; + private transient Map, OutgoingData> output; private transient PubsubClient pubsubClient; @@ -1653,19 +1653,21 @@ public void processElement(@Element PubsubMessage message, @Timestamp Instant ti pubsubTopic = PubsubTopic.fromPath(Preconditions.checkArgumentNotNull(message.getTopic())); } + String orderingKey = message.getOrderingKey(); + // Checking before adding the message stops us from violating max batch size or bytes - OutgoingData currentTopicOutput = - output.computeIfAbsent(pubsubTopic, t -> new OutgoingData()); - if (currentTopicOutput.messages.size() >= maxPublishBatchSize - || (!currentTopicOutput.messages.isEmpty() - && (currentTopicOutput.bytes + messageSize) >= maxPublishBatchByteSize)) { - publish(pubsubTopic, currentTopicOutput.messages); - currentTopicOutput.messages.clear(); - currentTopicOutput.bytes = 0; + OutgoingData currentTopicAndOrderingKeyOutput = + output.computeIfAbsent(KV.of(pubsubTopic, orderingKey), t -> new OutgoingData()); + if (currentTopicAndOrderingKeyOutput.messages.size() >= maxPublishBatchSize + || (!currentTopicAndOrderingKeyOutput.messages.isEmpty() + && (currentTopicAndOrderingKeyOutput.bytes + messageSize) + >= maxPublishBatchByteSize)) { + publish(pubsubTopic, currentTopicAndOrderingKeyOutput.messages); + currentTopicAndOrderingKeyOutput.messages.clear(); + currentTopicAndOrderingKeyOutput.bytes = 0; } Map attributes = message.getAttributeMap(); - String orderingKey = message.getOrderingKey(); com.google.pubsub.v1.PubsubMessage.Builder msgBuilder = com.google.pubsub.v1.PubsubMessage.newBuilder() @@ -1677,16 +1679,16 @@ public void processElement(@Element PubsubMessage message, @Timestamp Instant ti } // NOTE: The record id is always null. - currentTopicOutput.messages.add( + currentTopicAndOrderingKeyOutput.messages.add( OutgoingMessage.of( msgBuilder.build(), timestamp.getMillis(), null, message.getTopic())); - currentTopicOutput.bytes += messageSize; + currentTopicAndOrderingKeyOutput.bytes += messageSize; } @FinishBundle public void finishBundle() throws IOException { - for (Map.Entry entry : output.entrySet()) { - publish(entry.getKey(), entry.getValue().messages); + for (Map.Entry, OutgoingData> entry : output.entrySet()) { + publish(entry.getKey().getKey(), entry.getValue().messages); } output = null; pubsubClient.close(); From f5f8b57ad75e9913217a6029e713b4690a6f18ef Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Thu, 20 Jun 2024 01:23:46 +0200 Subject: [PATCH 08/43] Add support for ordering keys in PubsubUnboundedSink --- .../io/gcp/pubsub/PubsubUnboundedSink.java | 81 ++++++++++++++----- 1 file changed, 63 insertions(+), 18 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java index 66e58b67a1da..4105f3e79a00 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java @@ -23,8 +23,12 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; +import java.util.Objects; import java.util.UUID; import java.util.concurrent.ThreadLocalRandom; import org.apache.beam.sdk.coders.AtomicCoder; @@ -202,7 +206,12 @@ public void processElement( } @Nullable String topic = dynamicTopicFn.apply(element); - K key = keyFunction.apply(ThreadLocalRandom.current().nextInt(numShards), topic); + @Nullable String orderingKey = message.getOrderingKey(); + int shard = + orderingKey == null + ? ThreadLocalRandom.current().nextInt(numShards) + : Hashing.murmur3_32_fixed().hashString(orderingKey, StandardCharsets.UTF_8).asInt(); + K key = keyFunction.apply(shard, topic); o.output(KV.of(key, OutgoingMessage.of(message, timestampMsSinceEpoch, recordId, topic))); } @@ -219,6 +228,16 @@ public void populateDisplayData(DisplayData.Builder builder) { /** Publish messages to Pubsub in batches. */ private static class WriterFn extends DoFn, Void> { + private class OutgoingData { + int bytes; + List messages; + + OutgoingData() { + this.bytes = 0; + this.messages = new ArrayList<>(publishBatchSize); + } + } + private final PubsubClientFactory pubsubFactory; private final @Nullable ValueProvider topic; private final String timestampAttribute; @@ -305,27 +324,53 @@ public void startBundle(StartBundleContext c) throws Exception { } @ProcessElement + @SuppressWarnings("ReferenceEquality") public void processElement(ProcessContext c) throws Exception { - List pubsubMessages = new ArrayList<>(publishBatchSize); - int bytes = 0; + // TODO(sjvanrossum): Refactor the write transform so this map can be indexed with topic + + // ordering key and have bundle scoped lifetime. + // NB: A larger, breaking refactor could make this irrelevant with a GBK on topic + ordering + // key (or GroupIntoBatches with configurable shard for ShardedKey?) and unify + // bounded/unbounded writes and static/dynamic destinations. + Map<@Nullable String, OutgoingData> orderingKeyBatches = new HashMap<>(); + @Nullable String currentOrderingKey = null; + @Nullable OutgoingData currentBatch = null; for (OutgoingMessage message : c.element()) { - if (!pubsubMessages.isEmpty() - && bytes + message.getMessage().getData().size() > publishBatchBytes) { - // Break large (in bytes) batches into smaller. - // (We've already broken by batch size using the trigger below, though that may - // run slightly over the actual PUBLISH_BATCH_SIZE. We'll consider that ok since - // the hard limit from Pubsub is by bytes rather than number of messages.) - // BLOCKS until published. - publishBatch(pubsubMessages, bytes); - pubsubMessages.clear(); - bytes = 0; + // If currentBatch is null set currentOrderingKey before entering the then clause. If + // currentBatch is not null and currentOrderingKey does not equal messageOrderingKey set + // currentOrderingKey and currentBatch before entering the then or else clause and only + // enter the then clause if currentBatch is null. This ensures currentBatch is initialized + // and contains at least one element before entering the else clause if currentOrderingKey + // is equal to messageOrderingKey. + @Nullable String messageOrderingKey = message.getMessage().getOrderingKey(); + if ((currentBatch == null + && (currentOrderingKey = messageOrderingKey) == messageOrderingKey) + || (!Objects.equals(currentOrderingKey, messageOrderingKey) + && (currentBatch = orderingKeyBatches.get(currentOrderingKey = messageOrderingKey)) + == null)) { + currentBatch = new OutgoingData(); + currentBatch.messages.add(message); + currentBatch.bytes += message.getMessage().getData().size(); + orderingKeyBatches.put(currentOrderingKey, currentBatch); + } else { + if (currentBatch.bytes + message.getMessage().getData().size() > publishBatchBytes) { + // Break large (in bytes) batches into smaller. + // (We've already broken by batch size using the trigger below, though that may + // run slightly over the actual PUBLISH_BATCH_SIZE. We'll consider that ok since + // the hard limit from Pubsub is by bytes rather than number of messages.) + // BLOCKS until published. + publishBatch(currentBatch.messages, currentBatch.bytes); + currentBatch.messages.clear(); + currentBatch.bytes = 0; + } + currentBatch.messages.add(message); + currentBatch.bytes += message.getMessage().getData().size(); } - pubsubMessages.add(message); - bytes += message.getMessage().getData().size(); } - if (!pubsubMessages.isEmpty()) { - // BLOCKS until published. - publishBatch(pubsubMessages, bytes); + for (OutgoingData batch : orderingKeyBatches.values()) { + if (!batch.messages.isEmpty()) { + // BLOCKS until published. + publishBatch(batch.messages, batch.bytes); + } } } From 21e8e8e0ba1146c6f3a5b0a100edb9bf44b1cee7 Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Wed, 10 Jul 2024 12:14:59 +0200 Subject: [PATCH 09/43] Remove nullable ordering keys, null and empty are equivalent --- .../sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java | 3 ++- .../org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java | 10 +++++++--- .../beam/sdk/io/gcp/pubsub/PubsubJsonClient.java | 6 +++--- .../beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java | 11 +++++++---- 4 files changed, 19 insertions(+), 11 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java index 7c7e78d72f12..491fd37fbaef 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java @@ -29,6 +29,7 @@ import org.apache.beam.sdk.transforms.windowing.PaneInfo; import org.apache.beam.sdk.values.TupleTag; import org.apache.beam.sdk.values.ValueInSingleWindow; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; import org.checkerframework.checker.nullness.qual.Nullable; import org.joda.time.Instant; @@ -192,7 +193,7 @@ public void process( .add("pubsub", "topic", PubsubClient.topicPathFromPath(topic).getDataCatalogSegments()); reportedLineage = topic; } - if (!allowOrderingKey && message.getOrderingKey() != null) { + if (!allowOrderingKey && !Strings.isNullOrEmpty(message.getOrderingKey())) { badRecordRouter.route( o, element, diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java index 268a164c55c4..352642cffccd 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java @@ -85,6 +85,7 @@ import org.apache.beam.sdk.values.ValueInSingleWindow; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.MoreObjects; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; @@ -1653,11 +1654,14 @@ public void processElement(@Element PubsubMessage message, @Timestamp Instant ti pubsubTopic = PubsubTopic.fromPath(Preconditions.checkArgumentNotNull(message.getTopic())); } - String orderingKey = message.getOrderingKey(); + + // NOTE: Protobuf strings are non-null, null and empty keys are equivalent. + @Nullable String orderingKey = message.getOrderingKey(); // Checking before adding the message stops us from violating max batch size or bytes OutgoingData currentTopicAndOrderingKeyOutput = - output.computeIfAbsent(KV.of(pubsubTopic, orderingKey), t -> new OutgoingData()); + output.computeIfAbsent( + KV.of(pubsubTopic, Strings.emptyIfNull(orderingKey)), t -> new OutgoingData()); if (currentTopicAndOrderingKeyOutput.messages.size() >= maxPublishBatchSize || (!currentTopicAndOrderingKeyOutput.messages.isEmpty() && (currentTopicAndOrderingKeyOutput.bytes + messageSize) @@ -1687,7 +1691,7 @@ public void processElement(@Element PubsubMessage message, @Timestamp Instant ti @FinishBundle public void finishBundle() throws IOException { - for (Map.Entry, OutgoingData> entry : output.entrySet()) { + for (Map.Entry, OutgoingData> entry : output.entrySet()) { publish(entry.getKey().getKey(), entry.getValue().messages); } output = null; diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubJsonClient.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubJsonClient.java index 386febcf005b..15a1649a1d99 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubJsonClient.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubJsonClient.java @@ -230,11 +230,11 @@ public List pull( com.google.pubsub.v1.PubsubMessage.newBuilder(); protoMessage.setData(ByteString.copyFrom(elementBytes)); protoMessage.putAllAttributes(attributes); - // PubsubMessage uses `null` to represent no ordering key where we want a default of "". + // {@link PubsubMessage} uses `null` or empty string to represent no ordering key. + // {@link com.google.pubsub.v1.PubsubMessage} does not track string field presence and uses + // empty string as a default. if (pubsubMessage.getOrderingKey() != null) { protoMessage.setOrderingKey(pubsubMessage.getOrderingKey()); - } else { - protoMessage.setOrderingKey(""); } incomingMessages.add( IncomingMessage.of( diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java index 4105f3e79a00..6751d6429870 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java @@ -73,7 +73,9 @@ import org.apache.beam.sdk.values.TypeDescriptors; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.hash.Hashing; +import org.checkerframework.checker.nullness.qual.MonotonicNonNull; import org.checkerframework.checker.nullness.qual.Nullable; import org.joda.time.Duration; import org.joda.time.Instant; @@ -205,10 +207,11 @@ public void processElement( break; } + // NOTE: Null and empty ordering keys are treated as equivalent. @Nullable String topic = dynamicTopicFn.apply(element); @Nullable String orderingKey = message.getOrderingKey(); int shard = - orderingKey == null + Strings.isNullOrEmpty(orderingKey) ? ThreadLocalRandom.current().nextInt(numShards) : Hashing.murmur3_32_fixed().hashString(orderingKey, StandardCharsets.UTF_8).asInt(); K key = keyFunction.apply(shard, topic); @@ -331,8 +334,8 @@ public void processElement(ProcessContext c) throws Exception { // NB: A larger, breaking refactor could make this irrelevant with a GBK on topic + ordering // key (or GroupIntoBatches with configurable shard for ShardedKey?) and unify // bounded/unbounded writes and static/dynamic destinations. - Map<@Nullable String, OutgoingData> orderingKeyBatches = new HashMap<>(); - @Nullable String currentOrderingKey = null; + Map orderingKeyBatches = new HashMap<>(); + @MonotonicNonNull String currentOrderingKey = null; @Nullable OutgoingData currentBatch = null; for (OutgoingMessage message : c.element()) { // If currentBatch is null set currentOrderingKey before entering the then clause. If @@ -341,7 +344,7 @@ public void processElement(ProcessContext c) throws Exception { // enter the then clause if currentBatch is null. This ensures currentBatch is initialized // and contains at least one element before entering the else clause if currentOrderingKey // is equal to messageOrderingKey. - @Nullable String messageOrderingKey = message.getMessage().getOrderingKey(); + String messageOrderingKey = message.getMessage().getOrderingKey(); if ((currentBatch == null && (currentOrderingKey = messageOrderingKey) == messageOrderingKey) || (!Objects.equals(currentOrderingKey, messageOrderingKey) From 42bbb77788d1c64ccbe1b498a754947ff4aec00d Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Wed, 10 Jul 2024 14:32:38 +0200 Subject: [PATCH 10/43] Construct OutgoingMessage with Beam PubsubMessage to reduce repetition --- .../beam/sdk/io/gcp/pubsub/PubsubIO.java | 36 +++++-------------- 1 file changed, 9 insertions(+), 27 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java index 352642cffccd..c9305cda0c43 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java @@ -22,7 +22,6 @@ import com.google.api.client.util.Clock; import com.google.auto.value.AutoValue; -import com.google.protobuf.ByteString; import com.google.protobuf.Descriptors.Descriptor; import com.google.protobuf.DynamicMessage; import com.google.protobuf.InvalidProtocolBufferException; @@ -85,7 +84,6 @@ import org.apache.beam.sdk.values.ValueInSingleWindow; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.MoreObjects; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; @@ -1644,24 +1642,22 @@ public void processElement(@Element PubsubMessage message, @Timestamp Instant ti throws IOException, SizeLimitExceededException { // Validate again here just as a sanity check. PreparePubsubWriteDoFn.validatePubsubMessageSize(message, maxPublishBatchByteSize); - byte[] payload = message.getPayload(); - int messageSize = payload.length; + // NOTE: The record id is always null. + final OutgoingMessage msg = + OutgoingMessage.of(message, timestamp.getMillis(), null, message.getTopic()); + final int messageSize = msg.getMessage().getData().size(); - PubsubTopic pubsubTopic; + final PubsubTopic pubsubTopic; if (getTopicProvider() != null) { pubsubTopic = getTopicProvider().get(); } else { - pubsubTopic = - PubsubTopic.fromPath(Preconditions.checkArgumentNotNull(message.getTopic())); + pubsubTopic = PubsubTopic.fromPath(Preconditions.checkArgumentNotNull(msg.topic())); } - // NOTE: Protobuf strings are non-null, null and empty keys are equivalent. - @Nullable String orderingKey = message.getOrderingKey(); - // Checking before adding the message stops us from violating max batch size or bytes - OutgoingData currentTopicAndOrderingKeyOutput = + final OutgoingData currentTopicAndOrderingKeyOutput = output.computeIfAbsent( - KV.of(pubsubTopic, Strings.emptyIfNull(orderingKey)), t -> new OutgoingData()); + KV.of(pubsubTopic, msg.getMessage().getOrderingKey()), t -> new OutgoingData()); if (currentTopicAndOrderingKeyOutput.messages.size() >= maxPublishBatchSize || (!currentTopicAndOrderingKeyOutput.messages.isEmpty() && (currentTopicAndOrderingKeyOutput.bytes + messageSize) @@ -1671,21 +1667,7 @@ public void processElement(@Element PubsubMessage message, @Timestamp Instant ti currentTopicAndOrderingKeyOutput.bytes = 0; } - Map attributes = message.getAttributeMap(); - - com.google.pubsub.v1.PubsubMessage.Builder msgBuilder = - com.google.pubsub.v1.PubsubMessage.newBuilder() - .setData(ByteString.copyFrom(payload)) - .putAllAttributes(attributes); - - if (orderingKey != null) { - msgBuilder.setOrderingKey(orderingKey); - } - - // NOTE: The record id is always null. - currentTopicAndOrderingKeyOutput.messages.add( - OutgoingMessage.of( - msgBuilder.build(), timestamp.getMillis(), null, message.getTopic())); + currentTopicAndOrderingKeyOutput.messages.add(msg); currentTopicAndOrderingKeyOutput.bytes += messageSize; } From 394d135a86a7445a0c86e8ca3abcc73b920d95b8 Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Wed, 10 Jul 2024 14:34:07 +0200 Subject: [PATCH 11/43] Improve readability of PubsubUnboundedSink batch assignment --- .../sdk/io/gcp/pubsub/PubsubUnboundedSink.java | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java index 6751d6429870..e5b93cee9fce 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java @@ -28,7 +28,6 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.Objects; import java.util.UUID; import java.util.concurrent.ThreadLocalRandom; import org.apache.beam.sdk.coders.AtomicCoder; @@ -338,18 +337,12 @@ public void processElement(ProcessContext c) throws Exception { @MonotonicNonNull String currentOrderingKey = null; @Nullable OutgoingData currentBatch = null; for (OutgoingMessage message : c.element()) { - // If currentBatch is null set currentOrderingKey before entering the then clause. If - // currentBatch is not null and currentOrderingKey does not equal messageOrderingKey set - // currentOrderingKey and currentBatch before entering the then or else clause and only - // enter the then clause if currentBatch is null. This ensures currentBatch is initialized - // and contains at least one element before entering the else clause if currentOrderingKey - // is equal to messageOrderingKey. String messageOrderingKey = message.getMessage().getOrderingKey(); - if ((currentBatch == null - && (currentOrderingKey = messageOrderingKey) == messageOrderingKey) - || (!Objects.equals(currentOrderingKey, messageOrderingKey) - && (currentBatch = orderingKeyBatches.get(currentOrderingKey = messageOrderingKey)) - == null)) { + if (currentOrderingKey == null || !currentOrderingKey.equals(messageOrderingKey)) { + currentOrderingKey = messageOrderingKey; + currentBatch = orderingKeyBatches.get(currentOrderingKey); + } + if (currentBatch == null) { currentBatch = new OutgoingData(); currentBatch.messages.add(message); currentBatch.bytes += message.getMessage().getData().size(); From 1043961fd86d4d731dff664c5e6bfff2d7d155a9 Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Wed, 10 Jul 2024 14:38:45 +0200 Subject: [PATCH 12/43] Add size validation TODOs --- .../main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java | 4 ++++ .../apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java | 3 +++ 2 files changed, 7 insertions(+) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java index c9305cda0c43..2a7ead6a05f0 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java @@ -1641,10 +1641,12 @@ public void startBundle(StartBundleContext c) throws IOException { public void processElement(@Element PubsubMessage message, @Timestamp Instant timestamp) throws IOException, SizeLimitExceededException { // Validate again here just as a sanity check. + // TODO(sjvanrossum): https://github.com/apache/beam/issues/31800 PreparePubsubWriteDoFn.validatePubsubMessageSize(message, maxPublishBatchByteSize); // NOTE: The record id is always null. final OutgoingMessage msg = OutgoingMessage.of(message, timestamp.getMillis(), null, message.getTopic()); + // TODO(sjvanrossum): https://github.com/apache/beam/issues/31800 final int messageSize = msg.getMessage().getData().size(); final PubsubTopic pubsubTopic; @@ -1658,6 +1660,7 @@ public void processElement(@Element PubsubMessage message, @Timestamp Instant ti final OutgoingData currentTopicAndOrderingKeyOutput = output.computeIfAbsent( KV.of(pubsubTopic, msg.getMessage().getOrderingKey()), t -> new OutgoingData()); + // TODO(sjvanrossum): https://github.com/apache/beam/issues/31800 if (currentTopicAndOrderingKeyOutput.messages.size() >= maxPublishBatchSize || (!currentTopicAndOrderingKeyOutput.messages.isEmpty() && (currentTopicAndOrderingKeyOutput.bytes + messageSize) @@ -1668,6 +1671,7 @@ public void processElement(@Element PubsubMessage message, @Timestamp Instant ti } currentTopicAndOrderingKeyOutput.messages.add(msg); + // TODO(sjvanrossum): https://github.com/apache/beam/issues/31800 currentTopicAndOrderingKeyOutput.bytes += messageSize; } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java index e5b93cee9fce..6c9aa08a222d 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java @@ -345,9 +345,11 @@ public void processElement(ProcessContext c) throws Exception { if (currentBatch == null) { currentBatch = new OutgoingData(); currentBatch.messages.add(message); + // TODO(sjvanrossum): https://github.com/apache/beam/issues/31800 currentBatch.bytes += message.getMessage().getData().size(); orderingKeyBatches.put(currentOrderingKey, currentBatch); } else { + // TODO(sjvanrossum): https://github.com/apache/beam/issues/31800 if (currentBatch.bytes + message.getMessage().getData().size() > publishBatchBytes) { // Break large (in bytes) batches into smaller. // (We've already broken by batch size using the trigger below, though that may @@ -359,6 +361,7 @@ public void processElement(ProcessContext c) throws Exception { currentBatch.bytes = 0; } currentBatch.messages.add(message); + // TODO(sjvanrossum): https://github.com/apache/beam/issues/31800 currentBatch.bytes += message.getMessage().getData().size(); } } From cd727c2555b0970fd1fed06d2630e7fb2639704f Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Wed, 10 Jul 2024 15:10:33 +0200 Subject: [PATCH 13/43] Replace auto-sharding sink comment with FR link, move to relevant place --- .../apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java index 6c9aa08a222d..b46e3768fca7 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java @@ -206,6 +206,7 @@ public void processElement( break; } + // TODO(sjvanrossum): https://github.com/apache/beam/issues/31828 // NOTE: Null and empty ordering keys are treated as equivalent. @Nullable String topic = dynamicTopicFn.apply(element); @Nullable String orderingKey = message.getOrderingKey(); @@ -330,9 +331,6 @@ public void startBundle(StartBundleContext c) throws Exception { public void processElement(ProcessContext c) throws Exception { // TODO(sjvanrossum): Refactor the write transform so this map can be indexed with topic + // ordering key and have bundle scoped lifetime. - // NB: A larger, breaking refactor could make this irrelevant with a GBK on topic + ordering - // key (or GroupIntoBatches with configurable shard for ShardedKey?) and unify - // bounded/unbounded writes and static/dynamic destinations. Map orderingKeyBatches = new HashMap<>(); @MonotonicNonNull String currentOrderingKey = null; @Nullable OutgoingData currentBatch = null; From 20e7bb912b5284b0a1b4e1f3af45cb770b8473a8 Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Wed, 10 Jul 2024 17:50:14 +0200 Subject: [PATCH 14/43] Add links to Pub/Sub documentation --- .../java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java | 5 +++++ .../apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java | 2 ++ 2 files changed, 7 insertions(+) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java index 2a7ead6a05f0..6224a1b817e0 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java @@ -1464,6 +1464,9 @@ public Write withMaxBatchBytesSize(int maxBatchBytesSize) { * enabled will receive messages published in the same region with the same ordering key in the * order in which they were received by the service. Note that the order in which Beam publishes * records to the service remains unspecified. + * + * @see Pub/Sub documentation on message + * ordering */ public Write withOrderingKey() { return toBuilder().setNeedsOrderingKey(true).build(); @@ -1610,6 +1613,8 @@ private class OutgoingData { } } + // NOTE: A single publish request may only write to one ordering key. + // See https://cloud.google.com/pubsub/docs/publisher#using-ordering-keys for details. private transient Map, OutgoingData> output; private transient PubsubClient pubsubClient; diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java index b46e3768fca7..ef555c9472cd 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java @@ -331,6 +331,8 @@ public void startBundle(StartBundleContext c) throws Exception { public void processElement(ProcessContext c) throws Exception { // TODO(sjvanrossum): Refactor the write transform so this map can be indexed with topic + // ordering key and have bundle scoped lifetime. + // NOTE: A single publish request may only write to one ordering key. + // See https://cloud.google.com/pubsub/docs/publisher#using-ordering-keys for details. Map orderingKeyBatches = new HashMap<>(); @MonotonicNonNull String currentOrderingKey = null; @Nullable OutgoingData currentBatch = null; From 5911f63f21adcc322f63c18666f8778201f3d530 Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Wed, 10 Jul 2024 17:51:56 +0200 Subject: [PATCH 15/43] Refine comment about lack of ordering key support in Dataflow's sink Co-authored-by: Ahmed Abualsaud <65791736+ahmedabu98@users.noreply.github.com> --- .../org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java index ef555c9472cd..565474bdceb6 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java @@ -422,7 +422,7 @@ public void populateDisplayData(DisplayData.Builder builder) { */ private final int numShards; - /** Publish messages with an ordering key. Currently unsupported in DataflowRunner. */ + /** Publish messages with an ordering key. Currently unsupported with DataflowRunner's Pubsub sink override. */ private final boolean publishBatchWithOrderingKey; /** Maximum number of messages per publish. */ From ad397aada26ea989c3ec1bd27a06a10fca35c456 Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Wed, 10 Jul 2024 19:08:39 +0200 Subject: [PATCH 16/43] Add TODO to remove ordering key check once all sinks support this --- .../apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java | 1 + 1 file changed, 1 insertion(+) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java index 491fd37fbaef..736195729c76 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java @@ -193,6 +193,7 @@ public void process( .add("pubsub", "topic", PubsubClient.topicPathFromPath(topic).getDataCatalogSegments()); reportedLineage = topic; } + // TODO: Remove this check once Dataflow's native sink supports ordering keys. if (!allowOrderingKey && !Strings.isNullOrEmpty(message.getOrderingKey())) { badRecordRouter.route( o, From 53134d6546f7c25364e2d24381b42ede0eccd668 Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Tue, 10 Sep 2024 21:33:48 +0200 Subject: [PATCH 17/43] Add missing return statement Co-authored-by: Ahmed Abualsaud <65791736+ahmedabu98@users.noreply.github.com> --- .../apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java | 1 + 1 file changed, 1 insertion(+) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java index 736195729c76..ce9e7a098cdf 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java @@ -201,6 +201,7 @@ public void process( inputCoder, null, "The transform was not configured to publish messages with ordering keys"); + return; } try { validatePubsubMessageSize(message, maxPublishBatchSize); From 402ec9481fbd71fdfa46fccdf7214a76de3efabe Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Tue, 10 Sep 2024 22:07:21 +0200 Subject: [PATCH 18/43] Remove duplicated statements --- .../io/gcp/pubsub/PubsubUnboundedSink.java | 30 ++++++++----------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java index 565474bdceb6..ef9739652f4e 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java @@ -344,26 +344,22 @@ public void processElement(ProcessContext c) throws Exception { } if (currentBatch == null) { currentBatch = new OutgoingData(); - currentBatch.messages.add(message); - // TODO(sjvanrossum): https://github.com/apache/beam/issues/31800 - currentBatch.bytes += message.getMessage().getData().size(); orderingKeyBatches.put(currentOrderingKey, currentBatch); - } else { - // TODO(sjvanrossum): https://github.com/apache/beam/issues/31800 - if (currentBatch.bytes + message.getMessage().getData().size() > publishBatchBytes) { - // Break large (in bytes) batches into smaller. - // (We've already broken by batch size using the trigger below, though that may - // run slightly over the actual PUBLISH_BATCH_SIZE. We'll consider that ok since - // the hard limit from Pubsub is by bytes rather than number of messages.) - // BLOCKS until published. - publishBatch(currentBatch.messages, currentBatch.bytes); - currentBatch.messages.clear(); - currentBatch.bytes = 0; - } - currentBatch.messages.add(message); + } else if (currentBatch.bytes + message.getMessage().getData().size() > publishBatchBytes) { // TODO(sjvanrossum): https://github.com/apache/beam/issues/31800 - currentBatch.bytes += message.getMessage().getData().size(); + + // Break large (in bytes) batches into smaller. + // (We've already broken by batch size using the trigger below, though that may + // run slightly over the actual PUBLISH_BATCH_SIZE. We'll consider that ok since + // the hard limit from Pubsub is by bytes rather than number of messages.) + // BLOCKS until published. + publishBatch(currentBatch.messages, currentBatch.bytes); + currentBatch.messages.clear(); + currentBatch.bytes = 0; } + currentBatch.messages.add(message); + // TODO(sjvanrossum): https://github.com/apache/beam/issues/31800 + currentBatch.bytes += message.getMessage().getData().size(); } for (OutgoingData batch : orderingKeyBatches.values()) { if (!batch.messages.isEmpty()) { From a97f64c5a9513332adcfc644487b763eadf4f3e5 Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Tue, 10 Sep 2024 22:45:38 +0200 Subject: [PATCH 19/43] Apply Spotless --- .../apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java index ef9739652f4e..bec1751831a1 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java @@ -418,7 +418,10 @@ public void populateDisplayData(DisplayData.Builder builder) { */ private final int numShards; - /** Publish messages with an ordering key. Currently unsupported with DataflowRunner's Pubsub sink override. */ + /** + * Publish messages with an ordering key. Currently unsupported with DataflowRunner's Pubsub sink + * override. + */ private final boolean publishBatchWithOrderingKey; /** Maximum number of messages per publish. */ From 4513db408ba0b2b9286e0c1be45f8bb943f798e3 Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Tue, 10 Sep 2024 23:17:33 +0200 Subject: [PATCH 20/43] Add notable changes --- CHANGES.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 47f1831b446e..f1c7aac12065 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -68,6 +68,7 @@ * Dataflow worker can install packages from Google Artifact Registry Python repositories (Python) ([#32123](https://github.com/apache/beam/issues/32123)). * Added support for Zstd codec in SerializableAvroCodecFactory (Java) ([#32349](https://github.com/apache/beam/issues/32349)) +* Added support for ordering key writes in `PubsubIO.Write` (Java) ([#21162](https://github.com/apache/beam/issues/21162)) * X feature added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). ## Breaking Changes @@ -77,6 +78,12 @@ as strings rather than silently coerced (and possibly truncated) to numeric values. To retain the old behavior, pass `dtype=True` (or any other value accepted by `pandas.read_json`). +* In Java, `PubsubIO.Write` used to silently drop the ordering key property. + If the ordering key property is set on a message it will now be sent to the + bad record handler unless `withOrderingKeys()` is applied to the transform. + Dataflow uses a custom Pub/Sub sink by default without support for this and + will reject a transform if `withOrderingKeys()` is applied. Users can set + `--experiments=enable_custom_pubsub_sink` to use Beam's Pub/Sub sink instead. * X behavior was changed ([#X](https://github.com/apache/beam/issues/X)). ## Deprecations From a60d689df3a22fb89aa5e4e3f2817ba9c50a55da Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Fri, 13 Sep 2024 13:39:00 -0400 Subject: [PATCH 21/43] address comments --- .../org/apache/beam/runners/dataflow/DataflowRunner.java | 6 ++---- .../beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java | 5 +++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java b/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java index 75a31c37b232..2d7ef10f289a 100644 --- a/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java +++ b/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java @@ -2037,12 +2037,10 @@ private static void translate( if (overriddenTransform.getPublishBatchWithOrderingKey()) { throw new UnsupportedOperationException( String.format( - "%s does not currently support publishing with ordering keys. " + "The DataflowRunner does not currently support publishing to Pubsub with ordering keys. " + "%s is required to support publishing with ordering keys. " + "Set the pipeline option --experiments=%s to use this PTransform.", - StreamingPubsubIOWrite.class.getSimpleName(), - PubsubUnboundedSink.class.getSimpleName(), - ENABLE_CUSTOM_PUBSUB_SINK)); + PubsubUnboundedSink.class.getSimpleName(), ENABLE_CUSTOM_PUBSUB_SINK)); } stepContext.addInput(PropertyNames.FORMAT, "pubsub"); if (overriddenTransform.getTopicProvider() != null) { diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java index ce9e7a098cdf..958b318b6893 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java @@ -39,6 +39,7 @@ public class PreparePubsubWriteDoFn extends DoFn private static final int PUBSUB_MESSAGE_MAX_ATTRIBUTES = 100; private static final int PUBSUB_MESSAGE_ATTRIBUTE_MAX_KEY_BYTES = 256; private static final int PUBSUB_MESSAGE_ATTRIBUTE_MAX_VALUE_BYTES = 1024; + private static final int ORDERING_KEY_MAX_VALUE_BYTES = 1024; // The amount of bytes that each attribute entry adds up to the request private static final int PUBSUB_MESSAGE_ATTRIBUTE_ENCODE_ADDITIONAL_BYTES = 6; private boolean allowOrderingKey; @@ -71,12 +72,12 @@ static int validatePubsubMessageSize(PubsubMessage message, int maxPublishBatchS @Nullable String orderingKey = message.getOrderingKey(); if (orderingKey != null) { int orderingKeySize = orderingKey.getBytes(StandardCharsets.UTF_8).length; - if (orderingKeySize > PUBSUB_MESSAGE_ATTRIBUTE_MAX_VALUE_BYTES) { + if (orderingKeySize > ORDERING_KEY_MAX_VALUE_BYTES) { throw new SizeLimitExceededException( "Pubsub message ordering key of length " + orderingKeySize + " exceeds maximum of " - + PUBSUB_MESSAGE_ATTRIBUTE_MAX_VALUE_BYTES + + ORDERING_KEY_MAX_VALUE_BYTES + " bytes. See https://cloud.google.com/pubsub/quotas#resource_limits"); } totalSize += orderingKeySize; From 5bac7625e77a38f751d87bbfa206efdd7c723e24 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Sun, 15 Sep 2024 16:20:56 -0400 Subject: [PATCH 22/43] allow messages with ordering keys even when the sink isn't configured to accept ordering keys --- CHANGES.md | 8 +------- .../beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java | 10 ---------- 2 files changed, 1 insertion(+), 17 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index f1c7aac12065..83a8299f9b0e 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -68,7 +68,7 @@ * Dataflow worker can install packages from Google Artifact Registry Python repositories (Python) ([#32123](https://github.com/apache/beam/issues/32123)). * Added support for Zstd codec in SerializableAvroCodecFactory (Java) ([#32349](https://github.com/apache/beam/issues/32349)) -* Added support for ordering key writes in `PubsubIO.Write` (Java) ([#21162](https://github.com/apache/beam/issues/21162)) +* Added support for writing to Pubsub with ordering keys (Java) ([#21162](https://github.com/apache/beam/issues/21162)) * X feature added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). ## Breaking Changes @@ -78,12 +78,6 @@ as strings rather than silently coerced (and possibly truncated) to numeric values. To retain the old behavior, pass `dtype=True` (or any other value accepted by `pandas.read_json`). -* In Java, `PubsubIO.Write` used to silently drop the ordering key property. - If the ordering key property is set on a message it will now be sent to the - bad record handler unless `withOrderingKeys()` is applied to the transform. - Dataflow uses a custom Pub/Sub sink by default without support for this and - will reject a transform if `withOrderingKeys()` is applied. Users can set - `--experiments=enable_custom_pubsub_sink` to use Beam's Pub/Sub sink instead. * X behavior was changed ([#X](https://github.com/apache/beam/issues/X)). ## Deprecations diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java index 958b318b6893..cc87b958bc4d 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java @@ -194,16 +194,6 @@ public void process( .add("pubsub", "topic", PubsubClient.topicPathFromPath(topic).getDataCatalogSegments()); reportedLineage = topic; } - // TODO: Remove this check once Dataflow's native sink supports ordering keys. - if (!allowOrderingKey && !Strings.isNullOrEmpty(message.getOrderingKey())) { - badRecordRouter.route( - o, - element, - inputCoder, - null, - "The transform was not configured to publish messages with ordering keys"); - return; - } try { validatePubsubMessageSize(message, maxPublishBatchSize); } catch (SizeLimitExceededException e) { From 53d47a75d2ff4c51e26b2ba7ee1dbceb96c61a48 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Mon, 16 Sep 2024 06:18:14 -0400 Subject: [PATCH 23/43] spotless --- .../apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java | 1 - 1 file changed, 1 deletion(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java index cc87b958bc4d..a3c54d622251 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java @@ -29,7 +29,6 @@ import org.apache.beam.sdk.transforms.windowing.PaneInfo; import org.apache.beam.sdk.values.TupleTag; import org.apache.beam.sdk.values.ValueInSingleWindow; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; import org.checkerframework.checker.nullness.qual.Nullable; import org.joda.time.Instant; From cb9e7fb19699b002aa9245a4d1e7e36a1db31f4a Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Mon, 16 Sep 2024 06:45:20 -0400 Subject: [PATCH 24/43] spotless --- .../beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java | 9 +++------ .../java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java | 1 - 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java index a3c54d622251..6df3f1a8e9cf 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java @@ -38,10 +38,9 @@ public class PreparePubsubWriteDoFn extends DoFn private static final int PUBSUB_MESSAGE_MAX_ATTRIBUTES = 100; private static final int PUBSUB_MESSAGE_ATTRIBUTE_MAX_KEY_BYTES = 256; private static final int PUBSUB_MESSAGE_ATTRIBUTE_MAX_VALUE_BYTES = 1024; - private static final int ORDERING_KEY_MAX_VALUE_BYTES = 1024; + private static final int ORDERING_KEY_MAX_BYTE_SIZE = 1024; // The amount of bytes that each attribute entry adds up to the request private static final int PUBSUB_MESSAGE_ATTRIBUTE_ENCODE_ADDITIONAL_BYTES = 6; - private boolean allowOrderingKey; private int maxPublishBatchSize; private SerializableFunction, PubsubMessage> formatFunction; @@ -71,12 +70,12 @@ static int validatePubsubMessageSize(PubsubMessage message, int maxPublishBatchS @Nullable String orderingKey = message.getOrderingKey(); if (orderingKey != null) { int orderingKeySize = orderingKey.getBytes(StandardCharsets.UTF_8).length; - if (orderingKeySize > ORDERING_KEY_MAX_VALUE_BYTES) { + if (orderingKeySize > ORDERING_KEY_MAX_BYTE_SIZE) { throw new SizeLimitExceededException( "Pubsub message ordering key of length " + orderingKeySize + " exceeds maximum of " - + ORDERING_KEY_MAX_VALUE_BYTES + + ORDERING_KEY_MAX_BYTE_SIZE + " bytes. See https://cloud.google.com/pubsub/quotas#resource_limits"); } totalSize += orderingKeySize; @@ -141,14 +140,12 @@ static int validatePubsubMessageSize(PubsubMessage message, int maxPublishBatchS SerializableFunction, PubsubMessage> formatFunction, @Nullable SerializableFunction, PubsubIO.PubsubTopic> topicFunction, - boolean allowOrderingKey, int maxPublishBatchSize, BadRecordRouter badRecordRouter, Coder inputCoder, TupleTag outputTag) { this.formatFunction = formatFunction; this.topicFunction = topicFunction; - this.allowOrderingKey = allowOrderingKey; this.maxPublishBatchSize = maxPublishBatchSize; this.badRecordRouter = badRecordRouter; this.inputCoder = inputCoder; diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java index 6224a1b817e0..1694083aeeca 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java @@ -1543,7 +1543,6 @@ public PDone expand(PCollection input) { new PreparePubsubWriteDoFn<>( getFormatFn(), topicFunction, - getNeedsOrderingKey(), maxMessageSize, getBadRecordRouter(), input.getCoder(), From bbe25ca5f3c538e368e467c2c43b10a16620ff58 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Mon, 16 Sep 2024 07:58:29 -0400 Subject: [PATCH 25/43] add warning log when ordering key is not configured --- .../io/gcp/pubsub/PreparePubsubWriteDoFn.java | 17 ++++++++++++++++- .../apache/beam/sdk/io/gcp/pubsub/PubsubIO.java | 5 +++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java index 6df3f1a8e9cf..9cdf6b788e4e 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java @@ -29,10 +29,14 @@ import org.apache.beam.sdk.transforms.windowing.PaneInfo; import org.apache.beam.sdk.values.TupleTag; import org.apache.beam.sdk.values.ValueInSingleWindow; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; import org.checkerframework.checker.nullness.qual.Nullable; import org.joda.time.Instant; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class PreparePubsubWriteDoFn extends DoFn { + private static final Logger LOG = LoggerFactory.getLogger(PreparePubsubWriteDoFn.class); // See https://cloud.google.com/pubsub/quotas#resource_limits. private static final int PUBSUB_MESSAGE_DATA_MAX_BYTES = 10 << 20; private static final int PUBSUB_MESSAGE_MAX_ATTRIBUTES = 100; @@ -41,8 +45,9 @@ public class PreparePubsubWriteDoFn extends DoFn private static final int ORDERING_KEY_MAX_BYTE_SIZE = 1024; // The amount of bytes that each attribute entry adds up to the request private static final int PUBSUB_MESSAGE_ATTRIBUTE_ENCODE_ADDITIONAL_BYTES = 6; + private final boolean usesOrderingKey; private int maxPublishBatchSize; - + private boolean logOrderingKeyUnconfigured = false; private SerializableFunction, PubsubMessage> formatFunction; @Nullable SerializableFunction, PubsubIO.PubsubTopic> topicFunction; /** Last TopicPath that reported Lineage. */ @@ -140,12 +145,14 @@ static int validatePubsubMessageSize(PubsubMessage message, int maxPublishBatchS SerializableFunction, PubsubMessage> formatFunction, @Nullable SerializableFunction, PubsubIO.PubsubTopic> topicFunction, + boolean usesOrderingKey, int maxPublishBatchSize, BadRecordRouter badRecordRouter, Coder inputCoder, TupleTag outputTag) { this.formatFunction = formatFunction; this.topicFunction = topicFunction; + this.usesOrderingKey = usesOrderingKey; this.maxPublishBatchSize = maxPublishBatchSize; this.badRecordRouter = badRecordRouter; this.inputCoder = inputCoder; @@ -190,6 +197,14 @@ public void process( .add("pubsub", "topic", PubsubClient.topicPathFromPath(topic).getDataCatalogSegments()); reportedLineage = topic; } + if (!usesOrderingKey + && !Strings.isNullOrEmpty(message.getOrderingKey()) + && !logOrderingKeyUnconfigured) { + LOG.warn( + "Encountered Pubsub message with ordering key but this sink was not configured to " + + "retain ordering keys, so they will be dropped. Please set #withOrderingKeys()."); + logOrderingKeyUnconfigured = true; + } try { validatePubsubMessageSize(message, maxPublishBatchSize); } catch (SizeLimitExceededException e) { diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java index 1694083aeeca..4d1d1a79d7ee 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java @@ -1543,6 +1543,7 @@ public PDone expand(PCollection input) { new PreparePubsubWriteDoFn<>( getFormatFn(), topicFunction, + getNeedsOrderingKey(), maxMessageSize, getBadRecordRouter(), input.getCoder(), @@ -1661,9 +1662,9 @@ public void processElement(@Element PubsubMessage message, @Timestamp Instant ti } // Checking before adding the message stops us from violating max batch size or bytes + String orderingKey = getNeedsOrderingKey() ? msg.getMessage().getOrderingKey() : ""; final OutgoingData currentTopicAndOrderingKeyOutput = - output.computeIfAbsent( - KV.of(pubsubTopic, msg.getMessage().getOrderingKey()), t -> new OutgoingData()); + output.computeIfAbsent(KV.of(pubsubTopic, orderingKey), t -> new OutgoingData()); // TODO(sjvanrossum): https://github.com/apache/beam/issues/31800 if (currentTopicAndOrderingKeyOutput.messages.size() >= maxPublishBatchSize || (!currentTopicAndOrderingKeyOutput.messages.isEmpty() From fa80a24957197638bab4d2ed35b1540ec0a82643 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Fri, 27 Sep 2024 16:10:04 -0400 Subject: [PATCH 26/43] address comments --- .../beam/runners/dataflow/DataflowRunner.java | 3 ++- .../apache/beam/sdk/io/gcp/pubsub/PubsubIO.java | 16 ++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java b/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java index 2d7ef10f289a..953cc9484e9f 100644 --- a/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java +++ b/runners/google-cloud-dataflow-java/src/main/java/org/apache/beam/runners/dataflow/DataflowRunner.java @@ -2039,7 +2039,8 @@ private static void translate( String.format( "The DataflowRunner does not currently support publishing to Pubsub with ordering keys. " + "%s is required to support publishing with ordering keys. " - + "Set the pipeline option --experiments=%s to use this PTransform.", + + "Set the pipeline option --experiments=%s to use this PTransform. " + + "See https://issuetracker.google.com/issues/200955424 for current status.", PubsubUnboundedSink.class.getSimpleName(), ENABLE_CUSTOM_PUBSUB_SINK)); } stepContext.addInput(PropertyNames.FORMAT, "pubsub"); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java index 4d1d1a79d7ee..be573d5be52a 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java @@ -1337,7 +1337,7 @@ public abstract static class Write extends PTransform, PDone> abstract @Nullable String getPubsubRootUrl(); - abstract boolean getNeedsOrderingKey(); + abstract boolean getPublishWithOrderingKey(); abstract BadRecordRouter getBadRecordRouter(); @@ -1352,7 +1352,7 @@ static Builder newBuilder( builder.setFormatFn(formatFn); builder.setBadRecordRouter(BadRecordRouter.THROWING_ROUTER); builder.setBadRecordErrorHandler(new DefaultErrorHandler<>()); - builder.setNeedsOrderingKey(false); + builder.setPublishWithOrderingKey(false); return builder; } @@ -1384,7 +1384,7 @@ abstract Builder setFormatFn( abstract Builder setPubsubRootUrl(String pubsubRootUrl); - abstract Builder setNeedsOrderingKey(boolean needsOrderingKey); + abstract Builder setPublishWithOrderingKey(boolean publishWithOrderingKey); abstract Builder setBadRecordRouter(BadRecordRouter badRecordRouter); @@ -1469,7 +1469,7 @@ public Write withMaxBatchBytesSize(int maxBatchBytesSize) { * ordering */ public Write withOrderingKey() { - return toBuilder().setNeedsOrderingKey(true).build(); + return toBuilder().setPublishWithOrderingKey(true).build(); } /** @@ -1543,7 +1543,7 @@ public PDone expand(PCollection input) { new PreparePubsubWriteDoFn<>( getFormatFn(), topicFunction, - getNeedsOrderingKey(), + getPublishWithOrderingKey(), maxMessageSize, getBadRecordRouter(), input.getCoder(), @@ -1556,7 +1556,7 @@ public PDone expand(PCollection input) { .get(BAD_RECORD_TAG) .setCoder(BadRecord.getCoder(input.getPipeline()))); PCollection pubsubMessages = pubsubMessageTuple.get(pubsubMessageTupleTag); - if (getNeedsOrderingKey()) { + if (getPublishWithOrderingKey()) { pubsubMessages.setCoder(PubsubMessageSchemaCoder.getSchemaCoder()); } else { pubsubMessages.setCoder(PubsubMessageWithTopicCoder.of()); @@ -1580,7 +1580,7 @@ public PDone expand(PCollection input) { getTimestampAttribute(), getIdAttribute(), 100 /* numShards */, - getNeedsOrderingKey(), + getPublishWithOrderingKey(), MoreObjects.firstNonNull( getMaxBatchSize(), PubsubUnboundedSink.DEFAULT_PUBLISH_BATCH_SIZE), MoreObjects.firstNonNull( @@ -1662,7 +1662,7 @@ public void processElement(@Element PubsubMessage message, @Timestamp Instant ti } // Checking before adding the message stops us from violating max batch size or bytes - String orderingKey = getNeedsOrderingKey() ? msg.getMessage().getOrderingKey() : ""; + String orderingKey = getPublishWithOrderingKey() ? msg.getMessage().getOrderingKey() : ""; final OutgoingData currentTopicAndOrderingKeyOutput = output.computeIfAbsent(KV.of(pubsubTopic, orderingKey), t -> new OutgoingData()); // TODO(sjvanrossum): https://github.com/apache/beam/issues/31800 From 6197c210e5ce4758cf5254b6b36ccf9b98f4862c Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Fri, 25 Apr 2025 15:02:35 +0200 Subject: [PATCH 27/43] Distribute ordering keys across shards with consistent hashing --- .../apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java index bec1751831a1..90269bb77064 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java @@ -213,7 +213,9 @@ public void processElement( int shard = Strings.isNullOrEmpty(orderingKey) ? ThreadLocalRandom.current().nextInt(numShards) - : Hashing.murmur3_32_fixed().hashString(orderingKey, StandardCharsets.UTF_8).asInt(); + : Hashing.consistentHash( + Hashing.farmHashFingerprint64().hashString(orderingKey, StandardCharsets.UTF_8), + numShards); K key = keyFunction.apply(shard, topic); o.output(KV.of(key, OutgoingMessage.of(message, timestampMsSinceEpoch, recordId, topic))); } From 03b9159fc8095944190c9ba7bbd4cf415556594b Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Fri, 25 Apr 2025 13:35:48 +0000 Subject: [PATCH 28/43] Drop the ordering key field if ordering key writes are disabled --- .../io/gcp/pubsub/PreparePubsubWriteDoFn.java | 16 +++++++++------- .../beam/sdk/io/gcp/pubsub/PubsubMessage.java | 10 ++++++++++ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java index 9cdf6b788e4e..ad188ac9d64d 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PreparePubsubWriteDoFn.java @@ -197,13 +197,15 @@ public void process( .add("pubsub", "topic", PubsubClient.topicPathFromPath(topic).getDataCatalogSegments()); reportedLineage = topic; } - if (!usesOrderingKey - && !Strings.isNullOrEmpty(message.getOrderingKey()) - && !logOrderingKeyUnconfigured) { - LOG.warn( - "Encountered Pubsub message with ordering key but this sink was not configured to " - + "retain ordering keys, so they will be dropped. Please set #withOrderingKeys()."); - logOrderingKeyUnconfigured = true; + if (!usesOrderingKey && !Strings.isNullOrEmpty(message.getOrderingKey())) { + if (!logOrderingKeyUnconfigured) { + LOG.warn( + "Encountered Pubsub message with ordering key but this sink was not configured to " + + "retain ordering keys, so they will be dropped. Please set #withOrderingKeys()."); + + logOrderingKeyUnconfigured = true; + } + message = message.withOrderingKey(null); } try { validatePubsubMessageSize(message, maxPublishBatchSize); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessage.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessage.java index d57ac3d80207..779a9b847874 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessage.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessage.java @@ -112,6 +112,16 @@ public byte[] getPayload() { return impl.getMessageId(); } + public PubsubMessage withOrderingKey(@Nullable String orderingKey) { + return new PubsubMessage( + Impl.create( + impl.getTopic(), + impl.getPayload(), + impl.getAttributeMap(), + impl.getMessageId(), + orderingKey)); + } + /** Returns the ordering key of the message. */ public @Nullable String getOrderingKey() { return impl.getOrderingKey(); From c5e44a2088b0a1c20671bc6b604dd3462f9d2c07 Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Mon, 28 Apr 2025 15:40:58 +0200 Subject: [PATCH 29/43] Add more context to TODOs and remove one TODO occurrence --- .../java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java index c95ac2a13957..22b180d3338e 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java @@ -1736,11 +1736,15 @@ public void processElement(@Element PubsubMessage message, @Timestamp Instant ti throws IOException, SizeLimitExceededException { // Validate again here just as a sanity check. // TODO(sjvanrossum): https://github.com/apache/beam/issues/31800 + // - Size validation makes no distinction between JSON and Protobuf encoding + // - Accounting for HTTP to gRPC transcoding is non-trivial PreparePubsubWriteDoFn.validatePubsubMessageSize(message, maxPublishBatchByteSize); // NOTE: The record id is always null. final OutgoingMessage msg = OutgoingMessage.of(message, timestamp.getMillis(), null, message.getTopic()); // TODO(sjvanrossum): https://github.com/apache/beam/issues/31800 + // - Size validation makes no distinction between JSON and Protobuf encoding + // - Accounting for HTTP to gRPC transcoding is non-trivial final int messageSize = msg.getMessage().getData().size(); final PubsubTopic pubsubTopic; @@ -1765,7 +1769,6 @@ public void processElement(@Element PubsubMessage message, @Timestamp Instant ti } currentTopicAndOrderingKeyOutput.messages.add(msg); - // TODO(sjvanrossum): https://github.com/apache/beam/issues/31800 currentTopicAndOrderingKeyOutput.bytes += messageSize; } From b24ecbd7a4784f3af1b77dfd616d262f9868ba8a Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Mon, 28 Apr 2025 14:19:23 +0000 Subject: [PATCH 30/43] Add soft deprecation notice to coders for maintainers of PubsubIO --- .../beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java | 2 +- ...ssageWithAttributesAndMessageIdAndOrderingKeyCoder.java | 7 ++++++- .../PubsubMessageWithAttributesAndMessageIdCoder.java | 7 ++++++- .../io/gcp/pubsub/PubsubMessageWithAttributesCoder.java | 7 ++++++- .../sdk/io/gcp/pubsub/PubsubMessageWithMessageIdCoder.java | 3 +++ .../sdk/io/gcp/pubsub/PubsubMessageWithTopicCoder.java | 7 ++++++- 6 files changed, 28 insertions(+), 5 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java index 72eb32d87668..47443e88ad43 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java @@ -31,7 +31,7 @@ * PubSub message from server. * *

{@link SchemaCoder} is used so that fields can be added in the future without breaking update - * compatibility. + * compatibility. Maintainers should prefer this coder when adding new features to {@link PubsubIO}. */ public class PubsubMessageSchemaCoder { private static final Schema PUBSUB_MESSAGE_SCHEMA = diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithAttributesAndMessageIdAndOrderingKeyCoder.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithAttributesAndMessageIdAndOrderingKeyCoder.java index 7c2a4250e87c..968d76584bc3 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithAttributesAndMessageIdAndOrderingKeyCoder.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithAttributesAndMessageIdAndOrderingKeyCoder.java @@ -31,7 +31,12 @@ import org.apache.beam.sdk.extensions.protobuf.ProtoCoder; import org.apache.beam.sdk.values.TypeDescriptor; -/** A coder for PubsubMessage including all fields of a PubSub message from server. */ +/** + * A coder for PubsubMessage including all fields of a PubSub message from server. + * + *

Maintainers should prefer {@link PubsubMessageSchemaCoder} over this coder when adding + * features to {@link PubsubIO}. + */ @SuppressWarnings({ "nullness" // TODO(https://github.com/apache/beam/issues/20497) }) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithAttributesAndMessageIdCoder.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithAttributesAndMessageIdCoder.java index f6fa8a0cc973..803ff83583ce 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithAttributesAndMessageIdCoder.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithAttributesAndMessageIdCoder.java @@ -29,7 +29,12 @@ import org.apache.beam.sdk.coders.StringUtf8Coder; import org.apache.beam.sdk.values.TypeDescriptor; -/** A coder for PubsubMessage including attributes and the message id from the PubSub server. */ +/** + * A coder for PubsubMessage including attributes and the message id from the PubSub server. + * + *

Maintainers should prefer {@link PubsubMessageSchemaCoder} over this coder when adding + * features to {@link PubsubIO}. + */ @SuppressWarnings({ "nullness" // TODO(https://github.com/apache/beam/issues/20497) }) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithAttributesCoder.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithAttributesCoder.java index 3be321891d33..9dd3ab56a72a 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithAttributesCoder.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithAttributesCoder.java @@ -29,7 +29,12 @@ import org.apache.beam.sdk.coders.StringUtf8Coder; import org.apache.beam.sdk.values.TypeDescriptor; -/** A coder for PubsubMessage including attributes. */ +/** + * A coder for PubsubMessage including attributes. + * + *

Maintainers should prefer {@link PubsubMessageSchemaCoder} over this coder when adding + * features to {@link PubsubIO}. + */ @SuppressWarnings({ "nullness" // TODO(https://github.com/apache/beam/issues/20497) }) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithMessageIdCoder.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithMessageIdCoder.java index 95bd43c53a66..7016146b3671 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithMessageIdCoder.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithMessageIdCoder.java @@ -29,6 +29,9 @@ /** * A coder for PubsubMessage treating the raw bytes being decoded as the message's payload, with the * message id from the PubSub server. + * + *

Maintainers should prefer {@link PubsubMessageSchemaCoder} over this coder when adding + * features to {@link PubsubIO}. */ @SuppressWarnings({ "nullness" // TODO(https://github.com/apache/beam/issues/20497) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithTopicCoder.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithTopicCoder.java index 768aebe54e65..c3270625fce1 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithTopicCoder.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageWithTopicCoder.java @@ -30,7 +30,12 @@ import org.apache.beam.sdk.values.TypeDescriptor; import org.checkerframework.checker.nullness.qual.Nullable; -/** A coder for PubsubMessage including the topic from the PubSub server. */ +/** + * A coder for PubsubMessage including the topic from the PubSub server. + * + *

Maintainers should prefer {@link PubsubMessageSchemaCoder} over this coder when adding + * features to {@link PubsubIO}. + */ public class PubsubMessageWithTopicCoder extends CustomCoder { // A message's payload cannot be null private static final Coder PAYLOAD_CODER = ByteArrayCoder.of(); From 35be569d98ebcfa6dabb73b63d5a43f460727900 Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Mon, 28 Apr 2025 14:21:18 +0000 Subject: [PATCH 31/43] Use attachValues instead of constructing a map of named fields --- .../gcp/pubsub/PubsubMessageSchemaCoder.java | 32 ++++++------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java index 47443e88ad43..1efb8f4a9687 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java @@ -17,8 +17,6 @@ */ package org.apache.beam.sdk.io.gcp.pubsub; -import java.util.HashMap; -import java.util.Map; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.SchemaCoder; import org.apache.beam.sdk.transforms.SerializableFunction; @@ -34,6 +32,7 @@ * compatibility. Maintainers should prefer this coder when adding new features to {@link PubsubIO}. */ public class PubsubMessageSchemaCoder { + // NOTE: Fields must not be reordered. private static final Schema PUBSUB_MESSAGE_SCHEMA = Schema.builder() .addByteArrayField("payload") @@ -45,26 +44,15 @@ public class PubsubMessageSchemaCoder { private static final SerializableFunction TO_ROW = (PubsubMessage message) -> { - Map fieldValues = new HashMap<>(); - fieldValues.put("payload", message.getPayload()); - - String topic = message.getTopic(); - if (topic != null) { - fieldValues.put("topic", topic); - } - Map attributeMap = message.getAttributeMap(); - if (attributeMap != null) { - fieldValues.put("attributes", attributeMap); - } - String messageId = message.getMessageId(); - if (messageId != null) { - fieldValues.put("message_id", messageId); - } - String orderingKey = message.getOrderingKey(); - if (orderingKey != null) { - fieldValues.put("ordering_key", orderingKey); - } - return Row.withSchema(PUBSUB_MESSAGE_SCHEMA).withFieldValues(fieldValues).build(); + // NOTE: The row's value attachment order must match the schema's definition order. + return Row.withSchema(PUBSUB_MESSAGE_SCHEMA) + .attachValues( + message.getPayload(), + message.getTopic(), + message.getAttributeMap(), + message.getMessageId(), + message.getOrderingKey()) + .build(); }; private static final SerializableFunction FROM_ROW = From 0acc1b13ddaa0a99fd7f3ff6bc73de817cb520de Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Mon, 28 Apr 2025 16:31:29 +0200 Subject: [PATCH 32/43] Apply suggestions from code review --- .../main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java | 2 +- .../apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java | 2 +- .../org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java index 22b180d3338e..3c08bcbf2819 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubIO.java @@ -1739,7 +1739,7 @@ public void processElement(@Element PubsubMessage message, @Timestamp Instant ti // - Size validation makes no distinction between JSON and Protobuf encoding // - Accounting for HTTP to gRPC transcoding is non-trivial PreparePubsubWriteDoFn.validatePubsubMessageSize(message, maxPublishBatchByteSize); - // NOTE: The record id is always null. + // NOTE: The record id is always null since it will be assigned by Pub/Sub. final OutgoingMessage msg = OutgoingMessage.of(message, timestamp.getMillis(), null, message.getTopic()); // TODO(sjvanrossum): https://github.com/apache/beam/issues/31800 diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java index 1efb8f4a9687..de7275f923eb 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java @@ -64,7 +64,7 @@ public class PubsubMessageSchemaCoder { row.getString("message_id"), row.getString("ordering_key")); - String topic = row.getString("topic"); + @Nullable String topic = row.getString("topic"); if (topic != null) { message = message.withTopic(topic); } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java index fddae3069990..a23889b83447 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java @@ -240,7 +240,7 @@ private class OutgoingData { OutgoingData() { this.bytes = 0; - this.messages = new ArrayList<>(publishBatchSize); + this.messages = new ArrayList<>(); } } From 8fd553fac5479fff5036b3374aaca112837c97dd Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Mon, 28 Apr 2025 15:05:53 +0000 Subject: [PATCH 33/43] Add missing import of Nullable --- .../apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java | 1 + 1 file changed, 1 insertion(+) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java index de7275f923eb..e4c0651a5684 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java @@ -23,6 +23,7 @@ import org.apache.beam.sdk.values.Row; import org.apache.beam.sdk.values.TypeDescriptor; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; +import org.checkerframework.checker.nullness.qual.Nullable; /** * Provides a {@link SchemaCoder} for {@link PubsubMessage}, including the topic and all fields of a From d3e5fbdfb6d12f1d604d2e73968e5eda95c2fb20 Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Mon, 28 Apr 2025 16:17:25 +0000 Subject: [PATCH 34/43] Fix row builder --- .../beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java index e4c0651a5684..e3df56a98185 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessageSchemaCoder.java @@ -52,8 +52,7 @@ public class PubsubMessageSchemaCoder { message.getTopic(), message.getAttributeMap(), message.getMessageId(), - message.getOrderingKey()) - .build(); + message.getOrderingKey()); }; private static final SerializableFunction FROM_ROW = From 7e3466d3ab6861aa8dfd149b58519bdb30daed23 Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Mon, 28 Apr 2025 19:32:27 +0200 Subject: [PATCH 35/43] Add missing nullable annotation to attachValues --- .../java/core/src/main/java/org/apache/beam/sdk/values/Row.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/values/Row.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/values/Row.java index 591a83600561..cf7ad3de7b7b 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/values/Row.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/values/Row.java @@ -829,7 +829,7 @@ public Row attachValues(List<@Nullable Object> attachedValues) { return new RowWithStorage(schema, attachedValues); } - public Row attachValues(Object... values) { + public Row attachValues(@Nullable Object... values) { return attachValues(Arrays.asList(values)); } From 2cb82f4300d04aef0812408b81edc6a99c3d4a59 Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Mon, 28 Apr 2025 19:33:06 +0200 Subject: [PATCH 36/43] Remove unused field pubishBatchSize --- .../beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java index a23889b83447..3fe7d51aec1e 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSink.java @@ -234,7 +234,7 @@ public void populateDisplayData(DisplayData.Builder builder) { /** Publish messages to Pubsub in batches. */ @VisibleForTesting static class WriterFn extends DoFn, Void> { - private class OutgoingData { + private static class OutgoingData { int bytes; List messages; @@ -248,7 +248,6 @@ private class OutgoingData { private final @Nullable ValueProvider topic; private final String timestampAttribute; private final String idAttribute; - private final int publishBatchSize; private final int publishBatchBytes; private final String pubsubRootUrl; @@ -271,7 +270,6 @@ private class OutgoingData { this.topic = topic; this.timestampAttribute = timestampAttribute; this.idAttribute = idAttribute; - this.publishBatchSize = publishBatchSize; this.publishBatchBytes = publishBatchBytes; this.pubsubRootUrl = null; } @@ -281,14 +279,12 @@ private class OutgoingData { @Nullable ValueProvider topic, String timestampAttribute, String idAttribute, - int publishBatchSize, int publishBatchBytes, String pubsubRootUrl) { this.pubsubFactory = pubsubFactory; this.topic = topic; this.timestampAttribute = timestampAttribute; this.idAttribute = idAttribute; - this.publishBatchSize = publishBatchSize; this.publishBatchBytes = publishBatchBytes; this.pubsubRootUrl = pubsubRootUrl; } @@ -663,7 +659,6 @@ public PDone expand(PCollection> input) { outer.topic, outer.timestampAttribute, outer.idAttribute, - outer.publishBatchSize, outer.publishBatchBytes, outer.pubsubRootUrl))); return PDone.in(input.getPipeline()); @@ -716,7 +711,6 @@ public PDone expand(PCollection input) { outer.topic, outer.timestampAttribute, outer.idAttribute, - outer.publishBatchSize, outer.publishBatchBytes, outer.pubsubRootUrl))); return PDone.in(input.getPipeline()); From 50573992adc2690cfb1a9a3a4da6672541d5a8fd Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Mon, 28 Apr 2025 23:59:42 +0200 Subject: [PATCH 37/43] Rewrite integration test for ordering key writes --- .../beam/sdk/io/gcp/pubsub/PubsubWriteIT.java | 70 +++++++++++-------- 1 file changed, 41 insertions(+), 29 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteIT.java index fe36e31e69bb..6b188b515cea 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteIT.java @@ -18,23 +18,26 @@ package org.apache.beam.sdk.io.gcp.pubsub; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import com.google.protobuf.UnsafeByteOperations; import java.io.IOException; import java.nio.charset.StandardCharsets; -import java.util.Arrays; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Objects; import org.apache.beam.sdk.io.GenerateSequence; +import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.IncomingMessage; import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.SubscriptionPath; import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.TopicPath; import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.transforms.Create; import org.apache.beam.sdk.transforms.MapElements; import org.apache.beam.sdk.values.TypeDescriptors; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.commons.lang3.RandomStringUtils; +import org.checkerframework.checker.nullness.qual.Nullable; import org.joda.time.Instant; import org.junit.After; import org.junit.Before; @@ -111,7 +114,7 @@ public void testBoundedWriteMessageWithAttributes() { } @Test - public void testBoundedWriteMessageWithAttributesAndMessageIdAndOrderingKey() throws IOException { + public void testBoundedWriteMessageWithAttributesAndOrderingKey() throws IOException { TopicPath testTopicPath = PubsubClient.topicPathFromName( project, "pubsub-write-ordering-key-" + Instant.now().getMillis()); @@ -120,42 +123,51 @@ public void testBoundedWriteMessageWithAttributesAndMessageIdAndOrderingKey() th pubsubClient.createRandomSubscription( PubsubClient.projectPathFromId(project), testTopicPath, 10); - byte[] payload = RandomStringUtils.randomAscii(1_000_000).getBytes(StandardCharsets.UTF_8); - Map attributes = - ImmutableMap.builder() - .put("id", "1") - .put("description", RandomStringUtils.randomAscii(100)) - .build(); + byte[] payload = new byte[] {-16, -97, -89, -86}; // U+1F9EA - PubsubMessage outgoingMessage = - new PubsubMessage(payload, attributes, "test_message", "111222"); + Map outgoingMessages = new HashMap<>(); + outgoingMessages.put("0", new PubsubMessage(payload, ImmutableMap.of("id", "0"))); + outgoingMessages.put( + "1", new PubsubMessage(payload, ImmutableMap.of("id", "1")).withOrderingKey("12")); + outgoingMessages.put( + "2", new PubsubMessage(payload, ImmutableMap.of("id", "2")).withOrderingKey("12")); + outgoingMessages.put( + "3", new PubsubMessage(payload, ImmutableMap.of("id", "3")).withOrderingKey("3")); pipeline - .apply(Create.of(outgoingMessage).withCoder(PubsubMessageSchemaCoder.getSchemaCoder())) + .apply( + Create.of(ImmutableList.copyOf(outgoingMessages.values())) + .withCoder(PubsubMessageSchemaCoder.getSchemaCoder())) .apply(PubsubIO.writeMessages().withOrderingKey().to(testTopicPath.getPath())); pipeline.run().waitUntilFinish(); - List incomingMessages = - pubsubClient.pull(Instant.now().getMillis(), testSubscriptionPath, 1, true); - // sometimes the first pull comes up short. try 4 pulls to avoid flaky false-negatives - int numPulls = 1; - while (incomingMessages.isEmpty()) { - if (numPulls >= 4) { + int numPulls = 0; + while (!outgoingMessages.isEmpty()) { + boolean emptyOrDuplicatePull = true; + List incomingMessages = + pubsubClient.pull(Instant.now().getMillis(), testSubscriptionPath, 4, true); + + for (IncomingMessage incomingMessage : incomingMessages) { + com.google.pubsub.v1.PubsubMessage message = incomingMessage.message(); + @Nullable + PubsubMessage outgoingMessage = + outgoingMessages.remove(message.getAttributesMap().get("id")); + if (outgoingMessage != null) { + emptyOrDuplicatePull = false; + assertEquals( + UnsafeByteOperations.unsafeWrap(outgoingMessage.getPayload()), message.getData()); + assertEquals(outgoingMessage.getAttributeMap(), message.getAttributesMap()); + assertEquals(outgoingMessage.getOrderingKey(), message.getOrderingKey()); + } + } + + if (emptyOrDuplicatePull && ++numPulls > 4) { throw new RuntimeException( - String.format("Pulled %s times from PubSub but retrieved no elements.", numPulls)); + String.format( + "Pulled %s times from PubSub but retrieved no expected elements.", numPulls)); } - incomingMessages = - pubsubClient.pull(Instant.now().getMillis(), testSubscriptionPath, 1, true); - numPulls++; } - assertEquals(1, incomingMessages.size()); - - com.google.pubsub.v1.PubsubMessage incomingMessage = incomingMessages.get(0).message(); - assertTrue( - Arrays.equals(outgoingMessage.getPayload(), incomingMessage.getData().toByteArray())); - assertEquals(outgoingMessage.getAttributeMap(), incomingMessage.getAttributesMap()); - assertEquals(outgoingMessage.getOrderingKey(), incomingMessage.getOrderingKey()); pubsubClient.deleteSubscription(testSubscriptionPath); pubsubClient.deleteTopic(testTopicPath); From e69aae102ca783f2339006e521c8e0a383bc482c Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Tue, 29 Apr 2025 08:43:23 +0200 Subject: [PATCH 38/43] Fix assertion failure on empty ordering key --- .../java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteIT.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteIT.java index 6b188b515cea..0b01befef491 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteIT.java @@ -126,7 +126,8 @@ public void testBoundedWriteMessageWithAttributesAndOrderingKey() throws IOExcep byte[] payload = new byte[] {-16, -97, -89, -86}; // U+1F9EA Map outgoingMessages = new HashMap<>(); - outgoingMessages.put("0", new PubsubMessage(payload, ImmutableMap.of("id", "0"))); + outgoingMessages.put( + "0", new PubsubMessage(payload, ImmutableMap.of("id", "0")).withOrderingKey("")); outgoingMessages.put( "1", new PubsubMessage(payload, ImmutableMap.of("id", "1")).withOrderingKey("12")); outgoingMessages.put( From 05369b953afb310e9d5e58fd4768440b5fe44423 Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Tue, 29 Apr 2025 08:47:19 +0200 Subject: [PATCH 39/43] Add comments --- .../beam/sdk/io/gcp/pubsub/PubsubWriteIT.java | 22 +++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteIT.java index 0b01befef491..45c85183d536 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubWriteIT.java @@ -125,15 +125,29 @@ public void testBoundedWriteMessageWithAttributesAndOrderingKey() throws IOExcep byte[] payload = new byte[] {-16, -97, -89, -86}; // U+1F9EA + // Messages with and without ordering keys can be mixed together in a collection. The writer + // ensures that a publish request only sends message batches with the same ordering key, + // otherwise the Pub/Sub service will reject the request. + // Outgoing messages may specify either null or empty string to represent messages without + // ordering keys, but Protobuf treats strings as primitives so we explicitly use empty string in + // this test for the round trip assertion. Map outgoingMessages = new HashMap<>(); outgoingMessages.put( - "0", new PubsubMessage(payload, ImmutableMap.of("id", "0")).withOrderingKey("")); + "0", + new PubsubMessage(payload, ImmutableMap.of("id", "0")) + .withOrderingKey("")); // No ordering key outgoingMessages.put( - "1", new PubsubMessage(payload, ImmutableMap.of("id", "1")).withOrderingKey("12")); + "1", + new PubsubMessage(payload, ImmutableMap.of("id", "1")) + .withOrderingKey("12")); // Repeated ordering key outgoingMessages.put( - "2", new PubsubMessage(payload, ImmutableMap.of("id", "2")).withOrderingKey("12")); + "2", + new PubsubMessage(payload, ImmutableMap.of("id", "2")) + .withOrderingKey("12")); // Repeated ordering key outgoingMessages.put( - "3", new PubsubMessage(payload, ImmutableMap.of("id", "3")).withOrderingKey("3")); + "3", + new PubsubMessage(payload, ImmutableMap.of("id", "3")) + .withOrderingKey("3")); // Distinct ordering key pipeline .apply( From 2aa00bf8b98887538f899e4541ea7c506d5a08c1 Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Tue, 29 Apr 2025 11:05:49 +0200 Subject: [PATCH 40/43] Add unit tests for ordering key writes --- .../sdk/io/gcp/pubsub/PubsubTestClient.java | 6 + .../gcp/pubsub/PubsubUnboundedSinkTest.java | 162 +++++++++++++++++- 2 files changed, 163 insertions(+), 5 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubTestClient.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubTestClient.java index 3d5a879fce15..87ef641fbe2b 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubTestClient.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubTestClient.java @@ -32,6 +32,7 @@ import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Sets; +import org.checkerframework.checker.nullness.qual.MonotonicNonNull; import org.checkerframework.checker.nullness.qual.Nullable; /** @@ -461,7 +462,12 @@ public int publish(TopicPath topic, List outgoingMessages) thro topic, STATE.expectedTopic); } + @MonotonicNonNull String batchOrderingKey = null; for (OutgoingMessage outgoingMessage : outgoingMessages) { + if (batchOrderingKey == null) { + batchOrderingKey = outgoingMessage.getMessage().getOrderingKey(); + } + checkState(outgoingMessage.getMessage().getOrderingKey().equals(batchOrderingKey)); if (isDynamic) { checkState(outgoingMessage.topic().equals(topic.getPath())); } else { diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSinkTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSinkTest.java index 5296f4850ef1..9a78f1c40865 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSinkTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSinkTest.java @@ -17,6 +17,8 @@ */ package org.apache.beam.sdk.io.gcp.pubsub; +import static org.junit.Assert.assertThrows; + import com.google.protobuf.ByteString; import java.io.IOException; import java.io.Serializable; @@ -25,6 +27,7 @@ import java.util.List; import java.util.Map; import java.util.stream.Collectors; +import org.apache.beam.sdk.Pipeline.PipelineExecutionException; import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.OutgoingMessage; import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.TopicPath; import org.apache.beam.sdk.io.gcp.pubsub.PubsubTestClient.PubsubTestClientFactory; @@ -35,10 +38,12 @@ import org.apache.beam.sdk.transforms.Create; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.SerializableFunction; import org.apache.beam.sdk.values.TimestampedValue; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.hash.Hashing; +import org.checkerframework.checker.nullness.qual.Nullable; import org.joda.time.Duration; import org.joda.time.Instant; import org.junit.Rule; @@ -53,26 +58,41 @@ public class PubsubUnboundedSinkTest implements Serializable { private static final String DATA = "testData"; private static final ImmutableMap ATTRIBUTES = ImmutableMap.builder().put("a", "b").put("c", "d").build(); + private static final SerializableFunction ORDERING_KEY_FN = + e -> String.valueOf(e.length()); private static final long TIMESTAMP = 1234L; private static final String TIMESTAMP_ATTRIBUTE = "timestamp"; private static final String ID_ATTRIBUTE = "id"; private static final int NUM_SHARDS = 10; private static class Stamp extends DoFn { - private final Map attributes; + private final @Nullable Map attributes; + private final SerializableFunction orderingKeyFn; private Stamp() { - this(ImmutableMap.of()); + this(ImmutableMap.of(), e -> null); + } + + private Stamp(SerializableFunction orderingKeyFn) { + this(ImmutableMap.of(), orderingKeyFn); } - private Stamp(Map attributes) { + private Stamp(@Nullable Map attributes) { + this(attributes, e -> null); + } + + private Stamp( + @Nullable Map attributes, + SerializableFunction orderingKeyFn) { this.attributes = attributes; + this.orderingKeyFn = orderingKeyFn; } @ProcessElement public void processElement(ProcessContext c) { c.outputWithTimestamp( - new PubsubMessage(c.element().getBytes(StandardCharsets.UTF_8), attributes), + new PubsubMessage(c.element().getBytes(StandardCharsets.UTF_8), attributes) + .withOrderingKey(orderingKeyFn.apply(c.element())), new Instant(TIMESTAMP)); } } @@ -160,7 +180,7 @@ public void sendOneMessageWithoutAttributes() throws IOException { RecordIdMethod.DETERMINISTIC, null); p.apply(Create.of(ImmutableList.of(DATA))) - .apply(ParDo.of(new Stamp(null /* attributes */))) + .apply(ParDo.of((new Stamp((@Nullable Map) null /* attributes */)))) .apply(sink); p.run(); } @@ -320,6 +340,138 @@ public void sendMoreThanOneBatchByByteSize() throws IOException { // message does not match the expected publish message. } + @Test + public void sendOneMessageWithWrongCoderForOrderingKey() throws IOException { + List outgoing = + ImmutableList.of( + OutgoingMessage.of( + com.google.pubsub.v1.PubsubMessage.newBuilder() + .setData(ByteString.copyFromUtf8(DATA)) + .putAllAttributes(ATTRIBUTES) + .setOrderingKey(ORDERING_KEY_FN.apply(DATA)) + .build(), + TIMESTAMP, + getRecordId(DATA), + null)); + assertThrows( + PipelineExecutionException.class, + () -> { + try (PubsubTestClientFactory factory = + PubsubTestClient.createFactoryForPublish(TOPIC, outgoing, ImmutableList.of())) { + PubsubUnboundedSink sink = + new PubsubUnboundedSink( + factory, + StaticValueProvider.of(TOPIC), + TIMESTAMP_ATTRIBUTE, + ID_ATTRIBUTE, + NUM_SHARDS, + true, + 1 /* batchSize */, + 1 /* batchBytes */, + Duration.standardSeconds(2), + RecordIdMethod.DETERMINISTIC, + null); + p.apply(Create.of(DATA)) + .apply(ParDo.of(new Stamp(ATTRIBUTES, ORDERING_KEY_FN))) + .apply(sink); + p.run(); + } + }); + // The PubsubTestClientFactory will assert fail on close if the actual published + // message does not match the expected publish message. + } + + @Test + public void sendOneMessagePerOrderingKey() throws IOException { + List outgoing = + ImmutableList.of( + OutgoingMessage.of( + com.google.pubsub.v1.PubsubMessage.newBuilder() + .setData(ByteString.copyFromUtf8(DATA)) + .putAllAttributes(ATTRIBUTES) + .setOrderingKey(ORDERING_KEY_FN.apply(DATA)) + .build(), + TIMESTAMP, + getRecordId(DATA), + null), + OutgoingMessage.of( + com.google.pubsub.v1.PubsubMessage.newBuilder() + .setData(ByteString.copyFromUtf8(DATA + DATA)) + .putAllAttributes(ATTRIBUTES) + .setOrderingKey(ORDERING_KEY_FN.apply(DATA + DATA)) + .build(), + TIMESTAMP, + getRecordId(DATA + DATA), + null)); + try (PubsubTestClientFactory factory = + PubsubTestClient.createFactoryForPublish(TOPIC, outgoing, ImmutableList.of())) { + PubsubUnboundedSink sink = + new PubsubUnboundedSink( + factory, + StaticValueProvider.of(TOPIC), + TIMESTAMP_ATTRIBUTE, + ID_ATTRIBUTE, + NUM_SHARDS, + true, + 1 /* batchSize */, + 1 /* batchBytes */, + Duration.standardSeconds(2), + RecordIdMethod.DETERMINISTIC, + null); + p.apply(Create.of(DATA, DATA + DATA)) + .apply(ParDo.of(new Stamp(ATTRIBUTES, ORDERING_KEY_FN))) + .setCoder(PubsubMessageSchemaCoder.getSchemaCoder()) + .apply(sink); + p.run(); + } + // The PubsubTestClientFactory will assert fail on close if the actual published + // message does not match the expected publish message. + } + + @Test + public void sendMoreThanOneBatchByOrderingKey() throws IOException { + List outgoing = new ArrayList<>(); + List data = new ArrayList<>(); + int batchSize = 2; + int batchBytes = 1000; + for (int i = 0; i < batchSize * 10; i++) { + String str = String.valueOf(i); + outgoing.add( + OutgoingMessage.of( + com.google.pubsub.v1.PubsubMessage.newBuilder() + .setData(ByteString.copyFromUtf8(str)) + .setOrderingKey(ORDERING_KEY_FN.apply(str)) + .build(), + TIMESTAMP, + getRecordId(str), + null)); + data.add(str); + } + try (PubsubTestClientFactory factory = + PubsubTestClient.createFactoryForPublish(TOPIC, outgoing, ImmutableList.of())) { + PubsubUnboundedSink sink = + new PubsubUnboundedSink( + factory, + StaticValueProvider.of(TOPIC), + TIMESTAMP_ATTRIBUTE, + ID_ATTRIBUTE, + NUM_SHARDS, + true, + batchSize, + batchBytes, + Duration.standardSeconds(2), + RecordIdMethod.DETERMINISTIC, + null); + p.apply(Create.of(data)) + .apply(ParDo.of(new Stamp(ORDERING_KEY_FN))) + .setCoder(PubsubMessageSchemaCoder.getSchemaCoder()) + .apply(sink); + p.run(); + } + // The PubsubTestClientFactory will assert fail on close if the actual published + // message does not match the expected publish message. + } + // TODO: We would like to test that failed Pubsub publish calls cause the already assigned // (and random) record ids to be reused. However that can't be done without the test runnner // supporting retrying bundles. From 8730e488bb3fc187b450fd8cb52cc8aed345092d Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Tue, 29 Apr 2025 11:07:55 +0200 Subject: [PATCH 41/43] Unconditionally reset test client state to prevent global state corruption across tests --- .../sdk/io/gcp/pubsub/PubsubTestClient.java | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubTestClient.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubTestClient.java index 87ef641fbe2b..7bf342eee8c6 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubTestClient.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubTestClient.java @@ -305,12 +305,23 @@ private static void activate(Runnable setStateValues) { private static void deactivate(Runnable runFinalChecks) { synchronized (STATE) { checkState(STATE.isActive, "No test still in flight"); - runFinalChecks.run(); - STATE.remainingExpectedOutgoingMessages = null; - STATE.remainingPendingIncomingMessages = null; - STATE.pendingAckIncomingMessages = null; - STATE.ackDeadline = null; - STATE.isActive = false; + try { + runFinalChecks.run(); + } finally { + STATE.isPublish = false; + STATE.expectedTopic = null; + STATE.remainingExpectedOutgoingMessages = null; + STATE.remainingFailingOutgoingMessages = null; + STATE.clock = null; + STATE.expectedSubscription = null; + STATE.ackTimeoutSec = 0; + STATE.remainingPendingIncomingMessages = null; + STATE.pendingAckIncomingMessages = null; + STATE.ackDeadline = null; + STATE.expectedSchemaPath = null; + STATE.expectedSchema = null; + STATE.isActive = false; + } } } From d4764065751fb0ef7bf5c2b4e8effebaf7438423 Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Wed, 30 Apr 2025 09:45:31 +0000 Subject: [PATCH 42/43] Shuffle input for multiple ordering key batches --- .../apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSinkTest.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSinkTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSinkTest.java index 9a78f1c40865..4853a09a1875 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSinkTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSinkTest.java @@ -24,6 +24,7 @@ import java.io.Serializable; import java.nio.charset.StandardCharsets; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.Map; import java.util.stream.Collectors; @@ -447,6 +448,7 @@ public void sendMoreThanOneBatchByOrderingKey() throws IOException { null)); data.add(str); } + Collections.shuffle(data); try (PubsubTestClientFactory factory = PubsubTestClient.createFactoryForPublish(TOPIC, outgoing, ImmutableList.of())) { PubsubUnboundedSink sink = From 7f617616238eb55ffaf11836c9b865d1ae17bb35 Mon Sep 17 00:00:00 2001 From: Steven van Rossum Date: Wed, 30 Apr 2025 09:56:18 +0000 Subject: [PATCH 43/43] Add comment above call to shuffle --- .../apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSinkTest.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSinkTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSinkTest.java index 4853a09a1875..49b7f88f9e25 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSinkTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubUnboundedSinkTest.java @@ -448,7 +448,10 @@ public void sendMoreThanOneBatchByOrderingKey() throws IOException { null)); data.add(str); } + + // Randomly shuffle test input to highlight potential issues caused by order dependent logic. Collections.shuffle(data); + try (PubsubTestClientFactory factory = PubsubTestClient.createFactoryForPublish(TOPIC, outgoing, ImmutableList.of())) { PubsubUnboundedSink sink =