From ce14b965f1eba83535144069a38048952a1db24f Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Mon, 3 Jun 2024 19:14:14 -0400 Subject: [PATCH 01/24] managed bigqueryio --- .../io/google-cloud-platform/build.gradle | 1 + .../BigQuerySchemaTransformTranslation.java | 85 +++++++ ...ueryDirectReadSchemaTransformProvider.java | 28 ++- ...torageWriteApiSchemaTransformProvider.java | 26 ++- .../io/gcp/bigquery/BigQueryManagedIT.java | 103 +++++++++ ...igQuerySchemaTransformTranslationTest.java | 207 ++++++++++++++++++ .../org/apache/beam/sdk/managed/Managed.java | 3 + .../managed/ManagedTransformConstants.java | 25 +++ 8 files changed, 475 insertions(+), 3 deletions(-) create mode 100644 sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySchemaTransformTranslation.java create mode 100644 sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java create mode 100644 sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySchemaTransformTranslationTest.java diff --git a/sdks/java/io/google-cloud-platform/build.gradle b/sdks/java/io/google-cloud-platform/build.gradle index 5554d9964ac3..e54ce28aa277 100644 --- a/sdks/java/io/google-cloud-platform/build.gradle +++ b/sdks/java/io/google-cloud-platform/build.gradle @@ -43,6 +43,7 @@ dependencies { implementation project(":sdks:java:extensions:google-cloud-platform-core") implementation project(":sdks:java:extensions:protobuf") implementation project(":sdks:java:extensions:arrow") + implementation project(":sdks:java:managed") implementation library.java.avro implementation library.java.bigdataoss_util implementation library.java.error_prone_annotations diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySchemaTransformTranslation.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySchemaTransformTranslation.java new file mode 100644 index 000000000000..102a1840e177 --- /dev/null +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySchemaTransformTranslation.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.gcp.bigquery; + +import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryDirectReadSchemaTransformProvider.BigQueryDirectReadSchemaTransform; +import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider.BigQueryStorageWriteApiSchemaTransform; + +import com.google.auto.service.AutoService; +import java.util.Map; +import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryDirectReadSchemaTransformProvider; +import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider; +import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; +import org.apache.beam.sdk.schemas.transforms.SchemaTransformTranslation; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.util.construction.PTransformTranslation; +import org.apache.beam.sdk.util.construction.TransformPayloadTranslatorRegistrar; +import org.apache.beam.sdk.values.Row; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; + +public class BigQuerySchemaTransformTranslation { + public static class BigQueryStorageReadSchemaTransformTranslator + extends SchemaTransformTranslation.SchemaTransformPayloadTranslator< + BigQueryDirectReadSchemaTransform> { + @Override + public SchemaTransformProvider provider() { + return new BigQueryDirectReadSchemaTransformProvider(); + } + + @Override + public Row toConfigRow(BigQueryDirectReadSchemaTransform transform) { + return transform.getConfigurationRow(); + } + } + + public static class BigQueryStorageWriteSchemaTransformTranslator + extends SchemaTransformTranslation.SchemaTransformPayloadTranslator< + BigQueryStorageWriteApiSchemaTransform> { + @Override + public SchemaTransformProvider provider() { + return new BigQueryStorageWriteApiSchemaTransformProvider(); + } + + @Override + public Row toConfigRow(BigQueryStorageWriteApiSchemaTransform transform) { + return transform.getConfigurationRow(); + } + } + + @AutoService(TransformPayloadTranslatorRegistrar.class) + public static class ReadWriteRegistrar implements TransformPayloadTranslatorRegistrar { + @Override + @SuppressWarnings({ + "rawtypes", + }) + public Map< + ? extends Class, + ? extends PTransformTranslation.TransformPayloadTranslator> + getTransformPayloadTranslators() { + return ImmutableMap + ., PTransformTranslation.TransformPayloadTranslator>builder() + .put( + BigQueryDirectReadSchemaTransform.class, + new BigQueryStorageReadSchemaTransformTranslator()) + .put( + BigQueryStorageWriteApiSchemaTransform.class, + new BigQueryStorageWriteSchemaTransformTranslator()) + .build(); + } + } +} diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryDirectReadSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryDirectReadSchemaTransformProvider.java index 8b8e8179ce7d..d90a0c3ed970 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryDirectReadSchemaTransformProvider.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryDirectReadSchemaTransformProvider.java @@ -33,7 +33,9 @@ import org.apache.beam.sdk.io.gcp.bigquery.BigQueryUtils; import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryDirectReadSchemaTransformProvider.BigQueryDirectReadSchemaTransformConfiguration; import org.apache.beam.sdk.schemas.AutoValueSchema; +import org.apache.beam.sdk.schemas.NoSuchSchemaException; import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.schemas.SchemaRegistry; import org.apache.beam.sdk.schemas.annotations.DefaultSchema; import org.apache.beam.sdk.schemas.annotations.SchemaFieldDescription; import org.apache.beam.sdk.schemas.transforms.SchemaTransform; @@ -62,7 +64,7 @@ public class BigQueryDirectReadSchemaTransformProvider extends TypedSchemaTransformProvider { - private static final String OUTPUT_TAG = "OUTPUT_ROWS"; + public static final String OUTPUT_TAG = "OUTPUT_ROWS"; @Override protected Class configurationClass() { @@ -139,6 +141,10 @@ public static Builder builder() { @Nullable public abstract List getSelectedFields(); + @SchemaFieldDescription("Use this Cloud KMS key to encrypt your data") + @Nullable + public abstract String getKmsKey(); + @Nullable /** Builder for the {@link BigQueryDirectReadSchemaTransformConfiguration}. */ @AutoValue.Builder @@ -151,6 +157,8 @@ public abstract static class Builder { public abstract Builder setSelectedFields(List selectedFields); + public abstract Builder setKmsKey(String kmsKey); + /** Builds a {@link BigQueryDirectReadSchemaTransformConfiguration} instance. */ public abstract BigQueryDirectReadSchemaTransformConfiguration build(); } @@ -161,7 +169,7 @@ public abstract static class Builder { * BigQueryDirectReadSchemaTransformConfiguration} and instantiated by {@link * BigQueryDirectReadSchemaTransformProvider}. */ - protected static class BigQueryDirectReadSchemaTransform extends SchemaTransform { + public static class BigQueryDirectReadSchemaTransform extends SchemaTransform { private BigQueryServices testBigQueryServices = null; private final BigQueryDirectReadSchemaTransformConfiguration configuration; @@ -172,6 +180,19 @@ protected static class BigQueryDirectReadSchemaTransform extends SchemaTransform this.configuration = configuration; } + public Row getConfigurationRow() { + try { + // To stay consistent with our SchemaTransform configuration naming conventions, + // we sort lexicographically + return SchemaRegistry.createDefault() + .getToRowFunction(BigQueryDirectReadSchemaTransformConfiguration.class) + .apply(configuration) + .sorted(); + } catch (NoSuchSchemaException e) { + throw new RuntimeException(e); + } + } + @VisibleForTesting public void setBigQueryServices(BigQueryServices testBigQueryServices) { this.testBigQueryServices = testBigQueryServices; @@ -211,6 +232,9 @@ BigQueryIO.TypedRead createDirectReadTransform() { } else { read = read.fromQuery(configuration.getQuery()); } + if (!Strings.isNullOrEmpty(configuration.getKmsKey())) { + read = read.withKmsKey(configuration.getKmsKey()); + } if (this.testBigQueryServices != null) { read = read.withTestServices(testBigQueryServices); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java index 980d783ec43c..e5f6b370fa87 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java @@ -43,9 +43,11 @@ import org.apache.beam.sdk.metrics.Counter; import org.apache.beam.sdk.metrics.Metrics; import org.apache.beam.sdk.schemas.AutoValueSchema; +import org.apache.beam.sdk.schemas.NoSuchSchemaException; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.Schema.Field; import org.apache.beam.sdk.schemas.Schema.FieldType; +import org.apache.beam.sdk.schemas.SchemaRegistry; import org.apache.beam.sdk.schemas.annotations.DefaultSchema; import org.apache.beam.sdk.schemas.annotations.SchemaFieldDescription; import org.apache.beam.sdk.schemas.transforms.SchemaTransform; @@ -253,6 +255,10 @@ public static Builder builder() { @Nullable public abstract Integer getNumStreams(); + @SchemaFieldDescription("Use this Cloud KMS key to encrypt your data") + @Nullable + public abstract String getKmsKey(); + @SchemaFieldDescription("This option specifies whether and where to output unwritable rows.") @Nullable public abstract ErrorHandling getErrorHandling(); @@ -275,6 +281,8 @@ public abstract static class Builder { public abstract Builder setNumStreams(Integer numStreams); + public abstract Builder setKmsKey(String kmsKey); + public abstract Builder setErrorHandling(ErrorHandling errorHandling); /** Builds a {@link BigQueryStorageWriteApiSchemaTransformConfiguration} instance. */ @@ -289,7 +297,7 @@ public abstract static class Builder { * BigQueryStorageWriteApiSchemaTransformConfiguration} and instantiated by {@link * BigQueryStorageWriteApiSchemaTransformProvider}. */ - protected static class BigQueryStorageWriteApiSchemaTransform extends SchemaTransform { + public static class BigQueryStorageWriteApiSchemaTransform extends SchemaTransform { private BigQueryServices testBigQueryServices = null; private final BigQueryStorageWriteApiSchemaTransformConfiguration configuration; @@ -453,6 +461,19 @@ public PCollectionRowTuple expand(PCollectionRowTuple input) { } } + public Row getConfigurationRow() { + try { + // To stay consistent with our SchemaTransform configuration naming conventions, + // we sort lexicographically + return SchemaRegistry.createDefault() + .getToRowFunction(BigQueryStorageWriteApiSchemaTransformConfiguration.class) + .apply(configuration) + .sorted(); + } catch (NoSuchSchemaException e) { + throw new RuntimeException(e); + } + } + BigQueryIO.Write createStorageWriteApiTransform(Schema schema) { Method writeMethod = configuration.getUseAtLeastOnceSemantics() != null @@ -491,6 +512,9 @@ BigQueryIO.Write createStorageWriteApiTransform(Schema schema) { configuration.getWriteDisposition().toUpperCase()); write = write.withWriteDisposition(writeDisposition); } + if (!Strings.isNullOrEmpty(configuration.getKmsKey())) { + write = write.withKmsKey(configuration.getKmsKey()); + } if (this.testBigQueryServices != null) { write = write.withTestServices(testBigQueryServices); diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java new file mode 100644 index 000000000000..e52b5fa24b45 --- /dev/null +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.gcp.bigquery; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.LongStream; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; +import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryDirectReadSchemaTransformProvider; +import org.apache.beam.sdk.io.gcp.testing.BigqueryClient; +import org.apache.beam.sdk.managed.Managed; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionRowTuple; +import org.apache.beam.sdk.values.Row; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; +import org.testcontainers.shaded.com.google.common.collect.ImmutableMap; + +@RunWith(JUnit4.class) +public class BigQueryManagedIT { + private static final Schema SCHEMA = + Schema.of( + Schema.Field.of("str", Schema.FieldType.STRING), + Schema.Field.of("number", Schema.FieldType.INT64)); + + private static final List ROWS = + LongStream.range(0, 20) + .mapToObj( + i -> + Row.withSchema(SCHEMA) + .withFieldValue("str", Long.toString(i)) + .withFieldValue("number", i) + .build()) + .collect(Collectors.toList()); + + private static final BigqueryClient BQ_CLIENT = new BigqueryClient("BigQueryManagedIT"); + + private static final String PROJECT = + TestPipeline.testingPipelineOptions().as(GcpOptions.class).getProject(); + private static final String BIG_QUERY_DATASET_ID = "bigquery_managed_" + System.nanoTime(); + + @BeforeClass + public static void setUpTestEnvironment() throws IOException, InterruptedException { + // Create one BQ dataset for all test cases. + BQ_CLIENT.createNewDataset(PROJECT, BIG_QUERY_DATASET_ID, null); + } + + @AfterClass + public static void cleanup() { + BQ_CLIENT.deleteDataset(PROJECT, BIG_QUERY_DATASET_ID); + } + + @Test + public void testSimpleStorageWriteRead() { + String table = String.format("%s:%s.managed_read_write", PROJECT, BIG_QUERY_DATASET_ID); + + Map writeConfig = + ImmutableMap.builder() + .put("table", table) + .put("create_disposition", "create_if_needed") + .put("at_least_once", false) + .build(); + Pipeline p = Pipeline.create(); + PCollectionRowTuple.of("input", p.apply(Create.of(ROWS)).setRowSchema(SCHEMA)) + .apply(Managed.write(Managed.BIGQUERY_STORAGE).withConfig(writeConfig)); + p.run().waitUntilFinish(); + + Map readConfig = + ImmutableMap.builder().put("table", table).build(); + Pipeline q = Pipeline.create(); + PCollection outputRows = + PCollectionRowTuple.empty(p) + .apply(Managed.read(Managed.BIGQUERY_STORAGE).withConfig(readConfig)) + .get(BigQueryDirectReadSchemaTransformProvider.OUTPUT_TAG); + PAssert.that(outputRows).containsInAnyOrder(ROWS); + q.run().waitUntilFinish(); + } +} diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySchemaTransformTranslationTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySchemaTransformTranslationTest.java new file mode 100644 index 000000000000..d1dd787253c1 --- /dev/null +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySchemaTransformTranslationTest.java @@ -0,0 +1,207 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.gcp.bigquery; + +import static org.apache.beam.model.pipeline.v1.ExternalTransforms.ExpansionMethods.Enum.SCHEMA_TRANSFORM; +import static org.apache.beam.sdk.io.gcp.bigquery.BigQuerySchemaTransformTranslation.BigQueryStorageReadSchemaTransformTranslator; +import static org.apache.beam.sdk.io.gcp.bigquery.BigQuerySchemaTransformTranslation.BigQueryStorageWriteSchemaTransformTranslator; +import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryDirectReadSchemaTransformProvider.BigQueryDirectReadSchemaTransform; +import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider.BigQueryStorageWriteApiSchemaTransform; +import static org.junit.Assert.assertEquals; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; +import org.apache.beam.model.pipeline.v1.ExternalTransforms.SchemaTransformPayload; +import org.apache.beam.model.pipeline.v1.RunnerApi; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.coders.RowCoder; +import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryDirectReadSchemaTransformProvider; +import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.schemas.SchemaTranslation; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.util.construction.BeamUrns; +import org.apache.beam.sdk.util.construction.PipelineTranslation; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionRowTuple; +import org.apache.beam.sdk.values.Row; +import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.InvalidProtocolBufferException; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class BigQuerySchemaTransformTranslationTest { + static final BigQueryStorageWriteApiSchemaTransformProvider WRITE_PROVIDER = + new BigQueryStorageWriteApiSchemaTransformProvider(); + static final BigQueryDirectReadSchemaTransformProvider READ_PROVIDER = + new BigQueryDirectReadSchemaTransformProvider(); + static final Row WRITE_CONFIG_ROW = + Row.withSchema(WRITE_PROVIDER.configurationSchema()) + .withFieldValue("table", "project:dataset.table") + .withFieldValue("createDisposition", "create_never") + .withFieldValue("writeDisposition", "write_append") + .withFieldValue("triggeringFrequencySeconds", 5L) + .withFieldValue("useAtLeastOnceSemantics", false) + .withFieldValue("autoSharding", false) + .withFieldValue("numStreams", 5) + .withFieldValue("errorHandling", null) + .build(); + static final Row READ_CONFIG_ROW = + Row.withSchema(READ_PROVIDER.configurationSchema()) + .withFieldValue("query", null) + .withFieldValue("tableSpec", "apache-beam-testing.samples.weather_stations") + .withFieldValue("rowRestriction", "col < 5") + .withFieldValue("selectedFields", Arrays.asList("col1", "col2", "col3")) + .build(); + + @Test + public void testRecreateWriteTransformFromRow() { + BigQueryStorageWriteApiSchemaTransform writeTransform = + (BigQueryStorageWriteApiSchemaTransform) WRITE_PROVIDER.from(WRITE_CONFIG_ROW); + + BigQueryStorageWriteSchemaTransformTranslator translator = + new BigQueryStorageWriteSchemaTransformTranslator(); + Row translatedRow = translator.toConfigRow(writeTransform); + + BigQueryStorageWriteApiSchemaTransform writeTransformFromRow = + translator.fromConfigRow(translatedRow, PipelineOptionsFactory.create()); + + assertEquals(WRITE_CONFIG_ROW, writeTransformFromRow.getConfigurationRow()); + } + + @Test + public void testWriteTransformProtoTranslation() + throws InvalidProtocolBufferException, IOException { + // First build a pipeline + Pipeline p = Pipeline.create(); + Schema inputSchema = Schema.builder().addByteArrayField("b").build(); + PCollection input = + p.apply( + Create.of( + Collections.singletonList( + Row.withSchema(inputSchema).addValue(new byte[] {1, 2, 3}).build()))) + .setRowSchema(inputSchema); + + BigQueryStorageWriteApiSchemaTransform writeTransform = + (BigQueryStorageWriteApiSchemaTransform) WRITE_PROVIDER.from(WRITE_CONFIG_ROW); + PCollectionRowTuple.of("input", input).apply(writeTransform); + + // Then translate the pipeline to a proto and extract KafkaWriteSchemaTransform proto + RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p); + List writeTransformProto = + pipelineProto.getComponents().getTransformsMap().values().stream() + .filter( + tr -> { + RunnerApi.FunctionSpec spec = tr.getSpec(); + try { + return spec.getUrn().equals(BeamUrns.getUrn(SCHEMA_TRANSFORM)) + && SchemaTransformPayload.parseFrom(spec.getPayload()) + .getIdentifier() + .equals(WRITE_PROVIDER.identifier()); + } catch (InvalidProtocolBufferException e) { + throw new RuntimeException(e); + } + }) + .collect(Collectors.toList()); + assertEquals(1, writeTransformProto.size()); + RunnerApi.FunctionSpec spec = writeTransformProto.get(0).getSpec(); + + // Check that the proto contains correct values + SchemaTransformPayload payload = SchemaTransformPayload.parseFrom(spec.getPayload()); + Schema schemaFromSpec = SchemaTranslation.schemaFromProto(payload.getConfigurationSchema()); + assertEquals(WRITE_PROVIDER.configurationSchema(), schemaFromSpec); + Row rowFromSpec = RowCoder.of(schemaFromSpec).decode(payload.getConfigurationRow().newInput()); + + assertEquals(WRITE_CONFIG_ROW, rowFromSpec); + + // Use the information in the proto to recreate the KafkaWriteSchemaTransform + BigQueryStorageWriteSchemaTransformTranslator translator = + new BigQueryStorageWriteSchemaTransformTranslator(); + BigQueryStorageWriteApiSchemaTransform writeTransformFromSpec = + translator.fromConfigRow(rowFromSpec, PipelineOptionsFactory.create()); + + assertEquals(WRITE_CONFIG_ROW, writeTransformFromSpec.getConfigurationRow()); + } + + @Test + public void testReCreateReadTransformFromRow() { + BigQueryDirectReadSchemaTransform readTransform = + (BigQueryDirectReadSchemaTransform) READ_PROVIDER.from(READ_CONFIG_ROW); + + BigQueryStorageReadSchemaTransformTranslator translator = + new BigQueryStorageReadSchemaTransformTranslator(); + Row row = translator.toConfigRow(readTransform); + + BigQueryDirectReadSchemaTransform readTransformFromRow = + translator.fromConfigRow(row, PipelineOptionsFactory.create()); + + assertEquals(READ_CONFIG_ROW, readTransformFromRow.getConfigurationRow()); + } + + @Test + public void testReadTransformProtoTranslation() + throws InvalidProtocolBufferException, IOException { + // First build a pipeline + Pipeline p = Pipeline.create(); + + BigQueryDirectReadSchemaTransform readTransform = + (BigQueryDirectReadSchemaTransform) READ_PROVIDER.from(READ_CONFIG_ROW); + + PCollectionRowTuple.empty(p).apply(readTransform); + + // Then translate the pipeline to a proto and extract KafkaReadSchemaTransform proto + RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p); + List readTransformProto = + pipelineProto.getComponents().getTransformsMap().values().stream() + .filter( + tr -> { + RunnerApi.FunctionSpec spec = tr.getSpec(); + try { + return spec.getUrn().equals(BeamUrns.getUrn(SCHEMA_TRANSFORM)) + && SchemaTransformPayload.parseFrom(spec.getPayload()) + .getIdentifier() + .equals(READ_PROVIDER.identifier()); + } catch (InvalidProtocolBufferException e) { + throw new RuntimeException(e); + } + }) + .collect(Collectors.toList()); + assertEquals(1, readTransformProto.size()); + RunnerApi.FunctionSpec spec = readTransformProto.get(0).getSpec(); + + // Check that the proto contains correct values + SchemaTransformPayload payload = SchemaTransformPayload.parseFrom(spec.getPayload()); + Schema schemaFromSpec = SchemaTranslation.schemaFromProto(payload.getConfigurationSchema()); + assertEquals(READ_PROVIDER.configurationSchema(), schemaFromSpec); + Row rowFromSpec = RowCoder.of(schemaFromSpec).decode(payload.getConfigurationRow().newInput()); + assertEquals(READ_CONFIG_ROW, rowFromSpec); + + // Use the information in the proto to recreate the KafkaReadSchemaTransform + BigQueryStorageReadSchemaTransformTranslator translator = + new BigQueryStorageReadSchemaTransformTranslator(); + BigQueryDirectReadSchemaTransform readTransformFromSpec = + translator.fromConfigRow(rowFromSpec, PipelineOptionsFactory.create()); + + assertEquals(READ_CONFIG_ROW, readTransformFromSpec.getConfigurationRow()); + } +} diff --git a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/Managed.java b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/Managed.java index da4a0853fb39..e02cf8c6ae95 100644 --- a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/Managed.java +++ b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/Managed.java @@ -78,17 +78,20 @@ public class Managed { // TODO: Dynamically generate a list of supported transforms public static final String ICEBERG = "iceberg"; public static final String KAFKA = "kafka"; + public static final String BIGQUERY_STORAGE = "bigquery_storage"; // Supported SchemaTransforms public static final Map READ_TRANSFORMS = ImmutableMap.builder() .put(ICEBERG, ManagedTransformConstants.ICEBERG_READ) .put(KAFKA, ManagedTransformConstants.KAFKA_READ) + .put(BIGQUERY_STORAGE, ManagedTransformConstants.BIGQUERY_STORAGE_READ) .build(); public static final Map WRITE_TRANSFORMS = ImmutableMap.builder() .put(ICEBERG, ManagedTransformConstants.ICEBERG_WRITE) .put(KAFKA, ManagedTransformConstants.KAFKA_WRITE) + .put(BIGQUERY_STORAGE, ManagedTransformConstants.BIGQUERY_STORAGE_WRITE) .build(); /** diff --git a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedTransformConstants.java b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedTransformConstants.java index 8165633cf15e..575aba1ff25e 100644 --- a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedTransformConstants.java +++ b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedTransformConstants.java @@ -43,6 +43,10 @@ public class ManagedTransformConstants { "beam:schematransform:org.apache.beam:iceberg_write:v1"; public static final String KAFKA_READ = "beam:schematransform:org.apache.beam:kafka_read:v1"; public static final String KAFKA_WRITE = "beam:schematransform:org.apache.beam:kafka_write:v1"; + public static final String BIGQUERY_STORAGE_READ = + "beam:schematransform:org.apache.beam:bigquery_storage_read:v1"; + public static final String BIGQUERY_STORAGE_WRITE = + "beam:schematransform:org.apache.beam:bigquery_storage_write:v2"; private static final Map KAFKA_READ_MAPPINGS = ImmutableMap.builder() @@ -67,9 +71,30 @@ public class ManagedTransformConstants { .put("message_name", "messageName") .build(); + private static final Map BIGQUERY_STORAGE_READ_MAPPINGS = + ImmutableMap.builder() + .put("table", "tableSpec") + .put("query", "query") + .put("row_restriction", "rowRestriction") + .put("columns", "selectedFields") + .put("kms_key", "kmsKey") + .build(); + + private static final Map BIGQUERY_STORAGE_WRITE_MAPPINGS = + ImmutableMap.builder() + .put("table", "table") + .put("num_shards", "numStreams") + .put("triggering_frequency", "triggeringFrequencySeconds") + .put("create_disposition", "createDisposition") + .put("at_least_once", "useAtLeastOnceSemantics") + .put("kms_key", "kmsKey") + .build(); + public static final Map> MAPPINGS = ImmutableMap.>builder() .put(KAFKA_READ, KAFKA_READ_MAPPINGS) .put(KAFKA_WRITE, KAFKA_WRITE_MAPPINGS) + .put(BIGQUERY_STORAGE_READ, BIGQUERY_STORAGE_READ_MAPPINGS) + .put(BIGQUERY_STORAGE_WRITE, BIGQUERY_STORAGE_WRITE_MAPPINGS) .build(); } From 550c1b4de19b63239d8e7cb09b752a3392137e95 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Mon, 3 Jun 2024 21:25:45 -0400 Subject: [PATCH 02/24] spotless --- .../org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java index e52b5fa24b45..eaf8c234043e 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java @@ -34,12 +34,12 @@ import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionRowTuple; import org.apache.beam.sdk.values.Row; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; -import org.testcontainers.shaded.com.google.common.collect.ImmutableMap; @RunWith(JUnit4.class) public class BigQueryManagedIT { From c94de3c55e76cc88a0a422d718675d2c0c504c99 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Tue, 4 Jun 2024 08:57:18 -0400 Subject: [PATCH 03/24] move managed dependency to test only --- sdks/java/io/google-cloud-platform/build.gradle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/java/io/google-cloud-platform/build.gradle b/sdks/java/io/google-cloud-platform/build.gradle index e54ce28aa277..dba987f7cee9 100644 --- a/sdks/java/io/google-cloud-platform/build.gradle +++ b/sdks/java/io/google-cloud-platform/build.gradle @@ -43,7 +43,6 @@ dependencies { implementation project(":sdks:java:extensions:google-cloud-platform-core") implementation project(":sdks:java:extensions:protobuf") implementation project(":sdks:java:extensions:arrow") - implementation project(":sdks:java:managed") implementation library.java.avro implementation library.java.bigdataoss_util implementation library.java.error_prone_annotations @@ -167,6 +166,7 @@ dependencies { testImplementation project(path: ":runners:direct-java", configuration: "shadow") testImplementation project(path: ":sdks:java:io:common", configuration: "testRuntimeMigration") testImplementation project(path: ":sdks:java:testing:test-utils", configuration: "testRuntimeMigration") + testImplementation project(":sdks:java:managed") testImplementation library.java.mockito_core testImplementation library.java.powermock testImplementation library.java.powermock_mockito From f436e62104d04fc14098680569e34ad1dc021540 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Wed, 5 Jun 2024 10:30:32 -0400 Subject: [PATCH 04/24] cleanup after merging snake_case PR --- ...ueryDirectReadSchemaTransformProvider.java | 3 ++- ...torageWriteApiSchemaTransformProvider.java | 3 ++- ...igQuerySchemaTransformTranslationTest.java | 20 +++++++++---------- .../managed/ManagedTransformConstants.java | 14 +++---------- 4 files changed, 17 insertions(+), 23 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryDirectReadSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryDirectReadSchemaTransformProvider.java index d90a0c3ed970..e9db777801de 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryDirectReadSchemaTransformProvider.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryDirectReadSchemaTransformProvider.java @@ -187,7 +187,8 @@ public Row getConfigurationRow() { return SchemaRegistry.createDefault() .getToRowFunction(BigQueryDirectReadSchemaTransformConfiguration.class) .apply(configuration) - .sorted(); + .sorted() + .toSnakeCase(); } catch (NoSuchSchemaException e) { throw new RuntimeException(e); } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java index e5f6b370fa87..5fb9e9550454 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java @@ -468,7 +468,8 @@ public Row getConfigurationRow() { return SchemaRegistry.createDefault() .getToRowFunction(BigQueryStorageWriteApiSchemaTransformConfiguration.class) .apply(configuration) - .sorted(); + .sorted() + .toSnakeCase(); } catch (NoSuchSchemaException e) { throw new RuntimeException(e); } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySchemaTransformTranslationTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySchemaTransformTranslationTest.java index d1dd787253c1..bc6624bd9371 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySchemaTransformTranslationTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySchemaTransformTranslationTest.java @@ -58,20 +58,20 @@ public class BigQuerySchemaTransformTranslationTest { static final Row WRITE_CONFIG_ROW = Row.withSchema(WRITE_PROVIDER.configurationSchema()) .withFieldValue("table", "project:dataset.table") - .withFieldValue("createDisposition", "create_never") - .withFieldValue("writeDisposition", "write_append") - .withFieldValue("triggeringFrequencySeconds", 5L) - .withFieldValue("useAtLeastOnceSemantics", false) - .withFieldValue("autoSharding", false) - .withFieldValue("numStreams", 5) - .withFieldValue("errorHandling", null) + .withFieldValue("create_disposition", "create_never") + .withFieldValue("write_disposition", "write_append") + .withFieldValue("triggering_frequency_seconds", 5L) + .withFieldValue("use_at_least_once_semantics", false) + .withFieldValue("auto_sharding", false) + .withFieldValue("num_streams", 5) + .withFieldValue("error_handling", null) .build(); static final Row READ_CONFIG_ROW = Row.withSchema(READ_PROVIDER.configurationSchema()) .withFieldValue("query", null) - .withFieldValue("tableSpec", "apache-beam-testing.samples.weather_stations") - .withFieldValue("rowRestriction", "col < 5") - .withFieldValue("selectedFields", Arrays.asList("col1", "col2", "col3")) + .withFieldValue("table_spec", "apache-beam-testing.samples.weather_stations") + .withFieldValue("row_restriction", "col < 5") + .withFieldValue("selected_fields", Arrays.asList("col1", "col2", "col3")) .build(); @Test diff --git a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedTransformConstants.java b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedTransformConstants.java index 238a554adb21..79ea648fb5fa 100644 --- a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedTransformConstants.java +++ b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedTransformConstants.java @@ -59,21 +59,13 @@ public class ManagedTransformConstants { private static final Map BIGQUERY_STORAGE_READ_MAPPINGS = ImmutableMap.builder() - .put("table", "tableSpec") - .put("query", "query") - .put("row_restriction", "rowRestriction") - .put("columns", "selectedFields") - .put("kms_key", "kmsKey") + .put("table", "table_spec") + .put("columns", "selected_fields") .build(); private static final Map BIGQUERY_STORAGE_WRITE_MAPPINGS = ImmutableMap.builder() - .put("table", "table") - .put("num_shards", "numStreams") - .put("triggering_frequency", "triggeringFrequencySeconds") - .put("create_disposition", "createDisposition") - .put("at_least_once", "useAtLeastOnceSemantics") - .put("kms_key", "kmsKey") + .put("at_least_once", "use_at_least_once_semantics") .build(); public static final Map> MAPPINGS = From fe6090420b72dfb6af21fe397dbd0eb9e0b23f47 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Tue, 9 Jul 2024 17:14:43 -0400 Subject: [PATCH 05/24] choose write method based on boundedness and pipeline options --- .../io/google-cloud-platform/build.gradle | 1 + .../expansion-service/build.gradle | 3 + ...oadsWriteSchemaTransformConfiguration.java | 72 ------- ...FileLoadsWriteSchemaTransformProvider.java | 200 ++++-------------- ...torageWriteApiSchemaTransformProvider.java | 58 ++--- ...LoadsWriteSchemaTransformProviderTest.java | 155 +------------- .../io/gcp/bigquery/BigQueryManagedIT.java | 105 ++++++++- sdks/java/managed/build.gradle | 3 + .../org/apache/beam/sdk/managed/Managed.java | 11 +- .../ManagedSchemaTransformProvider.java | 99 ++++++--- .../managed/ManagedTransformConstants.java | 16 +- .../ManagedSchemaTransformProviderTest.java | 57 ++++- 12 files changed, 303 insertions(+), 477 deletions(-) delete mode 100644 sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformConfiguration.java diff --git a/sdks/java/io/google-cloud-platform/build.gradle b/sdks/java/io/google-cloud-platform/build.gradle index dba987f7cee9..9ec077d36153 100644 --- a/sdks/java/io/google-cloud-platform/build.gradle +++ b/sdks/java/io/google-cloud-platform/build.gradle @@ -164,6 +164,7 @@ dependencies { testImplementation project(path: ":sdks:java:extensions:google-cloud-platform-core", configuration: "testRuntimeMigration") testImplementation project(path: ":sdks:java:extensions:protobuf", configuration: "testRuntimeMigration") testImplementation project(path: ":runners:direct-java", configuration: "shadow") + testImplementation project(":runners:google-cloud-dataflow-java") testImplementation project(path: ":sdks:java:io:common", configuration: "testRuntimeMigration") testImplementation project(path: ":sdks:java:testing:test-utils", configuration: "testRuntimeMigration") testImplementation project(":sdks:java:managed") diff --git a/sdks/java/io/google-cloud-platform/expansion-service/build.gradle b/sdks/java/io/google-cloud-platform/expansion-service/build.gradle index 1288d91964e1..f6c6f07d0cdf 100644 --- a/sdks/java/io/google-cloud-platform/expansion-service/build.gradle +++ b/sdks/java/io/google-cloud-platform/expansion-service/build.gradle @@ -36,6 +36,9 @@ dependencies { permitUnusedDeclared project(":sdks:java:io:google-cloud-platform") // BEAM-11761 implementation project(":sdks:java:extensions:schemaio-expansion-service") permitUnusedDeclared project(":sdks:java:extensions:schemaio-expansion-service") // BEAM-11761 + implementation project(":sdks:java:managed") + permitUnusedDeclared project(":sdks:java:managed") // BEAM-11761 + runtimeOnly library.java.slf4j_jdk14 } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformConfiguration.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformConfiguration.java deleted file mode 100644 index f634b5ec6f60..000000000000 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformConfiguration.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.sdk.io.gcp.bigquery; - -import com.google.auto.value.AutoValue; -import org.apache.beam.sdk.schemas.AutoValueSchema; -import org.apache.beam.sdk.schemas.annotations.DefaultSchema; - -/** - * Configuration for writing to BigQuery. - * - *

This class is meant to be used with {@link BigQueryFileLoadsWriteSchemaTransformProvider}. - * - *

Internal only: This class is actively being worked on, and it will likely change. We - * provide no backwards compatibility guarantees, and it should not be implemented outside the Beam - * repository. - */ -@DefaultSchema(AutoValueSchema.class) -@AutoValue -public abstract class BigQueryFileLoadsWriteSchemaTransformConfiguration { - - /** Instantiates a {@link BigQueryFileLoadsWriteSchemaTransformConfiguration.Builder}. */ - public static Builder builder() { - return new AutoValue_BigQueryFileLoadsWriteSchemaTransformConfiguration.Builder(); - } - - /** - * Writes to the given table specification. See {@link BigQueryIO.Write#to(String)}} for the - * expected format. - */ - public abstract String getTableSpec(); - - /** Specifies whether the table should be created if it does not exist. */ - public abstract String getCreateDisposition(); - - /** Specifies what to do with existing data in the table, in case the table already exists. */ - public abstract String getWriteDisposition(); - - @AutoValue.Builder - public abstract static class Builder { - - /** - * Writes to the given table specification. See {@link BigQueryIO.Write#to(String)}} for the - * expected format. - */ - public abstract Builder setTableSpec(String value); - - /** Specifies whether the table should be created if it does not exist. */ - public abstract Builder setCreateDisposition(String value); - - /** Specifies what to do with existing data in the table, in case the table already exists. */ - public abstract Builder setWriteDisposition(String value); - - /** Builds the {@link BigQueryFileLoadsWriteSchemaTransformConfiguration} configuration. */ - public abstract BigQueryFileLoadsWriteSchemaTransformConfiguration build(); - } -} diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProvider.java index 3212e2a30348..4fc97e297cee 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProvider.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProvider.java @@ -17,34 +17,28 @@ */ package org.apache.beam.sdk.io.gcp.bigquery; -import com.google.api.services.bigquery.model.Table; -import com.google.api.services.bigquery.model.TableReference; +import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider.BigQueryStorageWriteApiSchemaTransformConfiguration; + import com.google.api.services.bigquery.model.TableRow; -import com.google.api.services.bigquery.model.TableSchema; import com.google.auto.service.AutoService; -import java.io.IOException; import java.util.Collections; import java.util.List; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; -import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService; -import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.schemas.Schema; -import org.apache.beam.sdk.schemas.io.InvalidConfigurationException; import org.apache.beam.sdk.schemas.transforms.SchemaTransform; import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider; -import org.apache.beam.sdk.transforms.MapElements; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionRowTuple; import org.apache.beam.sdk.values.Row; -import org.apache.beam.sdk.values.TypeDescriptor; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; /** * An implementation of {@link TypedSchemaTransformProvider} for BigQuery write jobs configured - * using {@link BigQueryFileLoadsWriteSchemaTransformConfiguration}. + * using {@link BigQueryStorageWriteApiSchemaTransformConfiguration}. * *

Internal only: This class is actively being worked on, and it will likely change. We * provide no backwards compatibility guarantees, and it should not be implemented outside the Beam @@ -56,201 +50,87 @@ @Internal @AutoService(SchemaTransformProvider.class) public class BigQueryFileLoadsWriteSchemaTransformProvider - extends TypedSchemaTransformProvider { + extends TypedSchemaTransformProvider { private static final String IDENTIFIER = - "beam:schematransform:org.apache.beam:bigquery_fileloads_write:v1"; - static final String INPUT_TAG = "INPUT"; - - /** Returns the expected class of the configuration. */ - @Override - protected Class configurationClass() { - return BigQueryFileLoadsWriteSchemaTransformConfiguration.class; - } + "beam:schematransform:org.apache.beam:bigquery_fileloads:v1"; + static final String INPUT_TAG = "input"; - /** Returns the expected {@link SchemaTransform} of the configuration. */ @Override - protected SchemaTransform from(BigQueryFileLoadsWriteSchemaTransformConfiguration configuration) { + protected SchemaTransform from( + BigQueryStorageWriteApiSchemaTransformConfiguration configuration) { return new BigQueryWriteSchemaTransform(configuration); } - /** Implementation of the {@link TypedSchemaTransformProvider} identifier method. */ @Override public String identifier() { return IDENTIFIER; } - /** - * Implementation of the {@link TypedSchemaTransformProvider} inputCollectionNames method. Since a - * single is expected, this returns a list with a single name. - */ @Override public List inputCollectionNames() { return Collections.singletonList(INPUT_TAG); } - /** - * Implementation of the {@link TypedSchemaTransformProvider} outputCollectionNames method. Since - * no output is expected, this returns an empty list. - */ @Override public List outputCollectionNames() { return Collections.emptyList(); } - /** - * A {@link SchemaTransform} that performs {@link BigQueryIO.Write}s based on a {@link - * BigQueryFileLoadsWriteSchemaTransformConfiguration}. - */ protected static class BigQueryWriteSchemaTransform extends SchemaTransform { /** An instance of {@link BigQueryServices} used for testing. */ private BigQueryServices testBigQueryServices = null; - private final BigQueryFileLoadsWriteSchemaTransformConfiguration configuration; + private final BigQueryStorageWriteApiSchemaTransformConfiguration configuration; - BigQueryWriteSchemaTransform(BigQueryFileLoadsWriteSchemaTransformConfiguration configuration) { + BigQueryWriteSchemaTransform( + BigQueryStorageWriteApiSchemaTransformConfiguration configuration) { + configuration.validate(); this.configuration = configuration; } - @Override - public void validate(PipelineOptions options) { - if (!configuration.getCreateDisposition().equals(CreateDisposition.CREATE_NEVER.name())) { - return; - } - - BigQueryOptions bigQueryOptions = options.as(BigQueryOptions.class); - - BigQueryServices bigQueryServices = new BigQueryServicesImpl(); - if (testBigQueryServices != null) { - bigQueryServices = testBigQueryServices; - } - - DatasetService datasetService = bigQueryServices.getDatasetService(bigQueryOptions); - TableReference tableReference = BigQueryUtils.toTableReference(configuration.getTableSpec()); - - try { - Table table = datasetService.getTable(tableReference); - if (table == null) { - throw new NullPointerException(); - } - - if (table.getSchema() == null) { - throw new InvalidConfigurationException( - String.format("could not fetch schema for table: %s", configuration.getTableSpec())); - } - - } catch (NullPointerException | InterruptedException | IOException ex) { - throw new InvalidConfigurationException( - String.format( - "could not fetch table %s, error: %s", - configuration.getTableSpec(), ex.getMessage())); - } - } - @Override public PCollectionRowTuple expand(PCollectionRowTuple input) { - validate(input); - PCollection rowPCollection = input.get(INPUT_TAG); - Schema schema = rowPCollection.getSchema(); - BigQueryIO.Write write = toWrite(schema); - if (testBigQueryServices != null) { - write = write.withTestServices(testBigQueryServices); - } + PCollection rowPCollection = input.getSinglePCollection(); + BigQueryIO.Write write = toWrite(); + rowPCollection.apply(write); - PCollection tableRowPCollection = - rowPCollection.apply( - MapElements.into(TypeDescriptor.of(TableRow.class)).via(BigQueryUtils::toTableRow)); - tableRowPCollection.apply(write); return PCollectionRowTuple.empty(input.getPipeline()); } /** Instantiates a {@link BigQueryIO.Write} from a {@link Schema}. */ - BigQueryIO.Write toWrite(Schema schema) { - TableSchema tableSchema = BigQueryUtils.toTableSchema(schema); - CreateDisposition createDisposition = - CreateDisposition.valueOf(configuration.getCreateDisposition()); - WriteDisposition writeDisposition = - WriteDisposition.valueOf(configuration.getWriteDisposition()); - - return BigQueryIO.writeTableRows() - .to(configuration.getTableSpec()) - .withCreateDisposition(createDisposition) - .withWriteDisposition(writeDisposition) - .withSchema(tableSchema); - } - - /** Setter for testing using {@link BigQueryServices}. */ - @VisibleForTesting - void setTestBigQueryServices(BigQueryServices testBigQueryServices) { - this.testBigQueryServices = testBigQueryServices; - } - - /** Validate a {@link PCollectionRowTuple} input. */ - void validate(PCollectionRowTuple input) { - if (!input.has(INPUT_TAG)) { - throw new IllegalArgumentException( - String.format( - "%s %s is missing expected tag: %s", - getClass().getSimpleName(), input.getClass().getSimpleName(), INPUT_TAG)); + BigQueryIO.Write toWrite() { + BigQueryIO.Write write = + BigQueryIO.write() + .to(configuration.getTable()) + .withMethod(BigQueryIO.Write.Method.FILE_LOADS) + .withFormatFunction(BigQueryUtils.toTableRow()) + .useBeamSchema(); + + if (!Strings.isNullOrEmpty(configuration.getCreateDisposition())) { + CreateDisposition createDisposition = + CreateDisposition.valueOf(configuration.getCreateDisposition().toUpperCase()); + write = write.withCreateDisposition(createDisposition); } - - PCollection rowInput = input.get(INPUT_TAG); - Schema sourceSchema = rowInput.getSchema(); - - if (sourceSchema == null) { - throw new IllegalArgumentException( - String.format("%s is null for input of tag: %s", Schema.class, INPUT_TAG)); + if (!Strings.isNullOrEmpty(configuration.getWriteDisposition())) { + WriteDisposition writeDisposition = + WriteDisposition.valueOf(configuration.getWriteDisposition().toUpperCase()); + write = write.withWriteDisposition(writeDisposition); } - - if (!configuration.getCreateDisposition().equals(CreateDisposition.CREATE_NEVER.name())) { - return; + if (!Strings.isNullOrEmpty(configuration.getKmsKey())) { + write = write.withKmsKey(configuration.getKmsKey()); } - - BigQueryOptions bigQueryOptions = input.getPipeline().getOptions().as(BigQueryOptions.class); - - BigQueryServices bigQueryServices = new BigQueryServicesImpl(); if (testBigQueryServices != null) { - bigQueryServices = testBigQueryServices; + write = write.withTestServices(testBigQueryServices); } - DatasetService datasetService = bigQueryServices.getDatasetService(bigQueryOptions); - TableReference tableReference = BigQueryUtils.toTableReference(configuration.getTableSpec()); - - try { - Table table = datasetService.getTable(tableReference); - if (table == null) { - throw new NullPointerException(); - } - - TableSchema tableSchema = table.getSchema(); - if (tableSchema == null) { - throw new NullPointerException(); - } - - Schema destinationSchema = BigQueryUtils.fromTableSchema(tableSchema); - if (destinationSchema == null) { - throw new NullPointerException(); - } - - validateMatching(sourceSchema, destinationSchema); - - } catch (NullPointerException | InterruptedException | IOException e) { - throw new InvalidConfigurationException( - String.format( - "could not validate input for create disposition: %s and table: %s, error: %s", - configuration.getCreateDisposition(), - configuration.getTableSpec(), - e.getMessage())); - } + return write; } - void validateMatching(Schema sourceSchema, Schema destinationSchema) { - if (!sourceSchema.equals(destinationSchema)) { - throw new IllegalArgumentException( - String.format( - "source and destination schema mismatch for table: %s", - configuration.getTableSpec())); - } + /** Setter for testing using {@link BigQueryServices}. */ + @VisibleForTesting + void setTestBigQueryServices(BigQueryServices testBigQueryServices) { + this.testBigQueryServices = testBigQueryServices; } } } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java index 5fb9e9550454..c90564331ec1 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java @@ -26,7 +26,7 @@ import java.util.Arrays; import java.util.Collections; import java.util.List; -import java.util.Map; +import java.util.stream.Collectors; import javax.annotation.Nullable; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; @@ -63,8 +63,8 @@ import org.apache.beam.sdk.values.TypeDescriptors; import org.apache.beam.sdk.values.ValueInSingleWindow; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.joda.time.Duration; /** @@ -125,20 +125,6 @@ public List outputCollectionNames() { @DefaultSchema(AutoValueSchema.class) @AutoValue public abstract static class BigQueryStorageWriteApiSchemaTransformConfiguration { - - static final Map CREATE_DISPOSITIONS = - ImmutableMap.builder() - .put(CreateDisposition.CREATE_IF_NEEDED.name(), CreateDisposition.CREATE_IF_NEEDED) - .put(CreateDisposition.CREATE_NEVER.name(), CreateDisposition.CREATE_NEVER) - .build(); - - static final Map WRITE_DISPOSITIONS = - ImmutableMap.builder() - .put(WriteDisposition.WRITE_TRUNCATE.name(), WriteDisposition.WRITE_TRUNCATE) - .put(WriteDisposition.WRITE_EMPTY.name(), WriteDisposition.WRITE_EMPTY) - .put(WriteDisposition.WRITE_APPEND.name(), WriteDisposition.WRITE_APPEND) - .build(); - @AutoValue public abstract static class ErrorHandling { @SchemaFieldDescription("The name of the output PCollection containing failed writes.") @@ -171,21 +157,23 @@ public void validate() { } // validate create and write dispositions - if (!Strings.isNullOrEmpty(this.getCreateDisposition())) { - checkNotNull( - CREATE_DISPOSITIONS.get(this.getCreateDisposition().toUpperCase()), - invalidConfigMessage - + "Invalid create disposition (%s) was specified. Available dispositions are: %s", - this.getCreateDisposition(), - CREATE_DISPOSITIONS.keySet()); + if (!Strings.isNullOrEmpty(getCreateDisposition())) { + List createDispostions = + Arrays.stream(CreateDisposition.values()).map(Enum::name).collect(Collectors.toList()); + Preconditions.checkArgument( + createDispostions.contains(getCreateDisposition()), + "Invalid create disposition (%s) was specified. Available dispositions are: %s", + getCreateDisposition(), + createDispostions); } - if (!Strings.isNullOrEmpty(this.getWriteDisposition())) { - checkNotNull( - WRITE_DISPOSITIONS.get(this.getWriteDisposition().toUpperCase()), - invalidConfigMessage - + "Invalid write disposition (%s) was specified. Available dispositions are: %s", - this.getWriteDisposition(), - WRITE_DISPOSITIONS.keySet()); + if (!Strings.isNullOrEmpty(getWriteDisposition())) { + List writeDispostions = + Arrays.stream(WriteDisposition.values()).map(Enum::name).collect(Collectors.toList()); + Preconditions.checkArgument( + writeDispostions.contains(getWriteDisposition()), + "Invalid write disposition (%s) was specified. Available dispositions are: %s", + getWriteDisposition(), + writeDispostions); } if (this.getErrorHandling() != null) { @@ -376,8 +364,7 @@ public TableSchema getSchema(String destination) { @Override public PCollectionRowTuple expand(PCollectionRowTuple input) { // Check that the input exists - checkArgument(input.has(INPUT_ROWS_TAG), "Missing expected input tag: %s", INPUT_ROWS_TAG); - PCollection inputRows = input.get(INPUT_ROWS_TAG); + PCollection inputRows = input.getSinglePCollection(); BigQueryIO.Write write = createStorageWriteApiTransform(inputRows.getSchema()); @@ -503,20 +490,17 @@ BigQueryIO.Write createStorageWriteApiTransform(Schema schema) { if (!Strings.isNullOrEmpty(configuration.getCreateDisposition())) { CreateDisposition createDisposition = - BigQueryStorageWriteApiSchemaTransformConfiguration.CREATE_DISPOSITIONS.get( - configuration.getCreateDisposition().toUpperCase()); + CreateDisposition.valueOf(configuration.getCreateDisposition().toUpperCase()); write = write.withCreateDisposition(createDisposition); } if (!Strings.isNullOrEmpty(configuration.getWriteDisposition())) { WriteDisposition writeDisposition = - BigQueryStorageWriteApiSchemaTransformConfiguration.WRITE_DISPOSITIONS.get( - configuration.getWriteDisposition().toUpperCase()); + WriteDisposition.valueOf(configuration.getWriteDisposition().toUpperCase()); write = write.withWriteDisposition(writeDisposition); } if (!Strings.isNullOrEmpty(configuration.getKmsKey())) { write = write.withKmsKey(configuration.getKmsKey()); } - if (this.testBigQueryServices != null) { write = write.withTestServices(testBigQueryServices); } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProviderTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProviderTest.java index dd8bb9fc8664..4264533410ce 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProviderTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProviderTest.java @@ -17,20 +17,14 @@ */ package org.apache.beam.sdk.io.gcp.bigquery; -import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryFileLoadsWriteSchemaTransformProvider.INPUT_TAG; +import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider.BigQueryStorageWriteApiSchemaTransformConfiguration; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertThrows; import com.google.api.services.bigquery.model.TableReference; -import com.google.api.services.bigquery.model.TableRow; -import com.google.api.services.bigquery.model.TableSchema; import java.io.IOException; import java.util.Arrays; -import java.util.HashSet; import java.util.List; -import java.util.Map; -import java.util.Set; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryFileLoadsWriteSchemaTransformProvider.BigQueryWriteSchemaTransform; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; @@ -40,15 +34,10 @@ import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.Schema.Field; import org.apache.beam.sdk.schemas.Schema.FieldType; -import org.apache.beam.sdk.schemas.io.InvalidConfigurationException; import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.transforms.Create; -import org.apache.beam.sdk.transforms.display.DisplayData; -import org.apache.beam.sdk.transforms.display.DisplayData.Identifier; -import org.apache.beam.sdk.transforms.display.DisplayData.Item; import org.apache.beam.sdk.values.PCollectionRowTuple; import org.apache.beam.sdk.values.Row; -import org.apache.commons.lang3.tuple.Pair; import org.junit.After; import org.junit.Before; import org.junit.Rule; @@ -71,8 +60,6 @@ public class BigQueryFileLoadsWriteSchemaTransformProviderTest { private static final Schema SCHEMA = Schema.of(Field.of("name", FieldType.STRING), Field.of("number", FieldType.INT64)); - private static final TableSchema TABLE_SCHEMA = BigQueryUtils.toTableSchema(SCHEMA); - private static final List ROWS = Arrays.asList( Row.withSchema(SCHEMA).withFieldValue("name", "a").withFieldValue("number", 1L).build(), @@ -109,9 +96,9 @@ public void tearDown() { public void testLoad() throws IOException, InterruptedException { BigQueryFileLoadsWriteSchemaTransformProvider provider = new BigQueryFileLoadsWriteSchemaTransformProvider(); - BigQueryFileLoadsWriteSchemaTransformConfiguration configuration = - BigQueryFileLoadsWriteSchemaTransformConfiguration.builder() - .setTableSpec(BigQueryHelpers.toTableSpec(TABLE_REFERENCE)) + BigQueryStorageWriteApiSchemaTransformConfiguration configuration = + BigQueryStorageWriteApiSchemaTransformConfiguration.builder() + .setTable(BigQueryHelpers.toTableSpec(TABLE_REFERENCE)) .setWriteDisposition(WriteDisposition.WRITE_TRUNCATE.name()) .setCreateDisposition(CreateDisposition.CREATE_IF_NEEDED.name()) .build(); @@ -128,138 +115,4 @@ public void testLoad() throws IOException, InterruptedException { assertNotNull(fakeDatasetService.getTable(TABLE_REFERENCE)); assertEquals(ROWS.size(), fakeDatasetService.getAllRows(PROJECT, DATASET, TABLE_ID).size()); } - - @Test - public void testValidatePipelineOptions() { - List< - Pair< - BigQueryFileLoadsWriteSchemaTransformConfiguration.Builder, - Class>> - cases = - Arrays.asList( - Pair.of( - BigQueryFileLoadsWriteSchemaTransformConfiguration.builder() - .setTableSpec("project.doesnot.exist") - .setCreateDisposition(CreateDisposition.CREATE_NEVER.name()) - .setWriteDisposition(WriteDisposition.WRITE_APPEND.name()), - InvalidConfigurationException.class), - Pair.of( - BigQueryFileLoadsWriteSchemaTransformConfiguration.builder() - .setTableSpec(String.format("%s.%s.%s", PROJECT, DATASET, "doesnotexist")) - .setCreateDisposition(CreateDisposition.CREATE_NEVER.name()) - .setWriteDisposition(WriteDisposition.WRITE_EMPTY.name()), - InvalidConfigurationException.class), - Pair.of( - BigQueryFileLoadsWriteSchemaTransformConfiguration.builder() - .setTableSpec("project.doesnot.exist") - .setCreateDisposition(CreateDisposition.CREATE_IF_NEEDED.name()) - .setWriteDisposition(WriteDisposition.WRITE_APPEND.name()), - null)); - for (Pair< - BigQueryFileLoadsWriteSchemaTransformConfiguration.Builder, Class> - caze : cases) { - BigQueryWriteSchemaTransform transform = transformFrom(caze.getLeft().build()); - if (caze.getRight() != null) { - assertThrows(caze.getRight(), () -> transform.validate(p.getOptions())); - } else { - transform.validate(p.getOptions()); - } - } - } - - @Test - public void testToWrite() { - List< - Pair< - BigQueryFileLoadsWriteSchemaTransformConfiguration.Builder, - BigQueryIO.Write>> - cases = - Arrays.asList( - Pair.of( - BigQueryFileLoadsWriteSchemaTransformConfiguration.builder() - .setTableSpec(BigQueryHelpers.toTableSpec(TABLE_REFERENCE)) - .setCreateDisposition(CreateDisposition.CREATE_NEVER.name()) - .setWriteDisposition(WriteDisposition.WRITE_EMPTY.name()), - BigQueryIO.writeTableRows() - .to(TABLE_REFERENCE) - .withCreateDisposition(CreateDisposition.CREATE_NEVER) - .withWriteDisposition(WriteDisposition.WRITE_EMPTY) - .withSchema(TABLE_SCHEMA)), - Pair.of( - BigQueryFileLoadsWriteSchemaTransformConfiguration.builder() - .setTableSpec(BigQueryHelpers.toTableSpec(TABLE_REFERENCE)) - .setCreateDisposition(CreateDisposition.CREATE_IF_NEEDED.name()) - .setWriteDisposition(WriteDisposition.WRITE_TRUNCATE.name()), - BigQueryIO.writeTableRows() - .to(TABLE_REFERENCE) - .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) - .withWriteDisposition(WriteDisposition.WRITE_TRUNCATE) - .withSchema(TABLE_SCHEMA))); - for (Pair< - BigQueryFileLoadsWriteSchemaTransformConfiguration.Builder, BigQueryIO.Write> - caze : cases) { - BigQueryWriteSchemaTransform transform = transformFrom(caze.getLeft().build()); - Map gotDisplayData = DisplayData.from(transform.toWrite(SCHEMA)).asMap(); - Map wantDisplayData = DisplayData.from(caze.getRight()).asMap(); - Set keys = new HashSet<>(); - keys.addAll(gotDisplayData.keySet()); - keys.addAll(wantDisplayData.keySet()); - for (Identifier key : keys) { - Item got = null; - Item want = null; - if (gotDisplayData.containsKey(key)) { - got = gotDisplayData.get(key); - } - if (wantDisplayData.containsKey(key)) { - want = wantDisplayData.get(key); - } - assertEquals(want, got); - } - } - } - - @Test - public void validatePCollectionRowTupleInput() { - PCollectionRowTuple empty = PCollectionRowTuple.empty(p); - PCollectionRowTuple valid = - PCollectionRowTuple.of( - INPUT_TAG, p.apply("CreateRowsWithValidSchema", Create.of(ROWS)).setRowSchema(SCHEMA)); - - PCollectionRowTuple invalid = - PCollectionRowTuple.of( - INPUT_TAG, - p.apply( - "CreateRowsWithInvalidSchema", - Create.of( - Row.nullRow( - Schema.builder().addNullableField("name", FieldType.STRING).build())))); - - BigQueryWriteSchemaTransform transform = - transformFrom( - BigQueryFileLoadsWriteSchemaTransformConfiguration.builder() - .setTableSpec(BigQueryHelpers.toTableSpec(TABLE_REFERENCE)) - .setCreateDisposition(CreateDisposition.CREATE_IF_NEEDED.name()) - .setWriteDisposition(WriteDisposition.WRITE_APPEND.name()) - .build()); - - assertThrows(IllegalArgumentException.class, () -> transform.validate(empty)); - - assertThrows(IllegalStateException.class, () -> transform.validate(invalid)); - - transform.validate(valid); - - p.run(); - } - - private BigQueryWriteSchemaTransform transformFrom( - BigQueryFileLoadsWriteSchemaTransformConfiguration configuration) { - BigQueryFileLoadsWriteSchemaTransformProvider provider = - new BigQueryFileLoadsWriteSchemaTransformProvider(); - BigQueryWriteSchemaTransform transform = - (BigQueryWriteSchemaTransform) provider.from(configuration); - - transform.setTestBigQueryServices(fakeBigQueryServices); - - return transform; - } } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java index eaf8c234043e..ad3af3ffcb85 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java @@ -18,29 +18,42 @@ package org.apache.beam.sdk.io.gcp.bigquery; import java.io.IOException; +import java.util.Collections; import java.util.List; import java.util.Map; import java.util.stream.Collectors; import java.util.stream.LongStream; +import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryDirectReadSchemaTransformProvider; import org.apache.beam.sdk.io.gcp.testing.BigqueryClient; import org.apache.beam.sdk.managed.Managed; +import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.testing.PAssert; import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.testing.TestPipelineOptions; import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.transforms.PeriodicImpulse; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionRowTuple; import org.apache.beam.sdk.values.Row; +import org.apache.beam.sdk.values.TypeDescriptors; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; +import org.joda.time.Duration; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; +/** + * This class tests the execution of {@link Managed} BigQueryIO. Tests validating that the correct + * write transform is requested can be found in + * ManagedSchemaTransformProviderTest.testResolveBigQueryWrite. + */ @RunWith(JUnit4.class) public class BigQueryManagedIT { private static final Schema SCHEMA = @@ -76,18 +89,42 @@ public static void cleanup() { } @Test - public void testSimpleStorageWriteRead() { - String table = String.format("%s:%s.managed_read_write", PROJECT, BIG_QUERY_DATASET_ID); + public void testBatchFileLoadsWriteRead() { + String table = String.format("%s:%s.managed_file_loads_read", PROJECT, BIG_QUERY_DATASET_ID); + + Map writeConfig = + ImmutableMap.builder().put("table", table).build(); + + // file loads requires a GCS temp location + TestPipelineOptions options = + TestPipeline.testingPipelineOptions().as(TestPipelineOptions.class); + options.setTempLocation(options.getTempRoot()); + + Pipeline p = Pipeline.create(options); + PCollectionRowTuple.of("input", getInput(p, false)) + .apply(Managed.write(Managed.BIGQUERY).withConfig(writeConfig)); + p.run().waitUntilFinish(); + + Map readConfig = + ImmutableMap.builder().put("table", table).build(); + Pipeline q = Pipeline.create(); + PCollection outputRows = + PCollectionRowTuple.empty(p) + .apply(Managed.read(Managed.BIGQUERY).withConfig(readConfig)) + .get(BigQueryDirectReadSchemaTransformProvider.OUTPUT_TAG); + PAssert.that(outputRows).containsInAnyOrder(ROWS); + q.run().waitUntilFinish(); + } + + @Test + public void testStreamingStorageWriteRead() { + String table = String.format("%s:%s.managed_storage_write_read", PROJECT, BIG_QUERY_DATASET_ID); Map writeConfig = - ImmutableMap.builder() - .put("table", table) - .put("create_disposition", "create_if_needed") - .put("at_least_once", false) - .build(); + ImmutableMap.builder().put("table", table).build(); Pipeline p = Pipeline.create(); - PCollectionRowTuple.of("input", p.apply(Create.of(ROWS)).setRowSchema(SCHEMA)) - .apply(Managed.write(Managed.BIGQUERY_STORAGE).withConfig(writeConfig)); + PCollectionRowTuple.of("input", getInput(p, true)) + .apply(Managed.write(Managed.BIGQUERY).withConfig(writeConfig)); p.run().waitUntilFinish(); Map readConfig = @@ -95,9 +132,57 @@ public void testSimpleStorageWriteRead() { Pipeline q = Pipeline.create(); PCollection outputRows = PCollectionRowTuple.empty(p) - .apply(Managed.read(Managed.BIGQUERY_STORAGE).withConfig(readConfig)) + .apply(Managed.read(Managed.BIGQUERY).withConfig(readConfig)) .get(BigQueryDirectReadSchemaTransformProvider.OUTPUT_TAG); PAssert.that(outputRows).containsInAnyOrder(ROWS); q.run().waitUntilFinish(); } + + @Test + public void testStreamingStorageWriteAtLeastOnceRead() { + String table = + String.format( + "%s:%s.managed_storage_write_at_least_once_read", PROJECT, BIG_QUERY_DATASET_ID); + + Map writeConfig = + ImmutableMap.builder().put("table", table).build(); + + DataflowPipelineOptions options = + PipelineOptionsFactory.create().as(DataflowPipelineOptions.class); + options.setDataflowServiceOptions(Collections.singletonList("streaming_mode_at_least_once")); + Pipeline p = Pipeline.create(options); + + PCollectionRowTuple.of("input", getInput(p, true)) + .apply(Managed.write(Managed.BIGQUERY).withConfig(writeConfig)); + p.run().waitUntilFinish(); + + Map readConfig = + ImmutableMap.builder().put("table", table).build(); + Pipeline q = Pipeline.create(); + PCollection outputRows = + PCollectionRowTuple.empty(p) + .apply(Managed.read(Managed.BIGQUERY).withConfig(readConfig)) + .get(BigQueryDirectReadSchemaTransformProvider.OUTPUT_TAG); + PAssert.that(outputRows).containsInAnyOrder(ROWS); + q.run().waitUntilFinish(); + } + + public PCollection getInput(Pipeline p, boolean isStreaming) { + if (isStreaming) { + return p.apply( + PeriodicImpulse.create() + .stopAfter(Duration.millis(20)) + .withInterval(Duration.millis(1))) + .apply( + MapElements.into(TypeDescriptors.rows()) + .via( + i -> + Row.withSchema(SCHEMA) + .withFieldValue("str", Long.toString(i.getMillis())) + .withFieldValue("number", i.getMillis()) + .build())) + .setRowSchema(SCHEMA); + } + return p.apply(Create.of(ROWS)).setRowSchema(SCHEMA); + } } diff --git a/sdks/java/managed/build.gradle b/sdks/java/managed/build.gradle index add0d7f3cc0d..e4d98d91a15d 100644 --- a/sdks/java/managed/build.gradle +++ b/sdks/java/managed/build.gradle @@ -28,6 +28,9 @@ ext.summary = """Library that provides managed IOs.""" dependencies { implementation project(path: ":sdks:java:core", configuration: "shadow") + implementation project(":runners:google-cloud-dataflow-java") + implementation project(":sdks:java:io:google-cloud-platform") + implementation project(":sdks:java:extensions:google-cloud-platform-core") implementation library.java.vendored_guava_32_1_2_jre implementation library.java.slf4j_api diff --git a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/Managed.java b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/Managed.java index 0d0eeec9c4a7..03a724a4c7c5 100644 --- a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/Managed.java +++ b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/Managed.java @@ -82,20 +82,20 @@ public class Managed { // TODO: Dynamically generate a list of supported transforms public static final String ICEBERG = "iceberg"; public static final String KAFKA = "kafka"; - public static final String BIGQUERY_STORAGE = "bigquery_storage"; + public static final String BIGQUERY = "bigquery"; // Supported SchemaTransforms public static final Map READ_TRANSFORMS = ImmutableMap.builder() .put(ICEBERG, ManagedTransformConstants.ICEBERG_READ) .put(KAFKA, ManagedTransformConstants.KAFKA_READ) - .put(BIGQUERY_STORAGE, ManagedTransformConstants.BIGQUERY_STORAGE_READ) + .put(BIGQUERY, ManagedTransformConstants.BIGQUERY_READ) .build(); public static final Map WRITE_TRANSFORMS = ImmutableMap.builder() .put(ICEBERG, ManagedTransformConstants.ICEBERG_WRITE) .put(KAFKA, ManagedTransformConstants.KAFKA_WRITE) - .put(BIGQUERY_STORAGE, ManagedTransformConstants.BIGQUERY_STORAGE_WRITE) + .put(BIGQUERY, ManagedTransformConstants.BIGQUERY_STORAGE_WRITE) .build(); /** @@ -127,6 +127,9 @@ public static ManagedTransform read(String source) { * */ public static ManagedTransform write(String sink) { + List supportedIdentifiers = new ArrayList<>(WRITE_TRANSFORMS.values()); + supportedIdentifiers.add(ManagedTransformConstants.BIGQUERY_FILE_LOADS); + return new AutoValue_Managed_ManagedTransform.Builder() .setIdentifier( Preconditions.checkNotNull( @@ -134,7 +137,7 @@ public static ManagedTransform write(String sink) { "An unsupported sink was specified: '%s'. Please specify one of the following sinks: %s", sink, WRITE_TRANSFORMS.keySet())) - .setSupportedIdentifiers(new ArrayList<>(WRITE_TRANSFORMS.values())) + .setSupportedIdentifiers(supportedIdentifiers) .build(); } diff --git a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProvider.java b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProvider.java index 6f97983d3260..832c1492e27f 100644 --- a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProvider.java +++ b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProvider.java @@ -32,8 +32,10 @@ import java.util.Map; import java.util.ServiceLoader; import javax.annotation.Nullable; +import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions; import org.apache.beam.sdk.io.FileSystems; import org.apache.beam.sdk.io.fs.MatchResult; +import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.schemas.AutoValueSchema; import org.apache.beam.sdk.schemas.NoSuchSchemaException; import org.apache.beam.sdk.schemas.Schema; @@ -44,6 +46,7 @@ import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider; import org.apache.beam.sdk.schemas.utils.YamlUtils; +import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionRowTuple; import org.apache.beam.sdk.values.Row; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; @@ -117,7 +120,8 @@ protected void validate() { "Please specify a config or a config URL, but not both."); } - public @Nullable String resolveUnderlyingConfig() { + @VisibleForTesting + Map resolveUnderlyingConfig(PipelineOptions options) { String yamlTransformConfig = getConfig(); // If YAML string is empty, then attempt to read from YAML file if (Strings.isNullOrEmpty(yamlTransformConfig)) { @@ -131,55 +135,82 @@ protected void validate() { throw new RuntimeException(e); } } - return yamlTransformConfig; + + Map config = YamlUtils.yamlStringToMap(yamlTransformConfig); + return maybeModify(config, options); + } + + private Map maybeModify(Map config, PipelineOptions options) { + DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class); + if (getTransformIdentifier().equals(ManagedTransformConstants.BIGQUERY_STORAGE_WRITE) + && dataflowOptions.getDataflowServiceOptions() != null + && dataflowOptions.getDataflowServiceOptions().contains("streaming_mode_at_least_once")) { + config.put("at_least_once", true); + } + return config; + } + + @VisibleForTesting + String resolveUnderlyingTransform(PCollectionRowTuple input) { + String identifier = getTransformIdentifier(); + if (identifier.equals(ManagedTransformConstants.BIGQUERY_STORAGE_WRITE)) { + if (input.getSinglePCollection().isBounded().equals(PCollection.IsBounded.BOUNDED)) { + return ManagedTransformConstants.BIGQUERY_FILE_LOADS; + } + } + + return identifier; } } @Override protected SchemaTransform from(ManagedConfig managedConfig) { managedConfig.validate(); - SchemaTransformProvider schemaTransformProvider = - Preconditions.checkNotNull( - getAllProviders().get(managedConfig.getTransformIdentifier()), - "Could not find a transform with the identifier " - + "%s. This could be either due to the dependency with the " - + "transform not being available in the classpath or due to " - + "the specified transform not being supported.", - managedConfig.getTransformIdentifier()); - - return new ManagedSchemaTransform(managedConfig, schemaTransformProvider); + return new ManagedSchemaTransform(managedConfig, getAllProviders()); } static class ManagedSchemaTransform extends SchemaTransform { private final ManagedConfig managedConfig; - private final Row underlyingTransformConfig; - private final SchemaTransformProvider underlyingTransformProvider; + private final Map transformProviders; ManagedSchemaTransform( - ManagedConfig managedConfig, SchemaTransformProvider underlyingTransformProvider) { - // parse config before expansion to check if it matches underlying transform's config schema + ManagedConfig managedConfig, Map transformProviders) { + this.transformProviders = transformProviders; + this.managedConfig = managedConfig; + } + + @Override + public PCollectionRowTuple expand(PCollectionRowTuple input) { + String identifier = managedConfig.resolveUnderlyingTransform(input); + Map underlyingConfig = + managedConfig.resolveUnderlyingConfig(input.getPipeline().getOptions()); + + System.out.println("providers: " + transformProviders); + + SchemaTransformProvider underlyingTransformProvider = + Preconditions.checkNotNull( + transformProviders.get(identifier), + "Could not find a transform with the identifier " + + "%s. This could be either due to the dependency with the " + + "transform not being available in the classpath or due to " + + "the specified transform not being supported.", + identifier); Schema transformConfigSchema = underlyingTransformProvider.configurationSchema(); - Row underlyingTransformConfig; + + Row underlyingRowConfig; try { - underlyingTransformConfig = getRowConfig(managedConfig, transformConfigSchema); + underlyingRowConfig = getRowConfig(identifier, underlyingConfig, transformConfigSchema); } catch (Exception e) { throw new IllegalArgumentException( - "Encountered an error when retrieving a Row configuration", e); + "Encountered an error when retrieving Row configuration", e); } - this.managedConfig = managedConfig; - this.underlyingTransformConfig = underlyingTransformConfig; - this.underlyingTransformProvider = underlyingTransformProvider; - } - - @Override - public PCollectionRowTuple expand(PCollectionRowTuple input) { LOG.debug( "Building transform \"{}\" with Row configuration: {}", underlyingTransformProvider.identifier(), - underlyingTransformConfig); + underlyingRowConfig); - return input.apply(underlyingTransformProvider.from(underlyingTransformConfig)); + return input.apply(underlyingTransformProvider.from(underlyingRowConfig)); } public ManagedConfig getManagedConfig() { @@ -201,17 +232,15 @@ Row getConfigurationRow() { } } + // May return an empty row (perhaps the underlying transform doesn't have any required + // parameters) @VisibleForTesting - static Row getRowConfig(ManagedConfig config, Schema transformSchema) { - // May return an empty row (perhaps the underlying transform doesn't have any required - // parameters) - String yamlConfig = config.resolveUnderlyingConfig(); - Map configMap = YamlUtils.yamlStringToMap(yamlConfig); - + static Row getRowConfig( + String identifier, Map configMap, Schema transformSchema) { // The config Row object will be used to build the underlying SchemaTransform. // If a mapping for the SchemaTransform exists, we use it to update parameter names and align // with the underlying config schema - Map mapping = MAPPINGS.get(config.getTransformIdentifier()); + Map mapping = MAPPINGS.get(identifier); if (mapping != null && configMap != null) { Map remappedConfig = new HashMap<>(); for (Map.Entry entry : configMap.entrySet()) { diff --git a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedTransformConstants.java b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedTransformConstants.java index 79ea648fb5fa..5dbc209591b2 100644 --- a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedTransformConstants.java +++ b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedTransformConstants.java @@ -46,10 +46,12 @@ public class ManagedTransformConstants { "beam:schematransform:org.apache.beam:iceberg_write:v1"; public static final String KAFKA_READ = "beam:schematransform:org.apache.beam:kafka_read:v1"; public static final String KAFKA_WRITE = "beam:schematransform:org.apache.beam:kafka_write:v1"; - public static final String BIGQUERY_STORAGE_READ = + public static final String BIGQUERY_READ = "beam:schematransform:org.apache.beam:bigquery_storage_read:v1"; public static final String BIGQUERY_STORAGE_WRITE = "beam:schematransform:org.apache.beam:bigquery_storage_write:v2"; + public static final String BIGQUERY_FILE_LOADS = + "beam:schematransform:org.apache.beam:bigquery_fileloads:v1"; private static final Map KAFKA_READ_MAPPINGS = ImmutableMap.builder().put("data_format", "format").build(); @@ -57,22 +59,24 @@ public class ManagedTransformConstants { private static final Map KAFKA_WRITE_MAPPINGS = ImmutableMap.builder().put("data_format", "format").build(); - private static final Map BIGQUERY_STORAGE_READ_MAPPINGS = + private static final Map BIGQUERY_READ_MAPPINGS = ImmutableMap.builder() .put("table", "table_spec") - .put("columns", "selected_fields") + .put("fields", "selected_fields") .build(); - private static final Map BIGQUERY_STORAGE_WRITE_MAPPINGS = + private static final Map BIGQUERY_WRITE_MAPPINGS = ImmutableMap.builder() .put("at_least_once", "use_at_least_once_semantics") + .put("triggering_frequency", "triggering_frequency_seconds") .build(); public static final Map> MAPPINGS = ImmutableMap.>builder() .put(KAFKA_READ, KAFKA_READ_MAPPINGS) .put(KAFKA_WRITE, KAFKA_WRITE_MAPPINGS) - .put(BIGQUERY_STORAGE_READ, BIGQUERY_STORAGE_READ_MAPPINGS) - .put(BIGQUERY_STORAGE_WRITE, BIGQUERY_STORAGE_WRITE_MAPPINGS) + .put(BIGQUERY_READ, BIGQUERY_READ_MAPPINGS) + .put(BIGQUERY_STORAGE_WRITE, BIGQUERY_WRITE_MAPPINGS) + .put(BIGQUERY_FILE_LOADS, BIGQUERY_WRITE_MAPPINGS) .build(); } diff --git a/sdks/java/managed/src/test/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProviderTest.java b/sdks/java/managed/src/test/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProviderTest.java index e9edf8751e34..101954ba808e 100644 --- a/sdks/java/managed/src/test/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProviderTest.java +++ b/sdks/java/managed/src/test/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProviderTest.java @@ -19,13 +19,22 @@ import static org.apache.beam.sdk.managed.ManagedSchemaTransformProvider.ManagedConfig; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import java.net.URISyntaxException; import java.nio.file.Paths; import java.util.Arrays; +import java.util.Collections; +import java.util.Map; +import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions; +import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.managed.testing.TestSchemaTransformProvider; +import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionRowTuple; import org.apache.beam.sdk.values.Row; import org.junit.Rule; import org.junit.Test; @@ -65,7 +74,10 @@ public void testGetConfigRowFromYamlString() { .build(); Row returnedRow = - ManagedSchemaTransformProvider.getRowConfig(config, TestSchemaTransformProvider.SCHEMA); + ManagedSchemaTransformProvider.getRowConfig( + config.getTransformIdentifier(), + config.resolveUnderlyingConfig(PipelineOptionsFactory.create()), + TestSchemaTransformProvider.SCHEMA); assertEquals(expectedRow, returnedRow); } @@ -89,7 +101,9 @@ public void testGetConfigRowFromYamlFile() throws URISyntaxException { .build(); Row configRow = ManagedSchemaTransformProvider.getRowConfig( - config, new TestSchemaTransformProvider().configurationSchema()); + config.getTransformIdentifier(), + config.resolveUnderlyingConfig(PipelineOptionsFactory.create()), + new TestSchemaTransformProvider().configurationSchema()); assertEquals(expectedRow, configRow); } @@ -130,4 +144,43 @@ public void testDiscoverTestProvider() { assertTrue(provider.getAllProviders().containsKey(TestSchemaTransformProvider.IDENTIFIER)); } + + @Test + public void testResolveBigQueryWrite() { + String yamlString = "table: test-table"; + ManagedConfig config = + ManagedConfig.builder() + .setTransformIdentifier(ManagedTransformConstants.BIGQUERY_STORAGE_WRITE) + .setConfig(yamlString) + .build(); + + DataflowPipelineOptions options = + PipelineOptionsFactory.create().as(DataflowPipelineOptions.class); + Pipeline p = Pipeline.create(); + + // streaming case, pick Storage Write API + PCollection unboundedInput = + p.apply(Create.of(Row.nullRow(Schema.builder().build()))) + .setIsBoundedInternal(PCollection.IsBounded.UNBOUNDED); + String identifier = + config.resolveUnderlyingTransform(PCollectionRowTuple.of("input", unboundedInput)); + assertEquals(ManagedTransformConstants.BIGQUERY_STORAGE_WRITE, identifier); + + // batch case, pick File Loads + PCollection boundedInput = + p.apply(Create.of(Row.nullRow(Schema.builder().build()))) + .setIsBoundedInternal(PCollection.IsBounded.BOUNDED); + identifier = config.resolveUnderlyingTransform(PCollectionRowTuple.of("input", boundedInput)); + assertEquals(ManagedTransformConstants.BIGQUERY_FILE_LOADS, identifier); + + // "streaming_mode_at_least_once" dataflow service option is not set: config is unaffected + Map modifiedConfig = config.resolveUnderlyingConfig(options); + assertFalse(modifiedConfig.containsKey("at_least_once")); + + // "streaming_mode_at_least_once" dataflow service option is not set: inject + // "at_least_once=true" to user STORAGE_API_AT_LEAST_ONCE + options.setDataflowServiceOptions(Collections.singletonList("streaming_mode_at_least_once")); + modifiedConfig = config.resolveUnderlyingConfig(options); + assertEquals(true, modifiedConfig.get("at_least_once")); + } } From d45159ffb4c0ee5dbe0f75e4cbc237784bff2060 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Tue, 9 Jul 2024 17:35:49 -0400 Subject: [PATCH 06/24] rename bigquery write config class --- ...FileLoadsWriteSchemaTransformProvider.java | 14 +++--- ...torageWriteApiSchemaTransformProvider.java | 39 +++++++--------- ...LoadsWriteSchemaTransformProviderTest.java | 6 +-- ...geWriteApiSchemaTransformProviderTest.java | 45 ++++++++----------- 4 files changed, 45 insertions(+), 59 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProvider.java index 4fc97e297cee..9005d5e49c78 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProvider.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProvider.java @@ -17,7 +17,7 @@ */ package org.apache.beam.sdk.io.gcp.bigquery; -import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider.BigQueryStorageWriteApiSchemaTransformConfiguration; +import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider.BigQueryWriteConfiguration; import com.google.api.services.bigquery.model.TableRow; import com.google.auto.service.AutoService; @@ -38,7 +38,7 @@ /** * An implementation of {@link TypedSchemaTransformProvider} for BigQuery write jobs configured - * using {@link BigQueryStorageWriteApiSchemaTransformConfiguration}. + * using {@link BigQueryWriteConfiguration}. * *

Internal only: This class is actively being worked on, and it will likely change. We * provide no backwards compatibility guarantees, and it should not be implemented outside the Beam @@ -50,15 +50,14 @@ @Internal @AutoService(SchemaTransformProvider.class) public class BigQueryFileLoadsWriteSchemaTransformProvider - extends TypedSchemaTransformProvider { + extends TypedSchemaTransformProvider { private static final String IDENTIFIER = "beam:schematransform:org.apache.beam:bigquery_fileloads:v1"; static final String INPUT_TAG = "input"; @Override - protected SchemaTransform from( - BigQueryStorageWriteApiSchemaTransformConfiguration configuration) { + protected SchemaTransform from(BigQueryWriteConfiguration configuration) { return new BigQueryWriteSchemaTransform(configuration); } @@ -81,10 +80,9 @@ protected static class BigQueryWriteSchemaTransform extends SchemaTransform { /** An instance of {@link BigQueryServices} used for testing. */ private BigQueryServices testBigQueryServices = null; - private final BigQueryStorageWriteApiSchemaTransformConfiguration configuration; + private final BigQueryWriteConfiguration configuration; - BigQueryWriteSchemaTransform( - BigQueryStorageWriteApiSchemaTransformConfiguration configuration) { + BigQueryWriteSchemaTransform(BigQueryWriteConfiguration configuration) { configuration.validate(); this.configuration = configuration; } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java index c90564331ec1..d4dc6ce6cfe7 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java @@ -39,7 +39,7 @@ import org.apache.beam.sdk.io.gcp.bigquery.DynamicDestinations; import org.apache.beam.sdk.io.gcp.bigquery.TableDestination; import org.apache.beam.sdk.io.gcp.bigquery.WriteResult; -import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider.BigQueryStorageWriteApiSchemaTransformConfiguration; +import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider.BigQueryWriteConfiguration; import org.apache.beam.sdk.metrics.Counter; import org.apache.beam.sdk.metrics.Metrics; import org.apache.beam.sdk.schemas.AutoValueSchema; @@ -69,7 +69,7 @@ /** * An implementation of {@link TypedSchemaTransformProvider} for BigQuery Storage Write API jobs - * configured via {@link BigQueryStorageWriteApiSchemaTransformConfiguration}. + * configured via {@link BigQueryWriteConfiguration}. * *

Internal only: This class is actively being worked on, and it will likely change. We * provide no backwards compatibility guarantees, and it should not be implemented outside the Beam @@ -80,7 +80,7 @@ }) @AutoService(SchemaTransformProvider.class) public class BigQueryStorageWriteApiSchemaTransformProvider - extends TypedSchemaTransformProvider { + extends TypedSchemaTransformProvider { private static final Integer DEFAULT_TRIGGER_FREQUENCY_SECS = 5; private static final Duration DEFAULT_TRIGGERING_FREQUENCY = Duration.standardSeconds(DEFAULT_TRIGGER_FREQUENCY_SECS); @@ -91,8 +91,7 @@ public class BigQueryStorageWriteApiSchemaTransformProvider protected static final String DYNAMIC_DESTINATIONS = "DYNAMIC_DESTINATIONS"; @Override - protected SchemaTransform from( - BigQueryStorageWriteApiSchemaTransformConfiguration configuration) { + protected SchemaTransform from(BigQueryWriteConfiguration configuration) { return new BigQueryStorageWriteApiSchemaTransform(configuration); } @@ -124,14 +123,14 @@ public List outputCollectionNames() { /** Configuration for writing to BigQuery with Storage Write API. */ @DefaultSchema(AutoValueSchema.class) @AutoValue - public abstract static class BigQueryStorageWriteApiSchemaTransformConfiguration { + public abstract static class BigQueryWriteConfiguration { @AutoValue public abstract static class ErrorHandling { @SchemaFieldDescription("The name of the output PCollection containing failed writes.") public abstract String getOutput(); public static Builder builder() { - return new AutoValue_BigQueryStorageWriteApiSchemaTransformProvider_BigQueryStorageWriteApiSchemaTransformConfiguration_ErrorHandling + return new AutoValue_BigQueryStorageWriteApiSchemaTransformProvider_BigQueryWriteConfiguration_ErrorHandling .Builder(); } @@ -161,7 +160,7 @@ public void validate() { List createDispostions = Arrays.stream(CreateDisposition.values()).map(Enum::name).collect(Collectors.toList()); Preconditions.checkArgument( - createDispostions.contains(getCreateDisposition()), + createDispostions.contains(getCreateDisposition().toUpperCase()), "Invalid create disposition (%s) was specified. Available dispositions are: %s", getCreateDisposition(), createDispostions); @@ -170,7 +169,7 @@ public void validate() { List writeDispostions = Arrays.stream(WriteDisposition.values()).map(Enum::name).collect(Collectors.toList()); Preconditions.checkArgument( - writeDispostions.contains(getWriteDisposition()), + writeDispostions.contains(getWriteDisposition().toUpperCase()), "Invalid write disposition (%s) was specified. Available dispositions are: %s", getWriteDisposition(), writeDispostions); @@ -192,11 +191,9 @@ public void validate() { } } - /** - * Instantiates a {@link BigQueryStorageWriteApiSchemaTransformConfiguration.Builder} instance. - */ + /** Instantiates a {@link BigQueryWriteConfiguration.Builder} instance. */ public static Builder builder() { - return new AutoValue_BigQueryStorageWriteApiSchemaTransformProvider_BigQueryStorageWriteApiSchemaTransformConfiguration + return new AutoValue_BigQueryStorageWriteApiSchemaTransformProvider_BigQueryWriteConfiguration .Builder(); } @@ -251,7 +248,7 @@ public static Builder builder() { @Nullable public abstract ErrorHandling getErrorHandling(); - /** Builder for {@link BigQueryStorageWriteApiSchemaTransformConfiguration}. */ + /** Builder for {@link BigQueryWriteConfiguration}. */ @AutoValue.Builder public abstract static class Builder { @@ -273,25 +270,23 @@ public abstract static class Builder { public abstract Builder setErrorHandling(ErrorHandling errorHandling); - /** Builds a {@link BigQueryStorageWriteApiSchemaTransformConfiguration} instance. */ - public abstract BigQueryStorageWriteApiSchemaTransformProvider - .BigQueryStorageWriteApiSchemaTransformConfiguration + /** Builds a {@link BigQueryWriteConfiguration} instance. */ + public abstract BigQueryStorageWriteApiSchemaTransformProvider.BigQueryWriteConfiguration build(); } } /** * A {@link SchemaTransform} for BigQuery Storage Write API, configured with {@link - * BigQueryStorageWriteApiSchemaTransformConfiguration} and instantiated by {@link + * BigQueryWriteConfiguration} and instantiated by {@link * BigQueryStorageWriteApiSchemaTransformProvider}. */ public static class BigQueryStorageWriteApiSchemaTransform extends SchemaTransform { private BigQueryServices testBigQueryServices = null; - private final BigQueryStorageWriteApiSchemaTransformConfiguration configuration; + private final BigQueryWriteConfiguration configuration; - BigQueryStorageWriteApiSchemaTransform( - BigQueryStorageWriteApiSchemaTransformConfiguration configuration) { + BigQueryStorageWriteApiSchemaTransform(BigQueryWriteConfiguration configuration) { configuration.validate(); this.configuration = configuration; } @@ -453,7 +448,7 @@ public Row getConfigurationRow() { // To stay consistent with our SchemaTransform configuration naming conventions, // we sort lexicographically return SchemaRegistry.createDefault() - .getToRowFunction(BigQueryStorageWriteApiSchemaTransformConfiguration.class) + .getToRowFunction(BigQueryWriteConfiguration.class) .apply(configuration) .sorted() .toSnakeCase(); diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProviderTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProviderTest.java index 4264533410ce..4d6596fa7bbc 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProviderTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProviderTest.java @@ -17,7 +17,7 @@ */ package org.apache.beam.sdk.io.gcp.bigquery; -import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider.BigQueryStorageWriteApiSchemaTransformConfiguration; +import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider.BigQueryWriteConfiguration; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; @@ -96,8 +96,8 @@ public void tearDown() { public void testLoad() throws IOException, InterruptedException { BigQueryFileLoadsWriteSchemaTransformProvider provider = new BigQueryFileLoadsWriteSchemaTransformProvider(); - BigQueryStorageWriteApiSchemaTransformConfiguration configuration = - BigQueryStorageWriteApiSchemaTransformConfiguration.builder() + BigQueryWriteConfiguration configuration = + BigQueryWriteConfiguration.builder() .setTable(BigQueryHelpers.toTableSpec(TABLE_REFERENCE)) .setWriteDisposition(WriteDisposition.WRITE_TRUNCATE.name()) .setCreateDisposition(CreateDisposition.CREATE_IF_NEEDED.name()) diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProviderTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProviderTest.java index 64ea0b11d1b9..382ef016213a 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProviderTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProviderTest.java @@ -33,7 +33,7 @@ import org.apache.beam.sdk.PipelineResult; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers; import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider.BigQueryStorageWriteApiSchemaTransform; -import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider.BigQueryStorageWriteApiSchemaTransformConfiguration; +import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider.BigQueryWriteConfiguration; import org.apache.beam.sdk.io.gcp.testing.FakeBigQueryServices; import org.apache.beam.sdk.io.gcp.testing.FakeDatasetService; import org.apache.beam.sdk.io.gcp.testing.FakeJobService; @@ -105,15 +105,14 @@ public void setUp() throws Exception { @Test public void testInvalidConfig() { - List invalidConfigs = + List invalidConfigs = Arrays.asList( - BigQueryStorageWriteApiSchemaTransformConfiguration.builder() - .setTable("not_a_valid_table_spec"), - BigQueryStorageWriteApiSchemaTransformConfiguration.builder() + BigQueryWriteConfiguration.builder().setTable("not_a_valid_table_spec"), + BigQueryWriteConfiguration.builder() .setTable("project:dataset.table") .setCreateDisposition("INVALID_DISPOSITION")); - for (BigQueryStorageWriteApiSchemaTransformConfiguration.Builder config : invalidConfigs) { + for (BigQueryWriteConfiguration.Builder config : invalidConfigs) { assertThrows( Exception.class, () -> { @@ -122,13 +121,11 @@ public void testInvalidConfig() { } } - public PCollectionRowTuple runWithConfig( - BigQueryStorageWriteApiSchemaTransformConfiguration config) { + public PCollectionRowTuple runWithConfig(BigQueryWriteConfiguration config) { return runWithConfig(config, ROWS); } - public PCollectionRowTuple runWithConfig( - BigQueryStorageWriteApiSchemaTransformConfiguration config, List inputRows) { + public PCollectionRowTuple runWithConfig(BigQueryWriteConfiguration config, List inputRows) { BigQueryStorageWriteApiSchemaTransformProvider provider = new BigQueryStorageWriteApiSchemaTransformProvider(); @@ -173,8 +170,8 @@ public boolean rowEquals(Row expectedRow, TableRow actualRow) { @Test public void testSimpleWrite() throws Exception { String tableSpec = "project:dataset.simple_write"; - BigQueryStorageWriteApiSchemaTransformConfiguration config = - BigQueryStorageWriteApiSchemaTransformConfiguration.builder().setTable(tableSpec).build(); + BigQueryWriteConfiguration config = + BigQueryWriteConfiguration.builder().setTable(tableSpec).build(); runWithConfig(config, ROWS); p.run().waitUntilFinish(); @@ -187,8 +184,8 @@ public void testSimpleWrite() throws Exception { @Test public void testWriteToDynamicDestinations() throws Exception { String dynamic = BigQueryStorageWriteApiSchemaTransformProvider.DYNAMIC_DESTINATIONS; - BigQueryStorageWriteApiSchemaTransformConfiguration config = - BigQueryStorageWriteApiSchemaTransformConfiguration.builder().setTable(dynamic).build(); + BigQueryWriteConfiguration config = + BigQueryWriteConfiguration.builder().setTable(dynamic).build(); String baseTableSpec = "project:dataset.dynamic_write_"; @@ -224,8 +221,8 @@ public void testWriteToDynamicDestinations() throws Exception { @Test public void testInputElementCount() throws Exception { String tableSpec = "project:dataset.input_count"; - BigQueryStorageWriteApiSchemaTransformConfiguration config = - BigQueryStorageWriteApiSchemaTransformConfiguration.builder().setTable(tableSpec).build(); + BigQueryWriteConfiguration config = + BigQueryWriteConfiguration.builder().setTable(tableSpec).build(); runWithConfig(config); PipelineResult result = p.run(); @@ -254,13 +251,11 @@ public void testInputElementCount() throws Exception { @Test public void testFailedRows() throws Exception { String tableSpec = "project:dataset.write_with_fail"; - BigQueryStorageWriteApiSchemaTransformConfiguration config = - BigQueryStorageWriteApiSchemaTransformConfiguration.builder() + BigQueryWriteConfiguration config = + BigQueryWriteConfiguration.builder() .setTable(tableSpec) .setErrorHandling( - BigQueryStorageWriteApiSchemaTransformConfiguration.ErrorHandling.builder() - .setOutput("FailedRows") - .build()) + BigQueryWriteConfiguration.ErrorHandling.builder().setOutput("FailedRows").build()) .build(); String failValue = "fail_me"; @@ -307,13 +302,11 @@ public void testFailedRows() throws Exception { @Test public void testErrorCount() throws Exception { String tableSpec = "project:dataset.error_count"; - BigQueryStorageWriteApiSchemaTransformConfiguration config = - BigQueryStorageWriteApiSchemaTransformConfiguration.builder() + BigQueryWriteConfiguration config = + BigQueryWriteConfiguration.builder() .setTable(tableSpec) .setErrorHandling( - BigQueryStorageWriteApiSchemaTransformConfiguration.ErrorHandling.builder() - .setOutput("FailedRows") - .build()) + BigQueryWriteConfiguration.ErrorHandling.builder().setOutput("FailedRows").build()) .build(); Function shouldFailRow = From 989ad0f13aa392a310ee59f959fc764a7d219b05 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Tue, 9 Jul 2024 17:38:06 -0400 Subject: [PATCH 07/24] spotless --- .../BigQueryStorageWriteApiSchemaTransformProvider.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java index d4dc6ce6cfe7..aa840c53700a 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java @@ -158,7 +158,9 @@ public void validate() { // validate create and write dispositions if (!Strings.isNullOrEmpty(getCreateDisposition())) { List createDispostions = - Arrays.stream(CreateDisposition.values()).map(Enum::name).collect(Collectors.toList()); + Arrays.stream(CreateDisposition.values()) + .map(c -> c.name()) + .collect(Collectors.toList()); Preconditions.checkArgument( createDispostions.contains(getCreateDisposition().toUpperCase()), "Invalid create disposition (%s) was specified. Available dispositions are: %s", @@ -167,7 +169,9 @@ public void validate() { } if (!Strings.isNullOrEmpty(getWriteDisposition())) { List writeDispostions = - Arrays.stream(WriteDisposition.values()).map(Enum::name).collect(Collectors.toList()); + Arrays.stream(WriteDisposition.values()) + .map(w -> w.name()) + .collect(Collectors.toList()); Preconditions.checkArgument( writeDispostions.contains(getWriteDisposition().toUpperCase()), "Invalid write disposition (%s) was specified. Available dispositions are: %s", From b9b49e71cfe74eef956698f4cdbd1af787b7ddaa Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Tue, 9 Jul 2024 17:45:31 -0400 Subject: [PATCH 08/24] change read output tag to 'output' --- .../providers/BigQueryDirectReadSchemaTransformProvider.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryDirectReadSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryDirectReadSchemaTransformProvider.java index e9db777801de..7a443a3f402d 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryDirectReadSchemaTransformProvider.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryDirectReadSchemaTransformProvider.java @@ -64,7 +64,7 @@ public class BigQueryDirectReadSchemaTransformProvider extends TypedSchemaTransformProvider { - public static final String OUTPUT_TAG = "OUTPUT_ROWS"; + public static final String OUTPUT_TAG = "output"; @Override protected Class configurationClass() { From a119bbcf75fbf242e6c8127f2e90fb718a1ed556 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Tue, 9 Jul 2024 17:57:59 -0400 Subject: [PATCH 09/24] spotless --- .../BigQueryFileLoadsWriteSchemaTransformProvider.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProvider.java index 9005d5e49c78..a9fdceff3f80 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProvider.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProvider.java @@ -19,14 +19,12 @@ import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider.BigQueryWriteConfiguration; -import com.google.api.services.bigquery.model.TableRow; import com.google.auto.service.AutoService; import java.util.Collections; import java.util.List; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; -import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.transforms.SchemaTransform; import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider; @@ -96,7 +94,6 @@ public PCollectionRowTuple expand(PCollectionRowTuple input) { return PCollectionRowTuple.empty(input.getPipeline()); } - /** Instantiates a {@link BigQueryIO.Write} from a {@link Schema}. */ BigQueryIO.Write toWrite() { BigQueryIO.Write write = BigQueryIO.write() From 74bc178c79aaed8329b50ed5fa0b50f65bccc3e8 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Tue, 16 Jul 2024 12:06:56 -0400 Subject: [PATCH 10/24] revert logic that depends on DataflowServiceOptions. switching BQ methods can instead be done in Dataflow service side --- .../io/google-cloud-platform/build.gradle | 1 - .../io/gcp/bigquery/BigQueryManagedIT.java | 61 +----------- sdks/java/managed/build.gradle | 3 - .../org/apache/beam/sdk/managed/Managed.java | 7 +- .../ManagedSchemaTransformProvider.java | 92 +++++++------------ .../managed/ManagedTransformConstants.java | 7 +- .../ManagedSchemaTransformProviderTest.java | 58 +----------- 7 files changed, 38 insertions(+), 191 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/build.gradle b/sdks/java/io/google-cloud-platform/build.gradle index 291a3863c3e2..ef48e8403ffb 100644 --- a/sdks/java/io/google-cloud-platform/build.gradle +++ b/sdks/java/io/google-cloud-platform/build.gradle @@ -159,7 +159,6 @@ dependencies { testImplementation project(path: ":sdks:java:extensions:google-cloud-platform-core", configuration: "testRuntimeMigration") testImplementation project(path: ":sdks:java:extensions:protobuf", configuration: "testRuntimeMigration") testImplementation project(path: ":runners:direct-java", configuration: "shadow") - testImplementation project(":runners:google-cloud-dataflow-java") testImplementation project(":sdks:java:managed") testImplementation project(path: ":sdks:java:io:common") testImplementation project(path: ":sdks:java:testing:test-utils") diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java index ad3af3ffcb85..d72ad5e7320e 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java @@ -50,9 +50,7 @@ import org.junit.runners.JUnit4; /** - * This class tests the execution of {@link Managed} BigQueryIO. Tests validating that the correct - * write transform is requested can be found in - * ManagedSchemaTransformProviderTest.testResolveBigQueryWrite. + * This class tests the execution of {@link Managed} BigQueryIO. */ @RunWith(JUnit4.class) public class BigQueryManagedIT { @@ -88,34 +86,6 @@ public static void cleanup() { BQ_CLIENT.deleteDataset(PROJECT, BIG_QUERY_DATASET_ID); } - @Test - public void testBatchFileLoadsWriteRead() { - String table = String.format("%s:%s.managed_file_loads_read", PROJECT, BIG_QUERY_DATASET_ID); - - Map writeConfig = - ImmutableMap.builder().put("table", table).build(); - - // file loads requires a GCS temp location - TestPipelineOptions options = - TestPipeline.testingPipelineOptions().as(TestPipelineOptions.class); - options.setTempLocation(options.getTempRoot()); - - Pipeline p = Pipeline.create(options); - PCollectionRowTuple.of("input", getInput(p, false)) - .apply(Managed.write(Managed.BIGQUERY).withConfig(writeConfig)); - p.run().waitUntilFinish(); - - Map readConfig = - ImmutableMap.builder().put("table", table).build(); - Pipeline q = Pipeline.create(); - PCollection outputRows = - PCollectionRowTuple.empty(p) - .apply(Managed.read(Managed.BIGQUERY).withConfig(readConfig)) - .get(BigQueryDirectReadSchemaTransformProvider.OUTPUT_TAG); - PAssert.that(outputRows).containsInAnyOrder(ROWS); - q.run().waitUntilFinish(); - } - @Test public void testStreamingStorageWriteRead() { String table = String.format("%s:%s.managed_storage_write_read", PROJECT, BIG_QUERY_DATASET_ID); @@ -138,35 +108,6 @@ public void testStreamingStorageWriteRead() { q.run().waitUntilFinish(); } - @Test - public void testStreamingStorageWriteAtLeastOnceRead() { - String table = - String.format( - "%s:%s.managed_storage_write_at_least_once_read", PROJECT, BIG_QUERY_DATASET_ID); - - Map writeConfig = - ImmutableMap.builder().put("table", table).build(); - - DataflowPipelineOptions options = - PipelineOptionsFactory.create().as(DataflowPipelineOptions.class); - options.setDataflowServiceOptions(Collections.singletonList("streaming_mode_at_least_once")); - Pipeline p = Pipeline.create(options); - - PCollectionRowTuple.of("input", getInput(p, true)) - .apply(Managed.write(Managed.BIGQUERY).withConfig(writeConfig)); - p.run().waitUntilFinish(); - - Map readConfig = - ImmutableMap.builder().put("table", table).build(); - Pipeline q = Pipeline.create(); - PCollection outputRows = - PCollectionRowTuple.empty(p) - .apply(Managed.read(Managed.BIGQUERY).withConfig(readConfig)) - .get(BigQueryDirectReadSchemaTransformProvider.OUTPUT_TAG); - PAssert.that(outputRows).containsInAnyOrder(ROWS); - q.run().waitUntilFinish(); - } - public PCollection getInput(Pipeline p, boolean isStreaming) { if (isStreaming) { return p.apply( diff --git a/sdks/java/managed/build.gradle b/sdks/java/managed/build.gradle index e4d98d91a15d..add0d7f3cc0d 100644 --- a/sdks/java/managed/build.gradle +++ b/sdks/java/managed/build.gradle @@ -28,9 +28,6 @@ ext.summary = """Library that provides managed IOs.""" dependencies { implementation project(path: ":sdks:java:core", configuration: "shadow") - implementation project(":runners:google-cloud-dataflow-java") - implementation project(":sdks:java:io:google-cloud-platform") - implementation project(":sdks:java:extensions:google-cloud-platform-core") implementation library.java.vendored_guava_32_1_2_jre implementation library.java.slf4j_api diff --git a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/Managed.java b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/Managed.java index 570da89fadab..371c21645b41 100644 --- a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/Managed.java +++ b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/Managed.java @@ -96,7 +96,7 @@ public class Managed { ImmutableMap.builder() .put(ICEBERG, ManagedTransformConstants.ICEBERG_WRITE) .put(KAFKA, ManagedTransformConstants.KAFKA_WRITE) - .put(BIGQUERY, ManagedTransformConstants.BIGQUERY_STORAGE_WRITE) + .put(BIGQUERY, ManagedTransformConstants.BIGQUERY_WRITE) .build(); /** @@ -128,9 +128,6 @@ public static ManagedTransform read(String source) { * */ public static ManagedTransform write(String sink) { - List supportedIdentifiers = new ArrayList<>(WRITE_TRANSFORMS.values()); - supportedIdentifiers.add(ManagedTransformConstants.BIGQUERY_FILE_LOADS); - return new AutoValue_Managed_ManagedTransform.Builder() .setIdentifier( Preconditions.checkNotNull( @@ -138,7 +135,7 @@ public static ManagedTransform write(String sink) { "An unsupported sink was specified: '%s'. Please specify one of the following sinks: %s", sink, WRITE_TRANSFORMS.keySet())) - .setSupportedIdentifiers(supportedIdentifiers) + .setSupportedIdentifiers(new ArrayList<>(WRITE_TRANSFORMS.values())) .build(); } diff --git a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProvider.java b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProvider.java index 832c1492e27f..6ca883c96698 100644 --- a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProvider.java +++ b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProvider.java @@ -32,10 +32,8 @@ import java.util.Map; import java.util.ServiceLoader; import javax.annotation.Nullable; -import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions; import org.apache.beam.sdk.io.FileSystems; import org.apache.beam.sdk.io.fs.MatchResult; -import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.schemas.AutoValueSchema; import org.apache.beam.sdk.schemas.NoSuchSchemaException; import org.apache.beam.sdk.schemas.Schema; @@ -46,7 +44,6 @@ import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider; import org.apache.beam.sdk.schemas.utils.YamlUtils; -import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionRowTuple; import org.apache.beam.sdk.values.Row; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; @@ -120,8 +117,7 @@ protected void validate() { "Please specify a config or a config URL, but not both."); } - @VisibleForTesting - Map resolveUnderlyingConfig(PipelineOptions options) { + private Map resolveUnderlyingConfig() { String yamlTransformConfig = getConfig(); // If YAML string is empty, then attempt to read from YAML file if (Strings.isNullOrEmpty(yamlTransformConfig)) { @@ -136,75 +132,49 @@ Map resolveUnderlyingConfig(PipelineOptions options) { } } - Map config = YamlUtils.yamlStringToMap(yamlTransformConfig); - return maybeModify(config, options); - } - - private Map maybeModify(Map config, PipelineOptions options) { - DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class); - if (getTransformIdentifier().equals(ManagedTransformConstants.BIGQUERY_STORAGE_WRITE) - && dataflowOptions.getDataflowServiceOptions() != null - && dataflowOptions.getDataflowServiceOptions().contains("streaming_mode_at_least_once")) { - config.put("at_least_once", true); - } - return config; - } - - @VisibleForTesting - String resolveUnderlyingTransform(PCollectionRowTuple input) { - String identifier = getTransformIdentifier(); - if (identifier.equals(ManagedTransformConstants.BIGQUERY_STORAGE_WRITE)) { - if (input.getSinglePCollection().isBounded().equals(PCollection.IsBounded.BOUNDED)) { - return ManagedTransformConstants.BIGQUERY_FILE_LOADS; - } - } - - return identifier; + return YamlUtils.yamlStringToMap(yamlTransformConfig); } } @Override protected SchemaTransform from(ManagedConfig managedConfig) { managedConfig.validate(); - return new ManagedSchemaTransform(managedConfig, getAllProviders()); + SchemaTransformProvider schemaTransformProvider = + Preconditions.checkNotNull( + getAllProviders().get(managedConfig.getTransformIdentifier()), + "Could not find a transform with the identifier " + + "%s. This could be either due to the dependency with the " + + "transform not being available in the classpath or due to " + + "the specified transform not being supported.", + managedConfig.getTransformIdentifier()); + + return new ManagedSchemaTransform(managedConfig, schemaTransformProvider); } static class ManagedSchemaTransform extends SchemaTransform { private final ManagedConfig managedConfig; - private final Map transformProviders; + private final Row underlyingRowConfig; + private final SchemaTransformProvider underlyingTransformProvider; ManagedSchemaTransform( - ManagedConfig managedConfig, Map transformProviders) { - this.transformProviders = transformProviders; - this.managedConfig = managedConfig; - } - - @Override - public PCollectionRowTuple expand(PCollectionRowTuple input) { - String identifier = managedConfig.resolveUnderlyingTransform(input); - Map underlyingConfig = - managedConfig.resolveUnderlyingConfig(input.getPipeline().getOptions()); - - System.out.println("providers: " + transformProviders); - - SchemaTransformProvider underlyingTransformProvider = - Preconditions.checkNotNull( - transformProviders.get(identifier), - "Could not find a transform with the identifier " - + "%s. This could be either due to the dependency with the " - + "transform not being available in the classpath or due to " - + "the specified transform not being supported.", - identifier); + ManagedConfig managedConfig, SchemaTransformProvider underlyingTransformProvider) { + // parse config before expansion to check if it matches underlying transform's config schema Schema transformConfigSchema = underlyingTransformProvider.configurationSchema(); - Row underlyingRowConfig; try { - underlyingRowConfig = getRowConfig(identifier, underlyingConfig, transformConfigSchema); + underlyingRowConfig = getRowConfig(managedConfig, transformConfigSchema); } catch (Exception e) { throw new IllegalArgumentException( - "Encountered an error when retrieving Row configuration", e); + "Encountered an error when retrieving a Row configuration", e); } + this.underlyingRowConfig = underlyingRowConfig; + this.underlyingTransformProvider = underlyingTransformProvider; + this.managedConfig = managedConfig; + } + + @Override + public PCollectionRowTuple expand(PCollectionRowTuple input) { LOG.debug( "Building transform \"{}\" with Row configuration: {}", underlyingTransformProvider.identifier(), @@ -235,12 +205,12 @@ Row getConfigurationRow() { // May return an empty row (perhaps the underlying transform doesn't have any required // parameters) @VisibleForTesting - static Row getRowConfig( - String identifier, Map configMap, Schema transformSchema) { - // The config Row object will be used to build the underlying SchemaTransform. - // If a mapping for the SchemaTransform exists, we use it to update parameter names and align - // with the underlying config schema - Map mapping = MAPPINGS.get(identifier); + static Row getRowConfig(ManagedConfig config, Schema transformSchema) { + Map configMap = config.resolveUnderlyingConfig(); + // Build a config Row that will be used to build the underlying SchemaTransform. + // If a mapping for the SchemaTransform exists, we use it to update parameter names to align + // with the underlying SchemaTransform config schema + Map mapping = MAPPINGS.get(config.getTransformIdentifier()); if (mapping != null && configMap != null) { Map remappedConfig = new HashMap<>(); for (Map.Entry entry : configMap.entrySet()) { diff --git a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedTransformConstants.java b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedTransformConstants.java index 5dbc209591b2..77d5f29e926b 100644 --- a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedTransformConstants.java +++ b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedTransformConstants.java @@ -48,10 +48,8 @@ public class ManagedTransformConstants { public static final String KAFKA_WRITE = "beam:schematransform:org.apache.beam:kafka_write:v1"; public static final String BIGQUERY_READ = "beam:schematransform:org.apache.beam:bigquery_storage_read:v1"; - public static final String BIGQUERY_STORAGE_WRITE = + public static final String BIGQUERY_WRITE = "beam:schematransform:org.apache.beam:bigquery_storage_write:v2"; - public static final String BIGQUERY_FILE_LOADS = - "beam:schematransform:org.apache.beam:bigquery_fileloads:v1"; private static final Map KAFKA_READ_MAPPINGS = ImmutableMap.builder().put("data_format", "format").build(); @@ -76,7 +74,6 @@ public class ManagedTransformConstants { .put(KAFKA_READ, KAFKA_READ_MAPPINGS) .put(KAFKA_WRITE, KAFKA_WRITE_MAPPINGS) .put(BIGQUERY_READ, BIGQUERY_READ_MAPPINGS) - .put(BIGQUERY_STORAGE_WRITE, BIGQUERY_WRITE_MAPPINGS) - .put(BIGQUERY_FILE_LOADS, BIGQUERY_WRITE_MAPPINGS) + .put(BIGQUERY_WRITE, BIGQUERY_WRITE_MAPPINGS) .build(); } diff --git a/sdks/java/managed/src/test/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProviderTest.java b/sdks/java/managed/src/test/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProviderTest.java index 101954ba808e..a287ec6260ce 100644 --- a/sdks/java/managed/src/test/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProviderTest.java +++ b/sdks/java/managed/src/test/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProviderTest.java @@ -19,22 +19,13 @@ import static org.apache.beam.sdk.managed.ManagedSchemaTransformProvider.ManagedConfig; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import java.net.URISyntaxException; import java.nio.file.Paths; import java.util.Arrays; -import java.util.Collections; -import java.util.Map; -import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions; -import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.managed.testing.TestSchemaTransformProvider; -import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.schemas.Schema; -import org.apache.beam.sdk.transforms.Create; -import org.apache.beam.sdk.values.PCollection; -import org.apache.beam.sdk.values.PCollectionRowTuple; import org.apache.beam.sdk.values.Row; import org.junit.Rule; import org.junit.Test; @@ -74,10 +65,7 @@ public void testGetConfigRowFromYamlString() { .build(); Row returnedRow = - ManagedSchemaTransformProvider.getRowConfig( - config.getTransformIdentifier(), - config.resolveUnderlyingConfig(PipelineOptionsFactory.create()), - TestSchemaTransformProvider.SCHEMA); + ManagedSchemaTransformProvider.getRowConfig(config, TestSchemaTransformProvider.SCHEMA); assertEquals(expectedRow, returnedRow); } @@ -100,10 +88,7 @@ public void testGetConfigRowFromYamlFile() throws URISyntaxException { .withFieldValue("extra_integer", 123) .build(); Row configRow = - ManagedSchemaTransformProvider.getRowConfig( - config.getTransformIdentifier(), - config.resolveUnderlyingConfig(PipelineOptionsFactory.create()), - new TestSchemaTransformProvider().configurationSchema()); + ManagedSchemaTransformProvider.getRowConfig(config, TestSchemaTransformProvider.SCHEMA); assertEquals(expectedRow, configRow); } @@ -144,43 +129,4 @@ public void testDiscoverTestProvider() { assertTrue(provider.getAllProviders().containsKey(TestSchemaTransformProvider.IDENTIFIER)); } - - @Test - public void testResolveBigQueryWrite() { - String yamlString = "table: test-table"; - ManagedConfig config = - ManagedConfig.builder() - .setTransformIdentifier(ManagedTransformConstants.BIGQUERY_STORAGE_WRITE) - .setConfig(yamlString) - .build(); - - DataflowPipelineOptions options = - PipelineOptionsFactory.create().as(DataflowPipelineOptions.class); - Pipeline p = Pipeline.create(); - - // streaming case, pick Storage Write API - PCollection unboundedInput = - p.apply(Create.of(Row.nullRow(Schema.builder().build()))) - .setIsBoundedInternal(PCollection.IsBounded.UNBOUNDED); - String identifier = - config.resolveUnderlyingTransform(PCollectionRowTuple.of("input", unboundedInput)); - assertEquals(ManagedTransformConstants.BIGQUERY_STORAGE_WRITE, identifier); - - // batch case, pick File Loads - PCollection boundedInput = - p.apply(Create.of(Row.nullRow(Schema.builder().build()))) - .setIsBoundedInternal(PCollection.IsBounded.BOUNDED); - identifier = config.resolveUnderlyingTransform(PCollectionRowTuple.of("input", boundedInput)); - assertEquals(ManagedTransformConstants.BIGQUERY_FILE_LOADS, identifier); - - // "streaming_mode_at_least_once" dataflow service option is not set: config is unaffected - Map modifiedConfig = config.resolveUnderlyingConfig(options); - assertFalse(modifiedConfig.containsKey("at_least_once")); - - // "streaming_mode_at_least_once" dataflow service option is not set: inject - // "at_least_once=true" to user STORAGE_API_AT_LEAST_ONCE - options.setDataflowServiceOptions(Collections.singletonList("streaming_mode_at_least_once")); - modifiedConfig = config.resolveUnderlyingConfig(options); - assertEquals(true, modifiedConfig.get("at_least_once")); - } } From 528b504361ce2f77b7c2402a2042c8b1c5e2d960 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Tue, 16 Jul 2024 12:22:50 -0400 Subject: [PATCH 11/24] spotless --- .../beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java index d72ad5e7320e..a39e27bf7d85 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java @@ -18,22 +18,18 @@ package org.apache.beam.sdk.io.gcp.bigquery; import java.io.IOException; -import java.util.Collections; import java.util.List; import java.util.Map; import java.util.stream.Collectors; import java.util.stream.LongStream; -import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryDirectReadSchemaTransformProvider; import org.apache.beam.sdk.io.gcp.testing.BigqueryClient; import org.apache.beam.sdk.managed.Managed; -import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.testing.PAssert; import org.apache.beam.sdk.testing.TestPipeline; -import org.apache.beam.sdk.testing.TestPipelineOptions; import org.apache.beam.sdk.transforms.Create; import org.apache.beam.sdk.transforms.MapElements; import org.apache.beam.sdk.transforms.PeriodicImpulse; @@ -49,9 +45,7 @@ import org.junit.runner.RunWith; import org.junit.runners.JUnit4; -/** - * This class tests the execution of {@link Managed} BigQueryIO. - */ +/** This class tests the execution of {@link Managed} BigQueryIO. */ @RunWith(JUnit4.class) public class BigQueryManagedIT { private static final Schema SCHEMA = From dcc398a0f14145ea2c15337525700ee8e99d296b Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Mon, 29 Jul 2024 16:50:40 -0400 Subject: [PATCH 12/24] fix typo --- .../BigQueryStorageWriteApiSchemaTransformProvider.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java index aa840c53700a..04180ea4b70f 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java @@ -157,15 +157,15 @@ public void validate() { // validate create and write dispositions if (!Strings.isNullOrEmpty(getCreateDisposition())) { - List createDispostions = + List createDispositions = Arrays.stream(CreateDisposition.values()) .map(c -> c.name()) .collect(Collectors.toList()); Preconditions.checkArgument( - createDispostions.contains(getCreateDisposition().toUpperCase()), + createDispositions.contains(getCreateDisposition().toUpperCase()), "Invalid create disposition (%s) was specified. Available dispositions are: %s", getCreateDisposition(), - createDispostions); + createDispositions); } if (!Strings.isNullOrEmpty(getWriteDisposition())) { List writeDispostions = From 36edc38dabc560d47f0c0bc7452e87a227db23a8 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Tue, 6 Aug 2024 12:45:35 -0400 Subject: [PATCH 13/24] separate BQ write config to a new class --- ...FileLoadsWriteSchemaTransformProvider.java | 3 +- ...torageWriteApiSchemaTransformProvider.java | 172 +-------------- .../providers/BigQueryWriteConfiguration.java | 196 ++++++++++++++++++ ...LoadsWriteSchemaTransformProviderTest.java | 2 +- ...geWriteApiSchemaTransformProviderTest.java | 3 +- 5 files changed, 200 insertions(+), 176 deletions(-) create mode 100644 sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryWriteConfiguration.java diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProvider.java index a9fdceff3f80..7c89cb09ef6d 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProvider.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProvider.java @@ -17,14 +17,13 @@ */ package org.apache.beam.sdk.io.gcp.bigquery; -import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider.BigQueryWriteConfiguration; - import com.google.auto.service.AutoService; import java.util.Collections; import java.util.List; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; +import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryWriteConfiguration; import org.apache.beam.sdk.schemas.transforms.SchemaTransform; import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider; diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java index 04180ea4b70f..602a6e6faf3e 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java @@ -17,18 +17,14 @@ */ package org.apache.beam.sdk.io.gcp.bigquery.providers; +import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryWriteConfiguration.DYNAMIC_DESTINATIONS; import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; -import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkNotNull; import com.google.api.services.bigquery.model.TableSchema; import com.google.auto.service.AutoService; -import com.google.auto.value.AutoValue; import java.util.Arrays; import java.util.Collections; import java.util.List; -import java.util.stream.Collectors; -import javax.annotation.Nullable; -import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.Method; @@ -39,17 +35,13 @@ import org.apache.beam.sdk.io.gcp.bigquery.DynamicDestinations; import org.apache.beam.sdk.io.gcp.bigquery.TableDestination; import org.apache.beam.sdk.io.gcp.bigquery.WriteResult; -import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider.BigQueryWriteConfiguration; import org.apache.beam.sdk.metrics.Counter; import org.apache.beam.sdk.metrics.Metrics; -import org.apache.beam.sdk.schemas.AutoValueSchema; import org.apache.beam.sdk.schemas.NoSuchSchemaException; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.Schema.Field; import org.apache.beam.sdk.schemas.Schema.FieldType; import org.apache.beam.sdk.schemas.SchemaRegistry; -import org.apache.beam.sdk.schemas.annotations.DefaultSchema; -import org.apache.beam.sdk.schemas.annotations.SchemaFieldDescription; import org.apache.beam.sdk.schemas.transforms.SchemaTransform; import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider; @@ -63,7 +55,6 @@ import org.apache.beam.sdk.values.TypeDescriptors; import org.apache.beam.sdk.values.ValueInSingleWindow; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; import org.joda.time.Duration; @@ -88,7 +79,6 @@ public class BigQueryStorageWriteApiSchemaTransformProvider private static final String FAILED_ROWS_TAG = "FailedRows"; private static final String FAILED_ROWS_WITH_ERRORS_TAG = "FailedRowsWithErrors"; // magic string that tells us to write to dynamic destinations - protected static final String DYNAMIC_DESTINATIONS = "DYNAMIC_DESTINATIONS"; @Override protected SchemaTransform from(BigQueryWriteConfiguration configuration) { @@ -120,166 +110,6 @@ public List outputCollectionNames() { return Arrays.asList(FAILED_ROWS_TAG, FAILED_ROWS_WITH_ERRORS_TAG, "errors"); } - /** Configuration for writing to BigQuery with Storage Write API. */ - @DefaultSchema(AutoValueSchema.class) - @AutoValue - public abstract static class BigQueryWriteConfiguration { - @AutoValue - public abstract static class ErrorHandling { - @SchemaFieldDescription("The name of the output PCollection containing failed writes.") - public abstract String getOutput(); - - public static Builder builder() { - return new AutoValue_BigQueryStorageWriteApiSchemaTransformProvider_BigQueryWriteConfiguration_ErrorHandling - .Builder(); - } - - @AutoValue.Builder - public abstract static class Builder { - public abstract Builder setOutput(String output); - - public abstract ErrorHandling build(); - } - } - - public void validate() { - String invalidConfigMessage = "Invalid BigQuery Storage Write configuration: "; - - // validate output table spec - checkArgument( - !Strings.isNullOrEmpty(this.getTable()), - invalidConfigMessage + "Table spec for a BigQuery Write must be specified."); - - // if we have an input table spec, validate it - if (!this.getTable().equals(DYNAMIC_DESTINATIONS)) { - checkNotNull(BigQueryHelpers.parseTableSpec(this.getTable())); - } - - // validate create and write dispositions - if (!Strings.isNullOrEmpty(getCreateDisposition())) { - List createDispositions = - Arrays.stream(CreateDisposition.values()) - .map(c -> c.name()) - .collect(Collectors.toList()); - Preconditions.checkArgument( - createDispositions.contains(getCreateDisposition().toUpperCase()), - "Invalid create disposition (%s) was specified. Available dispositions are: %s", - getCreateDisposition(), - createDispositions); - } - if (!Strings.isNullOrEmpty(getWriteDisposition())) { - List writeDispostions = - Arrays.stream(WriteDisposition.values()) - .map(w -> w.name()) - .collect(Collectors.toList()); - Preconditions.checkArgument( - writeDispostions.contains(getWriteDisposition().toUpperCase()), - "Invalid write disposition (%s) was specified. Available dispositions are: %s", - getWriteDisposition(), - writeDispostions); - } - - if (this.getErrorHandling() != null) { - checkArgument( - !Strings.isNullOrEmpty(this.getErrorHandling().getOutput()), - invalidConfigMessage + "Output must not be empty if error handling specified."); - } - - if (this.getAutoSharding() != null - && this.getAutoSharding() - && this.getNumStreams() != null) { - checkArgument( - this.getNumStreams() == 0, - invalidConfigMessage - + "Cannot set a fixed number of streams when auto-sharding is enabled. Please pick only one of the two options."); - } - } - - /** Instantiates a {@link BigQueryWriteConfiguration.Builder} instance. */ - public static Builder builder() { - return new AutoValue_BigQueryStorageWriteApiSchemaTransformProvider_BigQueryWriteConfiguration - .Builder(); - } - - @SchemaFieldDescription( - "The bigquery table to write to. Format: [${PROJECT}:]${DATASET}.${TABLE}") - public abstract String getTable(); - - @SchemaFieldDescription( - "Optional field that specifies whether the job is allowed to create new tables. " - + "The following values are supported: CREATE_IF_NEEDED (the job may create the table), CREATE_NEVER (" - + "the job must fail if the table does not exist already).") - @Nullable - public abstract String getCreateDisposition(); - - @SchemaFieldDescription( - "Specifies the action that occurs if the destination table already exists. " - + "The following values are supported: " - + "WRITE_TRUNCATE (overwrites the table data), " - + "WRITE_APPEND (append the data to the table), " - + "WRITE_EMPTY (job must fail if the table is not empty).") - @Nullable - public abstract String getWriteDisposition(); - - @SchemaFieldDescription( - "Determines how often to 'commit' progress into BigQuery. Default is every 5 seconds.") - @Nullable - public abstract Long getTriggeringFrequencySeconds(); - - @SchemaFieldDescription( - "This option enables lower latency for insertions to BigQuery but may ocassionally " - + "duplicate data elements.") - @Nullable - public abstract Boolean getUseAtLeastOnceSemantics(); - - @SchemaFieldDescription( - "This option enables using a dynamically determined number of Storage Write API streams to write to " - + "BigQuery. Only applicable to unbounded data.") - @Nullable - public abstract Boolean getAutoSharding(); - - @SchemaFieldDescription( - "Specifies the number of write streams that the Storage API sink will use. " - + "This parameter is only applicable when writing unbounded data.") - @Nullable - public abstract Integer getNumStreams(); - - @SchemaFieldDescription("Use this Cloud KMS key to encrypt your data") - @Nullable - public abstract String getKmsKey(); - - @SchemaFieldDescription("This option specifies whether and where to output unwritable rows.") - @Nullable - public abstract ErrorHandling getErrorHandling(); - - /** Builder for {@link BigQueryWriteConfiguration}. */ - @AutoValue.Builder - public abstract static class Builder { - - public abstract Builder setTable(String table); - - public abstract Builder setCreateDisposition(String createDisposition); - - public abstract Builder setWriteDisposition(String writeDisposition); - - public abstract Builder setTriggeringFrequencySeconds(Long seconds); - - public abstract Builder setUseAtLeastOnceSemantics(Boolean use); - - public abstract Builder setAutoSharding(Boolean autoSharding); - - public abstract Builder setNumStreams(Integer numStreams); - - public abstract Builder setKmsKey(String kmsKey); - - public abstract Builder setErrorHandling(ErrorHandling errorHandling); - - /** Builds a {@link BigQueryWriteConfiguration} instance. */ - public abstract BigQueryStorageWriteApiSchemaTransformProvider.BigQueryWriteConfiguration - build(); - } - } - /** * A {@link SchemaTransform} for BigQuery Storage Write API, configured with {@link * BigQueryWriteConfiguration} and instantiated by {@link diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryWriteConfiguration.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryWriteConfiguration.java new file mode 100644 index 000000000000..41243b8ebd62 --- /dev/null +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryWriteConfiguration.java @@ -0,0 +1,196 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.gcp.bigquery.providers; + +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkNotNull; + +import com.google.auto.value.AutoValue; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import javax.annotation.Nullable; +import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers; +import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; +import org.apache.beam.sdk.schemas.AutoValueSchema; +import org.apache.beam.sdk.schemas.annotations.DefaultSchema; +import org.apache.beam.sdk.schemas.annotations.SchemaFieldDescription; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; + +/** Configuration for writing to BigQuery with Storage Write API. */ +@DefaultSchema(AutoValueSchema.class) +@AutoValue +public abstract class BigQueryWriteConfiguration { + protected static final String DYNAMIC_DESTINATIONS = "DYNAMIC_DESTINATIONS"; + + @AutoValue + public abstract static class ErrorHandling { + @SchemaFieldDescription("The name of the output PCollection containing failed writes.") + public abstract String getOutput(); + + public static Builder builder() { + return new AutoValue_BigQueryWriteConfiguration_ErrorHandling.Builder(); + } + + @AutoValue.Builder + public abstract static class Builder { + public abstract Builder setOutput(String output); + + public abstract ErrorHandling build(); + } + } + + public void validate() { + String invalidConfigMessage = "Invalid BigQuery Storage Write configuration: "; + + // validate output table spec + checkArgument( + !Strings.isNullOrEmpty(this.getTable()), + invalidConfigMessage + "Table spec for a BigQuery Write must be specified."); + + // if we have an input table spec, validate it + if (!this.getTable().equals(DYNAMIC_DESTINATIONS)) { + checkNotNull(BigQueryHelpers.parseTableSpec(this.getTable())); + } + + // validate create and write dispositions + String createDisposition = getCreateDisposition(); + if (createDisposition != null && !createDisposition.isEmpty()) { + List createDispositions = + Arrays.stream(BigQueryIO.Write.CreateDisposition.values()) + .map(c -> c.name()) + .collect(Collectors.toList()); + Preconditions.checkArgument( + createDispositions.contains(createDisposition.toUpperCase()), + "Invalid create disposition (%s) was specified. Available dispositions are: %s", + createDisposition, + createDispositions); + } + String writeDisposition = getWriteDisposition(); + if (writeDisposition != null && !writeDisposition.isEmpty()) { + List writeDispostions = + Arrays.stream(BigQueryIO.Write.WriteDisposition.values()) + .map(w -> w.name()) + .collect(Collectors.toList()); + Preconditions.checkArgument( + writeDispostions.contains(writeDisposition.toUpperCase()), + "Invalid write disposition (%s) was specified. Available dispositions are: %s", + writeDisposition, + writeDispostions); + } + + ErrorHandling errorHandling = getErrorHandling(); + if (errorHandling != null) { + checkArgument( + !Strings.isNullOrEmpty(errorHandling.getOutput()), + invalidConfigMessage + "Output must not be empty if error handling specified."); + } + + Boolean autoSharding = getAutoSharding(); + Integer numStreams = getNumStreams(); + if (autoSharding != null && autoSharding && numStreams != null) { + checkArgument( + numStreams == 0, + invalidConfigMessage + + "Cannot set a fixed number of streams when auto-sharding is enabled. Please pick only one of the two options."); + } + } + + /** Instantiates a {@link BigQueryWriteConfiguration.Builder} instance. */ + public static Builder builder() { + return new AutoValue_BigQueryWriteConfiguration.Builder(); + } + + @SchemaFieldDescription( + "The bigquery table to write to. Format: [${PROJECT}:]${DATASET}.${TABLE}") + public abstract String getTable(); + + @SchemaFieldDescription( + "Optional field that specifies whether the job is allowed to create new tables. " + + "The following values are supported: CREATE_IF_NEEDED (the job may create the table), CREATE_NEVER (" + + "the job must fail if the table does not exist already).") + @Nullable + public abstract String getCreateDisposition(); + + @SchemaFieldDescription( + "Specifies the action that occurs if the destination table already exists. " + + "The following values are supported: " + + "WRITE_TRUNCATE (overwrites the table data), " + + "WRITE_APPEND (append the data to the table), " + + "WRITE_EMPTY (job must fail if the table is not empty).") + @Nullable + public abstract String getWriteDisposition(); + + @SchemaFieldDescription( + "Determines how often to 'commit' progress into BigQuery. Default is every 5 seconds.") + @Nullable + public abstract Long getTriggeringFrequencySeconds(); + + @SchemaFieldDescription( + "This option enables lower latency for insertions to BigQuery but may ocassionally " + + "duplicate data elements.") + @Nullable + public abstract Boolean getUseAtLeastOnceSemantics(); + + @SchemaFieldDescription( + "This option enables using a dynamically determined number of Storage Write API streams to write to " + + "BigQuery. Only applicable to unbounded data.") + @Nullable + public abstract Boolean getAutoSharding(); + + @SchemaFieldDescription( + "Specifies the number of write streams that the Storage API sink will use. " + + "This parameter is only applicable when writing unbounded data.") + @Nullable + public abstract Integer getNumStreams(); + + @SchemaFieldDescription("Use this Cloud KMS key to encrypt your data") + @Nullable + public abstract String getKmsKey(); + + @SchemaFieldDescription("This option specifies whether and where to output unwritable rows.") + @Nullable + public abstract ErrorHandling getErrorHandling(); + + /** Builder for {@link BigQueryWriteConfiguration}. */ + @AutoValue.Builder + public abstract static class Builder { + + public abstract Builder setTable(String table); + + public abstract Builder setCreateDisposition(String createDisposition); + + public abstract Builder setWriteDisposition(String writeDisposition); + + public abstract Builder setTriggeringFrequencySeconds(Long seconds); + + public abstract Builder setUseAtLeastOnceSemantics(Boolean use); + + public abstract Builder setAutoSharding(Boolean autoSharding); + + public abstract Builder setNumStreams(Integer numStreams); + + public abstract Builder setKmsKey(String kmsKey); + + public abstract Builder setErrorHandling(ErrorHandling errorHandling); + + /** Builds a {@link BigQueryWriteConfiguration} instance. */ + public abstract BigQueryWriteConfiguration build(); + } +} diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProviderTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProviderTest.java index 4d6596fa7bbc..1e4791b94e1c 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProviderTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProviderTest.java @@ -17,7 +17,6 @@ */ package org.apache.beam.sdk.io.gcp.bigquery; -import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider.BigQueryWriteConfiguration; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; @@ -28,6 +27,7 @@ import org.apache.beam.sdk.io.gcp.bigquery.BigQueryFileLoadsWriteSchemaTransformProvider.BigQueryWriteSchemaTransform; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; +import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryWriteConfiguration; import org.apache.beam.sdk.io.gcp.testing.FakeBigQueryServices; import org.apache.beam.sdk.io.gcp.testing.FakeDatasetService; import org.apache.beam.sdk.io.gcp.testing.FakeJobService; diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProviderTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProviderTest.java index 382ef016213a..a421aa648e8a 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProviderTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProviderTest.java @@ -33,7 +33,6 @@ import org.apache.beam.sdk.PipelineResult; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers; import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider.BigQueryStorageWriteApiSchemaTransform; -import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider.BigQueryWriteConfiguration; import org.apache.beam.sdk.io.gcp.testing.FakeBigQueryServices; import org.apache.beam.sdk.io.gcp.testing.FakeDatasetService; import org.apache.beam.sdk.io.gcp.testing.FakeJobService; @@ -183,7 +182,7 @@ public void testSimpleWrite() throws Exception { @Test public void testWriteToDynamicDestinations() throws Exception { - String dynamic = BigQueryStorageWriteApiSchemaTransformProvider.DYNAMIC_DESTINATIONS; + String dynamic = BigQueryWriteConfiguration.DYNAMIC_DESTINATIONS; BigQueryWriteConfiguration config = BigQueryWriteConfiguration.builder().setTable(dynamic).build(); From f9be86c471a727400579f4dc1fa4c8141cc6495c Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Tue, 6 Aug 2024 12:46:44 -0400 Subject: [PATCH 14/24] fix doc --- .../gcp/bigquery/providers/BigQueryWriteConfiguration.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryWriteConfiguration.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryWriteConfiguration.java index 41243b8ebd62..b674d5b2056a 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryWriteConfiguration.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryWriteConfiguration.java @@ -33,7 +33,11 @@ import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; -/** Configuration for writing to BigQuery with Storage Write API. */ +/** + * Configuration for writing to BigQuery with SchemaTransforms. Used by {@link + * BigQueryStorageWriteApiSchemaTransformProvider} and {@link + * org.apache.beam.sdk.io.gcp.bigquery.BigQueryFileLoadsWriteSchemaTransformProvider}. + */ @DefaultSchema(AutoValueSchema.class) @AutoValue public abstract class BigQueryWriteConfiguration { From a26765efb4927f09c0aa9b1e74e132304a502d71 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Sat, 26 Oct 2024 02:18:50 +0300 Subject: [PATCH 15/24] resolve after syncing to HEAD --- .../beam_PostCommit_Java_DataflowV2.json | 3 +- .../pipeline/v1/external_transforms.proto | 4 + ...ueryDirectReadSchemaTransformProvider.java | 2 +- ...torageWriteApiSchemaTransformProvider.java | 190 +----------------- .../providers/BigQueryWriteConfiguration.java | 18 ++ ...geWriteApiSchemaTransformProviderTest.java | 10 +- 6 files changed, 31 insertions(+), 196 deletions(-) diff --git a/.github/trigger_files/beam_PostCommit_Java_DataflowV2.json b/.github/trigger_files/beam_PostCommit_Java_DataflowV2.json index a03c067d2c4e..1efc8e9e4405 100644 --- a/.github/trigger_files/beam_PostCommit_Java_DataflowV2.json +++ b/.github/trigger_files/beam_PostCommit_Java_DataflowV2.json @@ -1,3 +1,4 @@ { - "comment": "Modify this file in a trivial way to cause this test suite to run" + "comment": "Modify this file in a trivial way to cause this test suite to run", + "modification": 1 } diff --git a/model/pipeline/src/main/proto/org/apache/beam/model/pipeline/v1/external_transforms.proto b/model/pipeline/src/main/proto/org/apache/beam/model/pipeline/v1/external_transforms.proto index b03350966d6c..e7990e2f1085 100644 --- a/model/pipeline/src/main/proto/org/apache/beam/model/pipeline/v1/external_transforms.proto +++ b/model/pipeline/src/main/proto/org/apache/beam/model/pipeline/v1/external_transforms.proto @@ -70,6 +70,10 @@ message ManagedTransforms { "beam:schematransform:org.apache.beam:kafka_read:v1"]; KAFKA_WRITE = 3 [(org.apache.beam.model.pipeline.v1.beam_urn) = "beam:schematransform:org.apache.beam:kafka_write:v1"]; + BIGQUERY_READ = 4 [(org.apache.beam.model.pipeline.v1.beam_urn) = + "beam:schematransform:org.apache.beam:bigquery_storage_read:v1"]; + BIGQUERY_WRITE = 5 [(org.apache.beam.model.pipeline.v1.beam_urn) = + "beam:schematransform:org.apache.beam:bigquery_storage_write:v2"]; } } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryDirectReadSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryDirectReadSchemaTransformProvider.java index 7a443a3f402d..76ba186d2e82 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryDirectReadSchemaTransformProvider.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryDirectReadSchemaTransformProvider.java @@ -78,7 +78,7 @@ protected SchemaTransform from(BigQueryDirectReadSchemaTransformConfiguration co @Override public String identifier() { - return "beam:schematransform:org.apache.beam:bigquery_storage_read:v1"; + return "beam:schematransform:org.apache.beam:bigquery_storage_read:v1"; // getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_READ); } @Override diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java index 73407606cd10..403e13264e20 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java @@ -26,10 +26,7 @@ import java.util.Arrays; import java.util.Collections; import java.util.List; -import java.util.Map; import java.util.Optional; -import javax.annotation.Nullable; -import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.Method; @@ -62,7 +59,6 @@ import org.apache.beam.sdk.values.ValueInSingleWindow; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.joda.time.Duration; /** @@ -86,7 +82,6 @@ public class BigQueryStorageWriteApiSchemaTransformProvider private static final String FAILED_ROWS_TAG = "FailedRows"; private static final String FAILED_ROWS_WITH_ERRORS_TAG = "FailedRowsWithErrors"; // magic string that tells us to write to dynamic destinations - protected static final String DYNAMIC_DESTINATIONS = "DYNAMIC_DESTINATIONS"; protected static final String ROW_PROPERTY_MUTATION_INFO = "row_mutation_info"; protected static final String ROW_PROPERTY_MUTATION_TYPE = "mutation_type"; protected static final String ROW_PROPERTY_MUTATION_SQN = "change_sequence_number"; @@ -103,7 +98,7 @@ protected SchemaTransform from(BigQueryWriteConfiguration configuration) { @Override public String identifier() { - return String.format("beam:schematransform:org.apache.beam:bigquery_storage_write:v2"); + return "beam:schematransform:org.apache.beam:bigquery_storage_write:v2"; // getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_WRITE); } @Override @@ -126,189 +121,6 @@ public List outputCollectionNames() { return Arrays.asList(FAILED_ROWS_TAG, FAILED_ROWS_WITH_ERRORS_TAG, "errors"); } - /** Configuration for writing to BigQuery with Storage Write API. */ - @DefaultSchema(AutoValueSchema.class) - @AutoValue - public abstract static class BigQueryStorageWriteApiSchemaTransformConfiguration { - - static final Map CREATE_DISPOSITIONS = - ImmutableMap.builder() - .put(CreateDisposition.CREATE_IF_NEEDED.name(), CreateDisposition.CREATE_IF_NEEDED) - .put(CreateDisposition.CREATE_NEVER.name(), CreateDisposition.CREATE_NEVER) - .build(); - - static final Map WRITE_DISPOSITIONS = - ImmutableMap.builder() - .put(WriteDisposition.WRITE_TRUNCATE.name(), WriteDisposition.WRITE_TRUNCATE) - .put(WriteDisposition.WRITE_EMPTY.name(), WriteDisposition.WRITE_EMPTY) - .put(WriteDisposition.WRITE_APPEND.name(), WriteDisposition.WRITE_APPEND) - .build(); - - @AutoValue - public abstract static class ErrorHandling { - @SchemaFieldDescription("The name of the output PCollection containing failed writes.") - public abstract String getOutput(); - - public static Builder builder() { - return new AutoValue_BigQueryStorageWriteApiSchemaTransformProvider_BigQueryStorageWriteApiSchemaTransformConfiguration_ErrorHandling - .Builder(); - } - - @AutoValue.Builder - public abstract static class Builder { - public abstract Builder setOutput(String output); - - public abstract ErrorHandling build(); - } - } - - public void validate() { - String invalidConfigMessage = "Invalid BigQuery Storage Write configuration: "; - - // validate output table spec - checkArgument( - !Strings.isNullOrEmpty(this.getTable()), - invalidConfigMessage + "Table spec for a BigQuery Write must be specified."); - - // if we have an input table spec, validate it - if (!this.getTable().equals(DYNAMIC_DESTINATIONS)) { - checkNotNull(BigQueryHelpers.parseTableSpec(this.getTable())); - } - - // validate create and write dispositions - if (!Strings.isNullOrEmpty(this.getCreateDisposition())) { - checkNotNull( - CREATE_DISPOSITIONS.get(this.getCreateDisposition().toUpperCase()), - invalidConfigMessage - + "Invalid create disposition (%s) was specified. Available dispositions are: %s", - this.getCreateDisposition(), - CREATE_DISPOSITIONS.keySet()); - } - if (!Strings.isNullOrEmpty(this.getWriteDisposition())) { - checkNotNull( - WRITE_DISPOSITIONS.get(this.getWriteDisposition().toUpperCase()), - invalidConfigMessage - + "Invalid write disposition (%s) was specified. Available dispositions are: %s", - this.getWriteDisposition(), - WRITE_DISPOSITIONS.keySet()); - } - - if (this.getErrorHandling() != null) { - checkArgument( - !Strings.isNullOrEmpty(this.getErrorHandling().getOutput()), - invalidConfigMessage + "Output must not be empty if error handling specified."); - } - - if (this.getAutoSharding() != null - && this.getAutoSharding() - && this.getNumStreams() != null) { - checkArgument( - this.getNumStreams() == 0, - invalidConfigMessage - + "Cannot set a fixed number of streams when auto-sharding is enabled. Please pick only one of the two options."); - } - } - - /** - * Instantiates a {@link BigQueryStorageWriteApiSchemaTransformConfiguration.Builder} instance. - */ - public static Builder builder() { - return new AutoValue_BigQueryStorageWriteApiSchemaTransformProvider_BigQueryStorageWriteApiSchemaTransformConfiguration - .Builder(); - } - - @SchemaFieldDescription( - "The bigquery table to write to. Format: [${PROJECT}:]${DATASET}.${TABLE}") - public abstract String getTable(); - - @SchemaFieldDescription( - "Optional field that specifies whether the job is allowed to create new tables. " - + "The following values are supported: CREATE_IF_NEEDED (the job may create the table), CREATE_NEVER (" - + "the job must fail if the table does not exist already).") - @Nullable - public abstract String getCreateDisposition(); - - @SchemaFieldDescription( - "Specifies the action that occurs if the destination table already exists. " - + "The following values are supported: " - + "WRITE_TRUNCATE (overwrites the table data), " - + "WRITE_APPEND (append the data to the table), " - + "WRITE_EMPTY (job must fail if the table is not empty).") - @Nullable - public abstract String getWriteDisposition(); - - @SchemaFieldDescription( - "Determines how often to 'commit' progress into BigQuery. Default is every 5 seconds.") - @Nullable - public abstract Long getTriggeringFrequencySeconds(); - - @SchemaFieldDescription( - "This option enables lower latency for insertions to BigQuery but may ocassionally " - + "duplicate data elements.") - @Nullable - public abstract Boolean getUseAtLeastOnceSemantics(); - - @SchemaFieldDescription( - "This option enables using a dynamically determined number of Storage Write API streams to write to " - + "BigQuery. Only applicable to unbounded data.") - @Nullable - public abstract Boolean getAutoSharding(); - - @SchemaFieldDescription( - "Specifies the number of write streams that the Storage API sink will use. " - + "This parameter is only applicable when writing unbounded data.") - @Nullable - public abstract Integer getNumStreams(); - - @SchemaFieldDescription("This option specifies whether and where to output unwritable rows.") - @Nullable - public abstract ErrorHandling getErrorHandling(); - - @SchemaFieldDescription( - "This option enables the use of BigQuery CDC functionality. The expected PCollection" - + " should contain Beam Rows with a schema wrapping the record to be inserted and" - + " adding the CDC info similar to: {row_mutation_info: {mutation_type:\"...\", " - + "change_sequence_number:\"...\"}, record: {...}}") - @Nullable - public abstract Boolean getUseCdcWrites(); - - @SchemaFieldDescription( - "If CREATE_IF_NEEDED disposition is set, BigQuery table(s) will be created with this" - + " columns as primary key. Required when CDC writes are enabled with CREATE_IF_NEEDED.") - @Nullable - public abstract List getPrimaryKey(); - - /** Builder for {@link BigQueryStorageWriteApiSchemaTransformConfiguration}. */ - @AutoValue.Builder - public abstract static class Builder { - - public abstract Builder setTable(String table); - - public abstract Builder setCreateDisposition(String createDisposition); - - public abstract Builder setWriteDisposition(String writeDisposition); - - public abstract Builder setTriggeringFrequencySeconds(Long seconds); - - public abstract Builder setUseAtLeastOnceSemantics(Boolean use); - - public abstract Builder setAutoSharding(Boolean autoSharding); - - public abstract Builder setNumStreams(Integer numStreams); - - public abstract Builder setErrorHandling(ErrorHandling errorHandling); - - public abstract Builder setUseCdcWrites(Boolean cdcWrites); - - public abstract Builder setPrimaryKey(List pkColumns); - - /** Builds a {@link BigQueryStorageWriteApiSchemaTransformConfiguration} instance. */ - public abstract BigQueryStorageWriteApiSchemaTransformProvider - .BigQueryStorageWriteApiSchemaTransformConfiguration - build(); - } - } - /** * A {@link SchemaTransform} for BigQuery Storage Write API, configured with {@link * BigQueryWriteConfiguration} and instantiated by {@link diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryWriteConfiguration.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryWriteConfiguration.java index b674d5b2056a..acc5b1ff6ea4 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryWriteConfiguration.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryWriteConfiguration.java @@ -172,6 +172,20 @@ public static Builder builder() { @Nullable public abstract ErrorHandling getErrorHandling(); + @SchemaFieldDescription( + "This option enables the use of BigQuery CDC functionality. The expected PCollection" + + " should contain Beam Rows with a schema wrapping the record to be inserted and" + + " adding the CDC info similar to: {row_mutation_info: {mutation_type:\"...\", " + + "change_sequence_number:\"...\"}, record: {...}}") + @Nullable + public abstract Boolean getUseCdcWrites(); + + @SchemaFieldDescription( + "If CREATE_IF_NEEDED disposition is set, BigQuery table(s) will be created with this" + + " columns as primary key. Required when CDC writes are enabled with CREATE_IF_NEEDED.") + @Nullable + public abstract List getPrimaryKey(); + /** Builder for {@link BigQueryWriteConfiguration}. */ @AutoValue.Builder public abstract static class Builder { @@ -194,6 +208,10 @@ public abstract static class Builder { public abstract Builder setErrorHandling(ErrorHandling errorHandling); + public abstract Builder setUseCdcWrites(Boolean cdcWrites); + + public abstract Builder setPrimaryKey(List pkColumns); + /** Builds a {@link BigQueryWriteConfiguration} instance. */ public abstract BigQueryWriteConfiguration build(); } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProviderTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProviderTest.java index d90560c6cdaf..3a23f5a3205a 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProviderTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProviderTest.java @@ -269,8 +269,8 @@ public void testCDCWrites() throws Exception { String tableSpec = "project:dataset.cdc_write"; List primaryKeyColumns = ImmutableList.of("name"); - BigQueryStorageWriteApiSchemaTransformConfiguration config = - BigQueryStorageWriteApiSchemaTransformConfiguration.builder() + BigQueryWriteConfiguration config = + BigQueryWriteConfiguration.builder() .setUseAtLeastOnceSemantics(true) .setTable(tableSpec) .setUseCdcWrites(true) @@ -300,9 +300,9 @@ public void testCDCWrites() throws Exception { @Test public void testCDCWriteToDynamicDestinations() throws Exception { List primaryKeyColumns = ImmutableList.of("name"); - String dynamic = BigQueryStorageWriteApiSchemaTransformProvider.DYNAMIC_DESTINATIONS; - BigQueryStorageWriteApiSchemaTransformConfiguration config = - BigQueryStorageWriteApiSchemaTransformConfiguration.builder() + String dynamic = BigQueryWriteConfiguration.DYNAMIC_DESTINATIONS; + BigQueryWriteConfiguration config = + BigQueryWriteConfiguration.builder() .setUseAtLeastOnceSemantics(true) .setTable(dynamic) .setUseCdcWrites(true) From 725f7bd6adaffa2569533d6fb383fcde076b7b9a Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Sat, 26 Oct 2024 07:52:08 +0300 Subject: [PATCH 16/24] spotless --- .../beam/sdk/managed/ManagedTransformConstants.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedTransformConstants.java b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedTransformConstants.java index 5ab2ee18b80c..30476a30d373 100644 --- a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedTransformConstants.java +++ b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedTransformConstants.java @@ -66,7 +66,11 @@ public class ManagedTransformConstants { ImmutableMap.>builder() .put(getUrn(ExternalTransforms.ManagedTransforms.Urns.KAFKA_READ), KAFKA_READ_MAPPINGS) .put(getUrn(ExternalTransforms.ManagedTransforms.Urns.KAFKA_WRITE), KAFKA_WRITE_MAPPINGS) - .put(getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_READ), BIGQUERY_READ_MAPPINGS) - .put(getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_WRITE), BIGQUERY_WRITE_MAPPINGS) + .put( + getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_READ), + BIGQUERY_READ_MAPPINGS) + .put( + getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_WRITE), + BIGQUERY_WRITE_MAPPINGS) .build(); } From 2631104cf8d8ad69abb45afc2f3a113ad539e96b Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Tue, 5 Nov 2024 07:06:57 +0300 Subject: [PATCH 17/24] fork on batch/streaming --- ...am_PostCommit_Python_Xlang_Gcp_Direct.json | 2 +- .../pipeline/v1/external_transforms.proto | 4 +- ...FileLoadsWriteSchemaTransformProvider.java | 7 +- ...ueryDirectReadSchemaTransformProvider.java | 4 +- ...torageWriteApiSchemaTransformProvider.java | 4 +- .../io/gcp/bigquery/BigQueryManagedIT.java | 67 +++++++++++----- .../managed/expansion-service/build.gradle | 61 +++++++++++++++ .../expansion-service/container/build.gradle | 19 +++++ .../org/apache/beam/sdk/managed/Managed.java | 15 +++- .../ManagedSchemaTransformProvider.java | 76 +++++++++++-------- .../managed/ManagedTransformConstants.java | 2 +- .../ManagedSchemaTransformProviderTest.java | 34 +++++++++ 12 files changed, 235 insertions(+), 60 deletions(-) create mode 100644 sdks/java/managed/expansion-service/build.gradle create mode 100644 sdks/java/managed/expansion-service/container/build.gradle diff --git a/.github/trigger_files/beam_PostCommit_Python_Xlang_Gcp_Direct.json b/.github/trigger_files/beam_PostCommit_Python_Xlang_Gcp_Direct.json index b26833333238..e3d6056a5de9 100644 --- a/.github/trigger_files/beam_PostCommit_Python_Xlang_Gcp_Direct.json +++ b/.github/trigger_files/beam_PostCommit_Python_Xlang_Gcp_Direct.json @@ -1,4 +1,4 @@ { "comment": "Modify this file in a trivial way to cause this test suite to run", - "modification": 2 + "modification": 1 } diff --git a/model/pipeline/src/main/proto/org/apache/beam/model/pipeline/v1/external_transforms.proto b/model/pipeline/src/main/proto/org/apache/beam/model/pipeline/v1/external_transforms.proto index e7990e2f1085..2d8356614ce7 100644 --- a/model/pipeline/src/main/proto/org/apache/beam/model/pipeline/v1/external_transforms.proto +++ b/model/pipeline/src/main/proto/org/apache/beam/model/pipeline/v1/external_transforms.proto @@ -72,8 +72,10 @@ message ManagedTransforms { "beam:schematransform:org.apache.beam:kafka_write:v1"]; BIGQUERY_READ = 4 [(org.apache.beam.model.pipeline.v1.beam_urn) = "beam:schematransform:org.apache.beam:bigquery_storage_read:v1"]; - BIGQUERY_WRITE = 5 [(org.apache.beam.model.pipeline.v1.beam_urn) = + BIGQUERY_STORAGE_WRITE = 5 [(org.apache.beam.model.pipeline.v1.beam_urn) = "beam:schematransform:org.apache.beam:bigquery_storage_write:v2"]; + BIGQUERY_FILE_LOADS = 6 [(org.apache.beam.model.pipeline.v1.beam_urn) = + "beam:schematransform:org.apache.beam:bigquery_fileloads:v1"]; } } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProvider.java index 7c89cb09ef6d..7fdcabf5c695 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProvider.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProvider.java @@ -17,9 +17,12 @@ */ package org.apache.beam.sdk.io.gcp.bigquery; +import static org.apache.beam.sdk.util.construction.BeamUrns.getUrn; + import com.google.auto.service.AutoService; import java.util.Collections; import java.util.List; +import org.apache.beam.model.pipeline.v1.ExternalTransforms; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; @@ -49,8 +52,6 @@ public class BigQueryFileLoadsWriteSchemaTransformProvider extends TypedSchemaTransformProvider { - private static final String IDENTIFIER = - "beam:schematransform:org.apache.beam:bigquery_fileloads:v1"; static final String INPUT_TAG = "input"; @Override @@ -60,7 +61,7 @@ protected SchemaTransform from(BigQueryWriteConfiguration configuration) { @Override public String identifier() { - return IDENTIFIER; + return getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_FILE_LOADS); } @Override diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryDirectReadSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryDirectReadSchemaTransformProvider.java index 76ba186d2e82..15b1b01d7f6c 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryDirectReadSchemaTransformProvider.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryDirectReadSchemaTransformProvider.java @@ -17,6 +17,7 @@ */ package org.apache.beam.sdk.io.gcp.bigquery.providers; +import static org.apache.beam.sdk.util.construction.BeamUrns.getUrn; import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkNotNull; @@ -26,6 +27,7 @@ import java.util.Collections; import java.util.List; import javax.annotation.Nullable; +import org.apache.beam.model.pipeline.v1.ExternalTransforms; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.TypedRead; @@ -78,7 +80,7 @@ protected SchemaTransform from(BigQueryDirectReadSchemaTransformConfiguration co @Override public String identifier() { - return "beam:schematransform:org.apache.beam:bigquery_storage_read:v1"; // getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_READ); + return getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_READ); } @Override diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java index 403e13264e20..eba9dd61d510 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java @@ -18,6 +18,7 @@ package org.apache.beam.sdk.io.gcp.bigquery.providers; import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryWriteConfiguration.DYNAMIC_DESTINATIONS; +import static org.apache.beam.sdk.util.construction.BeamUrns.getUrn; import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; import com.google.api.services.bigquery.model.TableConstraints; @@ -27,6 +28,7 @@ import java.util.Collections; import java.util.List; import java.util.Optional; +import org.apache.beam.model.pipeline.v1.ExternalTransforms; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.Method; @@ -98,7 +100,7 @@ protected SchemaTransform from(BigQueryWriteConfiguration configuration) { @Override public String identifier() { - return "beam:schematransform:org.apache.beam:bigquery_storage_write:v2"; // getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_WRITE); + return getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_STORAGE_WRITE); } @Override diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java index a39e27bf7d85..1ae97aaeee2e 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java @@ -24,12 +24,12 @@ import java.util.stream.LongStream; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; -import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryDirectReadSchemaTransformProvider; import org.apache.beam.sdk.io.gcp.testing.BigqueryClient; import org.apache.beam.sdk.managed.Managed; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.testing.PAssert; import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.testing.TestPipelineOptions; import org.apache.beam.sdk.transforms.Create; import org.apache.beam.sdk.transforms.MapElements; import org.apache.beam.sdk.transforms.PeriodicImpulse; @@ -39,15 +39,22 @@ import org.apache.beam.sdk.values.TypeDescriptors; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.joda.time.Duration; +import org.joda.time.Instant; import org.junit.AfterClass; import org.junit.BeforeClass; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.TestName; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; /** This class tests the execution of {@link Managed} BigQueryIO. */ @RunWith(JUnit4.class) public class BigQueryManagedIT { + @Rule public TestName testName = new TestName(); + @Rule public transient TestPipeline writePipeline = TestPipeline.create(); + @Rule public transient TestPipeline readPipeline = TestPipeline.create(); + private static final Schema SCHEMA = Schema.of( Schema.Field.of("str", Schema.FieldType.STRING), @@ -79,34 +86,58 @@ public static void setUpTestEnvironment() throws IOException, InterruptedExcepti public static void cleanup() { BQ_CLIENT.deleteDataset(PROJECT, BIG_QUERY_DATASET_ID); } + @Test + public void testBatchFileLoadsWriteRead() { + String table = + String.format("%s:%s.%s", PROJECT, BIG_QUERY_DATASET_ID, testName.getMethodName()); + Map config = ImmutableMap.of("table", table); + + // file loads requires a GCS temp location + String tempLocation = writePipeline.getOptions().as(TestPipelineOptions.class).getTempRoot(); + writePipeline.getOptions().setTempLocation(tempLocation); + + // batch write + PCollectionRowTuple.of("input", getInput(writePipeline, false)) + .apply(Managed.write(Managed.BIGQUERY).withConfig(config)); + writePipeline.run().waitUntilFinish(); + + // read and validate + PCollection outputRows = + readPipeline + .apply(Managed.read(Managed.BIGQUERY).withConfig(config)) + .getSinglePCollection(); + PAssert.that(outputRows).containsInAnyOrder(ROWS); + + readPipeline.run().waitUntilFinish(); + } @Test public void testStreamingStorageWriteRead() { - String table = String.format("%s:%s.managed_storage_write_read", PROJECT, BIG_QUERY_DATASET_ID); - - Map writeConfig = - ImmutableMap.builder().put("table", table).build(); - Pipeline p = Pipeline.create(); - PCollectionRowTuple.of("input", getInput(p, true)) - .apply(Managed.write(Managed.BIGQUERY).withConfig(writeConfig)); - p.run().waitUntilFinish(); - - Map readConfig = - ImmutableMap.builder().put("table", table).build(); - Pipeline q = Pipeline.create(); + String table = + String.format("%s:%s.%s", PROJECT, BIG_QUERY_DATASET_ID, testName.getMethodName()); + Map config = ImmutableMap.of("table", table); + + // streaming write + PCollectionRowTuple.of("input", getInput(writePipeline, true)) + .apply(Managed.write(Managed.BIGQUERY).withConfig(config)); + writePipeline.run().waitUntilFinish(); + + // read and validate PCollection outputRows = - PCollectionRowTuple.empty(p) - .apply(Managed.read(Managed.BIGQUERY).withConfig(readConfig)) - .get(BigQueryDirectReadSchemaTransformProvider.OUTPUT_TAG); + readPipeline + .apply(Managed.read(Managed.BIGQUERY).withConfig(config)) + .getSinglePCollection(); PAssert.that(outputRows).containsInAnyOrder(ROWS); - q.run().waitUntilFinish(); + + readPipeline.run().waitUntilFinish(); } public PCollection getInput(Pipeline p, boolean isStreaming) { if (isStreaming) { return p.apply( PeriodicImpulse.create() - .stopAfter(Duration.millis(20)) + .startAt(new Instant(0)) + .stopAt(new Instant(19)) .withInterval(Duration.millis(1))) .apply( MapElements.into(TypeDescriptors.rows()) diff --git a/sdks/java/managed/expansion-service/build.gradle b/sdks/java/managed/expansion-service/build.gradle new file mode 100644 index 000000000000..ff78159fa5f7 --- /dev/null +++ b/sdks/java/managed/expansion-service/build.gradle @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +apply plugin: 'org.apache.beam.module' +apply plugin: 'application' +mainClassName = "org.apache.beam.sdk.expansion.service.ExpansionService" + +applyJavaNature( + automaticModuleName: 'org.apache.beam.sdk.managed.expansion.service', + exportJavadoc: false, + validateShadowJar: false, + shadowClosure: {}, +) + +// TODO(https://github.com/apache/beam/pull/32486/) Use library.java.kafka_clients once >=3.1.0 is set as default +configurations.runtimeClasspath { + // Pin kafka-clients version due to <3.1.0 missing auth callback classes + resolutionStrategy.force 'org.apache.kafka:kafka-clients:3.1.2' +} + +shadowJar { + mergeServiceFiles() + outputs.upToDateWhen { false } +} + +description = "Apache Beam :: SDKs :: Java :: Managed :: Expansion Service" +ext.summary = "Expansion service for Managed Transforms" + +dependencies { + runtimeOnly project(":sdks:java:expansion-service") + + // **** IcebergIO and dependencies **** + runtimeOnly project(":sdks:java:io:iceberg") + // Needed when writing to GCS + runtimeOnly library.java.bigdataoss_gcs_connector + runtimeOnly library.java.hadoop_client + // For HiveCatalog + runtimeOnly ("org.apache.iceberg:iceberg-hive-metastore:1.4.2") + runtimeOnly project(path: ":sdks:java:io:iceberg:hive:exec", configuration: "shadow") + + // **** KafkaIO and dependencies **** + runtimeOnly project(":sdks:java:io:kafka") + runtimeOnly library.java.kafka_clients + + runtimeOnly library.java.slf4j_jdk14 +} diff --git a/sdks/java/managed/expansion-service/container/build.gradle b/sdks/java/managed/expansion-service/container/build.gradle new file mode 100644 index 000000000000..556af1f4f3a9 --- /dev/null +++ b/sdks/java/managed/expansion-service/container/build.gradle @@ -0,0 +1,19 @@ +plugins { + id 'java' +} + +group = 'org.apache.beam.sdk.managed.expansion.service.container' +version = '2.61.0-SNAPSHOT' + +repositories { + mavenCentral() +} + +dependencies { + testImplementation platform('org.junit:junit-bom:5.9.1') + testImplementation 'org.junit.jupiter:junit-jupiter' +} + +test { + useJUnitPlatform() +} \ No newline at end of file diff --git a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/Managed.java b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/Managed.java index adc7fc7e2684..f6299a332ceb 100644 --- a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/Managed.java +++ b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/Managed.java @@ -99,7 +99,7 @@ public class Managed { ImmutableMap.builder() .put(ICEBERG, getUrn(ExternalTransforms.ManagedTransforms.Urns.ICEBERG_WRITE)) .put(KAFKA, getUrn(ExternalTransforms.ManagedTransforms.Urns.KAFKA_WRITE)) - .put(BIGQUERY, getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_WRITE)) + .put(BIGQUERY, getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_STORAGE_WRITE)) .build(); /** @@ -107,7 +107,9 @@ public class Managed { * supported managed sources are: * *

    - *
  • {@link Managed#ICEBERG} : Read from Apache Iceberg + *
  • {@link Managed#ICEBERG} : Read from Apache Iceberg tables + *
  • {@link Managed#KAFKA} : Read from Apache Kafka topics + *
  • {@link Managed#BIGQUERY} : Read from GCP BigQuery tables *
*/ public static ManagedTransform read(String source) { @@ -127,10 +129,15 @@ public static ManagedTransform read(String source) { * managed sinks are: * *
    - *
  • {@link Managed#ICEBERG} : Write to Apache Iceberg + *
  • {@link Managed#ICEBERG} : Write to Apache Iceberg tables + *
  • {@link Managed#KAFKA} : Write to Apache Kafka topics + *
  • {@link Managed#BIGQUERY} : Write to GCP BigQuery tables *
*/ public static ManagedTransform write(String sink) { + List supportedIdentifiers = new ArrayList<>(WRITE_TRANSFORMS.values()); + supportedIdentifiers.add(getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_FILE_LOADS)); + return new AutoValue_Managed_ManagedTransform.Builder() .setIdentifier( Preconditions.checkNotNull( @@ -138,7 +145,7 @@ public static ManagedTransform write(String sink) { "An unsupported sink was specified: '%s'. Please specify one of the following sinks: %s", sink, WRITE_TRANSFORMS.keySet())) - .setSupportedIdentifiers(new ArrayList<>(WRITE_TRANSFORMS.values())) + .setSupportedIdentifiers(supportedIdentifiers) .build(); } diff --git a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProvider.java b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProvider.java index 6ca883c96698..8bb4a6aedf5c 100644 --- a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProvider.java +++ b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProvider.java @@ -17,7 +17,10 @@ */ package org.apache.beam.sdk.managed; +import static org.apache.beam.sdk.managed.Managed.BIGQUERY; +import static org.apache.beam.sdk.managed.Managed.WRITE_TRANSFORMS; import static org.apache.beam.sdk.managed.ManagedTransformConstants.MAPPINGS; +import static org.apache.beam.sdk.util.construction.BeamUrns.getUrn; import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; import com.google.auto.service.AutoService; @@ -32,6 +35,7 @@ import java.util.Map; import java.util.ServiceLoader; import javax.annotation.Nullable; +import org.apache.beam.model.pipeline.v1.ExternalTransforms; import org.apache.beam.sdk.io.FileSystems; import org.apache.beam.sdk.io.fs.MatchResult; import org.apache.beam.sdk.schemas.AutoValueSchema; @@ -44,6 +48,7 @@ import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider; import org.apache.beam.sdk.schemas.utils.YamlUtils; +import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionRowTuple; import org.apache.beam.sdk.values.Row; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; @@ -97,6 +102,8 @@ public static Builder builder() { @SchemaFieldDescription("YAML string config used to build the underlying SchemaTransform.") public abstract @Nullable String getConfig(); + public abstract Builder toBuilder(); + @AutoValue.Builder public abstract static class Builder { public abstract Builder setTransformIdentifier(String identifier); @@ -134,53 +141,62 @@ private Map resolveUnderlyingConfig() { return YamlUtils.yamlStringToMap(yamlTransformConfig); } + + @VisibleForTesting + ManagedConfig resolveUnderlyingTransform(PCollectionRowTuple input) { + String identifier = getTransformIdentifier(); + if (identifier.equals(WRITE_TRANSFORMS.get(BIGQUERY))) { + if (input.getSinglePCollection().isBounded().equals(PCollection.IsBounded.BOUNDED)) { + identifier = getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_FILE_LOADS); + } + } + return toBuilder().setTransformIdentifier(identifier).build(); + } } @Override protected SchemaTransform from(ManagedConfig managedConfig) { managedConfig.validate(); - SchemaTransformProvider schemaTransformProvider = - Preconditions.checkNotNull( - getAllProviders().get(managedConfig.getTransformIdentifier()), - "Could not find a transform with the identifier " - + "%s. This could be either due to the dependency with the " - + "transform not being available in the classpath or due to " - + "the specified transform not being supported.", - managedConfig.getTransformIdentifier()); - - return new ManagedSchemaTransform(managedConfig, schemaTransformProvider); + return new ManagedSchemaTransform(managedConfig, getAllProviders()); } static class ManagedSchemaTransform extends SchemaTransform { - private final ManagedConfig managedConfig; - private final Row underlyingRowConfig; - private final SchemaTransformProvider underlyingTransformProvider; + private ManagedConfig managedConfig; + private final Map providers; ManagedSchemaTransform( - ManagedConfig managedConfig, SchemaTransformProvider underlyingTransformProvider) { - // parse config before expansion to check if it matches underlying transform's config schema - Schema transformConfigSchema = underlyingTransformProvider.configurationSchema(); - Row underlyingRowConfig; + ManagedConfig managedConfig, Map providers) { + this.providers = providers; + this.managedConfig = managedConfig; + } + + @Override + public PCollectionRowTuple expand(PCollectionRowTuple input) { + managedConfig = managedConfig.resolveUnderlyingTransform(input); + + SchemaTransformProvider schemaTransformProvider = + Preconditions.checkNotNull( + providers.get(managedConfig.getTransformIdentifier()), + "Could not find a transform with the identifier " + + "%s. This could be either due to the dependency with the " + + "transform not being available in the classpath or due to " + + "the specified transform not being supported.", + managedConfig.getTransformIdentifier()); + Schema transformConfigSchema = schemaTransformProvider.configurationSchema(); + Row transformRowConfig; try { - underlyingRowConfig = getRowConfig(managedConfig, transformConfigSchema); + transformRowConfig = getRowConfig(managedConfig, transformConfigSchema); } catch (Exception e) { throw new IllegalArgumentException( "Encountered an error when retrieving a Row configuration", e); } - this.underlyingRowConfig = underlyingRowConfig; - this.underlyingTransformProvider = underlyingTransformProvider; - this.managedConfig = managedConfig; - } - - @Override - public PCollectionRowTuple expand(PCollectionRowTuple input) { LOG.debug( - "Building transform \"{}\" with Row configuration: {}", - underlyingTransformProvider.identifier(), - underlyingRowConfig); + "Building transform \"{}\" with configuration: {}", + schemaTransformProvider.identifier(), + transformRowConfig); - return input.apply(underlyingTransformProvider.from(underlyingRowConfig)); + return input.apply(schemaTransformProvider.from(transformRowConfig)); } public ManagedConfig getManagedConfig() { @@ -226,7 +242,7 @@ static Row getRowConfig(ManagedConfig config, Schema transformSchema) { return YamlUtils.toBeamRow(configMap, transformSchema, false); } - // We load providers seperately, after construction, to prevent the + // We load providers separately, after construction, to prevent the // 'ManagedSchemaTransformProvider' from being initialized in a recursive loop // when being loaded using 'AutoValue'. synchronized Map getAllProviders() { diff --git a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedTransformConstants.java b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedTransformConstants.java index 30476a30d373..fabbb0e971c5 100644 --- a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedTransformConstants.java +++ b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedTransformConstants.java @@ -70,7 +70,7 @@ public class ManagedTransformConstants { getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_READ), BIGQUERY_READ_MAPPINGS) .put( - getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_WRITE), + getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_STORAGE_WRITE), BIGQUERY_WRITE_MAPPINGS) .build(); } diff --git a/sdks/java/managed/src/test/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProviderTest.java b/sdks/java/managed/src/test/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProviderTest.java index a287ec6260ce..6070b4858ce2 100644 --- a/sdks/java/managed/src/test/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProviderTest.java +++ b/sdks/java/managed/src/test/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProviderTest.java @@ -17,15 +17,22 @@ */ package org.apache.beam.sdk.managed; +import static org.apache.beam.sdk.managed.Managed.BIGQUERY; import static org.apache.beam.sdk.managed.ManagedSchemaTransformProvider.ManagedConfig; +import static org.apache.beam.sdk.util.construction.BeamUrns.getUrn; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.net.URISyntaxException; import java.nio.file.Paths; import java.util.Arrays; +import org.apache.beam.model.pipeline.v1.ExternalTransforms; +import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.managed.testing.TestSchemaTransformProvider; import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionRowTuple; import org.apache.beam.sdk.values.Row; import org.junit.Rule; import org.junit.Test; @@ -129,4 +136,31 @@ public void testDiscoverTestProvider() { assertTrue(provider.getAllProviders().containsKey(TestSchemaTransformProvider.IDENTIFIER)); } + + @Test + public void testResolveBigQueryWrite() { + String yamlString = "table: test-table"; + String storageApiIdentifier = Managed.WRITE_TRANSFORMS.get(BIGQUERY); + String fileLoadsIdentifier = + getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_FILE_LOADS); + + ManagedConfig config = + ManagedConfig.builder() + .setTransformIdentifier(storageApiIdentifier) + .setConfig(yamlString) + .build(); + Pipeline p = Pipeline.create(); + PCollection input = p.apply(Create.of(Row.nullRow(Schema.builder().build()))); + + // streaming case, pick Storage Write API + PCollection unboundedInput = input.setIsBoundedInternal(PCollection.IsBounded.UNBOUNDED); + config = config.resolveUnderlyingTransform(PCollectionRowTuple.of("input", unboundedInput)); + assertEquals(storageApiIdentifier, config.getTransformIdentifier()); + + // batch case, pick File Loads + PCollection boundedInput = + unboundedInput.setIsBoundedInternal(PCollection.IsBounded.BOUNDED); + config = config.resolveUnderlyingTransform(PCollectionRowTuple.of("input", boundedInput)); + assertEquals(fileLoadsIdentifier, config.getTransformIdentifier()); + } } From 770cf50294275db69dfe35c2bc6ddd8f7bef1a8e Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Tue, 5 Nov 2024 07:14:02 +0300 Subject: [PATCH 18/24] cleanup --- .../managed/expansion-service/build.gradle | 61 ------------------- .../expansion-service/container/build.gradle | 19 ------ 2 files changed, 80 deletions(-) delete mode 100644 sdks/java/managed/expansion-service/build.gradle delete mode 100644 sdks/java/managed/expansion-service/container/build.gradle diff --git a/sdks/java/managed/expansion-service/build.gradle b/sdks/java/managed/expansion-service/build.gradle deleted file mode 100644 index ff78159fa5f7..000000000000 --- a/sdks/java/managed/expansion-service/build.gradle +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -apply plugin: 'org.apache.beam.module' -apply plugin: 'application' -mainClassName = "org.apache.beam.sdk.expansion.service.ExpansionService" - -applyJavaNature( - automaticModuleName: 'org.apache.beam.sdk.managed.expansion.service', - exportJavadoc: false, - validateShadowJar: false, - shadowClosure: {}, -) - -// TODO(https://github.com/apache/beam/pull/32486/) Use library.java.kafka_clients once >=3.1.0 is set as default -configurations.runtimeClasspath { - // Pin kafka-clients version due to <3.1.0 missing auth callback classes - resolutionStrategy.force 'org.apache.kafka:kafka-clients:3.1.2' -} - -shadowJar { - mergeServiceFiles() - outputs.upToDateWhen { false } -} - -description = "Apache Beam :: SDKs :: Java :: Managed :: Expansion Service" -ext.summary = "Expansion service for Managed Transforms" - -dependencies { - runtimeOnly project(":sdks:java:expansion-service") - - // **** IcebergIO and dependencies **** - runtimeOnly project(":sdks:java:io:iceberg") - // Needed when writing to GCS - runtimeOnly library.java.bigdataoss_gcs_connector - runtimeOnly library.java.hadoop_client - // For HiveCatalog - runtimeOnly ("org.apache.iceberg:iceberg-hive-metastore:1.4.2") - runtimeOnly project(path: ":sdks:java:io:iceberg:hive:exec", configuration: "shadow") - - // **** KafkaIO and dependencies **** - runtimeOnly project(":sdks:java:io:kafka") - runtimeOnly library.java.kafka_clients - - runtimeOnly library.java.slf4j_jdk14 -} diff --git a/sdks/java/managed/expansion-service/container/build.gradle b/sdks/java/managed/expansion-service/container/build.gradle deleted file mode 100644 index 556af1f4f3a9..000000000000 --- a/sdks/java/managed/expansion-service/container/build.gradle +++ /dev/null @@ -1,19 +0,0 @@ -plugins { - id 'java' -} - -group = 'org.apache.beam.sdk.managed.expansion.service.container' -version = '2.61.0-SNAPSHOT' - -repositories { - mavenCentral() -} - -dependencies { - testImplementation platform('org.junit:junit-bom:5.9.1') - testImplementation 'org.junit.jupiter:junit-jupiter' -} - -test { - useJUnitPlatform() -} \ No newline at end of file From 0a7046690af799bbd16bb294dbd1951807d085e9 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Tue, 5 Nov 2024 07:28:58 +0300 Subject: [PATCH 19/24] spotless --- .../org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java | 1 + 1 file changed, 1 insertion(+) diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java index 1ae97aaeee2e..dcd1b6b853e3 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java @@ -86,6 +86,7 @@ public static void setUpTestEnvironment() throws IOException, InterruptedExcepti public static void cleanup() { BQ_CLIENT.deleteDataset(PROJECT, BIG_QUERY_DATASET_ID); } + @Test public void testBatchFileLoadsWriteRead() { String table = From 01a01f709b729a4fec7350861d5c2f3d086762b8 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Wed, 6 Nov 2024 11:44:20 -0500 Subject: [PATCH 20/24] move forking logic to BQ schematransform side --- .../pipeline/v1/external_transforms.proto | 6 +- ...FileLoadsWriteSchemaTransformProvider.java | 13 ++-- ...torageWriteApiSchemaTransformProvider.java | 4 +- .../providers/BigQueryWriteConfiguration.java | 2 +- .../BigQueryWriteSchemaTransformProvider.java | 70 ++++++++++++++++++ ...LoadsWriteSchemaTransformProviderTest.java | 12 ++-- .../org/apache/beam/sdk/managed/Managed.java | 7 +- .../ManagedSchemaTransformProvider.java | 72 ++++++++----------- .../managed/ManagedTransformConstants.java | 2 +- .../ManagedSchemaTransformProviderTest.java | 34 --------- 10 files changed, 119 insertions(+), 103 deletions(-) rename sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/{ => providers}/BigQueryFileLoadsWriteSchemaTransformProvider.java (92%) create mode 100644 sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryWriteSchemaTransformProvider.java rename sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/{ => providers}/BigQueryFileLoadsWriteSchemaTransformProviderTest.java (91%) diff --git a/model/pipeline/src/main/proto/org/apache/beam/model/pipeline/v1/external_transforms.proto b/model/pipeline/src/main/proto/org/apache/beam/model/pipeline/v1/external_transforms.proto index 2d8356614ce7..f102e82bafa6 100644 --- a/model/pipeline/src/main/proto/org/apache/beam/model/pipeline/v1/external_transforms.proto +++ b/model/pipeline/src/main/proto/org/apache/beam/model/pipeline/v1/external_transforms.proto @@ -72,10 +72,8 @@ message ManagedTransforms { "beam:schematransform:org.apache.beam:kafka_write:v1"]; BIGQUERY_READ = 4 [(org.apache.beam.model.pipeline.v1.beam_urn) = "beam:schematransform:org.apache.beam:bigquery_storage_read:v1"]; - BIGQUERY_STORAGE_WRITE = 5 [(org.apache.beam.model.pipeline.v1.beam_urn) = - "beam:schematransform:org.apache.beam:bigquery_storage_write:v2"]; - BIGQUERY_FILE_LOADS = 6 [(org.apache.beam.model.pipeline.v1.beam_urn) = - "beam:schematransform:org.apache.beam:bigquery_fileloads:v1"]; + BIGQUERY_WRITE = 5 [(org.apache.beam.model.pipeline.v1.beam_urn) = + "beam:schematransform:org.apache.beam:bigquery_write:v1"]; } } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsWriteSchemaTransformProvider.java similarity index 92% rename from sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProvider.java rename to sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsWriteSchemaTransformProvider.java index 7fdcabf5c695..3241b0ec68b3 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProvider.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsWriteSchemaTransformProvider.java @@ -15,18 +15,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.sdk.io.gcp.bigquery; - -import static org.apache.beam.sdk.util.construction.BeamUrns.getUrn; +package org.apache.beam.sdk.io.gcp.bigquery.providers; import com.google.auto.service.AutoService; import java.util.Collections; import java.util.List; -import org.apache.beam.model.pipeline.v1.ExternalTransforms; import org.apache.beam.sdk.annotations.Internal; +import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; -import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryWriteConfiguration; +import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices; +import org.apache.beam.sdk.io.gcp.bigquery.BigQueryUtils; import org.apache.beam.sdk.schemas.transforms.SchemaTransform; import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider; @@ -38,7 +37,7 @@ /** * An implementation of {@link TypedSchemaTransformProvider} for BigQuery write jobs configured - * using {@link BigQueryWriteConfiguration}. + * using {@link org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryWriteConfiguration}. * *

Internal only: This class is actively being worked on, and it will likely change. We * provide no backwards compatibility guarantees, and it should not be implemented outside the Beam @@ -61,7 +60,7 @@ protected SchemaTransform from(BigQueryWriteConfiguration configuration) { @Override public String identifier() { - return getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_FILE_LOADS); + return "beam:schematransform:org.apache.beam:bigquery_fileloads:v1"; } @Override diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java index eba9dd61d510..a159f4bfec8d 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java @@ -18,7 +18,6 @@ package org.apache.beam.sdk.io.gcp.bigquery.providers; import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryWriteConfiguration.DYNAMIC_DESTINATIONS; -import static org.apache.beam.sdk.util.construction.BeamUrns.getUrn; import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; import com.google.api.services.bigquery.model.TableConstraints; @@ -28,7 +27,6 @@ import java.util.Collections; import java.util.List; import java.util.Optional; -import org.apache.beam.model.pipeline.v1.ExternalTransforms; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.Method; @@ -100,7 +98,7 @@ protected SchemaTransform from(BigQueryWriteConfiguration configuration) { @Override public String identifier() { - return getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_STORAGE_WRITE); + return "beam:schematransform:org.apache.beam:bigquery_storage_write:v2"; } @Override diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryWriteConfiguration.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryWriteConfiguration.java index acc5b1ff6ea4..32250fc7976b 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryWriteConfiguration.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryWriteConfiguration.java @@ -36,7 +36,7 @@ /** * Configuration for writing to BigQuery with SchemaTransforms. Used by {@link * BigQueryStorageWriteApiSchemaTransformProvider} and {@link - * org.apache.beam.sdk.io.gcp.bigquery.BigQueryFileLoadsWriteSchemaTransformProvider}. + * BigQueryFileLoadsWriteSchemaTransformProvider}. */ @DefaultSchema(AutoValueSchema.class) @AutoValue diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryWriteSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryWriteSchemaTransformProvider.java new file mode 100644 index 000000000000..a069815f3189 --- /dev/null +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryWriteSchemaTransformProvider.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.gcp.bigquery.providers; + +import static org.apache.beam.sdk.util.construction.BeamUrns.getUrn; + +import com.google.auto.service.AutoService; +import org.apache.beam.model.pipeline.v1.ExternalTransforms; +import org.apache.beam.sdk.annotations.Internal; +import org.apache.beam.sdk.schemas.transforms.SchemaTransform; +import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; +import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionRowTuple; + +/** + * A BigQuery Write SchemaTransformProvider that routes to either {@link + * BigQueryFileLoadsWriteSchemaTransformProvider} or {@link + * BigQueryStorageWriteApiSchemaTransformProvider}. + * + *

Internal only. Used by the Managed Transform layer. + */ +@Internal +@AutoService(SchemaTransformProvider.class) +public class BigQueryWriteSchemaTransformProvider + extends TypedSchemaTransformProvider { + @Override + public String identifier() { + return getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_WRITE); + } + + @Override + protected SchemaTransform from(BigQueryWriteConfiguration configuration) { + return new BigQueryWriteRouter(configuration); + } + + static class BigQueryWriteRouter extends SchemaTransform { + private final BigQueryWriteConfiguration configuration; + + BigQueryWriteRouter(BigQueryWriteConfiguration configuration) { + configuration.validate(); + this.configuration = configuration; + } + + @Override + public PCollectionRowTuple expand(PCollectionRowTuple input) { + if (input.getSinglePCollection().isBounded().equals(PCollection.IsBounded.BOUNDED)) { + return input.apply(new BigQueryFileLoadsWriteSchemaTransformProvider().from(configuration)); + } else { // UNBOUNDED + return input.apply( + new BigQueryStorageWriteApiSchemaTransformProvider().from(configuration)); + } + } + } +} diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProviderTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsWriteSchemaTransformProviderTest.java similarity index 91% rename from sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProviderTest.java rename to sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsWriteSchemaTransformProviderTest.java index 1e4791b94e1c..278be65f96a7 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryFileLoadsWriteSchemaTransformProviderTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsWriteSchemaTransformProviderTest.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.sdk.io.gcp.bigquery; +package org.apache.beam.sdk.io.gcp.bigquery.providers; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; @@ -24,10 +24,11 @@ import java.io.IOException; import java.util.Arrays; import java.util.List; -import org.apache.beam.sdk.io.gcp.bigquery.BigQueryFileLoadsWriteSchemaTransformProvider.BigQueryWriteSchemaTransform; +import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; -import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryWriteConfiguration; +import org.apache.beam.sdk.io.gcp.bigquery.BigQueryOptions; +import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryFileLoadsWriteSchemaTransformProvider.BigQueryWriteSchemaTransform; import org.apache.beam.sdk.io.gcp.testing.FakeBigQueryServices; import org.apache.beam.sdk.io.gcp.testing.FakeDatasetService; import org.apache.beam.sdk.io.gcp.testing.FakeJobService; @@ -46,7 +47,10 @@ import org.junit.runner.RunWith; import org.junit.runners.JUnit4; -/** Test for {@link BigQueryFileLoadsWriteSchemaTransformProvider}. */ +/** + * Test for {@link + * org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryFileLoadsWriteSchemaTransformProvider}. + */ @RunWith(JUnit4.class) public class BigQueryFileLoadsWriteSchemaTransformProviderTest { diff --git a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/Managed.java b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/Managed.java index f6299a332ceb..8e7e0862eff4 100644 --- a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/Managed.java +++ b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/Managed.java @@ -99,7 +99,7 @@ public class Managed { ImmutableMap.builder() .put(ICEBERG, getUrn(ExternalTransforms.ManagedTransforms.Urns.ICEBERG_WRITE)) .put(KAFKA, getUrn(ExternalTransforms.ManagedTransforms.Urns.KAFKA_WRITE)) - .put(BIGQUERY, getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_STORAGE_WRITE)) + .put(BIGQUERY, getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_WRITE)) .build(); /** @@ -135,9 +135,6 @@ public static ManagedTransform read(String source) { * */ public static ManagedTransform write(String sink) { - List supportedIdentifiers = new ArrayList<>(WRITE_TRANSFORMS.values()); - supportedIdentifiers.add(getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_FILE_LOADS)); - return new AutoValue_Managed_ManagedTransform.Builder() .setIdentifier( Preconditions.checkNotNull( @@ -145,7 +142,7 @@ public static ManagedTransform write(String sink) { "An unsupported sink was specified: '%s'. Please specify one of the following sinks: %s", sink, WRITE_TRANSFORMS.keySet())) - .setSupportedIdentifiers(supportedIdentifiers) + .setSupportedIdentifiers(new ArrayList<>(WRITE_TRANSFORMS.values())) .build(); } diff --git a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProvider.java b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProvider.java index 8bb4a6aedf5c..b705306b9478 100644 --- a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProvider.java +++ b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProvider.java @@ -17,10 +17,7 @@ */ package org.apache.beam.sdk.managed; -import static org.apache.beam.sdk.managed.Managed.BIGQUERY; -import static org.apache.beam.sdk.managed.Managed.WRITE_TRANSFORMS; import static org.apache.beam.sdk.managed.ManagedTransformConstants.MAPPINGS; -import static org.apache.beam.sdk.util.construction.BeamUrns.getUrn; import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; import com.google.auto.service.AutoService; @@ -35,7 +32,6 @@ import java.util.Map; import java.util.ServiceLoader; import javax.annotation.Nullable; -import org.apache.beam.model.pipeline.v1.ExternalTransforms; import org.apache.beam.sdk.io.FileSystems; import org.apache.beam.sdk.io.fs.MatchResult; import org.apache.beam.sdk.schemas.AutoValueSchema; @@ -48,7 +44,6 @@ import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider; import org.apache.beam.sdk.schemas.utils.YamlUtils; -import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionRowTuple; import org.apache.beam.sdk.values.Row; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting; @@ -102,8 +97,6 @@ public static Builder builder() { @SchemaFieldDescription("YAML string config used to build the underlying SchemaTransform.") public abstract @Nullable String getConfig(); - public abstract Builder toBuilder(); - @AutoValue.Builder public abstract static class Builder { public abstract Builder setTransformIdentifier(String identifier); @@ -141,62 +134,53 @@ private Map resolveUnderlyingConfig() { return YamlUtils.yamlStringToMap(yamlTransformConfig); } - - @VisibleForTesting - ManagedConfig resolveUnderlyingTransform(PCollectionRowTuple input) { - String identifier = getTransformIdentifier(); - if (identifier.equals(WRITE_TRANSFORMS.get(BIGQUERY))) { - if (input.getSinglePCollection().isBounded().equals(PCollection.IsBounded.BOUNDED)) { - identifier = getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_FILE_LOADS); - } - } - return toBuilder().setTransformIdentifier(identifier).build(); - } } @Override protected SchemaTransform from(ManagedConfig managedConfig) { managedConfig.validate(); - return new ManagedSchemaTransform(managedConfig, getAllProviders()); + SchemaTransformProvider schemaTransformProvider = + Preconditions.checkNotNull( + getAllProviders().get(managedConfig.getTransformIdentifier()), + "Could not find a transform with the identifier " + + "%s. This could be either due to the dependency with the " + + "transform not being available in the classpath or due to " + + "the specified transform not being supported.", + managedConfig.getTransformIdentifier()); + + return new ManagedSchemaTransform(managedConfig, schemaTransformProvider); } static class ManagedSchemaTransform extends SchemaTransform { - private ManagedConfig managedConfig; - private final Map providers; + private final ManagedConfig managedConfig; + private final Row underlyingRowConfig; + private final SchemaTransformProvider underlyingTransformProvider; ManagedSchemaTransform( - ManagedConfig managedConfig, Map providers) { - this.providers = providers; - this.managedConfig = managedConfig; - } - - @Override - public PCollectionRowTuple expand(PCollectionRowTuple input) { - managedConfig = managedConfig.resolveUnderlyingTransform(input); - - SchemaTransformProvider schemaTransformProvider = - Preconditions.checkNotNull( - providers.get(managedConfig.getTransformIdentifier()), - "Could not find a transform with the identifier " - + "%s. This could be either due to the dependency with the " - + "transform not being available in the classpath or due to " - + "the specified transform not being supported.", - managedConfig.getTransformIdentifier()); - Schema transformConfigSchema = schemaTransformProvider.configurationSchema(); - Row transformRowConfig; + ManagedConfig managedConfig, SchemaTransformProvider underlyingTransformProvider) { + // parse config before expansion to check if it matches underlying transform's config schema + Schema transformConfigSchema = underlyingTransformProvider.configurationSchema(); + Row underlyingRowConfig; try { - transformRowConfig = getRowConfig(managedConfig, transformConfigSchema); + underlyingRowConfig = getRowConfig(managedConfig, transformConfigSchema); } catch (Exception e) { throw new IllegalArgumentException( "Encountered an error when retrieving a Row configuration", e); } + this.underlyingRowConfig = underlyingRowConfig; + this.underlyingTransformProvider = underlyingTransformProvider; + this.managedConfig = managedConfig; + } + + @Override + public PCollectionRowTuple expand(PCollectionRowTuple input) { LOG.debug( "Building transform \"{}\" with configuration: {}", - schemaTransformProvider.identifier(), - transformRowConfig); + underlyingTransformProvider.identifier(), + underlyingRowConfig); - return input.apply(schemaTransformProvider.from(transformRowConfig)); + return input.apply(underlyingTransformProvider.from(underlyingRowConfig)); } public ManagedConfig getManagedConfig() { diff --git a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedTransformConstants.java b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedTransformConstants.java index fabbb0e971c5..30476a30d373 100644 --- a/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedTransformConstants.java +++ b/sdks/java/managed/src/main/java/org/apache/beam/sdk/managed/ManagedTransformConstants.java @@ -70,7 +70,7 @@ public class ManagedTransformConstants { getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_READ), BIGQUERY_READ_MAPPINGS) .put( - getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_STORAGE_WRITE), + getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_WRITE), BIGQUERY_WRITE_MAPPINGS) .build(); } diff --git a/sdks/java/managed/src/test/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProviderTest.java b/sdks/java/managed/src/test/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProviderTest.java index 6070b4858ce2..a287ec6260ce 100644 --- a/sdks/java/managed/src/test/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProviderTest.java +++ b/sdks/java/managed/src/test/java/org/apache/beam/sdk/managed/ManagedSchemaTransformProviderTest.java @@ -17,22 +17,15 @@ */ package org.apache.beam.sdk.managed; -import static org.apache.beam.sdk.managed.Managed.BIGQUERY; import static org.apache.beam.sdk.managed.ManagedSchemaTransformProvider.ManagedConfig; -import static org.apache.beam.sdk.util.construction.BeamUrns.getUrn; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.net.URISyntaxException; import java.nio.file.Paths; import java.util.Arrays; -import org.apache.beam.model.pipeline.v1.ExternalTransforms; -import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.managed.testing.TestSchemaTransformProvider; import org.apache.beam.sdk.schemas.Schema; -import org.apache.beam.sdk.transforms.Create; -import org.apache.beam.sdk.values.PCollection; -import org.apache.beam.sdk.values.PCollectionRowTuple; import org.apache.beam.sdk.values.Row; import org.junit.Rule; import org.junit.Test; @@ -136,31 +129,4 @@ public void testDiscoverTestProvider() { assertTrue(provider.getAllProviders().containsKey(TestSchemaTransformProvider.IDENTIFIER)); } - - @Test - public void testResolveBigQueryWrite() { - String yamlString = "table: test-table"; - String storageApiIdentifier = Managed.WRITE_TRANSFORMS.get(BIGQUERY); - String fileLoadsIdentifier = - getUrn(ExternalTransforms.ManagedTransforms.Urns.BIGQUERY_FILE_LOADS); - - ManagedConfig config = - ManagedConfig.builder() - .setTransformIdentifier(storageApiIdentifier) - .setConfig(yamlString) - .build(); - Pipeline p = Pipeline.create(); - PCollection input = p.apply(Create.of(Row.nullRow(Schema.builder().build()))); - - // streaming case, pick Storage Write API - PCollection unboundedInput = input.setIsBoundedInternal(PCollection.IsBounded.UNBOUNDED); - config = config.resolveUnderlyingTransform(PCollectionRowTuple.of("input", unboundedInput)); - assertEquals(storageApiIdentifier, config.getTransformIdentifier()); - - // batch case, pick File Loads - PCollection boundedInput = - unboundedInput.setIsBoundedInternal(PCollection.IsBounded.BOUNDED); - config = config.resolveUnderlyingTransform(PCollectionRowTuple.of("input", boundedInput)); - assertEquals(fileLoadsIdentifier, config.getTransformIdentifier()); - } } From 697c0b8010958cbd40f61e71b87241e4e7f63920 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Thu, 7 Nov 2024 09:47:33 -0500 Subject: [PATCH 21/24] add file loads translation and tests; add test checks that the correct transform is chosen --- ...FileLoadsWriteSchemaTransformProvider.java | 22 ++++- .../BigQuerySchemaTransformTranslation.java | 22 ++++- ...LoadsWriteSchemaTransformProviderTest.java | 6 +- .../{ => providers}/BigQueryManagedIT.java | 41 +++++++- ...igQuerySchemaTransformTranslationTest.java | 97 ++++++++++++++++--- 5 files changed, 163 insertions(+), 25 deletions(-) rename sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/{ => providers}/BigQuerySchemaTransformTranslation.java (81%) rename sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/{ => providers}/BigQueryManagedIT.java (74%) rename sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/{ => providers}/BigQuerySchemaTransformTranslationTest.java (66%) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsWriteSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsWriteSchemaTransformProvider.java index 3241b0ec68b3..16ced3642b61 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsWriteSchemaTransformProvider.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsWriteSchemaTransformProvider.java @@ -26,6 +26,8 @@ import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryUtils; +import org.apache.beam.sdk.schemas.NoSuchSchemaException; +import org.apache.beam.sdk.schemas.SchemaRegistry; import org.apache.beam.sdk.schemas.transforms.SchemaTransform; import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider; @@ -55,7 +57,7 @@ public class BigQueryFileLoadsWriteSchemaTransformProvider @Override protected SchemaTransform from(BigQueryWriteConfiguration configuration) { - return new BigQueryWriteSchemaTransform(configuration); + return new BigQueryFileLoadsSchemaTransform(configuration); } @Override @@ -73,13 +75,13 @@ public List outputCollectionNames() { return Collections.emptyList(); } - protected static class BigQueryWriteSchemaTransform extends SchemaTransform { + public static class BigQueryFileLoadsSchemaTransform extends SchemaTransform { /** An instance of {@link BigQueryServices} used for testing. */ private BigQueryServices testBigQueryServices = null; private final BigQueryWriteConfiguration configuration; - BigQueryWriteSchemaTransform(BigQueryWriteConfiguration configuration) { + BigQueryFileLoadsSchemaTransform(BigQueryWriteConfiguration configuration) { configuration.validate(); this.configuration = configuration; } @@ -126,5 +128,19 @@ BigQueryIO.Write toWrite() { void setTestBigQueryServices(BigQueryServices testBigQueryServices) { this.testBigQueryServices = testBigQueryServices; } + + public Row getConfigurationRow() { + try { + // To stay consistent with our SchemaTransform configuration naming conventions, + // we sort lexicographically + return SchemaRegistry.createDefault() + .getToRowFunction(BigQueryWriteConfiguration.class) + .apply(configuration) + .sorted() + .toSnakeCase(); + } catch (NoSuchSchemaException e) { + throw new RuntimeException(e); + } + } } } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySchemaTransformTranslation.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQuerySchemaTransformTranslation.java similarity index 81% rename from sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySchemaTransformTranslation.java rename to sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQuerySchemaTransformTranslation.java index 102a1840e177..c0c561ee3f57 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySchemaTransformTranslation.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQuerySchemaTransformTranslation.java @@ -15,15 +15,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.sdk.io.gcp.bigquery; +package org.apache.beam.sdk.io.gcp.bigquery.providers; import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryDirectReadSchemaTransformProvider.BigQueryDirectReadSchemaTransform; +import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryFileLoadsWriteSchemaTransformProvider.BigQueryFileLoadsSchemaTransform; import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider.BigQueryStorageWriteApiSchemaTransform; import com.google.auto.service.AutoService; import java.util.Map; -import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryDirectReadSchemaTransformProvider; -import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider; import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; import org.apache.beam.sdk.schemas.transforms.SchemaTransformTranslation; import org.apache.beam.sdk.transforms.PTransform; @@ -61,6 +60,20 @@ public Row toConfigRow(BigQueryStorageWriteApiSchemaTransform transform) { } } + public static class BigQueryFileLoadsSchemaTransformTranslator + extends SchemaTransformTranslation.SchemaTransformPayloadTranslator< + BigQueryFileLoadsSchemaTransform> { + @Override + public SchemaTransformProvider provider() { + return new BigQueryFileLoadsWriteSchemaTransformProvider(); + } + + @Override + public Row toConfigRow(BigQueryFileLoadsSchemaTransform transform) { + return transform.getConfigurationRow(); + } + } + @AutoService(TransformPayloadTranslatorRegistrar.class) public static class ReadWriteRegistrar implements TransformPayloadTranslatorRegistrar { @Override @@ -79,6 +92,9 @@ public static class ReadWriteRegistrar implements TransformPayloadTranslatorRegi .put( BigQueryStorageWriteApiSchemaTransform.class, new BigQueryStorageWriteSchemaTransformTranslator()) + .put( + BigQueryFileLoadsSchemaTransform.class, + new BigQueryFileLoadsSchemaTransformTranslator()) .build(); } } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsWriteSchemaTransformProviderTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsWriteSchemaTransformProviderTest.java index 278be65f96a7..d506661a3e3e 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsWriteSchemaTransformProviderTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsWriteSchemaTransformProviderTest.java @@ -28,7 +28,7 @@ import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryOptions; -import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryFileLoadsWriteSchemaTransformProvider.BigQueryWriteSchemaTransform; +import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryFileLoadsWriteSchemaTransformProvider.BigQueryFileLoadsSchemaTransform; import org.apache.beam.sdk.io.gcp.testing.FakeBigQueryServices; import org.apache.beam.sdk.io.gcp.testing.FakeDatasetService; import org.apache.beam.sdk.io.gcp.testing.FakeJobService; @@ -106,8 +106,8 @@ public void testLoad() throws IOException, InterruptedException { .setWriteDisposition(WriteDisposition.WRITE_TRUNCATE.name()) .setCreateDisposition(CreateDisposition.CREATE_IF_NEEDED.name()) .build(); - BigQueryWriteSchemaTransform schemaTransform = - (BigQueryWriteSchemaTransform) provider.from(configuration); + BigQueryFileLoadsSchemaTransform schemaTransform = + (BigQueryFileLoadsSchemaTransform) provider.from(configuration); schemaTransform.setTestBigQueryServices(fakeBigQueryServices); String tag = provider.inputCollectionNames().get(0); PCollectionRowTuple input = diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryManagedIT.java similarity index 74% rename from sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java rename to sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryManagedIT.java index dcd1b6b853e3..62d737da9110 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryManagedIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryManagedIT.java @@ -15,13 +15,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.sdk.io.gcp.bigquery; +package org.apache.beam.sdk.io.gcp.bigquery.providers; + +import static org.apache.beam.model.pipeline.v1.ExternalTransforms.ExpansionMethods.Enum.SCHEMA_TRANSFORM; +import static org.apache.beam.sdk.util.construction.BeamUrns.getUrn; +import static org.junit.Assert.assertEquals; import java.io.IOException; import java.util.List; import java.util.Map; import java.util.stream.Collectors; import java.util.stream.LongStream; +import org.apache.beam.model.pipeline.v1.ExternalTransforms; +import org.apache.beam.model.pipeline.v1.RunnerApi; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; import org.apache.beam.sdk.io.gcp.testing.BigqueryClient; @@ -33,10 +39,12 @@ import org.apache.beam.sdk.transforms.Create; import org.apache.beam.sdk.transforms.MapElements; import org.apache.beam.sdk.transforms.PeriodicImpulse; +import org.apache.beam.sdk.util.construction.PipelineTranslation; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionRowTuple; import org.apache.beam.sdk.values.Row; import org.apache.beam.sdk.values.TypeDescriptors; +import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.InvalidProtocolBufferException; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.joda.time.Duration; import org.joda.time.Instant; @@ -87,6 +95,27 @@ public static void cleanup() { BQ_CLIENT.deleteDataset(PROJECT, BIG_QUERY_DATASET_ID); } + private void assertPipelineContainsTransformIdentifier( + Pipeline p, String schemaTransformIdentifier) { + RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p); + List writeTransformProto = + pipelineProto.getComponents().getTransformsMap().values().stream() + .filter( + tr -> { + RunnerApi.FunctionSpec spec = tr.getSpec(); + try { + return spec.getUrn().equals(getUrn(SCHEMA_TRANSFORM)) + && ExternalTransforms.SchemaTransformPayload.parseFrom(spec.getPayload()) + .getIdentifier() + .equals(schemaTransformIdentifier); + } catch (InvalidProtocolBufferException e) { + throw new RuntimeException(e); + } + }) + .collect(Collectors.toList()); + assertEquals(1, writeTransformProto.size()); + } + @Test public void testBatchFileLoadsWriteRead() { String table = @@ -100,6 +129,8 @@ public void testBatchFileLoadsWriteRead() { // batch write PCollectionRowTuple.of("input", getInput(writePipeline, false)) .apply(Managed.write(Managed.BIGQUERY).withConfig(config)); + assertPipelineContainsTransformIdentifier( + writePipeline, new BigQueryFileLoadsWriteSchemaTransformProvider().identifier()); writePipeline.run().waitUntilFinish(); // read and validate @@ -108,7 +139,8 @@ public void testBatchFileLoadsWriteRead() { .apply(Managed.read(Managed.BIGQUERY).withConfig(config)) .getSinglePCollection(); PAssert.that(outputRows).containsInAnyOrder(ROWS); - + assertPipelineContainsTransformIdentifier( + readPipeline, new BigQueryDirectReadSchemaTransformProvider().identifier()); readPipeline.run().waitUntilFinish(); } @@ -121,6 +153,8 @@ public void testStreamingStorageWriteRead() { // streaming write PCollectionRowTuple.of("input", getInput(writePipeline, true)) .apply(Managed.write(Managed.BIGQUERY).withConfig(config)); + assertPipelineContainsTransformIdentifier( + writePipeline, new BigQueryStorageWriteApiSchemaTransformProvider().identifier()); writePipeline.run().waitUntilFinish(); // read and validate @@ -129,7 +163,8 @@ public void testStreamingStorageWriteRead() { .apply(Managed.read(Managed.BIGQUERY).withConfig(config)) .getSinglePCollection(); PAssert.that(outputRows).containsInAnyOrder(ROWS); - + assertPipelineContainsTransformIdentifier( + readPipeline, new BigQueryDirectReadSchemaTransformProvider().identifier()); readPipeline.run().waitUntilFinish(); } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySchemaTransformTranslationTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQuerySchemaTransformTranslationTest.java similarity index 66% rename from sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySchemaTransformTranslationTest.java rename to sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQuerySchemaTransformTranslationTest.java index bc6624bd9371..57010489c13f 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQuerySchemaTransformTranslationTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQuerySchemaTransformTranslationTest.java @@ -15,12 +15,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.sdk.io.gcp.bigquery; +package org.apache.beam.sdk.io.gcp.bigquery.providers; import static org.apache.beam.model.pipeline.v1.ExternalTransforms.ExpansionMethods.Enum.SCHEMA_TRANSFORM; -import static org.apache.beam.sdk.io.gcp.bigquery.BigQuerySchemaTransformTranslation.BigQueryStorageReadSchemaTransformTranslator; -import static org.apache.beam.sdk.io.gcp.bigquery.BigQuerySchemaTransformTranslation.BigQueryStorageWriteSchemaTransformTranslator; import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryDirectReadSchemaTransformProvider.BigQueryDirectReadSchemaTransform; +import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryFileLoadsWriteSchemaTransformProvider.BigQueryFileLoadsSchemaTransform; +import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQuerySchemaTransformTranslation.BigQueryFileLoadsSchemaTransformTranslator; +import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQuerySchemaTransformTranslation.BigQueryStorageReadSchemaTransformTranslator; +import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQuerySchemaTransformTranslation.BigQueryStorageWriteSchemaTransformTranslator; import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider.BigQueryStorageWriteApiSchemaTransform; import static org.junit.Assert.assertEquals; @@ -33,8 +35,6 @@ import org.apache.beam.model.pipeline.v1.RunnerApi; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.coders.RowCoder; -import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryDirectReadSchemaTransformProvider; -import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.SchemaTranslation; @@ -51,12 +51,14 @@ @RunWith(JUnit4.class) public class BigQuerySchemaTransformTranslationTest { - static final BigQueryStorageWriteApiSchemaTransformProvider WRITE_PROVIDER = + static final BigQueryStorageWriteApiSchemaTransformProvider STORAGE_WRITE_PROVIDER = new BigQueryStorageWriteApiSchemaTransformProvider(); + static final BigQueryFileLoadsWriteSchemaTransformProvider FILE_LOADS_PROVIDER = + new BigQueryFileLoadsWriteSchemaTransformProvider(); static final BigQueryDirectReadSchemaTransformProvider READ_PROVIDER = new BigQueryDirectReadSchemaTransformProvider(); static final Row WRITE_CONFIG_ROW = - Row.withSchema(WRITE_PROVIDER.configurationSchema()) + Row.withSchema(STORAGE_WRITE_PROVIDER.configurationSchema()) .withFieldValue("table", "project:dataset.table") .withFieldValue("create_disposition", "create_never") .withFieldValue("write_disposition", "write_append") @@ -75,9 +77,9 @@ public class BigQuerySchemaTransformTranslationTest { .build(); @Test - public void testRecreateWriteTransformFromRow() { + public void testRecreateStorageWriteTransformFromRow() { BigQueryStorageWriteApiSchemaTransform writeTransform = - (BigQueryStorageWriteApiSchemaTransform) WRITE_PROVIDER.from(WRITE_CONFIG_ROW); + (BigQueryStorageWriteApiSchemaTransform) STORAGE_WRITE_PROVIDER.from(WRITE_CONFIG_ROW); BigQueryStorageWriteSchemaTransformTranslator translator = new BigQueryStorageWriteSchemaTransformTranslator(); @@ -90,7 +92,22 @@ public void testRecreateWriteTransformFromRow() { } @Test - public void testWriteTransformProtoTranslation() + public void testRecreateFileLoadsTransformFromRow() { + BigQueryFileLoadsSchemaTransform writeTransform = + (BigQueryFileLoadsSchemaTransform) FILE_LOADS_PROVIDER.from(WRITE_CONFIG_ROW); + + BigQueryFileLoadsSchemaTransformTranslator translator = + new BigQueryFileLoadsSchemaTransformTranslator(); + Row translatedRow = translator.toConfigRow(writeTransform); + + BigQueryFileLoadsSchemaTransform writeTransformFromRow = + translator.fromConfigRow(translatedRow, PipelineOptionsFactory.create()); + + assertEquals(WRITE_CONFIG_ROW, writeTransformFromRow.getConfigurationRow()); + } + + @Test + public void testStorageWriteTransformProtoTranslation() throws InvalidProtocolBufferException, IOException { // First build a pipeline Pipeline p = Pipeline.create(); @@ -103,7 +120,7 @@ public void testWriteTransformProtoTranslation() .setRowSchema(inputSchema); BigQueryStorageWriteApiSchemaTransform writeTransform = - (BigQueryStorageWriteApiSchemaTransform) WRITE_PROVIDER.from(WRITE_CONFIG_ROW); + (BigQueryStorageWriteApiSchemaTransform) STORAGE_WRITE_PROVIDER.from(WRITE_CONFIG_ROW); PCollectionRowTuple.of("input", input).apply(writeTransform); // Then translate the pipeline to a proto and extract KafkaWriteSchemaTransform proto @@ -117,7 +134,7 @@ public void testWriteTransformProtoTranslation() return spec.getUrn().equals(BeamUrns.getUrn(SCHEMA_TRANSFORM)) && SchemaTransformPayload.parseFrom(spec.getPayload()) .getIdentifier() - .equals(WRITE_PROVIDER.identifier()); + .equals(STORAGE_WRITE_PROVIDER.identifier()); } catch (InvalidProtocolBufferException e) { throw new RuntimeException(e); } @@ -129,7 +146,7 @@ public void testWriteTransformProtoTranslation() // Check that the proto contains correct values SchemaTransformPayload payload = SchemaTransformPayload.parseFrom(spec.getPayload()); Schema schemaFromSpec = SchemaTranslation.schemaFromProto(payload.getConfigurationSchema()); - assertEquals(WRITE_PROVIDER.configurationSchema(), schemaFromSpec); + assertEquals(STORAGE_WRITE_PROVIDER.configurationSchema(), schemaFromSpec); Row rowFromSpec = RowCoder.of(schemaFromSpec).decode(payload.getConfigurationRow().newInput()); assertEquals(WRITE_CONFIG_ROW, rowFromSpec); @@ -143,6 +160,60 @@ public void testWriteTransformProtoTranslation() assertEquals(WRITE_CONFIG_ROW, writeTransformFromSpec.getConfigurationRow()); } + @Test + public void testFileLoadsTransformProtoTranslation() + throws InvalidProtocolBufferException, IOException { + // First build a pipeline + Pipeline p = Pipeline.create(); + Schema inputSchema = Schema.builder().addByteArrayField("b").build(); + PCollection input = + p.apply( + Create.of( + Collections.singletonList( + Row.withSchema(inputSchema).addValue(new byte[] {1, 2, 3}).build()))) + .setRowSchema(inputSchema); + + BigQueryFileLoadsSchemaTransform writeTransform = + (BigQueryFileLoadsSchemaTransform) FILE_LOADS_PROVIDER.from(WRITE_CONFIG_ROW); + PCollectionRowTuple.of("input", input).apply(writeTransform); + + // Then translate the pipeline to a proto and extract KafkaWriteSchemaTransform proto + RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p); + List writeTransformProto = + pipelineProto.getComponents().getTransformsMap().values().stream() + .filter( + tr -> { + RunnerApi.FunctionSpec spec = tr.getSpec(); + try { + return spec.getUrn().equals(BeamUrns.getUrn(SCHEMA_TRANSFORM)) + && SchemaTransformPayload.parseFrom(spec.getPayload()) + .getIdentifier() + .equals(FILE_LOADS_PROVIDER.identifier()); + } catch (InvalidProtocolBufferException e) { + throw new RuntimeException(e); + } + }) + .collect(Collectors.toList()); + assertEquals(1, writeTransformProto.size()); + RunnerApi.FunctionSpec spec = writeTransformProto.get(0).getSpec(); + + // Check that the proto contains correct values + SchemaTransformPayload payload = SchemaTransformPayload.parseFrom(spec.getPayload()); + Schema schemaFromSpec = SchemaTranslation.schemaFromProto(payload.getConfigurationSchema()); + assertEquals(FILE_LOADS_PROVIDER.configurationSchema(), schemaFromSpec); + Row rowFromSpec = RowCoder.of(schemaFromSpec).decode(payload.getConfigurationRow().newInput()); + + assertEquals(WRITE_CONFIG_ROW, rowFromSpec); + + // Use the information in the proto to recreate the KafkaWriteSchemaTransform + BigQueryFileLoadsSchemaTransformTranslator translator = + new BigQueryFileLoadsSchemaTransformTranslator(); + BigQueryFileLoadsSchemaTransform writeTransformFromSpec = + translator.fromConfigRow(rowFromSpec, PipelineOptionsFactory.create()); + + assertEquals(WRITE_CONFIG_ROW, writeTransformFromSpec.getConfigurationRow()); + } + @Test public void testReCreateReadTransformFromRow() { BigQueryDirectReadSchemaTransform readTransform = From 105474bdeee97f14a374003df618c1d740eca95e Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Thu, 7 Nov 2024 19:55:51 -0500 Subject: [PATCH 22/24] set top-level wrapper to be the underlying managed BQ transform urn; change tests to verify underlying transform name --- ...ueryFileLoadsSchemaTransformProvider.java} | 18 +-- .../BigQuerySchemaTransformTranslation.java | 32 +---- ...torageWriteApiSchemaTransformProvider.java | 16 --- .../providers/BigQueryWriteConfiguration.java | 2 +- .../BigQueryWriteSchemaTransformProvider.java | 27 ++++- ...LoadsWriteSchemaTransformProviderTest.java | 11 +- .../bigquery/providers/BigQueryManagedIT.java | 44 +++---- ...igQuerySchemaTransformTranslationTest.java | 111 +++--------------- 8 files changed, 69 insertions(+), 192 deletions(-) rename sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/{BigQueryFileLoadsWriteSchemaTransformProvider.java => BigQueryFileLoadsSchemaTransformProvider.java} (88%) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsWriteSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsSchemaTransformProvider.java similarity index 88% rename from sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsWriteSchemaTransformProvider.java rename to sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsSchemaTransformProvider.java index 16ced3642b61..6532c5319657 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsWriteSchemaTransformProvider.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsSchemaTransformProvider.java @@ -26,8 +26,6 @@ import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryUtils; -import org.apache.beam.sdk.schemas.NoSuchSchemaException; -import org.apache.beam.sdk.schemas.SchemaRegistry; import org.apache.beam.sdk.schemas.transforms.SchemaTransform; import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider; @@ -50,7 +48,7 @@ }) @Internal @AutoService(SchemaTransformProvider.class) -public class BigQueryFileLoadsWriteSchemaTransformProvider +public class BigQueryFileLoadsSchemaTransformProvider extends TypedSchemaTransformProvider { static final String INPUT_TAG = "input"; @@ -128,19 +126,5 @@ BigQueryIO.Write toWrite() { void setTestBigQueryServices(BigQueryServices testBigQueryServices) { this.testBigQueryServices = testBigQueryServices; } - - public Row getConfigurationRow() { - try { - // To stay consistent with our SchemaTransform configuration naming conventions, - // we sort lexicographically - return SchemaRegistry.createDefault() - .getToRowFunction(BigQueryWriteConfiguration.class) - .apply(configuration) - .sorted() - .toSnakeCase(); - } catch (NoSuchSchemaException e) { - throw new RuntimeException(e); - } - } } } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQuerySchemaTransformTranslation.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQuerySchemaTransformTranslation.java index c0c561ee3f57..555df0d0a2b8 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQuerySchemaTransformTranslation.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQuerySchemaTransformTranslation.java @@ -18,8 +18,7 @@ package org.apache.beam.sdk.io.gcp.bigquery.providers; import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryDirectReadSchemaTransformProvider.BigQueryDirectReadSchemaTransform; -import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryFileLoadsWriteSchemaTransformProvider.BigQueryFileLoadsSchemaTransform; -import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider.BigQueryStorageWriteApiSchemaTransform; +import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryWriteSchemaTransformProvider.BigQueryWriteSchemaTransform; import com.google.auto.service.AutoService; import java.util.Map; @@ -46,30 +45,16 @@ public Row toConfigRow(BigQueryDirectReadSchemaTransform transform) { } } - public static class BigQueryStorageWriteSchemaTransformTranslator + public static class BigQueryWriteSchemaTransformTranslator extends SchemaTransformTranslation.SchemaTransformPayloadTranslator< - BigQueryStorageWriteApiSchemaTransform> { + BigQueryWriteSchemaTransform> { @Override public SchemaTransformProvider provider() { - return new BigQueryStorageWriteApiSchemaTransformProvider(); + return new BigQueryWriteSchemaTransformProvider(); } @Override - public Row toConfigRow(BigQueryStorageWriteApiSchemaTransform transform) { - return transform.getConfigurationRow(); - } - } - - public static class BigQueryFileLoadsSchemaTransformTranslator - extends SchemaTransformTranslation.SchemaTransformPayloadTranslator< - BigQueryFileLoadsSchemaTransform> { - @Override - public SchemaTransformProvider provider() { - return new BigQueryFileLoadsWriteSchemaTransformProvider(); - } - - @Override - public Row toConfigRow(BigQueryFileLoadsSchemaTransform transform) { + public Row toConfigRow(BigQueryWriteSchemaTransform transform) { return transform.getConfigurationRow(); } } @@ -89,12 +74,7 @@ public static class ReadWriteRegistrar implements TransformPayloadTranslatorRegi .put( BigQueryDirectReadSchemaTransform.class, new BigQueryStorageReadSchemaTransformTranslator()) - .put( - BigQueryStorageWriteApiSchemaTransform.class, - new BigQueryStorageWriteSchemaTransformTranslator()) - .put( - BigQueryFileLoadsSchemaTransform.class, - new BigQueryFileLoadsSchemaTransformTranslator()) + .put(BigQueryWriteSchemaTransform.class, new BigQueryWriteSchemaTransformTranslator()) .build(); } } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java index a159f4bfec8d..c45433aaf0e7 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java @@ -40,11 +40,9 @@ import org.apache.beam.sdk.io.gcp.bigquery.WriteResult; import org.apache.beam.sdk.metrics.Counter; import org.apache.beam.sdk.metrics.Metrics; -import org.apache.beam.sdk.schemas.NoSuchSchemaException; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.Schema.Field; import org.apache.beam.sdk.schemas.Schema.FieldType; -import org.apache.beam.sdk.schemas.SchemaRegistry; import org.apache.beam.sdk.schemas.transforms.SchemaTransform; import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider; @@ -311,20 +309,6 @@ public PCollectionRowTuple expand(PCollectionRowTuple input) { } } - public Row getConfigurationRow() { - try { - // To stay consistent with our SchemaTransform configuration naming conventions, - // we sort lexicographically - return SchemaRegistry.createDefault() - .getToRowFunction(BigQueryWriteConfiguration.class) - .apply(configuration) - .sorted() - .toSnakeCase(); - } catch (NoSuchSchemaException e) { - throw new RuntimeException(e); - } - } - void validateDynamicDestinationsExpectedSchema(Schema schema) { checkArgument( schema.getFieldNames().containsAll(Arrays.asList("destination", "record")), diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryWriteConfiguration.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryWriteConfiguration.java index 32250fc7976b..4296da7e0cd5 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryWriteConfiguration.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryWriteConfiguration.java @@ -36,7 +36,7 @@ /** * Configuration for writing to BigQuery with SchemaTransforms. Used by {@link * BigQueryStorageWriteApiSchemaTransformProvider} and {@link - * BigQueryFileLoadsWriteSchemaTransformProvider}. + * BigQueryFileLoadsSchemaTransformProvider}. */ @DefaultSchema(AutoValueSchema.class) @AutoValue diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryWriteSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryWriteSchemaTransformProvider.java index a069815f3189..abab169d6932 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryWriteSchemaTransformProvider.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryWriteSchemaTransformProvider.java @@ -22,15 +22,18 @@ import com.google.auto.service.AutoService; import org.apache.beam.model.pipeline.v1.ExternalTransforms; import org.apache.beam.sdk.annotations.Internal; +import org.apache.beam.sdk.schemas.NoSuchSchemaException; +import org.apache.beam.sdk.schemas.SchemaRegistry; import org.apache.beam.sdk.schemas.transforms.SchemaTransform; import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionRowTuple; +import org.apache.beam.sdk.values.Row; /** * A BigQuery Write SchemaTransformProvider that routes to either {@link - * BigQueryFileLoadsWriteSchemaTransformProvider} or {@link + * BigQueryFileLoadsSchemaTransformProvider} or {@link * BigQueryStorageWriteApiSchemaTransformProvider}. * *

Internal only. Used by the Managed Transform layer. @@ -46,13 +49,13 @@ public String identifier() { @Override protected SchemaTransform from(BigQueryWriteConfiguration configuration) { - return new BigQueryWriteRouter(configuration); + return new BigQueryWriteSchemaTransform(configuration); } - static class BigQueryWriteRouter extends SchemaTransform { + public static class BigQueryWriteSchemaTransform extends SchemaTransform { private final BigQueryWriteConfiguration configuration; - BigQueryWriteRouter(BigQueryWriteConfiguration configuration) { + BigQueryWriteSchemaTransform(BigQueryWriteConfiguration configuration) { configuration.validate(); this.configuration = configuration; } @@ -60,11 +63,25 @@ static class BigQueryWriteRouter extends SchemaTransform { @Override public PCollectionRowTuple expand(PCollectionRowTuple input) { if (input.getSinglePCollection().isBounded().equals(PCollection.IsBounded.BOUNDED)) { - return input.apply(new BigQueryFileLoadsWriteSchemaTransformProvider().from(configuration)); + return input.apply(new BigQueryFileLoadsSchemaTransformProvider().from(configuration)); } else { // UNBOUNDED return input.apply( new BigQueryStorageWriteApiSchemaTransformProvider().from(configuration)); } } + + public Row getConfigurationRow() { + try { + // To stay consistent with our SchemaTransform configuration naming conventions, + // we sort lexicographically + return SchemaRegistry.createDefault() + .getToRowFunction(BigQueryWriteConfiguration.class) + .apply(configuration) + .sorted() + .toSnakeCase(); + } catch (NoSuchSchemaException e) { + throw new RuntimeException(e); + } + } } } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsWriteSchemaTransformProviderTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsWriteSchemaTransformProviderTest.java index d506661a3e3e..422f82b61029 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsWriteSchemaTransformProviderTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsWriteSchemaTransformProviderTest.java @@ -28,7 +28,7 @@ import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryOptions; -import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryFileLoadsWriteSchemaTransformProvider.BigQueryFileLoadsSchemaTransform; +import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryFileLoadsSchemaTransformProvider.BigQueryFileLoadsSchemaTransform; import org.apache.beam.sdk.io.gcp.testing.FakeBigQueryServices; import org.apache.beam.sdk.io.gcp.testing.FakeDatasetService; import org.apache.beam.sdk.io.gcp.testing.FakeJobService; @@ -47,10 +47,7 @@ import org.junit.runner.RunWith; import org.junit.runners.JUnit4; -/** - * Test for {@link - * org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryFileLoadsWriteSchemaTransformProvider}. - */ +/** Test for {@link BigQueryFileLoadsSchemaTransformProvider}. */ @RunWith(JUnit4.class) public class BigQueryFileLoadsWriteSchemaTransformProviderTest { @@ -98,8 +95,8 @@ public void tearDown() { @Test public void testLoad() throws IOException, InterruptedException { - BigQueryFileLoadsWriteSchemaTransformProvider provider = - new BigQueryFileLoadsWriteSchemaTransformProvider(); + BigQueryFileLoadsSchemaTransformProvider provider = + new BigQueryFileLoadsSchemaTransformProvider(); BigQueryWriteConfiguration configuration = BigQueryWriteConfiguration.builder() .setTable(BigQueryHelpers.toTableSpec(TABLE_REFERENCE)) diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryManagedIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryManagedIT.java index 62d737da9110..b81295df8500 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryManagedIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryManagedIT.java @@ -17,16 +17,17 @@ */ package org.apache.beam.sdk.io.gcp.bigquery.providers; -import static org.apache.beam.model.pipeline.v1.ExternalTransforms.ExpansionMethods.Enum.SCHEMA_TRANSFORM; -import static org.apache.beam.sdk.util.construction.BeamUrns.getUrn; -import static org.junit.Assert.assertEquals; +import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryDirectReadSchemaTransformProvider.BigQueryDirectReadSchemaTransform; +import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryFileLoadsSchemaTransformProvider.BigQueryFileLoadsSchemaTransform; +import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider.BigQueryStorageWriteApiSchemaTransform; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.greaterThan; import java.io.IOException; import java.util.List; import java.util.Map; import java.util.stream.Collectors; import java.util.stream.LongStream; -import org.apache.beam.model.pipeline.v1.ExternalTransforms; import org.apache.beam.model.pipeline.v1.RunnerApi; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; @@ -44,7 +45,6 @@ import org.apache.beam.sdk.values.PCollectionRowTuple; import org.apache.beam.sdk.values.Row; import org.apache.beam.sdk.values.TypeDescriptors; -import org.apache.beam.vendor.grpc.v1p60p1.com.google.protobuf.InvalidProtocolBufferException; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.joda.time.Duration; import org.joda.time.Instant; @@ -95,25 +95,13 @@ public static void cleanup() { BQ_CLIENT.deleteDataset(PROJECT, BIG_QUERY_DATASET_ID); } - private void assertPipelineContainsTransformIdentifier( - Pipeline p, String schemaTransformIdentifier) { + private void assertPipelineContainsTransformName(Pipeline p, String transformName) { RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p); List writeTransformProto = pipelineProto.getComponents().getTransformsMap().values().stream() - .filter( - tr -> { - RunnerApi.FunctionSpec spec = tr.getSpec(); - try { - return spec.getUrn().equals(getUrn(SCHEMA_TRANSFORM)) - && ExternalTransforms.SchemaTransformPayload.parseFrom(spec.getPayload()) - .getIdentifier() - .equals(schemaTransformIdentifier); - } catch (InvalidProtocolBufferException e) { - throw new RuntimeException(e); - } - }) + .filter(tr -> tr.getUniqueName().contains(transformName)) .collect(Collectors.toList()); - assertEquals(1, writeTransformProto.size()); + assertThat(writeTransformProto.size(), greaterThan(0)); } @Test @@ -129,8 +117,8 @@ public void testBatchFileLoadsWriteRead() { // batch write PCollectionRowTuple.of("input", getInput(writePipeline, false)) .apply(Managed.write(Managed.BIGQUERY).withConfig(config)); - assertPipelineContainsTransformIdentifier( - writePipeline, new BigQueryFileLoadsWriteSchemaTransformProvider().identifier()); + assertPipelineContainsTransformName( + writePipeline, BigQueryFileLoadsSchemaTransform.class.getSimpleName()); writePipeline.run().waitUntilFinish(); // read and validate @@ -139,8 +127,8 @@ public void testBatchFileLoadsWriteRead() { .apply(Managed.read(Managed.BIGQUERY).withConfig(config)) .getSinglePCollection(); PAssert.that(outputRows).containsInAnyOrder(ROWS); - assertPipelineContainsTransformIdentifier( - readPipeline, new BigQueryDirectReadSchemaTransformProvider().identifier()); + assertPipelineContainsTransformName( + readPipeline, BigQueryDirectReadSchemaTransform.class.getSimpleName()); readPipeline.run().waitUntilFinish(); } @@ -153,8 +141,8 @@ public void testStreamingStorageWriteRead() { // streaming write PCollectionRowTuple.of("input", getInput(writePipeline, true)) .apply(Managed.write(Managed.BIGQUERY).withConfig(config)); - assertPipelineContainsTransformIdentifier( - writePipeline, new BigQueryStorageWriteApiSchemaTransformProvider().identifier()); + assertPipelineContainsTransformName( + writePipeline, BigQueryStorageWriteApiSchemaTransform.class.getSimpleName()); writePipeline.run().waitUntilFinish(); // read and validate @@ -163,8 +151,8 @@ public void testStreamingStorageWriteRead() { .apply(Managed.read(Managed.BIGQUERY).withConfig(config)) .getSinglePCollection(); PAssert.that(outputRows).containsInAnyOrder(ROWS); - assertPipelineContainsTransformIdentifier( - readPipeline, new BigQueryDirectReadSchemaTransformProvider().identifier()); + assertPipelineContainsTransformName( + readPipeline, BigQueryDirectReadSchemaTransform.class.getSimpleName()); readPipeline.run().waitUntilFinish(); } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQuerySchemaTransformTranslationTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQuerySchemaTransformTranslationTest.java index 57010489c13f..822c607aa3c9 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQuerySchemaTransformTranslationTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQuerySchemaTransformTranslationTest.java @@ -19,11 +19,9 @@ import static org.apache.beam.model.pipeline.v1.ExternalTransforms.ExpansionMethods.Enum.SCHEMA_TRANSFORM; import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryDirectReadSchemaTransformProvider.BigQueryDirectReadSchemaTransform; -import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryFileLoadsWriteSchemaTransformProvider.BigQueryFileLoadsSchemaTransform; -import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQuerySchemaTransformTranslation.BigQueryFileLoadsSchemaTransformTranslator; import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQuerySchemaTransformTranslation.BigQueryStorageReadSchemaTransformTranslator; -import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQuerySchemaTransformTranslation.BigQueryStorageWriteSchemaTransformTranslator; -import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider.BigQueryStorageWriteApiSchemaTransform; +import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQuerySchemaTransformTranslation.BigQueryWriteSchemaTransformTranslator; +import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryWriteSchemaTransformProvider.BigQueryWriteSchemaTransform; import static org.junit.Assert.assertEquals; import java.io.IOException; @@ -51,14 +49,12 @@ @RunWith(JUnit4.class) public class BigQuerySchemaTransformTranslationTest { - static final BigQueryStorageWriteApiSchemaTransformProvider STORAGE_WRITE_PROVIDER = - new BigQueryStorageWriteApiSchemaTransformProvider(); - static final BigQueryFileLoadsWriteSchemaTransformProvider FILE_LOADS_PROVIDER = - new BigQueryFileLoadsWriteSchemaTransformProvider(); + static final BigQueryWriteSchemaTransformProvider WRITE_PROVIDER = + new BigQueryWriteSchemaTransformProvider(); static final BigQueryDirectReadSchemaTransformProvider READ_PROVIDER = new BigQueryDirectReadSchemaTransformProvider(); static final Row WRITE_CONFIG_ROW = - Row.withSchema(STORAGE_WRITE_PROVIDER.configurationSchema()) + Row.withSchema(WRITE_PROVIDER.configurationSchema()) .withFieldValue("table", "project:dataset.table") .withFieldValue("create_disposition", "create_never") .withFieldValue("write_disposition", "write_append") @@ -77,91 +73,22 @@ public class BigQuerySchemaTransformTranslationTest { .build(); @Test - public void testRecreateStorageWriteTransformFromRow() { - BigQueryStorageWriteApiSchemaTransform writeTransform = - (BigQueryStorageWriteApiSchemaTransform) STORAGE_WRITE_PROVIDER.from(WRITE_CONFIG_ROW); + public void testRecreateWriteTransformFromRow() { + BigQueryWriteSchemaTransform writeTransform = + (BigQueryWriteSchemaTransform) WRITE_PROVIDER.from(WRITE_CONFIG_ROW); - BigQueryStorageWriteSchemaTransformTranslator translator = - new BigQueryStorageWriteSchemaTransformTranslator(); + BigQueryWriteSchemaTransformTranslator translator = + new BigQueryWriteSchemaTransformTranslator(); Row translatedRow = translator.toConfigRow(writeTransform); - BigQueryStorageWriteApiSchemaTransform writeTransformFromRow = + BigQueryWriteSchemaTransform writeTransformFromRow = translator.fromConfigRow(translatedRow, PipelineOptionsFactory.create()); assertEquals(WRITE_CONFIG_ROW, writeTransformFromRow.getConfigurationRow()); } @Test - public void testRecreateFileLoadsTransformFromRow() { - BigQueryFileLoadsSchemaTransform writeTransform = - (BigQueryFileLoadsSchemaTransform) FILE_LOADS_PROVIDER.from(WRITE_CONFIG_ROW); - - BigQueryFileLoadsSchemaTransformTranslator translator = - new BigQueryFileLoadsSchemaTransformTranslator(); - Row translatedRow = translator.toConfigRow(writeTransform); - - BigQueryFileLoadsSchemaTransform writeTransformFromRow = - translator.fromConfigRow(translatedRow, PipelineOptionsFactory.create()); - - assertEquals(WRITE_CONFIG_ROW, writeTransformFromRow.getConfigurationRow()); - } - - @Test - public void testStorageWriteTransformProtoTranslation() - throws InvalidProtocolBufferException, IOException { - // First build a pipeline - Pipeline p = Pipeline.create(); - Schema inputSchema = Schema.builder().addByteArrayField("b").build(); - PCollection input = - p.apply( - Create.of( - Collections.singletonList( - Row.withSchema(inputSchema).addValue(new byte[] {1, 2, 3}).build()))) - .setRowSchema(inputSchema); - - BigQueryStorageWriteApiSchemaTransform writeTransform = - (BigQueryStorageWriteApiSchemaTransform) STORAGE_WRITE_PROVIDER.from(WRITE_CONFIG_ROW); - PCollectionRowTuple.of("input", input).apply(writeTransform); - - // Then translate the pipeline to a proto and extract KafkaWriteSchemaTransform proto - RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p); - List writeTransformProto = - pipelineProto.getComponents().getTransformsMap().values().stream() - .filter( - tr -> { - RunnerApi.FunctionSpec spec = tr.getSpec(); - try { - return spec.getUrn().equals(BeamUrns.getUrn(SCHEMA_TRANSFORM)) - && SchemaTransformPayload.parseFrom(spec.getPayload()) - .getIdentifier() - .equals(STORAGE_WRITE_PROVIDER.identifier()); - } catch (InvalidProtocolBufferException e) { - throw new RuntimeException(e); - } - }) - .collect(Collectors.toList()); - assertEquals(1, writeTransformProto.size()); - RunnerApi.FunctionSpec spec = writeTransformProto.get(0).getSpec(); - - // Check that the proto contains correct values - SchemaTransformPayload payload = SchemaTransformPayload.parseFrom(spec.getPayload()); - Schema schemaFromSpec = SchemaTranslation.schemaFromProto(payload.getConfigurationSchema()); - assertEquals(STORAGE_WRITE_PROVIDER.configurationSchema(), schemaFromSpec); - Row rowFromSpec = RowCoder.of(schemaFromSpec).decode(payload.getConfigurationRow().newInput()); - - assertEquals(WRITE_CONFIG_ROW, rowFromSpec); - - // Use the information in the proto to recreate the KafkaWriteSchemaTransform - BigQueryStorageWriteSchemaTransformTranslator translator = - new BigQueryStorageWriteSchemaTransformTranslator(); - BigQueryStorageWriteApiSchemaTransform writeTransformFromSpec = - translator.fromConfigRow(rowFromSpec, PipelineOptionsFactory.create()); - - assertEquals(WRITE_CONFIG_ROW, writeTransformFromSpec.getConfigurationRow()); - } - - @Test - public void testFileLoadsTransformProtoTranslation() + public void testWriteTransformProtoTranslation() throws InvalidProtocolBufferException, IOException { // First build a pipeline Pipeline p = Pipeline.create(); @@ -173,8 +100,8 @@ public void testFileLoadsTransformProtoTranslation() Row.withSchema(inputSchema).addValue(new byte[] {1, 2, 3}).build()))) .setRowSchema(inputSchema); - BigQueryFileLoadsSchemaTransform writeTransform = - (BigQueryFileLoadsSchemaTransform) FILE_LOADS_PROVIDER.from(WRITE_CONFIG_ROW); + BigQueryWriteSchemaTransform writeTransform = + (BigQueryWriteSchemaTransform) WRITE_PROVIDER.from(WRITE_CONFIG_ROW); PCollectionRowTuple.of("input", input).apply(writeTransform); // Then translate the pipeline to a proto and extract KafkaWriteSchemaTransform proto @@ -188,7 +115,7 @@ public void testFileLoadsTransformProtoTranslation() return spec.getUrn().equals(BeamUrns.getUrn(SCHEMA_TRANSFORM)) && SchemaTransformPayload.parseFrom(spec.getPayload()) .getIdentifier() - .equals(FILE_LOADS_PROVIDER.identifier()); + .equals(WRITE_PROVIDER.identifier()); } catch (InvalidProtocolBufferException e) { throw new RuntimeException(e); } @@ -200,15 +127,15 @@ public void testFileLoadsTransformProtoTranslation() // Check that the proto contains correct values SchemaTransformPayload payload = SchemaTransformPayload.parseFrom(spec.getPayload()); Schema schemaFromSpec = SchemaTranslation.schemaFromProto(payload.getConfigurationSchema()); - assertEquals(FILE_LOADS_PROVIDER.configurationSchema(), schemaFromSpec); + assertEquals(WRITE_PROVIDER.configurationSchema(), schemaFromSpec); Row rowFromSpec = RowCoder.of(schemaFromSpec).decode(payload.getConfigurationRow().newInput()); assertEquals(WRITE_CONFIG_ROW, rowFromSpec); // Use the information in the proto to recreate the KafkaWriteSchemaTransform - BigQueryFileLoadsSchemaTransformTranslator translator = - new BigQueryFileLoadsSchemaTransformTranslator(); - BigQueryFileLoadsSchemaTransform writeTransformFromSpec = + BigQueryWriteSchemaTransformTranslator translator = + new BigQueryWriteSchemaTransformTranslator(); + BigQueryWriteSchemaTransform writeTransformFromSpec = translator.fromConfigRow(rowFromSpec, PipelineOptionsFactory.create()); assertEquals(WRITE_CONFIG_ROW, writeTransformFromSpec.getConfigurationRow()); From d6b9e69276f82635d34d2cc83ec3973852b4307a Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Thu, 7 Nov 2024 20:19:00 -0500 Subject: [PATCH 23/24] move unit tests to respectvie schematransform test classes --- ...LoadsWriteSchemaTransformProviderTest.java | 27 +++++++++++++++++++ .../bigquery/providers/BigQueryManagedIT.java | 25 ----------------- ...geWriteApiSchemaTransformProviderTest.java | 27 +++++++++++++++++++ 3 files changed, 54 insertions(+), 25 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsWriteSchemaTransformProviderTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsWriteSchemaTransformProviderTest.java index 422f82b61029..5c2b764ef2ec 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsWriteSchemaTransformProviderTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsWriteSchemaTransformProviderTest.java @@ -17,6 +17,8 @@ */ package org.apache.beam.sdk.io.gcp.bigquery.providers; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.greaterThan; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; @@ -24,6 +26,8 @@ import java.io.IOException; import java.util.Arrays; import java.util.List; +import java.util.stream.Collectors; +import org.apache.beam.model.pipeline.v1.RunnerApi; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; @@ -32,13 +36,17 @@ import org.apache.beam.sdk.io.gcp.testing.FakeBigQueryServices; import org.apache.beam.sdk.io.gcp.testing.FakeDatasetService; import org.apache.beam.sdk.io.gcp.testing.FakeJobService; +import org.apache.beam.sdk.managed.Managed; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.Schema.Field; import org.apache.beam.sdk.schemas.Schema.FieldType; import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.util.construction.PipelineTranslation; +import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionRowTuple; import org.apache.beam.sdk.values.Row; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.junit.After; import org.junit.Before; import org.junit.Rule; @@ -116,4 +124,23 @@ public void testLoad() throws IOException, InterruptedException { assertNotNull(fakeDatasetService.getTable(TABLE_REFERENCE)); assertEquals(ROWS.size(), fakeDatasetService.getAllRows(PROJECT, DATASET, TABLE_ID).size()); } + + @Test + public void testManagedChoosesFileLoadsForBoundedWrites() { + PCollection batchInput = p.apply(Create.of(ROWS)).setRowSchema(SCHEMA); + batchInput.apply( + Managed.write(Managed.BIGQUERY) + .withConfig(ImmutableMap.of("table", "project.dataset.table"))); + + RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p); + List writeTransformProto = + pipelineProto.getComponents().getTransformsMap().values().stream() + .filter( + tr -> + tr.getUniqueName() + .contains(BigQueryFileLoadsSchemaTransform.class.getSimpleName())) + .collect(Collectors.toList()); + assertThat(writeTransformProto.size(), greaterThan(0)); + p.enableAbandonedNodeEnforcement(false); + } } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryManagedIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryManagedIT.java index b81295df8500..63727107a651 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryManagedIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryManagedIT.java @@ -17,18 +17,11 @@ */ package org.apache.beam.sdk.io.gcp.bigquery.providers; -import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryDirectReadSchemaTransformProvider.BigQueryDirectReadSchemaTransform; -import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryFileLoadsSchemaTransformProvider.BigQueryFileLoadsSchemaTransform; -import static org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider.BigQueryStorageWriteApiSchemaTransform; -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.greaterThan; - import java.io.IOException; import java.util.List; import java.util.Map; import java.util.stream.Collectors; import java.util.stream.LongStream; -import org.apache.beam.model.pipeline.v1.RunnerApi; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; import org.apache.beam.sdk.io.gcp.testing.BigqueryClient; @@ -40,7 +33,6 @@ import org.apache.beam.sdk.transforms.Create; import org.apache.beam.sdk.transforms.MapElements; import org.apache.beam.sdk.transforms.PeriodicImpulse; -import org.apache.beam.sdk.util.construction.PipelineTranslation; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionRowTuple; import org.apache.beam.sdk.values.Row; @@ -95,15 +87,6 @@ public static void cleanup() { BQ_CLIENT.deleteDataset(PROJECT, BIG_QUERY_DATASET_ID); } - private void assertPipelineContainsTransformName(Pipeline p, String transformName) { - RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p); - List writeTransformProto = - pipelineProto.getComponents().getTransformsMap().values().stream() - .filter(tr -> tr.getUniqueName().contains(transformName)) - .collect(Collectors.toList()); - assertThat(writeTransformProto.size(), greaterThan(0)); - } - @Test public void testBatchFileLoadsWriteRead() { String table = @@ -117,8 +100,6 @@ public void testBatchFileLoadsWriteRead() { // batch write PCollectionRowTuple.of("input", getInput(writePipeline, false)) .apply(Managed.write(Managed.BIGQUERY).withConfig(config)); - assertPipelineContainsTransformName( - writePipeline, BigQueryFileLoadsSchemaTransform.class.getSimpleName()); writePipeline.run().waitUntilFinish(); // read and validate @@ -127,8 +108,6 @@ public void testBatchFileLoadsWriteRead() { .apply(Managed.read(Managed.BIGQUERY).withConfig(config)) .getSinglePCollection(); PAssert.that(outputRows).containsInAnyOrder(ROWS); - assertPipelineContainsTransformName( - readPipeline, BigQueryDirectReadSchemaTransform.class.getSimpleName()); readPipeline.run().waitUntilFinish(); } @@ -141,8 +120,6 @@ public void testStreamingStorageWriteRead() { // streaming write PCollectionRowTuple.of("input", getInput(writePipeline, true)) .apply(Managed.write(Managed.BIGQUERY).withConfig(config)); - assertPipelineContainsTransformName( - writePipeline, BigQueryStorageWriteApiSchemaTransform.class.getSimpleName()); writePipeline.run().waitUntilFinish(); // read and validate @@ -151,8 +128,6 @@ public void testStreamingStorageWriteRead() { .apply(Managed.read(Managed.BIGQUERY).withConfig(config)) .getSinglePCollection(); PAssert.that(outputRows).containsInAnyOrder(ROWS); - assertPipelineContainsTransformName( - readPipeline, BigQueryDirectReadSchemaTransform.class.getSimpleName()); readPipeline.run().waitUntilFinish(); } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProviderTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProviderTest.java index 3a23f5a3205a..7b59552bbbe4 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProviderTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProviderTest.java @@ -17,6 +17,8 @@ */ package org.apache.beam.sdk.io.gcp.bigquery.providers; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.greaterThan; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertThrows; @@ -32,12 +34,14 @@ import java.util.stream.Collectors; import java.util.stream.IntStream; import java.util.stream.Stream; +import org.apache.beam.model.pipeline.v1.RunnerApi; import org.apache.beam.sdk.PipelineResult; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers; import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider.BigQueryStorageWriteApiSchemaTransform; import org.apache.beam.sdk.io.gcp.testing.FakeBigQueryServices; import org.apache.beam.sdk.io.gcp.testing.FakeDatasetService; import org.apache.beam.sdk.io.gcp.testing.FakeJobService; +import org.apache.beam.sdk.managed.Managed; import org.apache.beam.sdk.metrics.MetricNameFilter; import org.apache.beam.sdk.metrics.MetricQueryResults; import org.apache.beam.sdk.metrics.MetricResult; @@ -49,13 +53,16 @@ import org.apache.beam.sdk.schemas.logicaltypes.SqlTypes; import org.apache.beam.sdk.testing.PAssert; import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.testing.TestStream; import org.apache.beam.sdk.transforms.Create; import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.util.construction.PipelineTranslation; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionRowTuple; import org.apache.beam.sdk.values.Row; import org.apache.beam.sdk.values.TypeDescriptors; import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList; +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap; import org.junit.Before; import org.junit.Rule; import org.junit.Test; @@ -448,4 +455,24 @@ public void testErrorCount() throws Exception { assertEquals(expectedCount, count.getAttempted()); } } + + @Test + public void testManagedChoosesStorageApiForUnboundedWrites() { + PCollection batchInput = + p.apply(TestStream.create(SCHEMA).addElements(ROWS.get(0)).advanceWatermarkToInfinity()); + batchInput.apply( + Managed.write(Managed.BIGQUERY) + .withConfig(ImmutableMap.of("table", "project.dataset.table"))); + + RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p); + List writeTransformProto = + pipelineProto.getComponents().getTransformsMap().values().stream() + .filter( + tr -> + tr.getUniqueName() + .contains(BigQueryStorageWriteApiSchemaTransform.class.getSimpleName())) + .collect(Collectors.toList()); + assertThat(writeTransformProto.size(), greaterThan(0)); + p.enableAbandonedNodeEnforcement(false); + } } From ad4dcd9080a2d29f836a6dd1a4e9a0d6a8684241 Mon Sep 17 00:00:00 2001 From: Ahmed Abualsaud Date: Mon, 11 Nov 2024 08:30:35 -0500 Subject: [PATCH 24/24] expose to Python SDK as well --- .../BigQueryFileLoadsSchemaTransformProvider.java | 11 +++++++++-- ...BigQueryFileLoadsSchemaTransformProviderTest.java} | 2 +- sdks/python/apache_beam/transforms/managed.py | 8 +++++++- 3 files changed, 17 insertions(+), 4 deletions(-) rename sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/{BigQueryFileLoadsWriteSchemaTransformProviderTest.java => BigQueryFileLoadsSchemaTransformProviderTest.java} (99%) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsSchemaTransformProvider.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsSchemaTransformProvider.java index 6532c5319657..092cf42a29a4 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsSchemaTransformProvider.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsSchemaTransformProvider.java @@ -26,6 +26,8 @@ import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryUtils; +import org.apache.beam.sdk.options.PipelineOptions; +import org.apache.beam.sdk.options.ValueProvider; import org.apache.beam.sdk.schemas.transforms.SchemaTransform; import org.apache.beam.sdk.schemas.transforms.SchemaTransformProvider; import org.apache.beam.sdk.schemas.transforms.TypedSchemaTransformProvider; @@ -87,18 +89,23 @@ public static class BigQueryFileLoadsSchemaTransform extends SchemaTransform { @Override public PCollectionRowTuple expand(PCollectionRowTuple input) { PCollection rowPCollection = input.getSinglePCollection(); - BigQueryIO.Write write = toWrite(); + BigQueryIO.Write write = toWrite(input.getPipeline().getOptions()); rowPCollection.apply(write); return PCollectionRowTuple.empty(input.getPipeline()); } - BigQueryIO.Write toWrite() { + BigQueryIO.Write toWrite(PipelineOptions options) { BigQueryIO.Write write = BigQueryIO.write() .to(configuration.getTable()) .withMethod(BigQueryIO.Write.Method.FILE_LOADS) .withFormatFunction(BigQueryUtils.toTableRow()) + // TODO(https://github.com/apache/beam/issues/33074) BatchLoad's + // createTempFilePrefixView() doesn't pick up the pipeline option + .withCustomGcsTempLocation( + ValueProvider.StaticValueProvider.of(options.getTempLocation())) + .withWriteDisposition(WriteDisposition.WRITE_APPEND) .useBeamSchema(); if (!Strings.isNullOrEmpty(configuration.getCreateDisposition())) { diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsWriteSchemaTransformProviderTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsSchemaTransformProviderTest.java similarity index 99% rename from sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsWriteSchemaTransformProviderTest.java rename to sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsSchemaTransformProviderTest.java index 5c2b764ef2ec..897d95da3b13 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsWriteSchemaTransformProviderTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/providers/BigQueryFileLoadsSchemaTransformProviderTest.java @@ -57,7 +57,7 @@ /** Test for {@link BigQueryFileLoadsSchemaTransformProvider}. */ @RunWith(JUnit4.class) -public class BigQueryFileLoadsWriteSchemaTransformProviderTest { +public class BigQueryFileLoadsSchemaTransformProviderTest { private static final String PROJECT = "fakeproject"; private static final String DATASET = "fakedataset"; diff --git a/sdks/python/apache_beam/transforms/managed.py b/sdks/python/apache_beam/transforms/managed.py index 22ee15b1de1c..cbcb6de56ed7 100644 --- a/sdks/python/apache_beam/transforms/managed.py +++ b/sdks/python/apache_beam/transforms/managed.py @@ -77,12 +77,16 @@ ICEBERG = "iceberg" KAFKA = "kafka" +BIGQUERY = "bigquery" _MANAGED_IDENTIFIER = "beam:transform:managed:v1" _EXPANSION_SERVICE_JAR_TARGETS = { "sdks:java:io:expansion-service:shadowJar": [KAFKA, ICEBERG], + "sdks:java:io:google-cloud-platform:expansion-service:shadowJar": [ + BIGQUERY + ] } -__all__ = ["ICEBERG", "KAFKA", "Read", "Write"] +__all__ = ["ICEBERG", "KAFKA", "BIGQUERY", "Read", "Write"] class Read(PTransform): @@ -90,6 +94,7 @@ class Read(PTransform): _READ_TRANSFORMS = { ICEBERG: ManagedTransforms.Urns.ICEBERG_READ.urn, KAFKA: ManagedTransforms.Urns.KAFKA_READ.urn, + BIGQUERY: ManagedTransforms.Urns.BIGQUERY_READ.urn } def __init__( @@ -130,6 +135,7 @@ class Write(PTransform): _WRITE_TRANSFORMS = { ICEBERG: ManagedTransforms.Urns.ICEBERG_WRITE.urn, KAFKA: ManagedTransforms.Urns.KAFKA_WRITE.urn, + BIGQUERY: ManagedTransforms.Urns.BIGQUERY_WRITE.urn } def __init__(