-
Notifications
You must be signed in to change notification settings - Fork 4.5k
Handle updates to table schema when using Storage API writes. #24145
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,23 +17,50 @@ | |
| */ | ||
| package org.apache.beam.sdk.io.gcp.bigquery; | ||
|
|
||
| import com.google.api.services.bigquery.model.TableRow; | ||
| import com.google.cloud.bigquery.storage.v1.ProtoRows; | ||
| import com.google.protobuf.ByteString; | ||
| import java.util.Iterator; | ||
| import java.util.NoSuchElementException; | ||
| import java.util.function.BiConsumer; | ||
| import java.util.function.Function; | ||
| import javax.annotation.Nullable; | ||
|
|
||
| /** | ||
| * Takes in an iterable and batches the results into multiple ProtoRows objects. The splitSize | ||
| * parameter controls how many rows are batched into a single ProtoRows object before we move on to | ||
| * the next one. | ||
| */ | ||
| class SplittingIterable implements Iterable<ProtoRows> { | ||
| interface ConvertUnknownFields { | ||
|
||
| ByteString convert(TableRow tableRow, boolean ignoreUnknownValues) | ||
| throws TableRowToStorageApiProto.SchemaConversionException; | ||
| } | ||
|
|
||
| private final Iterable<StorageApiWritePayload> underlying; | ||
| private final long splitSize; | ||
|
|
||
| public SplittingIterable(Iterable<StorageApiWritePayload> underlying, long splitSize) { | ||
| private final ConvertUnknownFields unknownFieldsToMessage; | ||
| private final Function<ByteString, TableRow> protoToTableRow; | ||
| private final BiConsumer<TableRow, String> failedRowsConsumer; | ||
| private final boolean autoUpdateSchema; | ||
| private final boolean ignoreUnknownValues; | ||
|
|
||
| public SplittingIterable( | ||
| Iterable<StorageApiWritePayload> underlying, | ||
| long splitSize, | ||
| ConvertUnknownFields unknownFieldsToMessage, | ||
| Function<ByteString, TableRow> protoToTableRow, | ||
| BiConsumer<TableRow, String> failedRowsConsumer, | ||
| boolean autoUpdateSchema, | ||
| boolean ignoreUnknownValues) { | ||
| this.underlying = underlying; | ||
| this.splitSize = splitSize; | ||
| this.unknownFieldsToMessage = unknownFieldsToMessage; | ||
| this.protoToTableRow = protoToTableRow; | ||
| this.failedRowsConsumer = failedRowsConsumer; | ||
| this.autoUpdateSchema = autoUpdateSchema; | ||
| this.ignoreUnknownValues = ignoreUnknownValues; | ||
| } | ||
|
|
||
| @Override | ||
|
|
@@ -57,7 +84,37 @@ public ProtoRows next() { | |
| while (underlyingIterator.hasNext()) { | ||
| StorageApiWritePayload payload = underlyingIterator.next(); | ||
| ByteString byteString = ByteString.copyFrom(payload.getPayload()); | ||
|
|
||
| if (autoUpdateSchema) { | ||
| try { | ||
| @Nullable TableRow unknownFields = payload.getUnknownFields(); | ||
| if (unknownFields != null) { | ||
| // Protocol buffer serialization format supports concatenation. We serialize any new | ||
| // "known" fields | ||
| // into a proto and concatenate to the existing proto. | ||
| try { | ||
| byteString = | ||
| byteString.concat( | ||
|
||
| unknownFieldsToMessage.convert(unknownFields, ignoreUnknownValues)); | ||
| } catch (TableRowToStorageApiProto.SchemaConversionException e) { | ||
| // This generally implies that ignoreUnknownValues=false and there were still | ||
| // unknown values here. | ||
| // Reconstitute the TableRow and send it to the failed-rows consumer. | ||
| TableRow tableRow = protoToTableRow.apply(byteString); | ||
| // TODO(24926, reuvenlax): We need to merge the unknown fields in! Currently we | ||
| // only execute this | ||
| // codepath when ignoreUnknownFields==true, so we should never hit this codepath. | ||
| // However once | ||
| // 24926 is fixed, we need to merge the unknownFields back into the main row | ||
| // before outputting to the | ||
| // failed-rows consumer. | ||
| failedRowsConsumer.accept(tableRow, e.toString()); | ||
| continue; | ||
| } | ||
| } | ||
| } catch (Exception e) { | ||
| throw new RuntimeException(e); | ||
| } | ||
| } | ||
| inserts.addSerializedRows(byteString); | ||
| bytesSize += byteString.size(); | ||
| if (bytesSize > splitSize) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -35,6 +35,8 @@ public interface MessageConverter<T> { | |
|
|
||
| StorageApiWritePayload toMessage(T element) throws Exception; | ||
|
|
||
| StorageApiWritePayload toMessage(TableRow tableRow, boolean respectRequired) throws Exception; | ||
|
||
|
|
||
| TableRow toTableRow(T element); | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In case of ignoreUnknonwValues to be false, this will be a void operation?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No - these are fields that are unknown to the prior step. They make actually end up being known to the current step due to the updated schema.