From 7f70c38d4f257eacebd902b586264242872db816 Mon Sep 17 00:00:00 2001 From: damccorm Date: Fri, 16 Jan 2026 19:28:30 +0000 Subject: [PATCH] Update managed-io.md for release 2.71.0-RC3. --- .../content/en/documentation/io/managed-io.md | 1022 ++++++----------- 1 file changed, 373 insertions(+), 649 deletions(-) diff --git a/website/www/site/content/en/documentation/io/managed-io.md b/website/www/site/content/en/documentation/io/managed-io.md index ced0443c6954..fab9e79e71a6 100644 --- a/website/www/site/content/en/documentation/io/managed-io.md +++ b/website/www/site/content/en/documentation/io/managed-io.md @@ -58,6 +58,31 @@ and Beam SQL is invoked via the Managed API under the hood. Read Configuration Write Configuration + + ICEBERG + + table (str)
+ catalog_name (str)
+ catalog_properties (map[str, str])
+ config_properties (map[str, str])
+ drop (list[str])
+ filter (str)
+ keep (list[str])
+ + + table (str)
+ catalog_name (str)
+ catalog_properties (map[str, str])
+ config_properties (map[str, str])
+ direct_write_byte_limit (int32)
+ drop (list[str])
+ keep (list[str])
+ only (str)
+ partition_fields (list[str])
+ table_properties (map[str, str])
+ triggering_frequency_seconds (int32)
+ + KAFKA @@ -86,31 +111,6 @@ and Beam SQL is invoked via the Managed API under the hood. schema (str)
- - ICEBERG - - table (str)
- catalog_name (str)
- catalog_properties (map[str, str])
- config_properties (map[str, str])
- drop (list[str])
- filter (str)
- keep (list[str])
- - - table (str)
- catalog_name (str)
- catalog_properties (map[str, str])
- config_properties (map[str, str])
- direct_write_byte_limit (int32)
- drop (list[str])
- keep (list[str])
- only (str)
- partition_fields (list[str])
- table_properties (map[str, str])
- triggering_frequency_seconds (int32)
- - ICEBERG_CDC @@ -134,34 +134,12 @@ and Beam SQL is invoked via the Managed API under the hood. - BIGQUERY - - kms_key (str)
- query (str)
- row_restriction (str)
- fields (list[str])
- table (str)
- - - table (str)
- drop (list[str])
- keep (list[str])
- kms_key (str)
- only (str)
- triggering_frequency_seconds (int64)
- - - - POSTGRES + SQLSERVER jdbc_url (str)
- connection_init_sql (list[str])
connection_properties (str)
disable_auto_commit (boolean)
- driver_class_name (str)
- driver_jars (str)
fetch_size (int32)
- jdbc_type (str)
location (str)
num_partitions (int32)
output_parallelization (boolean)
@@ -174,11 +152,7 @@ and Beam SQL is invoked via the Managed API under the hood. jdbc_url (str)
autosharding (boolean)
batch_size (int64)
- connection_init_sql (list[str])
connection_properties (str)
- driver_class_name (str)
- driver_jars (str)
- jdbc_type (str)
location (str)
password (str)
username (str)
@@ -186,16 +160,13 @@ and Beam SQL is invoked via the Managed API under the hood. - SQLSERVER + MYSQL jdbc_url (str)
connection_init_sql (list[str])
connection_properties (str)
disable_auto_commit (boolean)
- driver_class_name (str)
- driver_jars (str)
fetch_size (int32)
- jdbc_type (str)
location (str)
num_partitions (int32)
output_parallelization (boolean)
@@ -210,9 +181,6 @@ and Beam SQL is invoked via the Managed API under the hood. batch_size (int64)
connection_init_sql (list[str])
connection_properties (str)
- driver_class_name (str)
- driver_jars (str)
- jdbc_type (str)
location (str)
password (str)
username (str)
@@ -220,16 +188,29 @@ and Beam SQL is invoked via the Managed API under the hood. - MYSQL + BIGQUERY + + kms_key (str)
+ query (str)
+ row_restriction (str)
+ fields (list[str])
+ table (str)
+ + + table (str)
+ drop (list[str])
+ keep (list[str])
+ kms_key (str)
+ only (str)
+ triggering_frequency_seconds (int64)
+ + + + POSTGRES jdbc_url (str)
- connection_init_sql (list[str])
connection_properties (str)
- disable_auto_commit (boolean)
- driver_class_name (str)
- driver_jars (str)
fetch_size (int32)
- jdbc_type (str)
location (str)
num_partitions (int32)
output_parallelization (boolean)
@@ -242,11 +223,7 @@ and Beam SQL is invoked via the Managed API under the hood. jdbc_url (str)
autosharding (boolean)
batch_size (int64)
- connection_init_sql (list[str])
connection_properties (str)
- driver_class_name (str)
- driver_jars (str)
- jdbc_type (str)
location (str)
password (str)
username (str)
@@ -258,7 +235,7 @@ and Beam SQL is invoked via the Managed API under the hood. ## Configuration Details -### `KAFKA` Write +### `ICEBERG` Write
@@ -269,245 +246,135 @@ and Beam SQL is invoked via the Managed API under the hood. - - - - - - - - - - - - - - - -
- bootstrap_servers - - str - - A list of host/port pairs to use for establishing the initial connection to the Kafka cluster. The client will make use of all servers irrespective of which servers are specified here for bootstrapping—this list only impacts the initial hosts used to discover the full set of servers. | Format: host1:port1,host2:port2,... -
- format - - str - - The encoding format for the data stored in Kafka. Valid options are: RAW,JSON,AVRO,PROTO -
- topic - - str - - n/a -
- file_descriptor_path + table str - The path to the Protocol Buffer File Descriptor Set file. This file is used for schema definition and message serialization. + A fully-qualified table identifier. You may also provide a template to write to multiple dynamic destinations, for example: `dataset.my_{col1}_{col2.nested}_table`.
- message_name + catalog_name str - The name of the Protocol Buffer message to be used for schema extraction and data conversion. + Name of the catalog containing the table.
- producer_config_updates + catalog_properties map[str, str] - A list of key-value pairs that act as configuration parameters for Kafka producers. Most of these configurations will not be needed, but if you need to customize your Kafka producer, you may use this. See a detailed list: https://docs.confluent.io/platform/current/installation/configuration/producer-configs.html + Properties used to set up the Iceberg catalog.
- schema + config_properties - str + map[str, str] - n/a + Properties passed to the Hadoop Configuration.
-
- -### `KAFKA` Read - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ConfigurationTypeDescription
- bootstrap_servers + direct_write_byte_limit - str + int32 - A list of host/port pairs to use for establishing the initial connection to the Kafka cluster. The client will make use of all servers irrespective of which servers are specified here for bootstrapping—this list only impacts the initial hosts used to discover the full set of servers. This list should be in the form `host1:port1,host2:port2,...` + For a streaming pipeline, sets the limit for lifting bundles into the direct write path.
- topic + drop - str + list[str] - n/a + A list of field names to drop from the input record before writing. Is mutually exclusive with 'keep' and 'only'.
- allow_duplicates + keep - boolean + list[str] - If the Kafka read allows duplicates. + A list of field names to keep in the input record. All other fields are dropped before writing. Is mutually exclusive with 'drop' and 'only'.
- confluent_schema_registry_subject + only str - n/a + The name of a single record field that should be written. Is mutually exclusive with 'keep' and 'drop'.
- confluent_schema_registry_url + partition_fields - str + list[str] - n/a + Fields used to create a partition spec that is applied when tables are created. For a field 'foo', the available partition transforms are: + +- `foo` +- `truncate(foo, N)` +- `bucket(foo, N)` +- `hour(foo)` +- `day(foo)` +- `month(foo)` +- `year(foo)` +- `void(foo)` + +For more information on partition transforms, please visit https://iceberg.apache.org/spec/#partition-transforms.
- consumer_config_updates + table_properties map[str, str] - A list of key-value pairs that act as configuration parameters for Kafka consumers. Most of these configurations will not be needed, but if you need to customize your Kafka consumer, you may use this. See a detailed list: https://docs.confluent.io/platform/current/installation/configuration/consumer-configs.html -
- file_descriptor_path - - str - - The path to the Protocol Buffer File Descriptor Set file. This file is used for schema definition and message serialization. -
- format - - str - - The encoding format for the data stored in Kafka. Valid options are: RAW,STRING,AVRO,JSON,PROTO -
- message_name - - str - - The name of the Protocol Buffer message to be used for schema extraction and data conversion. -
- offset_deduplication - - boolean - - If the redistribute is using offset deduplication mode. -
- redistribute_by_record_key - - boolean - - If the redistribute keys by the Kafka record key. + Iceberg table properties to be set on the table when it is created. +For more information on table properties, please visit https://iceberg.apache.org/docs/latest/configuration/#table-properties.
- redistribute_num_keys + triggering_frequency_seconds int32 - The number of keys for redistributing Kafka inputs. -
- redistributed - - boolean - - If the Kafka read should be redistributed. -
- schema - - str - - The schema in which the data is encoded in the Kafka topic. For AVRO data, this is a schema defined with AVRO schema syntax (https://avro.apache.org/docs/1.10.2/spec.html#schemas). For JSON data, this is a schema defined with JSON-schema syntax (https://json-schema.org/). If a URL to Confluent Schema Registry is provided, then this field is ignored, and the schema is fetched from Confluent Schema Registry. + For a streaming pipeline, sets the frequency at which snapshots are produced.
@@ -602,7 +469,7 @@ and Beam SQL is invoked via the Managed API under the hood.
-### `ICEBERG` Write +### `KAFKA` Read
@@ -613,307 +480,162 @@ and Beam SQL is invoked via the Managed API under the hood. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- table - - str - - A fully-qualified table identifier. You may also provide a template to write to multiple dynamic destinations, for example: `dataset.my_{col1}_{col2.nested}_table`. -
- catalog_name + bootstrap_servers str - Name of the catalog containing the table. -
- catalog_properties - - map[str, str] - - Properties used to set up the Iceberg catalog. -
- config_properties - - map[str, str] - - Properties passed to the Hadoop Configuration. -
- direct_write_byte_limit - - int32 - - For a streaming pipeline, sets the limit for lifting bundles into the direct write path. -
- drop - - list[str] - - A list of field names to drop from the input record before writing. Is mutually exclusive with 'keep' and 'only'. -
- keep - - list[str] - - A list of field names to keep in the input record. All other fields are dropped before writing. Is mutually exclusive with 'drop' and 'only'. + A list of host/port pairs to use for establishing the initial connection to the Kafka cluster. The client will make use of all servers irrespective of which servers are specified here for bootstrapping—this list only impacts the initial hosts used to discover the full set of servers. This list should be in the form `host1:port1,host2:port2,...`
- only + topic str - The name of a single record field that should be written. Is mutually exclusive with 'keep' and 'drop'. -
- partition_fields - - list[str] - - Fields used to create a partition spec that is applied when tables are created. For a field 'foo', the available partition transforms are: - -- `foo` -- `truncate(foo, N)` -- `bucket(foo, N)` -- `hour(foo)` -- `day(foo)` -- `month(foo)` -- `year(foo)` -- `void(foo)` - -For more information on partition transforms, please visit https://iceberg.apache.org/spec/#partition-transforms. -
- table_properties - - map[str, str] - - Iceberg table properties to be set on the table when it is created. -For more information on table properties, please visit https://iceberg.apache.org/docs/latest/configuration/#table-properties. + n/a
- triggering_frequency_seconds + allow_duplicates - int32 + boolean - For a streaming pipeline, sets the frequency at which snapshots are produced. + If the Kafka read allows duplicates.
-
- -### `ICEBERG_CDC` Read - -
- - - - - - - - - - - - - - - - - - - - -
ConfigurationTypeDescription
- table + confluent_schema_registry_subject str - Identifier of the Iceberg table. + n/a
- catalog_name + confluent_schema_registry_url str - Name of the catalog containing the table. -
- catalog_properties - - map[str, str] - - Properties used to set up the Iceberg catalog. -
- config_properties - - map[str, str] - - Properties passed to the Hadoop Configuration. -
- drop - - list[str] - - A subset of column names to exclude from reading. If null or empty, all columns will be read. + n/a
- filter + consumer_config_updates - str + map[str, str] - SQL-like predicate to filter data at scan time. Example: "id > 5 AND status = 'ACTIVE'". Uses Apache Calcite syntax: https://calcite.apache.org/docs/reference.html + A list of key-value pairs that act as configuration parameters for Kafka consumers. Most of these configurations will not be needed, but if you need to customize your Kafka consumer, you may use this. See a detailed list: https://docs.confluent.io/platform/current/installation/configuration/consumer-configs.html
- from_snapshot + file_descriptor_path - int64 + str - Starts reading from this snapshot ID (inclusive). + The path to the Protocol Buffer File Descriptor Set file. This file is used for schema definition and message serialization.
- from_timestamp + format - int64 + str - Starts reading from the first snapshot (inclusive) that was created after this timestamp (in milliseconds). + The encoding format for the data stored in Kafka. Valid options are: RAW,STRING,AVRO,JSON,PROTO
- keep + message_name - list[str] + str - A subset of column names to read exclusively. If null or empty, all columns will be read. + The name of the Protocol Buffer message to be used for schema extraction and data conversion.
- poll_interval_seconds + offset_deduplication - int32 + boolean - The interval at which to poll for new snapshots. Defaults to 60 seconds. + If the redistribute is using offset deduplication mode.
- starting_strategy + redistribute_by_record_key - str + boolean - The source's starting strategy. Valid options are: "earliest" or "latest". Can be overriden by setting a starting snapshot or timestamp. Defaults to earliest for batch, and latest for streaming. + If the redistribute keys by the Kafka record key.
- streaming + redistribute_num_keys - boolean + int32 - Enables streaming reads, where source continuously polls for snapshots forever. + The number of keys for redistributing Kafka inputs.
- to_snapshot + redistributed - int64 + boolean - Reads up to this snapshot ID (inclusive). + If the Kafka read should be redistributed.
- to_timestamp + schema - int64 + str - Reads up to the latest snapshot (inclusive) created before this timestamp (in milliseconds). + The schema in which the data is encoded in the Kafka topic. For AVRO data, this is a schema defined with AVRO schema syntax (https://avro.apache.org/docs/1.10.2/spec.html#schemas). For JSON data, this is a schema defined with JSON-schema syntax (https://json-schema.org/). If a URL to Confluent Schema Registry is provided, then this field is ignored, and the schema is fetched from Confluent Schema Registry.
-### `BIGQUERY` Read +### `KAFKA` Write
@@ -924,63 +646,85 @@ For more information on table properties, please visit https://iceberg.apache.or + + + + + + + + + +
- kms_key + bootstrap_servers str - Use this Cloud KMS key to encrypt your data + A list of host/port pairs to use for establishing the initial connection to the Kafka cluster. The client will make use of all servers irrespective of which servers are specified here for bootstrapping—this list only impacts the initial hosts used to discover the full set of servers. | Format: host1:port1,host2:port2,...
- query + format str - The SQL query to be executed to read from the BigQuery table. + The encoding format for the data stored in Kafka. Valid options are: RAW,JSON,AVRO,PROTO
- row_restriction + topic str - Read only rows that match this filter, which must be compatible with Google standard SQL. This is not supported when reading via query. + n/a
- fields + file_descriptor_path - list[str] + str - Read only the specified fields (columns) from a BigQuery table. Fields may not be returned in the order specified. If no value is specified, then all fields are returned. Example: "col1, col2, col3" + The path to the Protocol Buffer File Descriptor Set file. This file is used for schema definition and message serialization.
- table + message_name str - The fully-qualified name of the BigQuery table to read from. Format: [${PROJECT}:]${DATASET}.${TABLE} + The name of the Protocol Buffer message to be used for schema extraction and data conversion. +
+ producer_config_updates + + map[str, str] + + A list of key-value pairs that act as configuration parameters for Kafka producers. Most of these configurations will not be needed, but if you need to customize your Kafka producer, you may use this. See a detailed list: https://docs.confluent.io/platform/current/installation/configuration/producer-configs.html +
+ schema + + str + + n/a
-### `BIGQUERY` Write +### `ICEBERG_CDC` Read
@@ -997,306 +741,306 @@ For more information on table properties, please visit https://iceberg.apache.or str -
- The bigquery table to write to. Format: [${PROJECT}:]${DATASET}.${TABLE} + Identifier of the Iceberg table.
- drop + catalog_name - list[str] + str - A list of field names to drop from the input record before writing. Is mutually exclusive with 'keep' and 'only'. + Name of the catalog containing the table.
- keep + catalog_properties - list[str] + map[str, str] - A list of field names to keep in the input record. All other fields are dropped before writing. Is mutually exclusive with 'drop' and 'only'. + Properties used to set up the Iceberg catalog.
- kms_key + config_properties - str + map[str, str] - Use this Cloud KMS key to encrypt your data + Properties passed to the Hadoop Configuration.
- only + drop - str + list[str] - The name of a single record field that should be written. Is mutually exclusive with 'keep' and 'drop'. + A subset of column names to exclude from reading. If null or empty, all columns will be read.
- triggering_frequency_seconds + filter - int64 + str - Determines how often to 'commit' progress into BigQuery. Default is every 5 seconds. + SQL-like predicate to filter data at scan time. Example: "id > 5 AND status = 'ACTIVE'". Uses Apache Calcite syntax: https://calcite.apache.org/docs/reference.html
-
- -### `POSTGRES` Write - -
- - - - - - +
ConfigurationTypeDescription
- jdbc_url + from_snapshot - str + int64 - Connection URL for the JDBC sink. + Starts reading from this snapshot ID (inclusive).
- autosharding + from_timestamp - boolean + int64 - If true, enables using a dynamically determined number of shards to write. + Starts reading from the first snapshot (inclusive) that was created after this timestamp (in milliseconds).
- batch_size + keep - int64 + list[str] - n/a + A subset of column names to read exclusively. If null or empty, all columns will be read.
- connection_init_sql + poll_interval_seconds - list[str] + int32 - Sets the connection init sql statements used by the Driver. Only MySQL and MariaDB support this. + The interval at which to poll for new snapshots. Defaults to 60 seconds.
- connection_properties + starting_strategy str - Used to set connection properties passed to the JDBC driver not already defined as standalone parameter (e.g. username and password can be set using parameters above accordingly). Format of the string must be "key1=value1;key2=value2;". + The source's starting strategy. Valid options are: "earliest" or "latest". Can be overriden by setting a starting snapshot or timestamp. Defaults to earliest for batch, and latest for streaming.
- driver_class_name + streaming - str + boolean - Name of a Java Driver class to use to connect to the JDBC source. For example, "com.mysql.jdbc.Driver". + Enables streaming reads, where source continuously polls for snapshots forever.
- driver_jars + to_snapshot - str + int64 - Comma separated path(s) for the JDBC driver jar(s). This can be a local path or GCS (gs://) path. + Reads up to this snapshot ID (inclusive).
- jdbc_type + to_timestamp - str + int64 - Type of JDBC source. When specified, an appropriate default Driver will be packaged with the transform. One of mysql, postgres, oracle, or mssql. + Reads up to the latest snapshot (inclusive) created before this timestamp (in milliseconds).
+
+ +### `SQLSERVER` Write + +
+ + + + + + -
ConfigurationTypeDescription
- location + jdbc_url str - Name of the table to write to. + Connection URL for the JDBC sink.
- password + autosharding - str + boolean - Password for the JDBC source. + If true, enables using a dynamically determined number of shards to write.
- username + batch_size - str + int64 - Username for the JDBC source. + n/a
- write_statement + connection_properties str - SQL query used to insert records into the JDBC sink. + Used to set connection properties passed to the JDBC driver not already defined as standalone parameter (e.g. username and password can be set using parameters above accordingly). Format of the string must be "key1=value1;key2=value2;".
-
- -### `POSTGRES` Read - -
- - - - - - +
ConfigurationTypeDescription
- jdbc_url + location str - Connection URL for the JDBC source. + Name of the table to write to.
- connection_init_sql + password - list[str] + str - Sets the connection init sql statements used by the Driver. Only MySQL and MariaDB support this. + Password for the JDBC source.
- connection_properties + username str - Used to set connection properties passed to the JDBC driver not already defined as standalone parameter (e.g. username and password can be set using parameters above accordingly). Format of the string must be "key1=value1;key2=value2;". + Username for the JDBC source.
- disable_auto_commit + write_statement - boolean + str - Whether to disable auto commit on read. Defaults to true if not provided. The need for this config varies depending on the database platform. Informix requires this to be set to false while Postgres requires this to be set to true. + SQL query used to insert records into the JDBC sink.
+
+ +### `SQLSERVER` Read + +
+ + + + + + @@ -1379,7 +1123,7 @@ For more information on table properties, please visit https://iceberg.apache.or
ConfigurationTypeDescription
- driver_class_name + jdbc_url str - Name of a Java Driver class to use to connect to the JDBC source. For example, "com.mysql.jdbc.Driver". + Connection URL for the JDBC source.
- driver_jars + connection_properties str - Comma separated path(s) for the JDBC driver jar(s). This can be a local path or GCS (gs://) path. + Used to set connection properties passed to the JDBC driver not already defined as standalone parameter (e.g. username and password can be set using parameters above accordingly). Format of the string must be "key1=value1;key2=value2;".
- fetch_size + disable_auto_commit - int32 + boolean - This method is used to override the size of the data that is going to be fetched and loaded in memory per every database call. It should ONLY be used if the default value throws memory errors. + Whether to disable auto commit on read. Defaults to true if not provided. The need for this config varies depending on the database platform. Informix requires this to be set to false while Postgres requires this to be set to true.
- jdbc_type + fetch_size - str + int32 - Type of JDBC source. When specified, an appropriate default Driver will be packaged with the transform. One of mysql, postgres, oracle, or mssql. + This method is used to override the size of the data that is going to be fetched and loaded in memory per every database call. It should ONLY be used if the default value throws memory errors.
-### `SQLSERVER` Read +### `MYSQL` Read
@@ -1432,28 +1176,6 @@ For more information on table properties, please visit https://iceberg.apache.or Whether to disable auto commit on read. Defaults to true if not provided. The need for this config varies depending on the database platform. Informix requires this to be set to false while Postgres requires this to be set to true. - - - - - - - - - - - - - - -
- driver_class_name - - str - - Name of a Java Driver class to use to connect to the JDBC source. For example, "com.mysql.jdbc.Driver". -
- driver_jars - - str - - Comma separated path(s) for the JDBC driver jar(s). This can be a local path or GCS (gs://) path. -
fetch_size @@ -1465,17 +1187,6 @@ For more information on table properties, please visit https://iceberg.apache.or This method is used to override the size of the data that is going to be fetched and loaded in memory per every database call. It should ONLY be used if the default value throws memory errors.
- jdbc_type - - str - - Type of JDBC source. When specified, an appropriate default Driver will be packaged with the transform. One of mysql, postgres, oracle, or mssql. -
location @@ -1556,7 +1267,7 @@ For more information on table properties, please visit https://iceberg.apache.or
-### `SQLSERVER` Write +### `MYSQL` Write
@@ -1622,223 +1333,258 @@ For more information on table properties, please visit https://iceberg.apache.or +
- driver_class_name + location str - Name of a Java Driver class to use to connect to the JDBC source. For example, "com.mysql.jdbc.Driver". + Name of the table to write to.
- driver_jars + password str - Comma separated path(s) for the JDBC driver jar(s). This can be a local path or GCS (gs://) path. + Password for the JDBC source.
- jdbc_type + username str - Type of JDBC source. When specified, an appropriate default Driver will be packaged with the transform. One of mysql, postgres, oracle, or mssql. + Username for the JDBC source.
- location + write_statement str - Name of the table to write to. + SQL query used to insert records into the JDBC sink.
+
+ +### `BIGQUERY` Write + +
+ + + + + + -
ConfigurationTypeDescription
- password + table str - Password for the JDBC source. + The bigquery table to write to. Format: [${PROJECT}:]${DATASET}.${TABLE}
- username + drop - str + list[str] - Username for the JDBC source. + A list of field names to drop from the input record before writing. Is mutually exclusive with 'keep' and 'only'.
- write_statement + keep - str + list[str] - SQL query used to insert records into the JDBC sink. + A list of field names to keep in the input record. All other fields are dropped before writing. Is mutually exclusive with 'drop' and 'only'.
-
- -### `MYSQL` Read - -
- - - - + + + +
ConfigurationTypeDescription + kms_key + + str + + Use this Cloud KMS key to encrypt your data +
- jdbc_url + only str - Connection URL for the JDBC source. + The name of a single record field that should be written. Is mutually exclusive with 'keep' and 'drop'.
- connection_init_sql + triggering_frequency_seconds - list[str] + int64 - Sets the connection init sql statements used by the Driver. Only MySQL and MariaDB support this. + Determines how often to 'commit' progress into BigQuery. Default is every 5 seconds.
+
+ +### `BIGQUERY` Read + +
+ + + + + + +
ConfigurationTypeDescription
- connection_properties + kms_key str - Used to set connection properties passed to the JDBC driver not already defined as standalone parameter (e.g. username and password can be set using parameters above accordingly). Format of the string must be "key1=value1;key2=value2;". + Use this Cloud KMS key to encrypt your data
- disable_auto_commit + query - boolean + str - Whether to disable auto commit on read. Defaults to true if not provided. The need for this config varies depending on the database platform. Informix requires this to be set to false while Postgres requires this to be set to true. + The SQL query to be executed to read from the BigQuery table.
- driver_class_name + row_restriction str - Name of a Java Driver class to use to connect to the JDBC source. For example, "com.mysql.jdbc.Driver". + Read only rows that match this filter, which must be compatible with Google standard SQL. This is not supported when reading via query.
- driver_jars + fields - str + list[str] - Comma separated path(s) for the JDBC driver jar(s). This can be a local path or GCS (gs://) path. + Read only the specified fields (columns) from a BigQuery table. Fields may not be returned in the order specified. If no value is specified, then all fields are returned. Example: "col1, col2, col3"
- fetch_size + table - int32 + str - This method is used to override the size of the data that is going to be fetched and loaded in memory per every database call. It should ONLY be used if the default value throws memory errors. + The fully-qualified name of the BigQuery table to read from. Format: [${PROJECT}:]${DATASET}.${TABLE}
+
+ +### `POSTGRES` Write + +
+ + + + + + @@ -1854,30 +1600,30 @@ For more information on table properties, please visit https://iceberg.apache.or
ConfigurationTypeDescription
- jdbc_type + jdbc_url str - Type of JDBC source. When specified, an appropriate default Driver will be packaged with the transform. One of mysql, postgres, oracle, or mssql. + Connection URL for the JDBC sink.
- location + autosharding - str + boolean - Name of the table to read from. + If true, enables using a dynamically determined number of shards to write.
- num_partitions + batch_size - int32 + int64 - The number of partitions + n/a
- output_parallelization + connection_properties - boolean + str - Whether to reshuffle the resulting PCollection so results are distributed to all workers. + Used to set connection properties passed to the JDBC driver not already defined as standalone parameter (e.g. username and password can be set using parameters above accordingly). Format of the string must be "key1=value1;key2=value2;".
- partition_column + location str - Name of a column of numeric type that will be used for partitioning. + Name of the table to write to.
- read_query + username str - SQL query used to query the JDBC source. + Username for the JDBC source.
- username + write_statement str - Username for the JDBC source. + SQL query used to insert records into the JDBC sink.
-### `MYSQL` Write +### `POSTGRES` Read
@@ -1894,95 +1640,73 @@ For more information on table properties, please visit https://iceberg.apache.or str - - - - - - - - - - @@ -1998,24 +1722,24 @@ For more information on table properties, please visit https://iceberg.apache.or
- Connection URL for the JDBC sink. -
- autosharding - - boolean - - If true, enables using a dynamically determined number of shards to write. -
- batch_size - - int64 - - n/a + Connection URL for the JDBC source.
- connection_init_sql + connection_properties - list[str] + str - Sets the connection init sql statements used by the Driver. Only MySQL and MariaDB support this. + Used to set connection properties passed to the JDBC driver not already defined as standalone parameter (e.g. username and password can be set using parameters above accordingly). Format of the string must be "key1=value1;key2=value2;".
- connection_properties + fetch_size - str + int32 - Used to set connection properties passed to the JDBC driver not already defined as standalone parameter (e.g. username and password can be set using parameters above accordingly). Format of the string must be "key1=value1;key2=value2;". + This method is used to override the size of the data that is going to be fetched and loaded in memory per every database call. It should ONLY be used if the default value throws memory errors.
- driver_class_name + location str - Name of a Java Driver class to use to connect to the JDBC source. For example, "com.mysql.jdbc.Driver". + Name of the table to read from.
- driver_jars + num_partitions - str + int32 - Comma separated path(s) for the JDBC driver jar(s). This can be a local path or GCS (gs://) path. + The number of partitions
- jdbc_type + output_parallelization - str + boolean - Type of JDBC source. When specified, an appropriate default Driver will be packaged with the transform. One of mysql, postgres, oracle, or mssql. + Whether to reshuffle the resulting PCollection so results are distributed to all workers.
- location + partition_column str - Name of the table to write to. + Name of a column of numeric type that will be used for partitioning.
- username + read_query str - Username for the JDBC source. + SQL query used to query the JDBC source.
- write_statement + username str - SQL query used to insert records into the JDBC sink. + Username for the JDBC source.