diff --git a/.github/trigger_files/beam_PostCommit_Python.json b/.github/trigger_files/beam_PostCommit_Python.json index c6ec17f48412..06bd728be6d7 100644 --- a/.github/trigger_files/beam_PostCommit_Python.json +++ b/.github/trigger_files/beam_PostCommit_Python.json @@ -1,5 +1,5 @@ { "comment": "Modify this file in a trivial way to cause this test suite to run.", - "modification": 35 + "modification": 33 } diff --git a/.github/workflows/beam_PreCommit_Python_Transforms.yml b/.github/workflows/beam_PreCommit_Python_Transforms.yml index 8753777057c6..1a16e9b61756 100644 --- a/.github/workflows/beam_PreCommit_Python_Transforms.yml +++ b/.github/workflows/beam_PreCommit_Python_Transforms.yml @@ -53,7 +53,6 @@ env: DEVELOCITY_ACCESS_KEY: ${{ secrets.DEVELOCITY_ACCESS_KEY }} GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }} GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }} - ALLOYDB_PASSWORD: ${{ secrets.ALLOYDB_PASSWORD }} jobs: beam_PreCommit_Python_Transforms: diff --git a/CHANGES.md b/CHANGES.md index db88b8c79807..2cfeeba85a52 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -76,6 +76,10 @@ * BigtableRead Connector for BeamYaml added with new Config Param ([#35696](https://github.com/apache/beam/pull/35696)) * Introduced a dedicated module for JUnit-based testing support: `sdks/java/testing/junit`, which provides `TestPipelineExtension` for JUnit 5 while maintaining backward compatibility with existing JUnit 4 `TestRule`-based tests (Java) ([#18733](https://github.com/apache/beam/issues/18733), [#35688](https://github.com/apache/beam/pull/35688)). - To use JUnit 5 with Beam tests, add a test-scoped dependency on `org.apache.beam:beam-sdks-java-testing-junit`. +* Google CloudSQL enrichment handler added (Python) ([#34398](https://github.com/apache/beam/pull/34398)). + Beam now supports data enrichment capabilities using SQL databases, with built-in support for: + - Managed PostgreSQL, MySQL, and Microsoft SQL Server instances on CloudSQL + - Unmanaged SQL database instances not hosted on CloudSQL (e.g., self-hosted or on-premises databases) ## Breaking Changes diff --git a/sdks/python/apache_beam/transforms/enrichment_handlers/cloudsql.py b/sdks/python/apache_beam/transforms/enrichment_handlers/cloudsql.py new file mode 100644 index 000000000000..f070158d1c54 --- /dev/null +++ b/sdks/python/apache_beam/transforms/enrichment_handlers/cloudsql.py @@ -0,0 +1,660 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import re +from abc import ABC +from abc import abstractmethod +from collections.abc import Callable +from collections.abc import Mapping +from dataclasses import dataclass +from dataclasses import field +from enum import Enum +from typing import Any +from typing import Dict +from typing import List +from typing import Optional +from typing import Union + +import pg8000 +import pymysql +import pytds +from google.cloud.sql.connector import Connector as CloudSQLConnector +from google.cloud.sql.connector.enums import RefreshStrategy +from sqlalchemy import create_engine +from sqlalchemy import text +from sqlalchemy.engine import Connection as DBAPIConnection + +import apache_beam as beam +from apache_beam.transforms.enrichment import EnrichmentSourceHandler + +QueryFn = Callable[[beam.Row], str] +ConditionValueFn = Callable[[beam.Row], list[Any]] + + +@dataclass +class CustomQueryConfig: + """Configuration for using a custom query function.""" + query_fn: QueryFn + + def __post_init__(self): + if not self.query_fn: + raise ValueError("CustomQueryConfig must provide a valid query_fn") + + +@dataclass +class TableFieldsQueryConfig: + """Configuration for using table name, where clause, and field names.""" + table_id: str + where_clause_template: str + where_clause_fields: List[str] + + def __post_init__(self): + if not self.table_id or not self.where_clause_template: + raise ValueError( + "TableFieldsQueryConfig must provide table_id and " + + "where_clause_template") + + if not self.where_clause_fields: + raise ValueError( + "TableFieldsQueryConfig must provide non-empty " + + "where_clause_fields") + + +@dataclass +class TableFunctionQueryConfig: + """Configuration for using table name, where clause, and a value function.""" + table_id: str + where_clause_template: str + where_clause_value_fn: ConditionValueFn + + def __post_init__(self): + if not self.table_id or not self.where_clause_template: + raise ValueError( + "TableFunctionQueryConfig must provide table_id and " + + "where_clause_template") + + if not self.where_clause_value_fn: + raise ValueError( + "TableFunctionQueryConfig must provide " + "where_clause_value_fn") + + +class DatabaseTypeAdapter(Enum): + POSTGRESQL = "pg8000" + MYSQL = "pymysql" + SQLSERVER = "pytds" + + def to_sqlalchemy_dialect(self): + """Map the adapter type to its corresponding SQLAlchemy dialect. + + Returns: + str: SQLAlchemy dialect string. + """ + if self == DatabaseTypeAdapter.POSTGRESQL: + return f"postgresql+{self.value}" + elif self == DatabaseTypeAdapter.MYSQL: + return f"mysql+{self.value}" + elif self == DatabaseTypeAdapter.SQLSERVER: + return f"mssql+{self.value}" + else: + raise ValueError(f"Unsupported database adapter type: {self.name}") + + +class ConnectionConfig(ABC): + @abstractmethod + def get_connector_handler(self) -> Callable[[], DBAPIConnection]: + pass + + @abstractmethod + def get_db_url(self) -> str: + pass + + +@dataclass +class CloudSQLConnectionConfig(ConnectionConfig): + """Connects to Google Cloud SQL using Cloud SQL Python Connector. + + Args: + db_adapter: The database adapter type (PostgreSQL, MySQL, SQL Server). + instance_connection_uri: URI for connecting to the Cloud SQL instance. + user: Username for authentication. + password: Password for authentication. Defaults to None. + db_id: Database identifier/name. + refresh_strategy: Strategy for refreshing connection (default: LAZY). + connector_kwargs: Additional keyword arguments for the + Cloud SQL Python Connector. Enables forward compatibility. + connect_kwargs: Additional keyword arguments for the client connect + method. Enables forward compatibility. + """ + db_adapter: DatabaseTypeAdapter + instance_connection_uri: str + user: str = field(default_factory=str) + password: str = field(default_factory=str) + db_id: str = field(default_factory=str) + refresh_strategy: RefreshStrategy = RefreshStrategy.LAZY + connector_kwargs: Dict[str, Any] = field(default_factory=dict) + connect_kwargs: Dict[str, Any] = field(default_factory=dict) + + def __post_init__(self): + if not self.instance_connection_uri: + raise ValueError("Instance connection URI cannot be empty") + + def get_connector_handler(self) -> Callable[[], DBAPIConnection]: + """Returns a function that creates a new database connection. + + The returned connector function creates database connections that should + be properly closed by the caller when no longer needed. + """ + cloudsql_client = CloudSQLConnector( + refresh_strategy=self.refresh_strategy, **self.connector_kwargs) + + cloudsql_connector = lambda: cloudsql_client.connect( + instance_connection_string=self.instance_connection_uri, driver=self. + db_adapter.value, user=self.user, password=self.password, db=self.db_id, + **self.connect_kwargs) + + return cloudsql_connector + + def get_db_url(self) -> str: + return self.db_adapter.to_sqlalchemy_dialect() + "://" + + +@dataclass +class ExternalSQLDBConnectionConfig(ConnectionConfig): + """Connects to External SQL DBs (PostgreSQL, MySQL, SQL Server) over TCP. + + Args: + db_adapter: The database adapter type (PostgreSQL, MySQL, SQL Server). + host: Hostname or IP address of the database server. + port: Port number for the database connection. + user: Username for authentication. + password: Password for authentication. + db_id: Database identifier/name. + connect_kwargs: Additional keyword arguments for the client connect + method. Enables forward compatibility. + """ + db_adapter: DatabaseTypeAdapter + host: str + port: int + user: str = field(default_factory=str) + password: str = field(default_factory=str) + db_id: str = field(default_factory=str) + connect_kwargs: Dict[str, Any] = field(default_factory=dict) + + def __post_init__(self): + if not self.host: + raise ValueError("Database host cannot be empty") + + def get_connector_handler(self) -> Callable[[], DBAPIConnection]: + """Returns a function that creates a new database connection. + + The returned connector function creates database connections that should + be properly closed by the caller when no longer needed. + """ + if self.db_adapter == DatabaseTypeAdapter.POSTGRESQL: + return lambda: pg8000.connect( + host=self.host, port=self.port, database=self.db_id, user=self.user, + password=self.password, **self.connect_kwargs) + elif self.db_adapter == DatabaseTypeAdapter.MYSQL: + return lambda: pymysql.connect( + host=self.host, port=self.port, database=self.db_id, user=self.user, + password=self.password, **self.connect_kwargs) + elif self.db_adapter == DatabaseTypeAdapter.SQLSERVER: + return lambda: pytds.connect( + dsn=self.host, port=self.port, database=self.db_id, user=self.user, + password=self.password, **self.connect_kwargs) + else: + raise ValueError(f"Unsupported database adapter: {self.db_adapter}") + + def get_db_url(self) -> str: + return self.db_adapter.to_sqlalchemy_dialect() + "://" + + +QueryConfig = Union[CustomQueryConfig, + TableFieldsQueryConfig, + TableFunctionQueryConfig] + + +class CloudSQLEnrichmentHandler(EnrichmentSourceHandler[beam.Row, beam.Row]): + """Enrichment handler for Cloud SQL databases. + + This handler is designed to work with the + :class:`apache_beam.transforms.enrichment.Enrichment` transform. + + To use this handler, you need to provide one of the following query configs: + * CustomQueryConfig - For providing a custom query function + * TableFieldsQueryConfig - For specifying table, where clause, and fields + * TableFunctionQueryConfig - For specifying table, where clause, and val fn + + By default, the handler retrieves all columns from the specified table. + To limit the columns, use the `column_names` parameter to specify + the desired column names. + + This handler queries the Cloud SQL database per element by default. + To enable batching, set the `min_batch_size` and `max_batch_size` parameters. + These values control the batching behavior in the + :class:`apache_beam.transforms.utils.BatchElements` transform. + + NOTE: Batching is not supported when using the CustomQueryConfig. + """ + def __init__( + self, + connection_config: ConnectionConfig, + *, + query_config: QueryConfig, + column_names: Optional[list[str]] = None, + min_batch_size: int = 1, + max_batch_size: int = 10000, + **kwargs, + ): + """ + Example usage:: + + connection_config = CloudSQLConnectionConfig( + db_adapter=DatabaseTypeAdapter.POSTGRESQL, + instance_connection_uri="apache-beam-testing:us-central1:itests", + user='postgres', + password= os.getenv("CLOUDSQL_PG_PASSWORD")) + query_config=TableFieldsQueryConfig('my_table',"id = :param0",['id']), + cloudsql_handler = CloudSQLEnrichmentHandler( + connection_config=connection_config, + query_config=query_config, + min_batch_size=2, + max_batch_size=100) + + Args: + connection_config (ConnectionConfig): Configuration for connecting to + the SQL database. Must be an instance of a subclass of + `ConnectionConfig`, such as `CloudSQLConnectionConfig` or + `ExternalSQLDBConnectionConfig`. This determines how the handler + connects to the target SQL database. + query_config: Configuration for database queries. Must be one of: + * CustomQueryConfig: For providing a custom query function + * TableFieldsQueryConfig: specifies table, where clause, and field names + * TableFunctionQueryConfig: specifies table, where clause, and val func + column_names (Optional[list[str]]): List of column names to select from + the Cloud SQL table. If not provided, all columns (`*`) are selected. + min_batch_size (int): Minimum number of rows to batch together when + querying the database. Defaults to 1 if `query_fn` is not used. + max_batch_size (int): Maximum number of rows to batch together. Defaults + to 10,000 if `query_fn` is not used. + **kwargs: Additional keyword arguments for database connection or query + handling. + + Note: + * Cannot use `min_batch_size` or `max_batch_size` with `query_fn`. + * Either `where_clause_fields` or `where_clause_value_fn` must be provided + for query construction if `query_fn` is not provided. + * Ensure that the database user has the necessary permissions to query the + specified table. + """ + self._connection_config = connection_config + self._query_config = query_config + self._column_names = ",".join(column_names) if column_names else "*" + self.kwargs = kwargs + self._batching_kwargs = {} + table_query_configs = (TableFieldsQueryConfig, TableFunctionQueryConfig) + if isinstance(query_config, table_query_configs): + self.query_template = ( + f"SELECT {self._column_names} " + f"FROM {query_config.table_id} " + f"WHERE {query_config.where_clause_template}") + self._batching_kwargs['min_batch_size'] = min_batch_size + self._batching_kwargs['max_batch_size'] = max_batch_size + + def __enter__(self): + connector = self._connection_config.get_connector_handler() + self._engine = create_engine( + url=self._connection_config.get_db_url(), creator=connector) + + def __call__( + self, request: Union[beam.Row, list[beam.Row]], *_args, **_kwargs): + """Handle requests by delegating to single or batch processing.""" + if isinstance(request, list): + return self._process_batch_request(request) + else: + return self._process_single_request(request) + + def _process_single_request(self, request: beam.Row): + """Process a single request and return with its response.""" + response: Union[List[Dict[str, Any]], Dict[str, Any]] + if isinstance(self._query_config, CustomQueryConfig): + query = self._query_config.query_fn(request) + response = self._execute_query(query, is_batch=False) + else: + values = self._extract_values_from_request(request) + param_dict = self._build_single_param_dict(values) + response = self._execute_query( + self.query_template, params=param_dict, is_batch=False) + return request, beam.Row(**response) # type: ignore[arg-type] + + def _process_batch_request(self, requests: list[beam.Row]): + """Process batch requests and match responses to original requests.""" + values, responses = [], [] + requests_map: dict[Any, Any] = {} + batch_size = len(requests) + + # Build the appropriate query (single or batched). + raw_query = self._build_batch_query(requests, batch_size) + + # Extract where_clause_fields values and map the generated request key to + # the original request object.. + for req in requests: + current_values = self._extract_values_from_request(req) + values.extend(current_values) + requests_map[self.create_row_key(req)] = req + + # Build named parameters dictionary for parameterized query. + param_dict = self._build_parameters_dict(requests, batch_size) + + # Execute the parameterized query with validated parameters. + result: Union[List[Dict[str, Any]], Dict[str, Any]] = self._execute_query( + raw_query, params=param_dict, is_batch=True) + for response in result: + response_row = beam.Row(**response) # type: ignore[arg-type] + response_key = self.create_row_key(response_row) + if response_key in requests_map: + responses.append((requests_map[response_key], response_row)) + return responses + + def _execute_query( + self, + query: str, + params: Optional[dict] = None, + is_batch: bool = False) -> Union[List[Dict[str, Any]], Dict[str, Any]]: + connection = None + try: + connection = self._engine.connect() + transaction = connection.begin() + try: + if params: + result = connection.execute(text(query), params) + else: + result = connection.execute(text(query)) + # Materialize results while transaction is active. + data: Union[List[Dict[str, Any]], Dict[str, Any]] + if is_batch: + data = [row._asdict() for row in result] + else: + result_row = result.first() + data = result_row._asdict() if result_row else {} + # Explicitly commit the transaction. + transaction.commit() + return data + except Exception as e: + transaction.rollback() + raise RuntimeError(f"Database operation failed: {e}") + except Exception as e: + raise Exception( + f'Could not execute the query. Please check if the query is properly ' + f'formatted and the table exists. {e}') + finally: + if connection: + connection.close() + + def _build_batch_query( + self, requests: list[beam.Row], batch_size: int) -> str: + """Build batched query with unique parameter names for multiple requests. + + This method extracts parameter placeholders from the where_clause_template + using regex and creates unique parameter names for each batch item. The + parameter names in the template can be any valid identifiers (e.g., :id, + :param_0, :user_name) and don't need to match field names exactly. + + For batch queries, placeholders are replaced with unique names like + :batch_0_id, :batch_1_param_0, etc., based on the actual parameter names + found in the template. + + Args: + requests: List of beam.Row requests to process + batch_size: Number of requests in the batch + + Returns: + SQL query string with batched WHERE clauses using unique parameter names + """ + # Single request - return original query. + if batch_size <= 1: + return self.query_template + + # Only batch table-based query configs. + table_query_configs = (TableFieldsQueryConfig, TableFunctionQueryConfig) + if not isinstance(self._query_config, table_query_configs): + return self.query_template + + # Build batched WHERE clauses. + where_clauses = [self._create_batch_clause(i) for i in range(batch_size)] + + # Combine clauses and update query. + where_clause_batched = ' OR '.join(where_clauses) + # We know this is a table-based config from the check above. + assert isinstance(self._query_config, table_query_configs) + return self.query_template.replace( + self._query_config.where_clause_template, where_clause_batched) + + def _create_batch_clause(self, batch_index: int) -> str: + """Create a WHERE clause for a single batch item with unique parameter + names.""" + # This method is only called for table-based query configs + table_query_configs = (TableFieldsQueryConfig, TableFunctionQueryConfig) + assert isinstance(self._query_config, table_query_configs) + clause = self._query_config.where_clause_template + + # Extract parameter names from the template using regex. + param_names = self._extract_parameter_names( + self._query_config.where_clause_template) + for param_name in param_names: + old_param = f':{param_name}' + new_param = f':batch_{batch_index}_{param_name}' + clause = clause.replace(old_param, new_param) + + return f'({clause})' + + def _build_parameters_dict( + self, requests: list[beam.Row], batch_size: int) -> dict: + """Build named parameters dictionary for parameterized queries. + + Args: + requests: List of beam.Row requests to process + batch_size: Number of requests in the batch + + Returns: + Dictionary mapping parameter names to validated values + """ + param_dict = {} + for i, req in enumerate(requests): + current_values = self._extract_values_from_request(req) + + # For batched queries, use unique parameter names per batch item. + if batch_size > 1: + # Extract parameter names from the template using regex. + # Batching is only used with table-based query configs + table_query_configs = (TableFieldsQueryConfig, TableFunctionQueryConfig) + assert isinstance(self._query_config, table_query_configs) + param_names = self._extract_parameter_names( + self._query_config.where_clause_template) + for param_name, val in zip(param_names, current_values): + param_dict[f'batch_{i}_{param_name}'] = val + else: + # For single request, use the helper function. + single_param_dict = self._build_single_param_dict(current_values) + param_dict.update(single_param_dict) + + return param_dict + + def _build_single_param_dict(self, values: list[Any]) -> dict[str, Any]: + """Build parameter dictionary for single request processing. + + Args: + values: List of parameter values + + Returns: + Dictionary mapping parameter names to values + """ + if isinstance(self._query_config, TableFieldsQueryConfig): + return { + field_name: val + for field_name, val in zip( + self._query_config.where_clause_fields, values) + } + else: # TableFunctionQueryConfig. + assert isinstance(self._query_config, TableFunctionQueryConfig) + _, param_dict = self._get_unique_template_and_params( + self._query_config.where_clause_template, values) + return param_dict + + def _get_unique_template_and_params( + self, template: str, values: list[Any]) -> tuple[str, dict[str, Any]]: + """Generate unique binding parameter names for duplicate templates. + + Args: + template: SQL template with potentially duplicate binding parameter names + values: List of parameter values + + Returns: + Tuple of (updated_template, param_dict) with unique binding names. + """ + param_names = self._extract_parameter_names(template) + unique_param_names = [ + f"{param_name}_{i}" if param_names.count(param_name) > 1 else param_name + for i, param_name in enumerate(param_names) + ] + + # Update template by replacing each parameter occurrence in order. + updated_template = template + param_positions = [] + + # Find all parameter positions. + for match in re.finditer(r':(\w+)', template): + param_positions.append((match.start(), match.end(), match.group(1))) + + # Replace parameters from right to left to avoid position shifts. + for i in reversed(range(len(param_positions))): + start, end, _ = param_positions[i] + unique_name = unique_param_names[i] + updated_template = ( + updated_template[:start] + f':{unique_name}' + updated_template[end:]) + + # Build parameter dictionary. + param_dict = { + unique_name: val + for unique_name, val in zip(unique_param_names, values) + } + + return updated_template, param_dict + + def _extract_values_from_request(self, request: beam.Row) -> list[Any]: + """Extract parameter values from a request based on query configuration. + + Args: + request: The beam.Row request to extract values from + + Returns: + List of parameter values + + Raises: + KeyError: If required fields are missing from the request + """ + try: + if isinstance(self._query_config, TableFunctionQueryConfig): + return [ + val for val in self._query_config.where_clause_value_fn(request) + ] + elif isinstance(self._query_config, TableFieldsQueryConfig): + request_dict = request._asdict() + return [ + request_dict[field] + for field in self._query_config.where_clause_fields + ] + else: + raise ValueError("Unsupported query configuration type") + except KeyError as e: + raise KeyError( + "Make sure the values passed in `where_clause_fields` are " + "the keys in the input `beam.Row`." + str(e)) + + def _extract_parameter_names(self, template: str) -> list[str]: + """Extract parameter names from a SQL template string. + + Args: + template: SQL template string with named parameters (e.g., "id = :id") + + Returns: + List of parameter names found in the template (e.g., ["id"]) + """ + return re.findall(r':(\w+)', template) + + def create_row_key(self, row: beam.Row): + if isinstance(self._query_config, TableFunctionQueryConfig): + return tuple(self._query_config.where_clause_value_fn(row)) + if isinstance(self._query_config, TableFieldsQueryConfig): + row_dict = row._asdict() + return ( + tuple( + row_dict[where_clause_field] + for where_clause_field in self._query_config.where_clause_fields)) + raise ValueError( + "Either where_clause_fields or where_clause_value_fn must be specified") + + def get_cache_key(self, request: Union[beam.Row, list[beam.Row]]): + if isinstance(self._query_config, CustomQueryConfig): + raise NotImplementedError( + "Caching is not supported for CustomQueryConfig. " + "Consider using TableFieldsQueryConfig or " + + "TableFunctionQueryConfig instead.") + + if isinstance(request, list): + cache_keys = [] + for req in request: + req_dict = req._asdict() + try: + if isinstance(self._query_config, TableFunctionQueryConfig): + current_values = self._query_config.where_clause_value_fn(req) + elif isinstance(self._query_config, TableFieldsQueryConfig): + current_values = [ + req_dict[field] + for field in self._query_config.where_clause_fields + ] + key = ';'.join(map(repr, current_values)) + cache_keys.append(key) + except KeyError as e: + raise KeyError( + "Make sure the values passed in `where_clause_fields` are the " + "keys in the input `beam.Row`." + str(e)) + return cache_keys + else: + req_dict = request._asdict() + try: + if isinstance(self._query_config, TableFunctionQueryConfig): + current_values = self._query_config.where_clause_value_fn(request) + else: # TableFieldsQueryConfig. + current_values = [ + req_dict[field] + for field in self._query_config.where_clause_fields + ] + key = ";".join(["%s"] * len(current_values)) + cache_key = key % tuple(current_values) + except KeyError as e: + raise KeyError( + "Make sure the values passed in `where_clause_fields` are the " + "keys in the input `beam.Row`." + str(e)) + return cache_key + + def __exit__(self, _exc_type, _exc_val, _exc_tb): + self._engine.dispose(close=True) + self._engine = None + + def batch_elements_kwargs(self) -> Mapping[str, Any]: + """Returns a kwargs suitable for `beam.BatchElements`.""" + return self._batching_kwargs diff --git a/sdks/python/apache_beam/transforms/enrichment_handlers/cloudsql_it_test.py b/sdks/python/apache_beam/transforms/enrichment_handlers/cloudsql_it_test.py new file mode 100644 index 000000000000..3d9cd18151b6 --- /dev/null +++ b/sdks/python/apache_beam/transforms/enrichment_handlers/cloudsql_it_test.py @@ -0,0 +1,623 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import functools +import logging +import os +import unittest +import uuid +from dataclasses import dataclass +from typing import Optional +from unittest.mock import MagicMock + +import pytest + +import apache_beam as beam +from apache_beam.coders import coders +from apache_beam.testing.test_pipeline import TestPipeline +from apache_beam.testing.util import assert_that +from apache_beam.testing.util import equal_to + +# pylint: disable=ungrouped-imports +try: + from testcontainers.core.generic import DbContainer + from testcontainers.postgres import PostgresContainer + from testcontainers.mysql import MySqlContainer + from testcontainers.mssql import SqlServerContainer + from testcontainers.redis import RedisContainer + from sqlalchemy import ( + create_engine, MetaData, Table, Column, Integer, VARCHAR, Engine) + from apache_beam.transforms.enrichment import Enrichment + from apache_beam.transforms.enrichment_handlers.cloudsql import ( + CloudSQLEnrichmentHandler, + DatabaseTypeAdapter, + CustomQueryConfig, + TableFieldsQueryConfig, + TableFunctionQueryConfig, + CloudSQLConnectionConfig, + ExternalSQLDBConnectionConfig, + ConnectionConfig) +except ImportError as e: + raise unittest.SkipTest(f'CloudSQL dependencies not installed: {str(e)}') + +_LOGGER = logging.getLogger(__name__) + + +def where_clause_value_fn(row: beam.Row): + return [row.id] # type: ignore[attr-defined] + + +def query_fn(table, row: beam.Row): + return f"SELECT * FROM {table} WHERE id = {row.id}" # type: ignore[attr-defined] + + +@dataclass +class SQLDBContainerInfo: + adapter: DatabaseTypeAdapter + container: DbContainer + host: str + port: int + user: str + password: str + id: str + + @property + def address(self) -> str: + return f"{self.host}:{self.port}" + + @property + def url(self) -> str: + return self.adapter.to_sqlalchemy_dialect() + "://" + + +class SQLEnrichmentTestHelper: + @staticmethod + def start_sql_db_container( + database_type: DatabaseTypeAdapter, + sql_client_retries=3) -> Optional[SQLDBContainerInfo]: + info = None + for i in range(sql_client_retries): + sql_db_container = DbContainer("") + try: + if database_type == DatabaseTypeAdapter.POSTGRESQL: + user, password, db_id = "test", "test", "test" + sql_db_container = PostgresContainer( + image="postgres:16", + username=user, + password=password, + dbname=db_id, + driver=database_type.value) + sql_db_container.start() + host = sql_db_container.get_container_host_ip() + port = int(sql_db_container.get_exposed_port(5432)) + elif database_type == DatabaseTypeAdapter.MYSQL: + user, password, db_id = "test", "test", "test" + sql_db_container = MySqlContainer( + image="mysql:8.0", + username=user, + root_password=password, + password=password, + dbname=db_id) + sql_db_container.start() + host = sql_db_container.get_container_host_ip() + port = int(sql_db_container.get_exposed_port(3306)) + elif database_type == DatabaseTypeAdapter.SQLSERVER: + user, password, db_id = "SA", "A_Str0ng_Required_Password", "tempdb" + sql_db_container = SqlServerContainer( + image="mcr.microsoft.com/mssql/server:2022-latest", + username=user, + password=password, + dbname=db_id, + dialect=database_type.to_sqlalchemy_dialect()) + sql_db_container.start() + host = sql_db_container.get_container_host_ip() + port = int(sql_db_container.get_exposed_port(1433)) + else: + raise ValueError(f"Unsupported database type: {database_type}") + + info = SQLDBContainerInfo( + adapter=database_type, + container=sql_db_container, + host=host, + port=port, + user=user, + password=password, + id=db_id) + _LOGGER.info( + "%s container started successfully on %s.", + database_type.name, + info.address) + break + except Exception as e: + stdout_logs, stderr_logs = sql_db_container.get_logs() + stdout_logs = stdout_logs.decode("utf-8") + stderr_logs = stderr_logs.decode("utf-8") + _LOGGER.warning( + "Retry %d/%d: Failed to start %s container. Reason: %s. " + "STDOUT logs:\n%s\nSTDERR logs:\n%s", + i + 1, + sql_client_retries, + database_type.name, + e, + stdout_logs, + stderr_logs) + if i == sql_client_retries - 1: + _LOGGER.error( + "Unable to start %s container for I/O tests after %d " + "retries. Tests cannot proceed. STDOUT logs:\n%s\n" + "STDERR logs:\n%s", + database_type.name, + sql_client_retries, + stdout_logs, + stderr_logs) + raise e + + return info + + @staticmethod + def stop_sql_db_container(db_info: SQLDBContainerInfo): + try: + _LOGGER.debug("Stopping %s container.", db_info.adapter.name) + db_info.container.stop() + _LOGGER.info("%s container stopped successfully.", db_info.adapter.name) + except Exception as e: + _LOGGER.warning( + "Error encountered while stopping %s container: %s", + db_info.adapter.name, + e) + + @staticmethod + def create_table( + table_id: str, + engine: Engine, + columns: list[Column], + table_data: list[dict], + metadata: MetaData): + # Create table metadata. + table = Table(table_id, metadata, *columns) + + # Create contextual connection for schema creation. + with engine.connect() as schema_connection: + try: + metadata.create_all(schema_connection) + schema_connection.commit() + except Exception as e: + schema_connection.rollback() + raise RuntimeError(f"Failed to create table schema: {e}") + + # Now create a separate contextual connection for data insertion. + with engine.connect() as connection: + try: + connection.execute(table.insert(), table_data) + connection.commit() + except Exception as e: + connection.rollback() + raise Exception(f"Failed to insert table data: {e}") + + +@pytest.mark.uses_testcontainer +class BaseTestSQLEnrichment(unittest.TestCase): + _table_data = [ + { + "id": 1, "name": "A", 'quantity': 2, 'distribution_center_id': 3 + }, + { + "id": 2, "name": "B", 'quantity': 3, 'distribution_center_id': 1 + }, + { + "id": 3, "name": "C", 'quantity': 10, 'distribution_center_id': 4 + }, + { + "id": 4, "name": "D", 'quantity': 1, 'distribution_center_id': 3 + }, + { + "id": 5, "name": "C", 'quantity': 100, 'distribution_center_id': 4 + }, + { + "id": 6, "name": "D", 'quantity': 11, 'distribution_center_id': 3 + }, + { + "id": 7, "name": "C", 'quantity': 7, 'distribution_center_id': 1 + }, + { + "id": 8, "name": "D", 'quantity': 4, 'distribution_center_id': 1 + }, + ] + + @classmethod + def setUpClass(cls): + if not hasattr(cls, '_connection_config') or not hasattr(cls, '_metadata'): + # Skip setup for the base class. + raise unittest.SkipTest( + "Base class - no connection_config or metadata defined") + + # Type hint data from subclasses. + cls._table_id: str + cls._connection_config: ConnectionConfig + cls._metadata: MetaData + + connector = cls._connection_config.get_connector_handler() + cls._engine = create_engine( + url=cls._connection_config.get_db_url(), creator=connector) + + SQLEnrichmentTestHelper.create_table( + table_id=cls._table_id, + engine=cls._engine, + columns=cls.get_columns(), + table_data=cls._table_data, + metadata=cls._metadata) + + cls._cache_client_retries = 3 + + @classmethod + def get_columns(cls): + """Returns fresh column objects each time it's called.""" + return [ + Column("id", Integer, nullable=False), + Column("name", VARCHAR(255), nullable=False), + Column("quantity", Integer, nullable=False), + Column("distribution_center_id", Integer, nullable=False), + ] + + @pytest.fixture + def cache_container(self): + self._start_cache_container() + + # Hand control to the test. + yield + + self._cache_container.stop() + self._cache_container = None + + def _start_cache_container(self): + for i in range(self._cache_client_retries): + try: + self._cache_container = RedisContainer(image="redis:7.2.4") + self._cache_container.start() + host = self._cache_container.get_container_host_ip() + port = self._cache_container.get_exposed_port(6379) + self._cache_container_host = host + self._cache_container_port = port + self._cache_client = self._cache_container.get_client() + break + except Exception as e: + if i == self._cache_client_retries - 1: + _LOGGER.error( + "Unable to start redis container for RRIO tests after " + "%d retries.", + self._cache_client_retries) + raise e + + @classmethod + def tearDownClass(cls): + cls._metadata.drop_all(cls._engine) + cls._engine.dispose(close=True) + cls._engine = None + + def test_sql_enrichment(self): + expected_rows = [ + beam.Row(id=1, name="A", quantity=2, distribution_center_id=3), + beam.Row(id=2, name="B", quantity=3, distribution_center_id=1) + ] + fields = ['id'] + requests = [ + beam.Row(id=1, name='A'), + beam.Row(id=2, name='B'), + ] + + query_config = TableFieldsQueryConfig( + table_id=self._table_id, + where_clause_template="id = :id", + where_clause_fields=fields) + + handler = CloudSQLEnrichmentHandler( + connection_config=self._connection_config, + query_config=query_config, + min_batch_size=1, + max_batch_size=100, + ) + + with TestPipeline(is_integration_test=True) as test_pipeline: + pcoll = (test_pipeline | beam.Create(requests) | Enrichment(handler)) + + assert_that(pcoll, equal_to(expected_rows)) + + def test_sql_enrichment_batched(self): + expected_rows = [ + beam.Row(id=1, name="A", quantity=2, distribution_center_id=3), + beam.Row(id=2, name="B", quantity=3, distribution_center_id=1) + ] + fields = ['id'] + requests = [ + beam.Row(id=1, name='A'), + beam.Row(id=2, name='B'), + ] + + query_config = TableFieldsQueryConfig( + table_id=self._table_id, + where_clause_template="id = :id", + where_clause_fields=fields) + + handler = CloudSQLEnrichmentHandler( + connection_config=self._connection_config, + query_config=query_config, + min_batch_size=2, + max_batch_size=100, + ) + with TestPipeline(is_integration_test=True) as test_pipeline: + pcoll = (test_pipeline | beam.Create(requests) | Enrichment(handler)) + + assert_that(pcoll, equal_to(expected_rows)) + + def test_sql_enrichment_batched_multiple_fields(self): + expected_rows = [ + beam.Row(id=1, distribution_center_id=3, name="A", quantity=2), + beam.Row(id=2, distribution_center_id=1, name="B", quantity=3) + ] + fields = ['id', 'distribution_center_id'] + requests = [ + beam.Row(id=1, distribution_center_id=3), + beam.Row(id=2, distribution_center_id=1), + ] + + query_config = TableFieldsQueryConfig( + table_id=self._table_id, + where_clause_template="id = :id AND distribution_center_id = :param_1", + where_clause_fields=fields) + + handler = CloudSQLEnrichmentHandler( + connection_config=self._connection_config, + query_config=query_config, + min_batch_size=8, + max_batch_size=100, + ) + with TestPipeline(is_integration_test=True) as test_pipeline: + pcoll = (test_pipeline | beam.Create(requests) | Enrichment(handler)) + + assert_that(pcoll, equal_to(expected_rows)) + + def test_sql_enrichment_with_query_fn(self): + expected_rows = [ + beam.Row(id=1, name="A", quantity=2, distribution_center_id=3), + beam.Row(id=2, name="B", quantity=3, distribution_center_id=1) + ] + requests = [ + beam.Row(id=1, name='A'), + beam.Row(id=2, name='B'), + ] + fn = functools.partial(query_fn, self._table_id) + + query_config = CustomQueryConfig(query_fn=fn) + + handler = CloudSQLEnrichmentHandler( + connection_config=self._connection_config, query_config=query_config) + with TestPipeline(is_integration_test=True) as test_pipeline: + pcoll = (test_pipeline | beam.Create(requests) | Enrichment(handler)) + + assert_that(pcoll, equal_to(expected_rows)) + + def test_sql_enrichment_with_condition_value_fn(self): + expected_rows = [ + beam.Row(id=1, name="A", quantity=2, distribution_center_id=3), + beam.Row(id=2, name="B", quantity=3, distribution_center_id=1) + ] + requests = [ + beam.Row(id=1, name='A'), + beam.Row(id=2, name='B'), + ] + + query_config = TableFunctionQueryConfig( + table_id=self._table_id, + where_clause_template="id = :param_0", + where_clause_value_fn=where_clause_value_fn) + + handler = CloudSQLEnrichmentHandler( + connection_config=self._connection_config, + query_config=query_config, + min_batch_size=2, + max_batch_size=100) + with TestPipeline(is_integration_test=True) as test_pipeline: + pcoll = (test_pipeline | beam.Create(requests) | Enrichment(handler)) + + assert_that(pcoll, equal_to(expected_rows)) + + def test_sql_enrichment_on_non_existent_table(self): + requests = [ + beam.Row(id=1, name='A'), + beam.Row(id=2, name='B'), + ] + + query_config = TableFunctionQueryConfig( + table_id=self._table_id, + where_clause_template="id = :id", + where_clause_value_fn=where_clause_value_fn) + + handler = CloudSQLEnrichmentHandler( + connection_config=self._connection_config, + query_config=query_config, + column_names=["wrong_column"], + ) + + with self.assertRaises(Exception) as context: + with TestPipeline() as p: + _ = (p | beam.Create(requests) | Enrichment(handler)) + + expect_err_msg_contains = ( + "Could not execute the query. Please check if the query is properly " + "formatted and the table exists.") + self.assertIn(expect_err_msg_contains, str(context.exception)) + + @pytest.mark.usefixtures("cache_container") + def test_sql_enrichment_with_redis(self): + requests = [ + beam.Row(id=1, name='A'), + beam.Row(id=2, name='B'), + ] + expected_rows = [ + beam.Row(id=1, name="A", quantity=2, distribution_center_id=3), + beam.Row(id=2, name="B", quantity=3, distribution_center_id=1) + ] + + query_config = TableFunctionQueryConfig( + table_id=self._table_id, + where_clause_template="id = :param_0", + where_clause_value_fn=where_clause_value_fn) + + handler = CloudSQLEnrichmentHandler( + connection_config=self._connection_config, + query_config=query_config, + min_batch_size=2, + max_batch_size=100) + with TestPipeline(is_integration_test=True) as test_pipeline: + pcoll_populate_cache = ( + test_pipeline + | beam.Create(requests) + | Enrichment(handler).with_redis_cache( + self._cache_container_host, self._cache_container_port)) + + assert_that(pcoll_populate_cache, equal_to(expected_rows)) + + # Manually check cache entry to verify entries were correctly stored. + c = coders.StrUtf8Coder() + for req in requests: + key = handler.get_cache_key(req) + response = self._cache_client.get(c.encode(key)) + if not response: + raise ValueError("No cache entry found for %s" % key) + + # Mocks the CloudSQL enrichment handler to prevent actual database calls. + # This ensures that a cache hit scenario does not trigger any database + # interaction, raising an exception if an unexpected call occurs. + actual = CloudSQLEnrichmentHandler.__call__ + CloudSQLEnrichmentHandler.__call__ = MagicMock( + side_effect=Exception("Database should not be called on a cache hit.")) + + # Run a second pipeline to verify cache is being used. + with TestPipeline(is_integration_test=True) as test_pipeline: + pcoll_cached = ( + test_pipeline + | beam.Create(requests) + | Enrichment(handler).with_redis_cache( + self._cache_container_host, self._cache_container_port)) + + assert_that(pcoll_cached, equal_to(expected_rows)) + + # Restore the original CloudSQL enrichment handler implementation. + CloudSQLEnrichmentHandler.__call__ = actual + + +class BaseCloudSQLDBEnrichment(BaseTestSQLEnrichment): + @classmethod + def setUpClass(cls): + if not hasattr(cls, '_db_adapter'): + # Skip setup for the base class. + raise unittest.SkipTest("Base class - no db_adapter defined") + + # Type hint data from subclasses. + cls._db_adapter: DatabaseTypeAdapter + cls._instance_connection_uri: str + cls._user: str + cls._password: str + cls._db_id: str + + cls._connection_config = CloudSQLConnectionConfig( + db_adapter=cls._db_adapter, + instance_connection_uri=cls._instance_connection_uri, + user=cls._user, + password=cls._password, + db_id=cls._db_id) + super().setUpClass() + + @classmethod + def tearDownClass(cls): + super().tearDownClass() + + +@unittest.skipUnless( + os.environ.get('ALLOYDB_PASSWORD'), + "ALLOYDB_PASSWORD environment var is not provided") +class TestCloudSQLPostgresEnrichment(BaseCloudSQLDBEnrichment): + _db_adapter = DatabaseTypeAdapter.POSTGRESQL + + # Configuration required for locating the CloudSQL instance. + _table_id = "product_details_cloudsql_pg_enrichment" + _gcp_project_id = "apache-beam-testing" + _region = "us-central1" + _instance_name = "beam-integration-tests" + _instance_connection_uri = f"{_gcp_project_id}:{_region}:{_instance_name}" + + # Configuration required for authenticating to the CloudSQL instance. + _user = "postgres" + _password = os.getenv("ALLOYDB_PASSWORD") + _db_id = "postgres" + + _metadata = MetaData() + + +@pytest.mark.uses_testcontainer +class BaseExternalSQLDBEnrichment(BaseTestSQLEnrichment): + @classmethod + def setUpClass(cls): + if not hasattr(cls, '_db_adapter'): + # Skip setup for the base class. + raise unittest.SkipTest("Base class - no db_adapter defined") + + # Type hint data from subclasses. + cls._db_adapter: DatabaseTypeAdapter + + cls._db = SQLEnrichmentTestHelper.start_sql_db_container(cls._db_adapter) + cls._connection_config = ExternalSQLDBConnectionConfig( + db_adapter=cls._db_adapter, + host=cls._db.host, + user=cls._db.user, + password=cls._db.password, + db_id=cls._db.id, + port=cls._db.port) + super().setUpClass() + + @classmethod + def tearDownClass(cls): + super().tearDownClass() + SQLEnrichmentTestHelper.stop_sql_db_container(cls._db) + cls._db = None + + +@pytest.mark.uses_testcontainer +class TestExternalPostgresEnrichment(BaseExternalSQLDBEnrichment): + _db_adapter = DatabaseTypeAdapter.POSTGRESQL + _unique_suffix = str(uuid.uuid4())[:8] + _table_id = f"product_details_external_pg_enrichment_{_unique_suffix}" + _metadata = MetaData() + + +@pytest.mark.uses_testcontainer +class TestExternalMySQLEnrichment(BaseExternalSQLDBEnrichment): + _db_adapter = DatabaseTypeAdapter.MYSQL + _unique_suffix = str(uuid.uuid4())[:8] + _table_id = f"product_details_external_mysql_enrichment_{_unique_suffix}" + _metadata = MetaData() + + +@pytest.mark.uses_testcontainer +class TestExternalSQLServerEnrichment(BaseExternalSQLDBEnrichment): + _db_adapter = DatabaseTypeAdapter.SQLSERVER + _unique_suffix = str(uuid.uuid4())[:8] + _table_id = f"product_details_external_mssql_enrichment_{_unique_suffix}" + _metadata = MetaData() + + +if __name__ == "__main__": + unittest.main() diff --git a/sdks/python/apache_beam/transforms/enrichment_handlers/cloudsql_test.py b/sdks/python/apache_beam/transforms/enrichment_handlers/cloudsql_test.py new file mode 100644 index 000000000000..99823f6d89a6 --- /dev/null +++ b/sdks/python/apache_beam/transforms/enrichment_handlers/cloudsql_test.py @@ -0,0 +1,569 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import unittest + +from parameterized import parameterized + +# pylint: disable=ungrouped-imports +try: + from apache_beam.transforms.enrichment_handlers.cloudsql import ( + CloudSQLEnrichmentHandler, + DatabaseTypeAdapter, + CustomQueryConfig, + TableFieldsQueryConfig, + TableFunctionQueryConfig, + CloudSQLConnectionConfig, + ExternalSQLDBConnectionConfig) + from apache_beam.transforms.enrichment_handlers.cloudsql_it_test import ( + query_fn, + where_clause_value_fn, + ) +except ImportError as e: + raise unittest.SkipTest(f'CloudSQL dependencies not installed: {str(e)}') + + +class TestCloudSQLEnrichment(unittest.TestCase): + def test_invalid_external_db_connection_params(self): + with self.assertRaises(ValueError) as context: + _ = ExternalSQLDBConnectionConfig( + db_adapter=DatabaseTypeAdapter.POSTGRESQL, + host='', + port=5432, + user='', + password='', + db_id='') + self.assertIn("Database host cannot be empty", str(context.exception)) + + def test_invalid_cloudsql_db_connection_params(self): + with self.assertRaises(ValueError) as context: + _ = CloudSQLConnectionConfig( + db_adapter=DatabaseTypeAdapter.POSTGRESQL, + instance_connection_uri='', + user='', + password='', + db_id='') + self.assertIn( + "Instance connection URI cannot be empty", str(context.exception)) + + @parameterized.expand([ + # Empty TableFieldsQueryConfig. + ( + lambda: TableFieldsQueryConfig( + table_id="", where_clause_template="", where_clause_fields=[]), + 1, + 2, + "must provide table_id and where_clause_template" + ), + # Missing where_clause_template in TableFieldsQueryConfig. + ( + lambda: TableFieldsQueryConfig( + table_id="table", + where_clause_template="", + where_clause_fields=["id"]), + 2, + 10, + "must provide table_id and where_clause_template" + ), + # Invalid CustomQueryConfig with None query_fn. + ( + lambda: CustomQueryConfig(query_fn=None), # type: ignore[arg-type] + 2, + 10, + "must provide a valid query_fn" + ), + # Missing table_id in TableFunctionQueryConfig. + ( + lambda: TableFunctionQueryConfig( + table_id="", + where_clause_template="id='{}'", + where_clause_value_fn=where_clause_value_fn), + 2, + 10, + "must provide table_id and where_clause_template" + ), + # Missing where_clause_fields in TableFieldsQueryConfig. + ( + lambda: TableFieldsQueryConfig( + table_id="table", + where_clause_template="id = '{}'", + where_clause_fields=[]), + 1, + 10, + "must provide non-empty where_clause_fields" + ), + # Missing where_clause_value_fn in TableFunctionQueryConfig. + ( + lambda: TableFunctionQueryConfig( + table_id="table", + where_clause_template="id = '{}'", + where_clause_value_fn=None), # type: ignore[arg-type] + 1, + 10, + "must provide where_clause_value_fn" + ), + ]) + def test_invalid_query_config( + self, create_config, min_batch_size, max_batch_size, expected_error_msg): + """Test that validation errors are raised for invalid query configs. + + The test verifies both that the appropriate ValueError is raised and that + the error message contains the expected text. + """ + with self.assertRaises(ValueError) as context: + # Call the lambda to create the config. + query_config = create_config() + + connection_config = ExternalSQLDBConnectionConfig( + db_adapter=DatabaseTypeAdapter.POSTGRESQL, + host='localhost', + port=5432, + user='', + password='', + db_id='') + + _ = CloudSQLEnrichmentHandler( + connection_config=connection_config, + query_config=query_config, + min_batch_size=min_batch_size, + max_batch_size=max_batch_size, + ) + # Verify the error message contains the expected text. + self.assertIn(expected_error_msg, str(context.exception)) + + def test_valid_query_configs(self): + """Test valid query configuration cases.""" + # Valid TableFieldsQueryConfig. + table_fields_config = TableFieldsQueryConfig( + table_id="my_table", + where_clause_template="id = :id", + where_clause_fields=["id"]) + + connection_config = ExternalSQLDBConnectionConfig( + db_adapter=DatabaseTypeAdapter.POSTGRESQL, + host='localhost', + port=5432, + user='user', + password='password', + db_id='db') + + handler1 = CloudSQLEnrichmentHandler( + connection_config=connection_config, + query_config=table_fields_config, + min_batch_size=1, + max_batch_size=10) + + self.assertEqual( + handler1.query_template, "SELECT * FROM my_table WHERE id = :id") + + # Valid TableFunctionQueryConfig. + table_function_config = TableFunctionQueryConfig( + table_id="my_table", + where_clause_template="id = :id", + where_clause_value_fn=where_clause_value_fn) + + handler2 = CloudSQLEnrichmentHandler( + connection_config=connection_config, + query_config=table_function_config, + min_batch_size=1, + max_batch_size=10) + + self.assertEqual( + handler2.query_template, "SELECT * FROM my_table WHERE id = :id") + + # Valid CustomQueryConfig. + custom_config = CustomQueryConfig(query_fn=query_fn) + + handler3 = CloudSQLEnrichmentHandler( + connection_config=connection_config, query_config=custom_config) + + # Verify that batching kwargs are empty for CustomQueryConfig. + self.assertEqual(handler3.batch_elements_kwargs(), {}) + + def test_custom_query_config_cache_key_error(self): + """Test get_cache_key raises NotImplementedError with CustomQueryConfig.""" + custom_config = CustomQueryConfig(query_fn=query_fn) + + connection_config = ExternalSQLDBConnectionConfig( + db_adapter=DatabaseTypeAdapter.POSTGRESQL, + host='localhost', + port=5432, + user='user', + password='password', + db_id='db') + + handler = CloudSQLEnrichmentHandler( + connection_config=connection_config, query_config=custom_config) + + # Create a dummy request. + import apache_beam as beam + request = beam.Row(id=1) + + # Verify that get_cache_key raises NotImplementedError. + with self.assertRaises(NotImplementedError): + handler.get_cache_key(request) + + def test_extract_parameter_names(self): + """Test parameter extraction from SQL templates.""" + connection_config = ExternalSQLDBConnectionConfig( + db_adapter=DatabaseTypeAdapter.POSTGRESQL, + host='localhost', + port=5432, + user='user', + password='password', + db_id='db') + + config = TableFieldsQueryConfig( + table_id="users", + where_clause_template="id = :id", + where_clause_fields=["id"]) + + handler = CloudSQLEnrichmentHandler( + connection_config=connection_config, query_config=config) + + # Test simple parameter extraction. + self.assertEqual(handler._extract_parameter_names("id = :id"), ["id"]) + + # Test multiple parameters. + self.assertEqual( + handler._extract_parameter_names("id = :id AND name = :name"), + ["id", "name"]) + + # Test no parameters. + self.assertEqual( + handler._extract_parameter_names("SELECT * FROM users"), []) + + # Test complex query. + self.assertEqual( + handler._extract_parameter_names( + "age > :min_age AND city = :city AND status = :status"), + ["min_age", "city", "status"]) + + def test_build_single_param_dict(self): + """Test building parameter dictionaries for single requests.""" + connection_config = ExternalSQLDBConnectionConfig( + db_adapter=DatabaseTypeAdapter.POSTGRESQL, + host='localhost', + port=5432, + user='user', + password='password', + db_id='db') + + # Test TableFieldsQueryConfig. + table_config = TableFieldsQueryConfig( + table_id="users", + where_clause_template="id = :id AND name = :name", + where_clause_fields=["id", "name"]) + + handler1 = CloudSQLEnrichmentHandler( + connection_config=connection_config, query_config=table_config) + + result1 = handler1._build_single_param_dict([123, "John"]) + self.assertEqual(result1, {"id": 123, "name": "John"}) + + # Test TableFunctionQueryConfig. + table_func_config = TableFunctionQueryConfig( + table_id="users", + where_clause_template="age > :min_age AND city = :city", + where_clause_value_fn=lambda row: [row.min_age, row.city]) + + handler2 = CloudSQLEnrichmentHandler( + connection_config=connection_config, query_config=table_func_config) + + result2 = handler2._build_single_param_dict([3, "NYC"]) + self.assertEqual(result2, {"min_age": 3, "city": "NYC"}) + + def test_extract_values_from_request(self): + """Test extracting values from requests based on query configuration.""" + connection_config = ExternalSQLDBConnectionConfig( + db_adapter=DatabaseTypeAdapter.POSTGRESQL, + host='localhost', + port=5432, + user='user', + password='password', + db_id='db') + + # Test TableFieldsQueryConfig. + table_config = TableFieldsQueryConfig( + table_id="users", + where_clause_template="id = :id AND name = :name", + where_clause_fields=["id", "name"]) + + handler1 = CloudSQLEnrichmentHandler( + connection_config=connection_config, query_config=table_config) + + import apache_beam as beam + request1 = beam.Row(id=123, name="John", age=30) + result1 = handler1._extract_values_from_request(request1) + self.assertEqual(result1, [123, "John"]) + + # Test TableFunctionQueryConfig. + def test_value_fn(row): + return [row.age, row.city] + + table_func_config = TableFunctionQueryConfig( + table_id="users", + where_clause_template="age > :min_age AND city = :city", + where_clause_value_fn=test_value_fn) + + handler2 = CloudSQLEnrichmentHandler( + connection_config=connection_config, query_config=table_func_config) + + request2 = beam.Row(age=25, city="NYC", name="Jane") + result2 = handler2._extract_values_from_request(request2) + self.assertEqual(result2, [25, "NYC"]) + + # Test missing field error. + request3 = beam.Row(age=30) # Missing name field. + with self.assertRaises(KeyError) as context: + handler1._extract_values_from_request(request3) + self.assertIn("where_clause_fields", str(context.exception)) + + def test_build_batch_query_single_request(self): + """Test batch query building with single request returns original query.""" + connection_config = ExternalSQLDBConnectionConfig( + db_adapter=DatabaseTypeAdapter.POSTGRESQL, + host='localhost', + port=5432, + user='user', + password='password', + db_id='db') + + config = TableFieldsQueryConfig( + table_id="users", + where_clause_template="id = :id", + where_clause_fields=["id"]) + + handler = CloudSQLEnrichmentHandler( + connection_config=connection_config, query_config=config) + + import apache_beam as beam + requests = [beam.Row(id=1)] + + # Single request should return original query template. + result = handler._build_batch_query(requests, batch_size=1) + self.assertEqual(result, "SELECT * FROM users WHERE id = :id") + + def test_build_batch_query_multiple_requests(self): + """Test batch query building with multiple requests creates OR clauses.""" + connection_config = ExternalSQLDBConnectionConfig( + db_adapter=DatabaseTypeAdapter.POSTGRESQL, + host='localhost', + port=5432, + user='user', + password='password', + db_id='db') + + config = TableFieldsQueryConfig( + table_id="users", + where_clause_template="id = :id AND name = :name", + where_clause_fields=["id", "name"]) + + handler = CloudSQLEnrichmentHandler( + connection_config=connection_config, query_config=config) + + import apache_beam as beam + requests = [beam.Row(id=1, name="Alice"), beam.Row(id=2, name="Bob")] + + result = handler._build_batch_query(requests, batch_size=2) + expected = ( + "SELECT * FROM users WHERE " + "(id = :batch_0_id AND name = :batch_0_name) OR " + "(id = :batch_1_id AND name = :batch_1_name)") + self.assertEqual(result, expected) + + def test_build_parameters_dict_batch(self): + """Test parameter dictionary building for batch requests.""" + connection_config = ExternalSQLDBConnectionConfig( + db_adapter=DatabaseTypeAdapter.POSTGRESQL, + host='localhost', + port=5432, + user='user', + password='password', + db_id='db') + + config = TableFieldsQueryConfig( + table_id="users", + where_clause_template="id = :id", + where_clause_fields=["id"]) + + handler = CloudSQLEnrichmentHandler( + connection_config=connection_config, query_config=config) + + import apache_beam as beam + requests = [beam.Row(id=1), beam.Row(id=2)] + + result = handler._build_parameters_dict(requests, batch_size=2) + expected = {"batch_0_id": 1, "batch_1_id": 2} + self.assertEqual(result, expected) + + def test_create_batch_clause(self): + """Test batch clause creation with unique parameter names.""" + connection_config = ExternalSQLDBConnectionConfig( + db_adapter=DatabaseTypeAdapter.POSTGRESQL, + host='localhost', + port=5432, + user='user', + password='password', + db_id='db') + + config = TableFieldsQueryConfig( + table_id="users", + where_clause_template="id = :id AND name = :name", + where_clause_fields=["id", "name"]) + + handler = CloudSQLEnrichmentHandler( + connection_config=connection_config, query_config=config) + + result = handler._create_batch_clause(batch_index=0) + expected = "(id = :batch_0_id AND name = :batch_0_name)" + self.assertEqual(result, expected) + + result2 = handler._create_batch_clause(batch_index=1) + expected2 = "(id = :batch_1_id AND name = :batch_1_name)" + self.assertEqual(result2, expected2) + + def test_security_parameter_extraction_edge_cases(self): + """Test parameter extraction with edge cases and security issues.""" + connection_config = ExternalSQLDBConnectionConfig( + db_adapter=DatabaseTypeAdapter.POSTGRESQL, + host='localhost', + port=5432, + user='user', + password='password', + db_id='db') + + config = TableFieldsQueryConfig( + table_id="users", + where_clause_template="id = :id", + where_clause_fields=["id"]) + + handler = CloudSQLEnrichmentHandler( + connection_config=connection_config, query_config=config) + + # Test empty template. + self.assertEqual(handler._extract_parameter_names(""), []) + + # Test template with no parameters. + self.assertEqual( + handler._extract_parameter_names("SELECT * FROM users"), []) + + # Test template with malformed parameters (should not match). + self.assertEqual(handler._extract_parameter_names("id = :"), []) + + # Test template with numeric parameter names. + self.assertEqual( + handler._extract_parameter_names("col = :param_123"), ["param_123"]) + + # Test template with underscore parameter names. + self.assertEqual( + handler._extract_parameter_names("col = :user_id"), ["user_id"]) + + # Test duplicate parameter names. + result = handler._extract_parameter_names("id = :id OR id = :id") + # Note: re.findall returns all matches, so we'd get ["id", "id"]. + self.assertEqual(result, ["id", "id"]) + + def test_build_single_param_dict_with_generic_names(self): + """Test parameter dict building with generic names.""" + connection_config = ExternalSQLDBConnectionConfig( + db_adapter=DatabaseTypeAdapter.POSTGRESQL, + host='localhost', + port=5432, + user='user', + password='password', + db_id='db') + + # Test TableFunctionQueryConfig with generic parameter names. + config = TableFunctionQueryConfig( + table_id="users", + where_clause_template="id = :param_0 AND name = :param_1", + where_clause_value_fn=lambda row: [row.id, row.name]) + + handler = CloudSQLEnrichmentHandler( + connection_config=connection_config, query_config=config) + + result = handler._build_single_param_dict([123, "John"]) + expected = {"param_0": 123, "param_1": "John"} + self.assertEqual(result, expected) + + def test_duplicate_binding_parameter_names_handling(self): + """Test handling of duplicate binding parameter names.""" + connection_config = ExternalSQLDBConnectionConfig( + db_adapter=DatabaseTypeAdapter.POSTGRESQL, + host='localhost', + port=5432, + user='user', + password='password', + db_id='db') + + # Test TableFunctionQueryConfig with duplicate parameter names. + config = TableFunctionQueryConfig( + table_id="users", + where_clause_template="id = :param AND name = :param", + where_clause_value_fn=lambda row: [row.id, row.name]) + + handler = CloudSQLEnrichmentHandler( + connection_config=connection_config, query_config=config) + + # Test that duplicate parameter names are made unique. + result = handler._build_single_param_dict([123, "John"]) + expected = {"param_0": 123, "param_1": "John"} + self.assertEqual(result, expected) + + # Test the helper method for template and params. + template = "id = :param AND name = :param" + values = [123, "John"] + updated_template, param_dict = handler._get_unique_template_and_params( + template, values) + + expected_template = "id = :param_0 AND name = :param_1" + expected_params = {"param_0": 123, "param_1": "John"} + self.assertEqual(updated_template, expected_template) + self.assertEqual(param_dict, expected_params) + + def test_unsupported_query_config_type_error(self): + """Test that unsupported query config types raise ValueError.""" + connection_config = ExternalSQLDBConnectionConfig( + db_adapter=DatabaseTypeAdapter.POSTGRESQL, + host='localhost', + port=5432, + user='user', + password='password', + db_id='db') + + config = TableFieldsQueryConfig( + table_id="users", + where_clause_template="id = :id", + where_clause_fields=["id"]) + + handler = CloudSQLEnrichmentHandler( + connection_config=connection_config, query_config=config) + + # Temporarily replace the config with an unsupported type. + handler._query_config = "unsupported_type" + + import apache_beam as beam + request = beam.Row(id=1) + + with self.assertRaises(ValueError) as context: + handler._extract_values_from_request(request) + self.assertIn( + "Unsupported query configuration type", str(context.exception)) + + +if __name__ == '__main__': + unittest.main() diff --git a/sdks/python/container/py310/base_image_requirements.txt b/sdks/python/container/py310/base_image_requirements.txt index 56c4826d5d1c..b1abff83eecf 100644 --- a/sdks/python/container/py310/base_image_requirements.txt +++ b/sdks/python/container/py310/base_image_requirements.txt @@ -37,12 +37,12 @@ build==1.3.0 cachetools==5.5.2 certifi==2025.8.3 cffi==1.17.1 -charset-normalizer==3.4.2 +charset-normalizer==3.4.3 click==8.2.1 -cloud-sql-python-connector==1.18.3 +cloud-sql-python-connector==1.18.4 crcmod==1.7 cryptography==45.0.6 -Cython==3.1.2 +Cython==3.1.3 dill==0.3.1.1 dnspython==2.7.0 docker==7.1.0 @@ -51,16 +51,16 @@ docstring_parser==0.17.0 exceptiongroup==1.3.0 execnet==2.1.1 fastavro==1.12.0 -fasteners==0.19 -freezegun==1.5.4 +fasteners==0.20 +freezegun==1.5.5 frozenlist==1.7.0 future==1.0.0 google-api-core==2.25.1 -google-api-python-client==2.178.0 +google-api-python-client==2.179.0 google-apitools==0.5.31 google-auth==2.40.3 google-auth-httplib2==0.2.0 -google-cloud-aiplatform==1.108.0 +google-cloud-aiplatform==1.109.0 google-cloud-bigquery==3.35.1 google-cloud-bigquery-storage==2.32.0 google-cloud-bigtable==2.32.0 @@ -73,12 +73,12 @@ google-cloud-pubsub==2.31.1 google-cloud-pubsublite==1.12.0 google-cloud-recommendations-ai==0.10.18 google-cloud-resource-manager==1.14.2 -google-cloud-spanner==3.56.0 +google-cloud-spanner==3.57.0 google-cloud-storage==2.19.0 google-cloud-videointelligence==2.16.2 google-cloud-vision==3.10.2 google-crc32c==1.7.1 -google-genai==1.29.0 +google-genai==1.30.0 google-resumable-media==2.7.2 googleapis-common-protos==1.70.0 greenlet==3.2.4 @@ -92,7 +92,7 @@ hdfs==2.7.3 httpcore==1.0.9 httplib2==0.22.0 httpx==0.28.1 -hypothesis==6.137.1 +hypothesis==6.138.2 idna==3.10 importlib_metadata==8.7.0 iniconfig==2.1.0 @@ -112,7 +112,7 @@ milvus==2.3.5 mmh3==5.2.0 mock==5.2.0 more-itertools==10.7.0 -multidict==6.6.3 +multidict==6.6.4 mysql-connector-python==9.4.0 nltk==3.9.1 numpy==2.2.6 @@ -122,7 +122,7 @@ opentelemetry-api==1.36.0 opentelemetry-sdk==1.36.0 opentelemetry-semantic-conventions==0.57b0 oracledb==3.3.0 -orjson==3.11.1 +orjson==3.11.2 overrides==7.7.0 packaging==25.0 pandas==2.2.3 @@ -173,7 +173,7 @@ six==1.17.0 sniffio==1.3.1 sortedcontainers==2.4.0 soupsieve==2.7 -SQLAlchemy==2.0.42 +SQLAlchemy==2.0.43 sqlalchemy_pytds==1.0.2 sqlparse==0.5.3 tenacity==8.5.0 @@ -189,7 +189,7 @@ urllib3==2.5.0 virtualenv-clone==0.5.7 websockets==15.0.1 wheel==0.45.1 -wrapt==1.17.2 +wrapt==1.17.3 yarl==1.20.1 zipp==3.23.0 zstandard==0.23.0 diff --git a/sdks/python/container/py310/ml_image_requirements.txt b/sdks/python/container/py310/ml_image_requirements.txt index db788b4afc3a..9943a7e2e23d 100644 --- a/sdks/python/container/py310/ml_image_requirements.txt +++ b/sdks/python/container/py310/ml_image_requirements.txt @@ -39,12 +39,12 @@ build==1.3.0 cachetools==5.5.2 certifi==2025.8.3 cffi==1.17.1 -charset-normalizer==3.4.2 +charset-normalizer==3.4.3 click==8.2.1 -cloud-sql-python-connector==1.18.3 +cloud-sql-python-connector==1.18.4 crcmod==1.7 cryptography==45.0.6 -Cython==3.1.2 +Cython==3.1.3 dill==0.3.1.1 dnspython==2.7.0 docker==7.1.0 @@ -53,20 +53,20 @@ docstring_parser==0.17.0 exceptiongroup==1.3.0 execnet==2.1.1 fastavro==1.12.0 -fasteners==0.19 -filelock==3.18.0 +fasteners==0.20 +filelock==3.19.1 flatbuffers==25.2.10 -freezegun==1.5.4 +freezegun==1.5.5 frozenlist==1.7.0 fsspec==2025.7.0 future==1.0.0 gast==0.6.0 google-api-core==2.25.1 -google-api-python-client==2.178.0 +google-api-python-client==2.179.0 google-apitools==0.5.31 google-auth==2.40.3 google-auth-httplib2==0.2.0 -google-cloud-aiplatform==1.108.0 +google-cloud-aiplatform==1.109.0 google-cloud-bigquery==3.35.1 google-cloud-bigquery-storage==2.32.0 google-cloud-bigtable==2.32.0 @@ -79,12 +79,12 @@ google-cloud-pubsub==2.31.1 google-cloud-pubsublite==1.12.0 google-cloud-recommendations-ai==0.10.18 google-cloud-resource-manager==1.14.2 -google-cloud-spanner==3.56.0 +google-cloud-spanner==3.57.0 google-cloud-storage==2.19.0 google-cloud-videointelligence==2.16.2 google-cloud-vision==3.10.2 google-crc32c==1.7.1 -google-genai==1.29.0 +google-genai==1.30.0 google-pasta==0.2.0 google-resumable-media==2.7.2 googleapis-common-protos==1.70.0 @@ -102,7 +102,7 @@ httpcore==1.0.9 httplib2==0.22.0 httpx==0.28.1 huggingface-hub==0.34.4 -hypothesis==6.137.1 +hypothesis==6.138.2 idna==3.10 importlib_metadata==8.7.0 iniconfig==2.1.0 @@ -115,12 +115,12 @@ joblib==1.5.1 jsonpickle==3.4.2 jsonschema==4.25.0 jsonschema-specifications==2025.4.1 -keras==3.11.1 +keras==3.11.2 keyring==25.6.0 keyrings.google-artifactregistry-auth==1.1.2 libclang==18.1.1 Markdown==3.8.2 -markdown-it-py==3.0.0 +markdown-it-py==4.0.0 MarkupSafe==3.0.2 mdurl==0.1.2 milvus==2.3.5 @@ -129,7 +129,7 @@ mmh3==5.2.0 mock==5.2.0 more-itertools==10.7.0 mpmath==1.3.0 -multidict==6.6.3 +multidict==6.6.4 mysql-connector-python==9.4.0 namex==0.1.0 networkx==3.4.2 @@ -157,7 +157,7 @@ opentelemetry-semantic-conventions==0.57b0 opt_einsum==3.4.0 optree==0.17.0 oracledb==3.3.0 -orjson==3.11.1 +orjson==3.11.2 overrides==7.7.0 packaging==25.0 pandas==2.2.3 @@ -211,7 +211,7 @@ six==1.17.0 sniffio==1.3.1 sortedcontainers==2.4.0 soupsieve==2.7 -SQLAlchemy==2.0.42 +SQLAlchemy==2.0.43 sqlalchemy_pytds==1.0.2 sqlparse==0.5.3 sympy==1.14.0 @@ -228,7 +228,7 @@ tokenizers==0.21.4 tomli==2.2.1 torch==2.7.1 tqdm==4.67.1 -transformers==4.48.3 +transformers==4.55.2 triton==3.3.1 typing-inspection==0.4.1 typing_extensions==4.14.1 @@ -239,7 +239,7 @@ virtualenv-clone==0.5.7 websockets==15.0.1 Werkzeug==3.1.3 wheel==0.45.1 -wrapt==1.17.2 +wrapt==1.17.3 yarl==1.20.1 zipp==3.23.0 zstandard==0.23.0 diff --git a/sdks/python/container/py311/base_image_requirements.txt b/sdks/python/container/py311/base_image_requirements.txt index b751c7b71649..b9991224e751 100644 --- a/sdks/python/container/py311/base_image_requirements.txt +++ b/sdks/python/container/py311/base_image_requirements.txt @@ -36,12 +36,12 @@ build==1.3.0 cachetools==5.5.2 certifi==2025.8.3 cffi==1.17.1 -charset-normalizer==3.4.2 +charset-normalizer==3.4.3 click==8.2.1 -cloud-sql-python-connector==1.18.3 +cloud-sql-python-connector==1.18.4 crcmod==1.7 cryptography==45.0.6 -Cython==3.1.2 +Cython==3.1.3 dill==0.3.1.1 dnspython==2.7.0 docker==7.1.0 @@ -49,16 +49,16 @@ docopt==0.6.2 docstring_parser==0.17.0 execnet==2.1.1 fastavro==1.12.0 -fasteners==0.19 -freezegun==1.5.4 +fasteners==0.20 +freezegun==1.5.5 frozenlist==1.7.0 future==1.0.0 google-api-core==2.25.1 -google-api-python-client==2.178.0 +google-api-python-client==2.179.0 google-apitools==0.5.31 google-auth==2.40.3 google-auth-httplib2==0.2.0 -google-cloud-aiplatform==1.108.0 +google-cloud-aiplatform==1.109.0 google-cloud-bigquery==3.35.1 google-cloud-bigquery-storage==2.32.0 google-cloud-bigtable==2.32.0 @@ -71,12 +71,12 @@ google-cloud-pubsub==2.31.1 google-cloud-pubsublite==1.12.0 google-cloud-recommendations-ai==0.10.18 google-cloud-resource-manager==1.14.2 -google-cloud-spanner==3.56.0 +google-cloud-spanner==3.57.0 google-cloud-storage==2.19.0 google-cloud-videointelligence==2.16.2 google-cloud-vision==3.10.2 google-crc32c==1.7.1 -google-genai==1.29.0 +google-genai==1.30.0 google-resumable-media==2.7.2 googleapis-common-protos==1.70.0 greenlet==3.2.4 @@ -90,7 +90,7 @@ hdfs==2.7.3 httpcore==1.0.9 httplib2==0.22.0 httpx==0.28.1 -hypothesis==6.137.1 +hypothesis==6.138.2 idna==3.10 importlib_metadata==8.7.0 iniconfig==2.1.0 @@ -110,7 +110,7 @@ milvus==2.3.5 mmh3==5.2.0 mock==5.2.0 more-itertools==10.7.0 -multidict==6.6.3 +multidict==6.6.4 mysql-connector-python==9.4.0 nltk==3.9.1 numpy==2.2.6 @@ -120,7 +120,7 @@ opentelemetry-api==1.36.0 opentelemetry-sdk==1.36.0 opentelemetry-semantic-conventions==0.57b0 oracledb==3.3.0 -orjson==3.11.1 +orjson==3.11.2 overrides==7.7.0 packaging==25.0 pandas==2.2.3 @@ -171,7 +171,7 @@ six==1.17.0 sniffio==1.3.1 sortedcontainers==2.4.0 soupsieve==2.7 -SQLAlchemy==2.0.42 +SQLAlchemy==2.0.43 sqlalchemy_pytds==1.0.2 sqlparse==0.5.3 tenacity==8.5.0 @@ -186,7 +186,7 @@ urllib3==2.5.0 virtualenv-clone==0.5.7 websockets==15.0.1 wheel==0.45.1 -wrapt==1.17.2 +wrapt==1.17.3 yarl==1.20.1 zipp==3.23.0 zstandard==0.23.0 diff --git a/sdks/python/container/py311/ml_image_requirements.txt b/sdks/python/container/py311/ml_image_requirements.txt index a08e10eeb14f..ea5312dcd05a 100644 --- a/sdks/python/container/py311/ml_image_requirements.txt +++ b/sdks/python/container/py311/ml_image_requirements.txt @@ -38,12 +38,12 @@ build==1.3.0 cachetools==5.5.2 certifi==2025.8.3 cffi==1.17.1 -charset-normalizer==3.4.2 +charset-normalizer==3.4.3 click==8.2.1 -cloud-sql-python-connector==1.18.3 +cloud-sql-python-connector==1.18.4 crcmod==1.7 cryptography==45.0.6 -Cython==3.1.2 +Cython==3.1.3 dill==0.3.1.1 dnspython==2.7.0 docker==7.1.0 @@ -51,20 +51,20 @@ docopt==0.6.2 docstring_parser==0.17.0 execnet==2.1.1 fastavro==1.12.0 -fasteners==0.19 -filelock==3.18.0 +fasteners==0.20 +filelock==3.19.1 flatbuffers==25.2.10 -freezegun==1.5.4 +freezegun==1.5.5 frozenlist==1.7.0 fsspec==2025.7.0 future==1.0.0 gast==0.6.0 google-api-core==2.25.1 -google-api-python-client==2.178.0 +google-api-python-client==2.179.0 google-apitools==0.5.31 google-auth==2.40.3 google-auth-httplib2==0.2.0 -google-cloud-aiplatform==1.108.0 +google-cloud-aiplatform==1.109.0 google-cloud-bigquery==3.35.1 google-cloud-bigquery-storage==2.32.0 google-cloud-bigtable==2.32.0 @@ -77,12 +77,12 @@ google-cloud-pubsub==2.31.1 google-cloud-pubsublite==1.12.0 google-cloud-recommendations-ai==0.10.18 google-cloud-resource-manager==1.14.2 -google-cloud-spanner==3.56.0 +google-cloud-spanner==3.57.0 google-cloud-storage==2.19.0 google-cloud-videointelligence==2.16.2 google-cloud-vision==3.10.2 google-crc32c==1.7.1 -google-genai==1.29.0 +google-genai==1.30.0 google-pasta==0.2.0 google-resumable-media==2.7.2 googleapis-common-protos==1.70.0 @@ -100,7 +100,7 @@ httpcore==1.0.9 httplib2==0.22.0 httpx==0.28.1 huggingface-hub==0.34.4 -hypothesis==6.137.1 +hypothesis==6.138.2 idna==3.10 importlib_metadata==8.7.0 iniconfig==2.1.0 @@ -113,12 +113,12 @@ joblib==1.5.1 jsonpickle==3.4.2 jsonschema==4.25.0 jsonschema-specifications==2025.4.1 -keras==3.11.1 +keras==3.11.2 keyring==25.6.0 keyrings.google-artifactregistry-auth==1.1.2 libclang==18.1.1 Markdown==3.8.2 -markdown-it-py==3.0.0 +markdown-it-py==4.0.0 MarkupSafe==3.0.2 mdurl==0.1.2 milvus==2.3.5 @@ -127,7 +127,7 @@ mmh3==5.2.0 mock==5.2.0 more-itertools==10.7.0 mpmath==1.3.0 -multidict==6.6.3 +multidict==6.6.4 mysql-connector-python==9.4.0 namex==0.1.0 networkx==3.5 @@ -155,7 +155,7 @@ opentelemetry-semantic-conventions==0.57b0 opt_einsum==3.4.0 optree==0.17.0 oracledb==3.3.0 -orjson==3.11.1 +orjson==3.11.2 overrides==7.7.0 packaging==25.0 pandas==2.2.3 @@ -209,7 +209,7 @@ six==1.17.0 sniffio==1.3.1 sortedcontainers==2.4.0 soupsieve==2.7 -SQLAlchemy==2.0.42 +SQLAlchemy==2.0.43 sqlalchemy_pytds==1.0.2 sqlparse==0.5.3 sympy==1.14.0 @@ -225,7 +225,7 @@ threadpoolctl==3.6.0 tokenizers==0.21.4 torch==2.7.1 tqdm==4.67.1 -transformers==4.48.3 +transformers==4.55.2 triton==3.3.1 typing-inspection==0.4.1 typing_extensions==4.14.1 @@ -236,7 +236,7 @@ virtualenv-clone==0.5.7 websockets==15.0.1 Werkzeug==3.1.3 wheel==0.45.1 -wrapt==1.17.2 +wrapt==1.17.3 yarl==1.20.1 zipp==3.23.0 zstandard==0.23.0 diff --git a/sdks/python/container/py312/base_image_requirements.txt b/sdks/python/container/py312/base_image_requirements.txt index d657ca17a617..d03a7fd605d9 100644 --- a/sdks/python/container/py312/base_image_requirements.txt +++ b/sdks/python/container/py312/base_image_requirements.txt @@ -35,12 +35,12 @@ build==1.3.0 cachetools==5.5.2 certifi==2025.8.3 cffi==1.17.1 -charset-normalizer==3.4.2 +charset-normalizer==3.4.3 click==8.2.1 -cloud-sql-python-connector==1.18.3 +cloud-sql-python-connector==1.18.4 crcmod==1.7 cryptography==45.0.6 -Cython==3.1.2 +Cython==3.1.3 dill==0.3.1.1 dnspython==2.7.0 docker==7.1.0 @@ -48,16 +48,16 @@ docopt==0.6.2 docstring_parser==0.17.0 execnet==2.1.1 fastavro==1.12.0 -fasteners==0.19 -freezegun==1.5.4 +fasteners==0.20 +freezegun==1.5.5 frozenlist==1.7.0 future==1.0.0 google-api-core==2.25.1 -google-api-python-client==2.178.0 +google-api-python-client==2.179.0 google-apitools==0.5.31 google-auth==2.40.3 google-auth-httplib2==0.2.0 -google-cloud-aiplatform==1.108.0 +google-cloud-aiplatform==1.109.0 google-cloud-bigquery==3.35.1 google-cloud-bigquery-storage==2.32.0 google-cloud-bigtable==2.32.0 @@ -70,12 +70,12 @@ google-cloud-pubsub==2.31.1 google-cloud-pubsublite==1.12.0 google-cloud-recommendations-ai==0.10.18 google-cloud-resource-manager==1.14.2 -google-cloud-spanner==3.56.0 +google-cloud-spanner==3.57.0 google-cloud-storage==2.19.0 google-cloud-videointelligence==2.16.2 google-cloud-vision==3.10.2 google-crc32c==1.7.1 -google-genai==1.29.0 +google-genai==1.30.0 google-resumable-media==2.7.2 googleapis-common-protos==1.70.0 greenlet==3.2.4 @@ -89,7 +89,7 @@ hdfs==2.7.3 httpcore==1.0.9 httplib2==0.22.0 httpx==0.28.1 -hypothesis==6.137.1 +hypothesis==6.138.2 idna==3.10 importlib_metadata==8.7.0 iniconfig==2.1.0 @@ -109,7 +109,7 @@ milvus==2.3.5 mmh3==5.2.0 mock==5.2.0 more-itertools==10.7.0 -multidict==6.6.3 +multidict==6.6.4 mysql-connector-python==9.4.0 nltk==3.9.1 numpy==2.2.6 @@ -119,7 +119,7 @@ opentelemetry-api==1.36.0 opentelemetry-sdk==1.36.0 opentelemetry-semantic-conventions==0.57b0 oracledb==3.3.0 -orjson==3.11.1 +orjson==3.11.2 overrides==7.7.0 packaging==25.0 pandas==2.2.3 @@ -170,7 +170,7 @@ six==1.17.0 sniffio==1.3.1 sortedcontainers==2.4.0 soupsieve==2.7 -SQLAlchemy==2.0.42 +SQLAlchemy==2.0.43 sqlalchemy_pytds==1.0.2 sqlparse==0.5.3 tenacity==8.5.0 @@ -185,7 +185,7 @@ urllib3==2.5.0 virtualenv-clone==0.5.7 websockets==15.0.1 wheel==0.45.1 -wrapt==1.17.2 +wrapt==1.17.3 yarl==1.20.1 zipp==3.23.0 zstandard==0.23.0 diff --git a/sdks/python/container/py312/ml_image_requirements.txt b/sdks/python/container/py312/ml_image_requirements.txt index 22e6617f3446..4c977e0bba68 100644 --- a/sdks/python/container/py312/ml_image_requirements.txt +++ b/sdks/python/container/py312/ml_image_requirements.txt @@ -37,12 +37,12 @@ build==1.3.0 cachetools==5.5.2 certifi==2025.8.3 cffi==1.17.1 -charset-normalizer==3.4.2 +charset-normalizer==3.4.3 click==8.2.1 -cloud-sql-python-connector==1.18.3 +cloud-sql-python-connector==1.18.4 crcmod==1.7 cryptography==45.0.6 -Cython==3.1.2 +Cython==3.1.3 dill==0.3.1.1 dnspython==2.7.0 docker==7.1.0 @@ -50,20 +50,20 @@ docopt==0.6.2 docstring_parser==0.17.0 execnet==2.1.1 fastavro==1.12.0 -fasteners==0.19 -filelock==3.18.0 +fasteners==0.20 +filelock==3.19.1 flatbuffers==25.2.10 -freezegun==1.5.4 +freezegun==1.5.5 frozenlist==1.7.0 fsspec==2025.7.0 future==1.0.0 gast==0.6.0 google-api-core==2.25.1 -google-api-python-client==2.178.0 +google-api-python-client==2.179.0 google-apitools==0.5.31 google-auth==2.40.3 google-auth-httplib2==0.2.0 -google-cloud-aiplatform==1.108.0 +google-cloud-aiplatform==1.109.0 google-cloud-bigquery==3.35.1 google-cloud-bigquery-storage==2.32.0 google-cloud-bigtable==2.32.0 @@ -76,12 +76,12 @@ google-cloud-pubsub==2.31.1 google-cloud-pubsublite==1.12.0 google-cloud-recommendations-ai==0.10.18 google-cloud-resource-manager==1.14.2 -google-cloud-spanner==3.56.0 +google-cloud-spanner==3.57.0 google-cloud-storage==2.19.0 google-cloud-videointelligence==2.16.2 google-cloud-vision==3.10.2 google-crc32c==1.7.1 -google-genai==1.29.0 +google-genai==1.30.0 google-pasta==0.2.0 google-resumable-media==2.7.2 googleapis-common-protos==1.70.0 @@ -99,7 +99,7 @@ httpcore==1.0.9 httplib2==0.22.0 httpx==0.28.1 huggingface-hub==0.34.4 -hypothesis==6.137.1 +hypothesis==6.138.2 idna==3.10 importlib_metadata==8.7.0 iniconfig==2.1.0 @@ -112,12 +112,12 @@ joblib==1.5.1 jsonpickle==3.4.2 jsonschema==4.25.0 jsonschema-specifications==2025.4.1 -keras==3.11.1 +keras==3.11.2 keyring==25.6.0 keyrings.google-artifactregistry-auth==1.1.2 libclang==18.1.1 Markdown==3.8.2 -markdown-it-py==3.0.0 +markdown-it-py==4.0.0 MarkupSafe==3.0.2 mdurl==0.1.2 milvus==2.3.5 @@ -126,7 +126,7 @@ mmh3==5.2.0 mock==5.2.0 more-itertools==10.7.0 mpmath==1.3.0 -multidict==6.6.3 +multidict==6.6.4 mysql-connector-python==9.4.0 namex==0.1.0 networkx==3.5 @@ -154,7 +154,7 @@ opentelemetry-semantic-conventions==0.57b0 opt_einsum==3.4.0 optree==0.17.0 oracledb==3.3.0 -orjson==3.11.1 +orjson==3.11.2 overrides==7.7.0 packaging==25.0 pandas==2.2.3 @@ -208,7 +208,7 @@ six==1.17.0 sniffio==1.3.1 sortedcontainers==2.4.0 soupsieve==2.7 -SQLAlchemy==2.0.42 +SQLAlchemy==2.0.43 sqlalchemy_pytds==1.0.2 sqlparse==0.5.3 sympy==1.14.0 @@ -223,7 +223,7 @@ threadpoolctl==3.6.0 tokenizers==0.21.4 torch==2.7.1 tqdm==4.67.1 -transformers==4.48.3 +transformers==4.55.2 triton==3.3.1 typing-inspection==0.4.1 typing_extensions==4.14.1 @@ -234,7 +234,7 @@ virtualenv-clone==0.5.7 websockets==15.0.1 Werkzeug==3.1.3 wheel==0.45.1 -wrapt==1.17.2 +wrapt==1.17.3 yarl==1.20.1 zipp==3.23.0 zstandard==0.23.0 diff --git a/sdks/python/container/py313/base_image_requirements.txt b/sdks/python/container/py313/base_image_requirements.txt index c0d16468fbc5..7111f0d552c9 100644 --- a/sdks/python/container/py313/base_image_requirements.txt +++ b/sdks/python/container/py313/base_image_requirements.txt @@ -35,12 +35,12 @@ build==1.3.0 cachetools==5.5.2 certifi==2025.8.3 cffi==1.17.1 -charset-normalizer==3.4.2 +charset-normalizer==3.4.3 click==8.2.1 -cloud-sql-python-connector==1.18.3 +cloud-sql-python-connector==1.18.4 crcmod==1.7 cryptography==45.0.6 -Cython==3.1.2 +Cython==3.1.3 dill==0.3.1.1 dnspython==2.7.0 docker==7.1.0 @@ -48,15 +48,15 @@ docopt==0.6.2 docstring_parser==0.17.0 execnet==2.1.1 fastavro==1.12.0 -fasteners==0.19 -freezegun==1.5.4 +fasteners==0.20 +freezegun==1.5.5 frozenlist==1.7.0 future==1.0.0 google-api-core==2.25.1 google-apitools==0.5.32 google-auth==2.40.3 google-auth-httplib2==0.2.0 -google-cloud-aiplatform==1.108.0 +google-cloud-aiplatform==1.109.0 google-cloud-bigquery==3.35.1 google-cloud-bigquery-storage==2.32.0 google-cloud-bigtable==2.32.0 @@ -68,12 +68,12 @@ google-cloud-pubsub==2.31.1 google-cloud-pubsublite==1.12.0 google-cloud-recommendations-ai==0.10.18 google-cloud-resource-manager==1.14.2 -google-cloud-spanner==3.56.0 +google-cloud-spanner==3.57.0 google-cloud-storage==2.19.0 google-cloud-videointelligence==2.16.2 google-cloud-vision==3.10.2 google-crc32c==1.7.1 -google-genai==1.29.0 +google-genai==1.30.0 google-resumable-media==2.7.2 googleapis-common-protos==1.70.0 greenlet==3.2.4 @@ -87,7 +87,7 @@ hdfs==2.7.3 httpcore==1.0.9 httplib2==0.22.0 httpx==0.28.1 -hypothesis==6.137.1 +hypothesis==6.138.2 idna==3.10 importlib_metadata==8.7.0 iniconfig==2.1.0 @@ -107,7 +107,7 @@ milvus==2.3.5 mmh3==5.2.0 mock==5.2.0 more-itertools==10.7.0 -multidict==6.6.3 +multidict==6.6.4 mysql-connector-python==9.4.0 nltk==3.9.1 numpy==2.2.6 @@ -117,7 +117,7 @@ opentelemetry-api==1.36.0 opentelemetry-sdk==1.36.0 opentelemetry-semantic-conventions==0.57b0 oracledb==3.3.0 -orjson==3.11.1 +orjson==3.11.2 overrides==7.7.0 packaging==25.0 pandas==2.2.3 @@ -168,7 +168,7 @@ six==1.17.0 sniffio==1.3.1 sortedcontainers==2.4.0 soupsieve==2.7 -SQLAlchemy==2.0.42 +SQLAlchemy==2.0.43 sqlalchemy_pytds==1.0.2 sqlparse==0.5.3 tenacity==8.5.0 @@ -182,7 +182,7 @@ urllib3==2.5.0 virtualenv-clone==0.5.7 websockets==15.0.1 wheel==0.45.1 -wrapt==1.17.2 +wrapt==1.17.3 yarl==1.20.1 zipp==3.23.0 zstandard==0.23.0 diff --git a/sdks/python/container/py39/base_image_requirements.txt b/sdks/python/container/py39/base_image_requirements.txt index b38ba013e02a..9be1605c850d 100644 --- a/sdks/python/container/py39/base_image_requirements.txt +++ b/sdks/python/container/py39/base_image_requirements.txt @@ -37,12 +37,12 @@ build==1.3.0 cachetools==5.5.2 certifi==2025.8.3 cffi==1.17.1 -charset-normalizer==3.4.2 +charset-normalizer==3.4.3 click==8.1.8 -cloud-sql-python-connector==1.18.3 +cloud-sql-python-connector==1.18.4 crcmod==1.7 cryptography==45.0.6 -Cython==3.1.2 +Cython==3.1.3 dill==0.3.1.1 dnspython==2.7.0 docker==7.1.0 @@ -51,16 +51,16 @@ docstring_parser==0.17.0 exceptiongroup==1.3.0 execnet==2.1.1 fastavro==1.12.0 -fasteners==0.19 -freezegun==1.5.4 +fasteners==0.20 +freezegun==1.5.5 frozenlist==1.7.0 future==1.0.0 google-api-core==2.25.1 -google-api-python-client==2.178.0 +google-api-python-client==2.179.0 google-apitools==0.5.31 google-auth==2.40.3 google-auth-httplib2==0.2.0 -google-cloud-aiplatform==1.108.0 +google-cloud-aiplatform==1.109.0 google-cloud-bigquery==3.35.1 google-cloud-bigquery-storage==2.32.0 google-cloud-bigtable==2.32.0 @@ -73,12 +73,12 @@ google-cloud-pubsub==2.31.1 google-cloud-pubsublite==1.12.0 google-cloud-recommendations-ai==0.10.18 google-cloud-resource-manager==1.14.2 -google-cloud-spanner==3.56.0 +google-cloud-spanner==3.57.0 google-cloud-storage==2.19.0 google-cloud-videointelligence==2.16.2 google-cloud-vision==3.10.2 google-crc32c==1.7.1 -google-genai==1.29.0 +google-genai==1.30.0 google-resumable-media==2.7.2 googleapis-common-protos==1.70.0 greenlet==3.2.4 @@ -92,7 +92,7 @@ hdfs==2.7.3 httpcore==1.0.9 httplib2==0.22.0 httpx==0.28.1 -hypothesis==6.137.1 +hypothesis==6.138.2 idna==3.10 importlib_metadata==8.7.0 iniconfig==2.1.0 @@ -112,7 +112,7 @@ milvus==2.3.5 mmh3==5.2.0 mock==5.2.0 more-itertools==10.7.0 -multidict==6.6.3 +multidict==6.6.4 mysql-connector-python==9.4.0 nltk==3.9.1 numpy==2.0.2 @@ -122,7 +122,7 @@ opentelemetry-api==1.36.0 opentelemetry-sdk==1.36.0 opentelemetry-semantic-conventions==0.57b0 oracledb==3.3.0 -orjson==3.11.1 +orjson==3.11.2 overrides==7.7.0 packaging==25.0 pandas==2.2.3 @@ -173,7 +173,7 @@ six==1.17.0 sniffio==1.3.1 sortedcontainers==2.4.0 soupsieve==2.7 -SQLAlchemy==2.0.42 +SQLAlchemy==2.0.43 sqlalchemy_pytds==1.0.2 sqlparse==0.5.3 tenacity==8.5.0 @@ -189,7 +189,7 @@ urllib3==2.5.0 virtualenv-clone==0.5.7 websockets==15.0.1 wheel==0.45.1 -wrapt==1.17.2 +wrapt==1.17.3 yarl==1.20.1 zipp==3.23.0 zstandard==0.23.0 diff --git a/sdks/python/container/py39/ml_image_requirements.txt b/sdks/python/container/py39/ml_image_requirements.txt index f17254f8bedd..a6a77834c550 100644 --- a/sdks/python/container/py39/ml_image_requirements.txt +++ b/sdks/python/container/py39/ml_image_requirements.txt @@ -39,12 +39,12 @@ build==1.3.0 cachetools==5.5.2 certifi==2025.8.3 cffi==1.17.1 -charset-normalizer==3.4.2 +charset-normalizer==3.4.3 click==8.1.8 -cloud-sql-python-connector==1.18.3 +cloud-sql-python-connector==1.18.4 crcmod==1.7 cryptography==45.0.6 -Cython==3.1.2 +Cython==3.1.3 dill==0.3.1.1 dnspython==2.7.0 docker==7.1.0 @@ -53,20 +53,20 @@ docstring_parser==0.17.0 exceptiongroup==1.3.0 execnet==2.1.1 fastavro==1.12.0 -fasteners==0.19 -filelock==3.18.0 +fasteners==0.20 +filelock==3.19.1 flatbuffers==25.2.10 -freezegun==1.5.4 +freezegun==1.5.5 frozenlist==1.7.0 fsspec==2025.7.0 future==1.0.0 gast==0.6.0 google-api-core==2.25.1 -google-api-python-client==2.178.0 +google-api-python-client==2.179.0 google-apitools==0.5.31 google-auth==2.40.3 google-auth-httplib2==0.2.0 -google-cloud-aiplatform==1.108.0 +google-cloud-aiplatform==1.109.0 google-cloud-bigquery==3.35.1 google-cloud-bigquery-storage==2.32.0 google-cloud-bigtable==2.32.0 @@ -79,12 +79,12 @@ google-cloud-pubsub==2.31.1 google-cloud-pubsublite==1.12.0 google-cloud-recommendations-ai==0.10.18 google-cloud-resource-manager==1.14.2 -google-cloud-spanner==3.56.0 +google-cloud-spanner==3.57.0 google-cloud-storage==2.19.0 google-cloud-videointelligence==2.16.2 google-cloud-vision==3.10.2 google-crc32c==1.7.1 -google-genai==1.29.0 +google-genai==1.30.0 google-pasta==0.2.0 google-resumable-media==2.7.2 googleapis-common-protos==1.70.0 @@ -102,7 +102,7 @@ httpcore==1.0.9 httplib2==0.22.0 httpx==0.28.1 huggingface-hub==0.34.4 -hypothesis==6.137.1 +hypothesis==6.138.2 idna==3.10 importlib_metadata==8.7.0 iniconfig==2.1.0 @@ -129,7 +129,7 @@ mmh3==5.2.0 mock==5.2.0 more-itertools==10.7.0 mpmath==1.3.0 -multidict==6.6.3 +multidict==6.6.4 mysql-connector-python==9.4.0 namex==0.1.0 networkx==3.2.1 @@ -157,7 +157,7 @@ opentelemetry-semantic-conventions==0.57b0 opt_einsum==3.4.0 optree==0.17.0 oracledb==3.3.0 -orjson==3.11.1 +orjson==3.11.2 overrides==7.7.0 packaging==25.0 pandas==2.2.3 @@ -211,7 +211,7 @@ six==1.17.0 sniffio==1.3.1 sortedcontainers==2.4.0 soupsieve==2.7 -SQLAlchemy==2.0.42 +SQLAlchemy==2.0.43 sqlalchemy_pytds==1.0.2 sqlparse==0.5.3 sympy==1.14.0 @@ -228,7 +228,7 @@ tokenizers==0.21.4 tomli==2.2.1 torch==2.7.1 tqdm==4.67.1 -transformers==4.48.3 +transformers==4.55.2 triton==3.3.1 typing-inspection==0.4.1 typing_extensions==4.14.1 @@ -239,7 +239,7 @@ virtualenv-clone==0.5.7 websockets==15.0.1 Werkzeug==3.1.3 wheel==0.45.1 -wrapt==1.17.2 +wrapt==1.17.3 yarl==1.20.1 zipp==3.23.0 zstandard==0.23.0 diff --git a/sdks/python/setup.py b/sdks/python/setup.py index f7cd2a808a9a..9702ba4746ea 100644 --- a/sdks/python/setup.py +++ b/sdks/python/setup.py @@ -444,6 +444,8 @@ def get_portability_package_data(): 'mysql-connector-python>=9.3.0', 'python-tds>=1.16.1', 'sqlalchemy-pytds>=1.0.2', + 'pg8000>=1.31.1', + "PyMySQL>=1.1.0", 'oracledb>=3.1.1', 'milvus' ], @@ -474,6 +476,10 @@ def get_portability_package_data(): 'google-cloud-vision>=2,<4', 'google-cloud-recommendations-ai>=0.1.0,<0.11.0', 'google-cloud-aiplatform>=1.26.0, < 2.0', + 'cloud-sql-python-connector>=1.18.2,<2.0.0', + 'python-tds>=1.16.1', + 'pg8000>=1.31.1', + "PyMySQL>=1.1.0", # Authentication for Google Artifact Registry when using # --extra-index-url or --index-url in requirements.txt in # Dataflow, which allows installing python packages from private