From 0ebfdf8034f4fba233e7a381c7bb19b8e1ede451 Mon Sep 17 00:00:00 2001 From: Mohamed Awnallah Date: Sun, 18 Jan 2026 12:01:37 +0000 Subject: [PATCH 1/7] embeddings: Fix OpenAI embeddings dimensions parameter type --- sdks/python/apache_beam/ml/transforms/embeddings/open_ai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/open_ai.py b/sdks/python/apache_beam/ml/transforms/embeddings/open_ai.py index 2092fa5cba9a..67d1b96db7c6 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/open_ai.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/open_ai.py @@ -109,7 +109,7 @@ def request( "input": batch, } if self.dimensions: - kwargs["dimensions"] = [str(self.dimensions)] + kwargs["dimensions"] = self.dimensions if self.user: kwargs["user"] = self.user From 69473af168ef9a3ff9381c418b4b5a5b8aa28df7 Mon Sep 17 00:00:00 2001 From: Mohamed Awnallah Date: Sun, 18 Jan 2026 12:20:16 +0000 Subject: [PATCH 2/7] embeddings: use scale test validation --- .../ml/transforms/embeddings/open_ai_it_test.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/open_ai_it_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/open_ai_it_test.py index 118c656c33c3..b02a095ba67d 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/open_ai_it_test.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/open_ai_it_test.py @@ -24,14 +24,12 @@ from apache_beam.ml.transforms import base from apache_beam.ml.transforms.base import MLTransform -try: - from sdks.python.apache_beam.ml.transforms.embeddings.open_ai import OpenAITextEmbeddings -except ImportError: - OpenAITextEmbeddings = None +from apache_beam.ml.transforms.embeddings.open_ai import OpenAITextEmbeddings # pylint: disable=ungrouped-imports try: import tensorflow_transform as tft + from apache_beam.ml.transforms.tft import ScaleTo01 except ImportError: tft = None @@ -76,6 +74,7 @@ def test_embeddings_with_scale_to_0_1(self): columns=[test_query_column], api_key=self.api_key, ) + scale_config = ScaleTo01(columns=['embedding']) with beam.Pipeline() as pipeline: transformed_pcoll = ( pipeline @@ -84,10 +83,12 @@ def test_embeddings_with_scale_to_0_1(self): }]) | "MLTransform" >> MLTransform( write_artifact_location=self.artifact_location).with_transform( - embedding_config)) + embedding_config)).with_transform(scale_config) def assert_element(element): - assert max(element.feature_1) == 1 + embedding_values = element.embedding + assert 0 <= max(embedding_values) <= 1 + assert 0 <= min(embedding_values) <= 1 _ = (transformed_pcoll | beam.Map(assert_element)) @@ -186,7 +187,7 @@ def test_with_int_data_types(self): write_artifact_location=self.artifact_location).with_transform( embedding_config)) - def test_with_artifact_location(self): # pylint: disable=line-too-long + def test_with_artifact_location(self): """Local artifact location test""" secondary_artifact_location = tempfile.mkdtemp( prefix='_openai_secondary_test') @@ -231,7 +232,7 @@ def assert_element(element): # Clean up the temporary directory shutil.rmtree(secondary_artifact_location) - def test_mltransform_to_ptransform_with_openai(self): # pylint: disable=line-too-long + def test_mltransform_to_ptransform_with_openai(self): transforms = [ OpenAITextEmbeddings( columns=['x'], From ea5eb0ac9500fb451883154aaa0525d5aa12b6c7 Mon Sep 17 00:00:00 2001 From: Mohamed Awnallah Date: Mon, 19 Jan 2026 02:35:26 +0000 Subject: [PATCH 3/7] rag: reuse `chunk_approximately_equals` util as prep for openai integration --- .../ml/rag/embeddings/huggingface_test.py | 17 +++-------------- .../ml/rag/embeddings/vertex_ai_test.py | 17 +++-------------- sdks/python/apache_beam/ml/rag/test_utils.py | 17 +++++++++++++---- 3 files changed, 19 insertions(+), 32 deletions(-) diff --git a/sdks/python/apache_beam/ml/rag/embeddings/huggingface_test.py b/sdks/python/apache_beam/ml/rag/embeddings/huggingface_test.py index f0b9316dcee8..4229b34d0937 100644 --- a/sdks/python/apache_beam/ml/rag/embeddings/huggingface_test.py +++ b/sdks/python/apache_beam/ml/rag/embeddings/huggingface_test.py @@ -27,6 +27,7 @@ from apache_beam.ml.rag.types import Chunk from apache_beam.ml.rag.types import Content from apache_beam.ml.rag.types import Embedding +from apache_beam.ml.rag.test_utils import TestHelpers from apache_beam.ml.transforms.base import MLTransform from apache_beam.testing.test_pipeline import TestPipeline from apache_beam.testing.util import assert_that @@ -40,19 +41,6 @@ SENTENCE_TRANSFORMERS_AVAILABLE = False -def chunk_approximately_equals(expected, actual): - """Compare embeddings allowing for numerical differences.""" - if not isinstance(expected, Chunk) or not isinstance(actual, Chunk): - return False - - return ( - expected.id == actual.id and expected.metadata == actual.metadata and - expected.content == actual.content and - len(expected.embedding.dense_embedding) == len( - actual.embedding.dense_embedding) and - all(isinstance(x, float) for x in actual.embedding.dense_embedding)) - - @pytest.mark.uses_transformers @unittest.skipIf( not SENTENCE_TRANSFORMERS_AVAILABLE, "sentence-transformers not available") @@ -105,7 +93,8 @@ def test_embedding_pipeline(self): with_transform(embedder)) assert_that( - embeddings, equal_to(expected, equals_fn=chunk_approximately_equals)) + embeddings, + equal_to(expected, equals_fn=TestHelpers.chunk_approximately_equals)) if __name__ == '__main__': diff --git a/sdks/python/apache_beam/ml/rag/embeddings/vertex_ai_test.py b/sdks/python/apache_beam/ml/rag/embeddings/vertex_ai_test.py index 435475ffb33b..96c346474944 100644 --- a/sdks/python/apache_beam/ml/rag/embeddings/vertex_ai_test.py +++ b/sdks/python/apache_beam/ml/rag/embeddings/vertex_ai_test.py @@ -21,6 +21,7 @@ import unittest import apache_beam as beam +from apache_beam.ml.rag.test_utils import TestHelpers from apache_beam.ml.rag.types import Chunk from apache_beam.ml.rag.types import Content from apache_beam.ml.rag.types import Embedding @@ -39,19 +40,6 @@ VERTEX_AI_AVAILABLE = False -def chunk_approximately_equals(expected, actual): - """Compare embeddings allowing for numerical differences.""" - if not isinstance(expected, Chunk) or not isinstance(actual, Chunk): - return False - - return ( - expected.id == actual.id and expected.metadata == actual.metadata and - expected.content == actual.content and - len(expected.embedding.dense_embedding) == len( - actual.embedding.dense_embedding) and - all(isinstance(x, float) for x in actual.embedding.dense_embedding)) - - @unittest.skipIf( not VERTEX_AI_AVAILABLE, "Vertex AI dependencies not available") class VertexAITextEmbeddingsTest(unittest.TestCase): @@ -104,7 +92,8 @@ def test_embedding_pipeline(self): with_transform(embedder)) assert_that( - embeddings, equal_to(expected, equals_fn=chunk_approximately_equals)) + embeddings, + equal_to(expected, equals_fn=TestHelpers.chunk_approximately_equals)) if __name__ == '__main__': diff --git a/sdks/python/apache_beam/ml/rag/test_utils.py b/sdks/python/apache_beam/ml/rag/test_utils.py index f4acb105892c..e12d936256f6 100644 --- a/sdks/python/apache_beam/ml/rag/test_utils.py +++ b/sdks/python/apache_beam/ml/rag/test_utils.py @@ -80,6 +80,19 @@ def find_free_port(): # Return the port number assigned by OS. return s.getsockname()[1] + @staticmethod + def chunk_approximately_equals(expected, actual): + """Compare embeddings allowing for numerical differences.""" + if not isinstance(expected, Chunk) or not isinstance(actual, Chunk): + return False + + return ( + expected.id == actual.id and expected.metadata == actual.metadata and + expected.content == actual.content and + len(expected.embedding.dense_embedding) == len( + actual.embedding.dense_embedding) and + all(isinstance(x, float) for x in actual.embedding.dense_embedding)) + class CustomMilvusContainer(MilvusContainer): """Custom Milvus container with configurable ports and environment setup. @@ -407,7 +420,3 @@ def assert_chunks_equivalent( # Validate field metadata. err_msg = f"Field Metadata doesn't match for chunk {actual.id}" assert a_f['metadata'] == e_f['metadata'], err_msg - - -if __name__ == '__main__': - unittest.main() From 43c4b515548280e92a47a7b04a1ae8608eed2262 Mon Sep 17 00:00:00 2001 From: Mohamed Awnallah Date: Mon, 19 Jan 2026 02:38:26 +0000 Subject: [PATCH 4/7] rag: add openai integration --- .../apache_beam/ml/rag/embeddings/open_ai.py | 80 +++++++++++ .../ml/rag/embeddings/open_ai_test.py | 127 ++++++++++++++++++ sdks/python/setup.py | 6 +- 3 files changed, 211 insertions(+), 2 deletions(-) create mode 100644 sdks/python/apache_beam/ml/rag/embeddings/open_ai.py create mode 100644 sdks/python/apache_beam/ml/rag/embeddings/open_ai_test.py diff --git a/sdks/python/apache_beam/ml/rag/embeddings/open_ai.py b/sdks/python/apache_beam/ml/rag/embeddings/open_ai.py new file mode 100644 index 000000000000..1dbf168a3a02 --- /dev/null +++ b/sdks/python/apache_beam/ml/rag/embeddings/open_ai.py @@ -0,0 +1,80 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""RAG-specific embedding implementations using OpenAI models.""" + +from typing import Optional + +import apache_beam as beam +from apache_beam.ml.inference.base import RunInference +from apache_beam.ml.rag.embeddings.base import create_rag_adapter +from apache_beam.ml.rag.types import Chunk +from apache_beam.ml.transforms.base import EmbeddingsManager +from apache_beam.ml.transforms.base import _TextEmbeddingHandler +from apache_beam.ml.transforms.embeddings.open_ai import _OpenAITextEmbeddingHandler + +__all__ = ['OpenAITextEmbeddings'] + + +class OpenAITextEmbeddings(EmbeddingsManager): + def __init__( + self, + model_name: str, + *, + api_key: Optional[str] = None, + organization: Optional[str] = None, + dimensions: Optional[int] = None, + user: Optional[str] = None, + max_batch_size: Optional[int] = None, + **kwargs): + """Utilizes OpenAI text embeddings for semantic search and RAG pipelines. + + Args: + model_name: Name of the OpenAI embedding model + api_key: OpenAI API key + organization: OpenAI organization ID + dimensions: Specific embedding dimensions to use (if supported) + user: End-user identifier for tracking and rate limit calculations + max_batch_size: Maximum batch size for requests to OpenAI API + **kwargs: Additional arguments passed to EmbeddingsManager including + ModelHandler inference_args. + """ + super().__init__(type_adapter=create_rag_adapter(), **kwargs) + self.model_name = model_name + self.api_key = api_key + self.organization = organization + self.dimensions = dimensions + self.user = user + self.max_batch_size = max_batch_size + + def get_model_handler(self): + """Returns model handler configured with RAG adapter.""" + return _OpenAITextEmbeddingHandler( + model_name=self.model_name, + api_key=self.api_key, + organization=self.organization, + dimensions=self.dimensions, + user=self.user, + max_batch_size=self.max_batch_size, + ) + + def get_ptransform_for_processing( + self, **kwargs + ) -> beam.PTransform[beam.PCollection[Chunk], beam.PCollection[Chunk]]: + """Returns PTransform that uses the RAG adapter.""" + return RunInference( + model_handler=_TextEmbeddingHandler(self), + inference_args=self.inference_args).with_output_types(Chunk) diff --git a/sdks/python/apache_beam/ml/rag/embeddings/open_ai_test.py b/sdks/python/apache_beam/ml/rag/embeddings/open_ai_test.py new file mode 100644 index 000000000000..244bc396037b --- /dev/null +++ b/sdks/python/apache_beam/ml/rag/embeddings/open_ai_test.py @@ -0,0 +1,127 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import shutil +import tempfile +import unittest + +import apache_beam as beam +from apache_beam.ml.rag.embeddings.open_ai import OpenAITextEmbeddings +from apache_beam.ml.rag.test_utils import TestHelpers +from apache_beam.ml.rag.types import Chunk +from apache_beam.ml.rag.types import Content +from apache_beam.ml.rag.types import Embedding +from apache_beam.ml.transforms.base import MLTransform +from apache_beam.testing.test_pipeline import TestPipeline +from apache_beam.testing.util import assert_that +from apache_beam.testing.util import equal_to + + +@unittest.skipIf( + not os.environ.get('OPENAI_API_KEY'), + 'OPENAI_API_KEY environment variable is not set') +class OpenAITextEmbeddingsTest(unittest.TestCase): + def setUp(self): + self.artifact_location = tempfile.mkdtemp(prefix='openai_') + self.test_chunks = [ + Chunk( + content=Content(text="This is a test sentence."), + id="1", + metadata={ + "source": "test.txt", "language": "en" + }), + Chunk( + content=Content(text="Another example."), + id="2", + metadata={ + "source": "test.txt", "language": "en" + }) + ] + + def tearDown(self) -> None: + shutil.rmtree(self.artifact_location) + + def test_embedding_pipeline(self): + expected = [ + Chunk( + id="1", + embedding=Embedding(dense_embedding=[0.0] * 1536), + metadata={ + "source": "test.txt", "language": "en" + }, + content=Content(text="This is a test sentence.")), + Chunk( + id="2", + embedding=Embedding(dense_embedding=[0.0] * 1536), + metadata={ + "source": "test.txt", "language": "en" + }, + content=Content(text="Another example.")) + ] + + embedder = OpenAITextEmbeddings( + model_name="text-embedding-3-small", + dimensions=1536, + api_key=os.environ.get("OPENAI_API_KEY")) + + with TestPipeline() as p: + embeddings = ( + p + | beam.Create(self.test_chunks) + | MLTransform(write_artifact_location=self.artifact_location). + with_transform(embedder)) + + assert_that( + embeddings, + equal_to(expected, equals_fn=TestHelpers.chunk_approximately_equals)) + + def test_embedding_pipeline_with_dimensions(self): + expected = [ + Chunk( + id="1", + embedding=Embedding(dense_embedding=[0.0] * 512), + metadata={ + "source": "test.txt", "language": "en" + }, + content=Content(text="This is a test sentence.")), + Chunk( + id="2", + embedding=Embedding(dense_embedding=[0.0] * 512), + metadata={ + "source": "test.txt", "language": "en" + }, + content=Content(text="Another example.")) + ] + + embedder = OpenAITextEmbeddings( + model_name="text-embedding-3-small", + dimensions=512, + api_key=os.environ.get("OPENAI_API_KEY")) + + with TestPipeline() as p: + embeddings = ( + p + | beam.Create(self.test_chunks) + | MLTransform(write_artifact_location=self.artifact_location). + with_transform(embedder)) + + assert_that( + embeddings, + equal_to(expected, equals_fn=TestHelpers.chunk_approximately_equals)) + + +if __name__ == '__main__': + unittest.main() diff --git a/sdks/python/setup.py b/sdks/python/setup.py index 6af2c5caf638..7d2fa3a75279 100644 --- a/sdks/python/setup.py +++ b/sdks/python/setup.py @@ -161,6 +161,7 @@ def cythonize(*args, **kwargs): ] milvus_dependency = ['pymilvus>=2.5.10,<3.0.0'] +openai_dependency = ['openai>=1.107.1,<2.0.0'] ml_base = [ 'embeddings', @@ -466,7 +467,7 @@ def get_portability_package_data(): 'pg8000>=1.31.5', "PyMySQL>=1.1.0", 'oracledb>=3.1.1' - ] + milvus_dependency, + ] + milvus_dependency + openai_dependency, 'gcp': [ 'cachetools>=3.1.0,<7', 'google-api-core>=2.0.0,<3', @@ -614,7 +615,8 @@ def get_portability_package_data(): 'xgboost': ['xgboost>=1.6.0,<2.1.3', 'datatable==1.0.0'], 'tensorflow-hub': ['tensorflow-hub>=0.14.0,<0.16.0'], 'milvus': milvus_dependency, - 'vllm': ['openai==1.107.1', 'vllm==0.10.1.1', 'triton==3.3.1'] + 'openai': openai_dependency, + 'vllm': ['vllm==0.10.1.1', 'triton==3.3.1'] + openai_dependency, }, zip_safe=False, # PyPI package information. From a2eef41790907a5a43fde3fdc1ebacdd6bf57ebe Mon Sep 17 00:00:00 2001 From: Mohamed Awnallah Date: Mon, 19 Jan 2026 05:59:03 +0000 Subject: [PATCH 5/7] sdks/python: fix proposed linting and formatting issues --- .../ml/rag/embeddings/huggingface_test.py | 113 ++-- .../transforms/embeddings/open_ai_it_test.py | 496 +++++++++--------- 2 files changed, 304 insertions(+), 305 deletions(-) diff --git a/sdks/python/apache_beam/ml/rag/embeddings/huggingface_test.py b/sdks/python/apache_beam/ml/rag/embeddings/huggingface_test.py index 4229b34d0937..051068318172 100644 --- a/sdks/python/apache_beam/ml/rag/embeddings/huggingface_test.py +++ b/sdks/python/apache_beam/ml/rag/embeddings/huggingface_test.py @@ -24,78 +24,77 @@ import apache_beam as beam from apache_beam.ml.rag.embeddings.huggingface import HuggingfaceTextEmbeddings -from apache_beam.ml.rag.types import Chunk -from apache_beam.ml.rag.types import Content -from apache_beam.ml.rag.types import Embedding from apache_beam.ml.rag.test_utils import TestHelpers +from apache_beam.ml.rag.types import Chunk, Content, Embedding from apache_beam.ml.transforms.base import MLTransform from apache_beam.testing.test_pipeline import TestPipeline -from apache_beam.testing.util import assert_that -from apache_beam.testing.util import equal_to +from apache_beam.testing.util import assert_that, equal_to # pylint: disable=unused-import try: - from sentence_transformers import SentenceTransformer - SENTENCE_TRANSFORMERS_AVAILABLE = True + from sentence_transformers import SentenceTransformer + + SENTENCE_TRANSFORMERS_AVAILABLE = True except ImportError: - SENTENCE_TRANSFORMERS_AVAILABLE = False + SENTENCE_TRANSFORMERS_AVAILABLE = False @pytest.mark.uses_transformers @unittest.skipIf( - not SENTENCE_TRANSFORMERS_AVAILABLE, "sentence-transformers not available") + not SENTENCE_TRANSFORMERS_AVAILABLE, "sentence-transformers not available" +) class HuggingfaceTextEmbeddingsTest(unittest.TestCase): - def setUp(self): - self.artifact_location = tempfile.mkdtemp(prefix='sentence_transformers_') - self.test_chunks = [ - Chunk( - content=Content(text="This is a test sentence."), - id="1", - metadata={ - "source": "test.txt", "language": "en" - }), - Chunk( - content=Content(text="Another example."), - id="2", - metadata={ - "source": "test.txt", "language": "en" - }) - ] + def setUp(self): + self.artifact_location = tempfile.mkdtemp(prefix="sentence_transformers_") + self.test_chunks = [ + Chunk( + content=Content(text="This is a test sentence."), + id="1", + metadata={"source": "test.txt", "language": "en"}, + ), + Chunk( + content=Content(text="Another example."), + id="2", + metadata={"source": "test.txt", "language": "en"}, + ), + ] - def tearDown(self) -> None: - shutil.rmtree(self.artifact_location) + def tearDown(self) -> None: + shutil.rmtree(self.artifact_location) - def test_embedding_pipeline(self): - expected = [ - Chunk( - id="1", - embedding=Embedding(dense_embedding=[0.0] * 384), - metadata={ - "source": "test.txt", "language": "en" - }, - content=Content(text="This is a test sentence.")), - Chunk( - id="2", - embedding=Embedding(dense_embedding=[0.0] * 384), - metadata={ - "source": "test.txt", "language": "en" - }, - content=Content(text="Another example.")) - ] - embedder = HuggingfaceTextEmbeddings( - model_name="sentence-transformers/all-MiniLM-L6-v2") + def test_embedding_pipeline(self): + expected = [ + Chunk( + id="1", + embedding=Embedding(dense_embedding=[0.0] * 384), + metadata={"source": "test.txt", "language": "en"}, + content=Content(text="This is a test sentence."), + ), + Chunk( + id="2", + embedding=Embedding(dense_embedding=[0.0] * 384), + metadata={"source": "test.txt", "language": "en"}, + content=Content(text="Another example."), + ), + ] + embedder = HuggingfaceTextEmbeddings( + model_name="sentence-transformers/all-MiniLM-L6-v2" + ) - with TestPipeline() as p: - embeddings = ( - p - | beam.Create(self.test_chunks) - | MLTransform(write_artifact_location=self.artifact_location). - with_transform(embedder)) + with TestPipeline() as p: + embeddings = ( + p + | beam.Create(self.test_chunks) + | MLTransform( + write_artifact_location=self.artifact_location + ).with_transform(embedder) + ) - assert_that( - embeddings, - equal_to(expected, equals_fn=TestHelpers.chunk_approximately_equals)) + assert_that( + embeddings, + equal_to(expected, equals_fn=TestHelpers.chunk_approximately_equals), + ) -if __name__ == '__main__': - unittest.main() +if __name__ == "__main__": + unittest.main() diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/open_ai_it_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/open_ai_it_test.py index b02a095ba67d..ea32684dd85e 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/open_ai_it_test.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/open_ai_it_test.py @@ -23,265 +23,265 @@ from apache_beam.ml.inference.base import RunInference from apache_beam.ml.transforms import base from apache_beam.ml.transforms.base import MLTransform - from apache_beam.ml.transforms.embeddings.open_ai import OpenAITextEmbeddings # pylint: disable=ungrouped-imports try: - import tensorflow_transform as tft - from apache_beam.ml.transforms.tft import ScaleTo01 + import tensorflow_transform as tft + + from apache_beam.ml.transforms.tft import ScaleTo01 except ImportError: - tft = None + tft = None test_query = "This is a test" test_query_column = "embedding" model_name: str = "text-embedding-3-small" -@unittest.skipIf( - OpenAITextEmbeddings is None, 'OpenAI Python SDK is not installed.') +@unittest.skipIf(OpenAITextEmbeddings is None, "OpenAI Python SDK is not installed.") class OpenAIEmbeddingsTest(unittest.TestCase): - def setUp(self) -> None: - self.artifact_location = tempfile.mkdtemp(prefix='_openai_test') - self.api_key = os.environ.get('OPENAI_API_KEY') - - def tearDown(self) -> None: - shutil.rmtree(self.artifact_location) - - def test_openai_text_embeddings(self): - embedding_config = OpenAITextEmbeddings( - model_name=model_name, columns=["embedding"], api_key=self.api_key) - with beam.Pipeline() as pipeline: - transformed_pcoll = ( - pipeline - | "CreateData" >> beam.Create([{ - test_query_column: test_query - }]) - | "MLTransform" >> MLTransform( - write_artifact_location=self.artifact_location).with_transform( - embedding_config)) - - def assert_element(element): - # OpenAI text-embedding-3-small produces 1536-dimensional embeddings - assert len(element[test_query_column]) == 1536 - - _ = (transformed_pcoll | beam.Map(assert_element)) - - @unittest.skipIf(tft is None, 'Tensorflow Transform is not installed.') - def test_embeddings_with_scale_to_0_1(self): - embedding_config = OpenAITextEmbeddings( - model_name=model_name, - columns=[test_query_column], - api_key=self.api_key, - ) - scale_config = ScaleTo01(columns=['embedding']) - with beam.Pipeline() as pipeline: - transformed_pcoll = ( - pipeline - | "CreateData" >> beam.Create([{ - test_query_column: test_query - }]) - | "MLTransform" >> MLTransform( - write_artifact_location=self.artifact_location).with_transform( - embedding_config)).with_transform(scale_config) - - def assert_element(element): - embedding_values = element.embedding - assert 0 <= max(embedding_values) <= 1 - assert 0 <= min(embedding_values) <= 1 - - _ = (transformed_pcoll | beam.Map(assert_element)) - - def test_embeddings_with_dimensions(self): - embedding_config = OpenAITextEmbeddings( - model_name=model_name, - columns=[test_query_column], - api_key=self.api_key, - dimensions=512) - with beam.Pipeline() as pipeline: - transformed_pcoll = ( - pipeline - | "CreateData" >> beam.Create([{ - test_query_column: test_query - }]) - | "MLTransform" >> MLTransform( - write_artifact_location=self.artifact_location).with_transform( - embedding_config)) - - def assert_element(element): - # Check that we get 512-dimensional embeddings as requested - assert len(element[test_query_column]) == 512 - - _ = (transformed_pcoll | beam.Map(assert_element)) - - def pipeline_with_configurable_artifact_location( - self, - pipeline, - embedding_config=None, - read_artifact_location=None, - write_artifact_location=None): - if write_artifact_location: - return ( - pipeline - | MLTransform(write_artifact_location=write_artifact_location). - with_transform(embedding_config)) - elif read_artifact_location: - return ( - pipeline - | MLTransform(read_artifact_location=read_artifact_location)) - else: - raise NotImplementedError - - def test_embeddings_with_read_artifact_location(self): - embedding_config = OpenAITextEmbeddings( - model_name=model_name, - columns=[test_query_column], - api_key=self.api_key) - - with beam.Pipeline() as p: - data = ( - p - | "CreateData" >> beam.Create([{ - test_query_column: test_query - }])) - _ = self.pipeline_with_configurable_artifact_location( - pipeline=data, - embedding_config=embedding_config, - write_artifact_location=self.artifact_location) - - with beam.Pipeline() as p: - data = ( - p - | "CreateData" >> beam.Create([{ - test_query_column: test_query - }, { - test_query_column: test_query - }])) - result_pcoll = self.pipeline_with_configurable_artifact_location( - pipeline=data, read_artifact_location=self.artifact_location) - - # Since we don't know the exact values of the embeddings, - # we just check that they are within a reasonable range - def assert_element(element): - # Embeddings should be normalized and generally small values - assert -1 <= element <= 1 - - _ = ( - result_pcoll - | beam.Map(lambda x: max(x[test_query_column])) - | beam.Map(assert_element)) - - def test_with_int_data_types(self): - embedding_config = OpenAITextEmbeddings( - model_name=model_name, - columns=[test_query_column], - api_key=self.api_key) - with self.assertRaises(Exception): - with beam.Pipeline() as pipeline: - _ = ( - pipeline - | "CreateData" >> beam.Create([{ - test_query_column: 1 - }]) - | "MLTransform" >> MLTransform( - write_artifact_location=self.artifact_location).with_transform( - embedding_config)) - - def test_with_artifact_location(self): - """Local artifact location test""" - secondary_artifact_location = tempfile.mkdtemp( - prefix='_openai_secondary_test') - - try: - embedding_config = OpenAITextEmbeddings( - model_name=model_name, - columns=[test_query_column], - api_key=self.api_key) - - with beam.Pipeline() as p: - data = ( - p - | "CreateData" >> beam.Create([{ - test_query_column: test_query - }])) - _ = self.pipeline_with_configurable_artifact_location( - pipeline=data, - embedding_config=embedding_config, - write_artifact_location=secondary_artifact_location) - - with beam.Pipeline() as p: - data = ( - p - | "CreateData" >> beam.Create([{ - test_query_column: test_query - }, { - test_query_column: test_query - }])) - result_pcoll = self.pipeline_with_configurable_artifact_location( - pipeline=data, read_artifact_location=secondary_artifact_location) - - def assert_element(element): - # Embeddings should be normalized and generally small values - assert -1 <= element <= 1 - - _ = ( - result_pcoll - | beam.Map(lambda x: max(x[test_query_column])) - | beam.Map(assert_element)) - finally: - # Clean up the temporary directory - shutil.rmtree(secondary_artifact_location) - - def test_mltransform_to_ptransform_with_openai(self): - transforms = [ - OpenAITextEmbeddings( - columns=['x'], + def setUp(self) -> None: + self.artifact_location = tempfile.mkdtemp(prefix="_openai_test") + self.api_key = os.environ.get("OPENAI_API_KEY") + + def tearDown(self) -> None: + shutil.rmtree(self.artifact_location) + + def test_openai_text_embeddings(self): + embedding_config = OpenAITextEmbeddings( + model_name=model_name, columns=["embedding"], api_key=self.api_key + ) + with beam.Pipeline() as pipeline: + transformed_pcoll = ( + pipeline + | "CreateData" >> beam.Create([{test_query_column: test_query}]) + | "MLTransform" + >> MLTransform( + write_artifact_location=self.artifact_location + ).with_transform(embedding_config) + ) + + def assert_element(element): + # OpenAI text-embedding-3-small produces 1536-dimensional embeddings + assert len(element[test_query_column]) == 1536 + + _ = transformed_pcoll | beam.Map(assert_element) + + @unittest.skipIf(tft is None, "Tensorflow Transform is not installed.") + def test_embeddings_with_scale_to_0_1(self): + embedding_config = OpenAITextEmbeddings( + model_name=model_name, + columns=[test_query_column], + api_key=self.api_key, + ) + scale_config = ScaleTo01(columns=["embedding"]) + with beam.Pipeline() as pipeline: + transformed_pcoll = ( + pipeline + | "CreateData" >> beam.Create([{test_query_column: test_query}]) + | "MLTransform" + >> MLTransform( + write_artifact_location=self.artifact_location + ).with_transform(embedding_config) + ).with_transform(scale_config) + + def assert_element(element): + embedding_values = element.embedding + assert 0 <= max(embedding_values) <= 1 + assert 0 <= min(embedding_values) <= 1 + + _ = transformed_pcoll | beam.Map(assert_element) + + def test_embeddings_with_dimensions(self): + embedding_config = OpenAITextEmbeddings( model_name=model_name, + columns=[test_query_column], api_key=self.api_key, - dimensions=512), - OpenAITextEmbeddings( - columns=['y', 'z'], model_name=model_name, api_key=self.api_key) - ] - ptransform_mapper = base._MLTransformToPTransformMapper( - transforms=transforms, - artifact_location=self.artifact_location, - artifact_mode=None) - - ptransform_list = ptransform_mapper.create_and_save_ptransform_list() - self.assertTrue(len(ptransform_list) == 2) - - self.assertEqual(type(ptransform_list[0]), RunInference) - expected_columns = [['x'], ['y', 'z']] - expected_dimensions = [512, None] - for i in range(len(ptransform_list)): - self.assertEqual(type(ptransform_list[i]), RunInference) - self.assertEqual( - type(ptransform_list[i]._model_handler), base._TextEmbeddingHandler) - self.assertEqual( - ptransform_list[i]._model_handler.columns, expected_columns[i]) - self.assertEqual( - ptransform_list[i]._model_handler._underlying.model_name, model_name) - if expected_dimensions[i]: - self.assertEqual( - ptransform_list[i]._model_handler._underlying.dimensions, - expected_dimensions[i]) - - ptransform_list = ( - base._MLTransformToPTransformMapper. - load_transforms_from_artifact_location(self.artifact_location)) - for i in range(len(ptransform_list)): - self.assertEqual(type(ptransform_list[i]), RunInference) - self.assertEqual( - type(ptransform_list[i]._model_handler), base._TextEmbeddingHandler) - self.assertEqual( - ptransform_list[i]._model_handler.columns, expected_columns[i]) - self.assertEqual( - ptransform_list[i]._model_handler._underlying.model_name, model_name) - if expected_dimensions[i]: - self.assertEqual( - ptransform_list[i]._model_handler._underlying.dimensions, - expected_dimensions[i]) - - -if __name__ == '__main__': - unittest.main() + dimensions=512, + ) + with beam.Pipeline() as pipeline: + transformed_pcoll = ( + pipeline + | "CreateData" >> beam.Create([{test_query_column: test_query}]) + | "MLTransform" + >> MLTransform( + write_artifact_location=self.artifact_location + ).with_transform(embedding_config) + ) + + def assert_element(element): + # Check that we get 512-dimensional embeddings as requested + assert len(element[test_query_column]) == 512 + + _ = transformed_pcoll | beam.Map(assert_element) + + def pipeline_with_configurable_artifact_location( + self, + pipeline, + embedding_config=None, + read_artifact_location=None, + write_artifact_location=None, + ): + if write_artifact_location: + return pipeline | MLTransform( + write_artifact_location=write_artifact_location + ).with_transform(embedding_config) + elif read_artifact_location: + return pipeline | MLTransform(read_artifact_location=read_artifact_location) + else: + raise NotImplementedError + + def test_embeddings_with_read_artifact_location(self): + embedding_config = OpenAITextEmbeddings( + model_name=model_name, columns=[test_query_column], api_key=self.api_key + ) + + with beam.Pipeline() as p: + data = p | "CreateData" >> beam.Create([{test_query_column: test_query}]) + _ = self.pipeline_with_configurable_artifact_location( + pipeline=data, + embedding_config=embedding_config, + write_artifact_location=self.artifact_location, + ) + + with beam.Pipeline() as p: + data = p | "CreateData" >> beam.Create( + [{test_query_column: test_query}, {test_query_column: test_query}] + ) + result_pcoll = self.pipeline_with_configurable_artifact_location( + pipeline=data, read_artifact_location=self.artifact_location + ) + + # Since we don't know the exact values of the embeddings, + # we just check that they are within a reasonable range + def assert_element(element): + # Embeddings should be normalized and generally small values + assert -1 <= element <= 1 + + _ = ( + result_pcoll + | beam.Map(lambda x: max(x[test_query_column])) + | beam.Map(assert_element) + ) + + def test_with_int_data_types(self): + embedding_config = OpenAITextEmbeddings( + model_name=model_name, columns=[test_query_column], api_key=self.api_key + ) + with self.assertRaises(Exception): + with beam.Pipeline() as pipeline: + _ = ( + pipeline + | "CreateData" >> beam.Create([{test_query_column: 1}]) + | "MLTransform" + >> MLTransform( + write_artifact_location=self.artifact_location + ).with_transform(embedding_config) + ) + + def test_with_artifact_location(self): + """Local artifact location test""" + secondary_artifact_location = tempfile.mkdtemp(prefix="_openai_secondary_test") + + try: + embedding_config = OpenAITextEmbeddings( + model_name=model_name, columns=[test_query_column], api_key=self.api_key + ) + + with beam.Pipeline() as p: + data = p | "CreateData" >> beam.Create( + [{test_query_column: test_query}] + ) + _ = self.pipeline_with_configurable_artifact_location( + pipeline=data, + embedding_config=embedding_config, + write_artifact_location=secondary_artifact_location, + ) + + with beam.Pipeline() as p: + data = p | "CreateData" >> beam.Create( + [{test_query_column: test_query}, {test_query_column: test_query}] + ) + result_pcoll = self.pipeline_with_configurable_artifact_location( + pipeline=data, read_artifact_location=secondary_artifact_location + ) + + def assert_element(element): + # Embeddings should be normalized and generally small values + assert -1 <= element <= 1 + + _ = ( + result_pcoll + | beam.Map(lambda x: max(x[test_query_column])) + | beam.Map(assert_element) + ) + finally: + # Clean up the temporary directory + shutil.rmtree(secondary_artifact_location) + + def test_mltransform_to_ptransform_with_openai(self): + transforms = [ + OpenAITextEmbeddings( + columns=["x"], + model_name=model_name, + api_key=self.api_key, + dimensions=512, + ), + OpenAITextEmbeddings( + columns=["y", "z"], model_name=model_name, api_key=self.api_key + ), + ] + ptransform_mapper = base._MLTransformToPTransformMapper( + transforms=transforms, + artifact_location=self.artifact_location, + artifact_mode=None, + ) + + ptransform_list = ptransform_mapper.create_and_save_ptransform_list() + self.assertTrue(len(ptransform_list) == 2) + + self.assertEqual(type(ptransform_list[0]), RunInference) + expected_columns = [["x"], ["y", "z"]] + expected_dimensions = [512, None] + for i in range(len(ptransform_list)): + self.assertEqual(type(ptransform_list[i]), RunInference) + self.assertEqual( + type(ptransform_list[i]._model_handler), base._TextEmbeddingHandler + ) + self.assertEqual( + ptransform_list[i]._model_handler.columns, expected_columns[i] + ) + self.assertEqual( + ptransform_list[i]._model_handler._underlying.model_name, model_name + ) + if expected_dimensions[i]: + self.assertEqual( + ptransform_list[i]._model_handler._underlying.dimensions, + expected_dimensions[i], + ) + + ptransform_list = ( + base._MLTransformToPTransformMapper.load_transforms_from_artifact_location( + self.artifact_location + ) + ) + for i in range(len(ptransform_list)): + self.assertEqual(type(ptransform_list[i]), RunInference) + self.assertEqual( + type(ptransform_list[i]._model_handler), base._TextEmbeddingHandler + ) + self.assertEqual( + ptransform_list[i]._model_handler.columns, expected_columns[i] + ) + self.assertEqual( + ptransform_list[i]._model_handler._underlying.model_name, model_name + ) + if expected_dimensions[i]: + self.assertEqual( + ptransform_list[i]._model_handler._underlying.dimensions, + expected_dimensions[i], + ) + + +if __name__ == "__main__": + unittest.main() From 2851d702bc9a79e893ec70f9886ede006804358e Mon Sep 17 00:00:00 2001 From: Mohamed Awnallah Date: Mon, 19 Jan 2026 06:15:56 +0000 Subject: [PATCH 6/7] container: update python docker requirements --- .../ml/py310/base_image_requirements.txt | 38 ++++++++-------- .../ml/py310/gpu_image_requirements.txt | 44 +++++++++---------- .../ml/py311/base_image_requirements.txt | 36 ++++++++------- .../ml/py311/gpu_image_requirements.txt | 42 +++++++++--------- .../ml/py312/base_image_requirements.txt | 36 ++++++++------- .../ml/py312/gpu_image_requirements.txt | 42 +++++++++--------- .../ml/py313/base_image_requirements.txt | 36 ++++++++------- .../py310/base_image_requirements.txt | 36 ++++++++------- .../py311/base_image_requirements.txt | 34 +++++++------- .../py312/base_image_requirements.txt | 34 +++++++------- .../py313/base_image_requirements.txt | 34 +++++++------- 11 files changed, 214 insertions(+), 198 deletions(-) diff --git a/sdks/python/container/ml/py310/base_image_requirements.txt b/sdks/python/container/ml/py310/base_image_requirements.txt index 0bcb6529cc58..f45b286b94db 100644 --- a/sdks/python/container/ml/py310/base_image_requirements.txt +++ b/sdks/python/container/ml/py310/base_image_requirements.txt @@ -43,7 +43,7 @@ certifi==2026.1.4 cffi==2.0.0 charset-normalizer==3.4.4 click==8.3.1 -cloud-sql-python-connector==1.19.0 +cloud-sql-python-connector==1.20.0 crcmod==1.7 cryptography==46.0.3 Cython==3.2.4 @@ -65,7 +65,7 @@ fsspec==2026.1.0 future==1.0.0 gast==0.7.0 google-api-core==2.29.0 -google-api-python-client==2.187.0 +google-api-python-client==2.188.0 google-apitools==0.5.31 google-auth==2.47.0 google-auth-httplib2==0.2.1 @@ -75,22 +75,22 @@ google-cloud-bigquery-storage==2.36.0 google-cloud-bigtable==2.35.0 google-cloud-core==2.5.0 google-cloud-datastore==2.23.0 -google-cloud-dlp==3.33.0 -google-cloud-kms==3.7.0 -google-cloud-language==2.18.0 -google-cloud-monitoring==2.28.0 +google-cloud-dlp==3.34.0 +google-cloud-kms==3.9.0 +google-cloud-language==2.19.0 +google-cloud-monitoring==2.29.0 google-cloud-profiler==4.1.0 google-cloud-pubsub==2.34.0 google-cloud-pubsublite==1.13.0 google-cloud-recommendations-ai==0.10.18 -google-cloud-resource-manager==1.15.0 +google-cloud-resource-manager==1.16.0 google-cloud-secret-manager==2.26.0 -google-cloud-spanner==3.61.0 +google-cloud-spanner==3.62.0 google-cloud-storage==2.19.0 -google-cloud-videointelligence==2.17.0 -google-cloud-vision==3.11.0 +google-cloud-videointelligence==2.18.0 +google-cloud-vision==3.12.0 google-crc32c==1.8.0 -google-genai==1.57.0 +google-genai==1.59.0 google-pasta==0.2.0 google-resumable-media==2.8.0 googleapis-common-protos==1.72.0 @@ -107,7 +107,7 @@ h5py==3.15.1 hf-xet==1.2.0 hpack==4.1.0 httpcore==1.0.9 -httplib2==0.22.0 +httplib2==0.31.1 httpx==0.28.1 huggingface-hub==0.36.0 hyperframe==6.1.0 @@ -116,16 +116,17 @@ idna==3.11 importlib_metadata==8.7.1 iniconfig==2.3.0 jaraco.classes==3.4.0 -jaraco.context==6.0.2 +jaraco.context==6.1.0 jaraco.functools==4.4.0 jeepney==0.9.0 Jinja2==3.1.6 +jiter==0.12.0 joblib==1.5.3 Js2Py==0.74 jsonpickle==3.4.2 jsonschema==4.26.0 jsonschema-specifications==2025.9.1 -keras==3.13.1 +keras==3.12.0 keyring==25.7.0 keyrings.google-artifactregistry-auth==1.1.2 libclang==18.1.1 @@ -146,6 +147,7 @@ nltk==3.9.2 numpy==2.2.6 oauth2client==4.1.3 objsize==0.7.1 +openai==1.109.1 opentelemetry-api==1.39.1 opentelemetry-resourcedetector-gcp==1.11.0a0 opentelemetry-sdk==1.39.1 @@ -168,7 +170,7 @@ protobuf==5.29.5 psycopg2-binary==2.9.11 pyarrow==18.1.0 pyarrow-hotfix==0.7 -pyasn1==0.6.1 +pyasn1==0.6.2 pyasn1_modules==0.4.2 pycparser==2.23 pydantic==2.12.5 @@ -190,7 +192,7 @@ python-tds==1.17.1 pytz==2025.2 PyYAML==6.0.3 referencing==0.37.0 -regex==2025.11.3 +regex==2026.1.15 requests==2.32.5 requests-mock==1.12.1 rich==14.2.0 @@ -205,7 +207,7 @@ setuptools==80.9.0 six==1.17.0 sniffio==1.3.1 sortedcontainers==2.4.0 -soupsieve==2.8.1 +soupsieve==2.8.2 SQLAlchemy==2.0.45 sqlalchemy_pytds==1.0.2 sqlparse==0.5.5 @@ -219,7 +221,7 @@ termcolor==3.3.0 testcontainers==4.14.0 threadpoolctl==3.6.0 tokenizers==0.21.4 -tomli==2.3.0 +tomli==2.4.0 torch==2.8.0+cpu tqdm==4.67.1 transformers==4.55.4 diff --git a/sdks/python/container/ml/py310/gpu_image_requirements.txt b/sdks/python/container/ml/py310/gpu_image_requirements.txt index 2ddb0799e04a..f1599e6084d5 100644 --- a/sdks/python/container/ml/py310/gpu_image_requirements.txt +++ b/sdks/python/container/ml/py310/gpu_image_requirements.txt @@ -47,7 +47,7 @@ certifi==2026.1.4 cffi==2.0.0 charset-normalizer==3.4.4 click==8.3.1 -cloud-sql-python-connector==1.19.0 +cloud-sql-python-connector==1.20.0 cloudpickle==3.1.2 compressed-tensors==0.10.2 crcmod==1.7 @@ -68,7 +68,7 @@ exceptiongroup==1.3.1 execnet==2.1.2 fastapi==0.128.0 fastapi-cli==0.0.20 -fastapi-cloud-cli==0.9.0 +fastapi-cloud-cli==0.11.0 fastar==0.8.0 fastavro==1.12.1 fasteners==0.20 @@ -82,7 +82,7 @@ future==1.0.0 gast==0.7.0 gguf==0.17.1 google-api-core==2.29.0 -google-api-python-client==2.187.0 +google-api-python-client==2.188.0 google-apitools==0.5.31 google-auth==2.47.0 google-auth-httplib2==0.2.1 @@ -92,22 +92,22 @@ google-cloud-bigquery-storage==2.36.0 google-cloud-bigtable==2.35.0 google-cloud-core==2.5.0 google-cloud-datastore==2.23.0 -google-cloud-dlp==3.33.0 -google-cloud-kms==3.7.0 -google-cloud-language==2.18.0 -google-cloud-monitoring==2.28.0 +google-cloud-dlp==3.34.0 +google-cloud-kms==3.9.0 +google-cloud-language==2.19.0 +google-cloud-monitoring==2.29.0 google-cloud-profiler==4.1.0 google-cloud-pubsub==2.34.0 google-cloud-pubsublite==1.13.0 google-cloud-recommendations-ai==0.10.18 -google-cloud-resource-manager==1.15.0 +google-cloud-resource-manager==1.16.0 google-cloud-secret-manager==2.26.0 -google-cloud-spanner==3.61.0 +google-cloud-spanner==3.62.0 google-cloud-storage==2.19.0 -google-cloud-videointelligence==2.17.0 -google-cloud-vision==3.11.0 +google-cloud-videointelligence==2.18.0 +google-cloud-vision==3.12.0 google-crc32c==1.8.0 -google-genai==1.57.0 +google-genai==1.59.0 google-pasta==0.2.0 google-resumable-media==2.8.0 googleapis-common-protos==1.72.0 @@ -124,7 +124,7 @@ h5py==3.15.1 hf-xet==1.2.0 hpack==4.1.0 httpcore==1.0.9 -httplib2==0.22.0 +httplib2==0.31.1 httptools==0.7.1 httpx==0.28.1 huggingface-hub==0.36.0 @@ -135,7 +135,7 @@ importlib_metadata==8.7.1 iniconfig==2.3.0 interegular==0.3.3 jaraco.classes==3.4.0 -jaraco.context==6.0.2 +jaraco.context==6.1.0 jaraco.functools==4.4.0 jeepney==0.9.0 Jinja2==3.1.6 @@ -144,7 +144,7 @@ joblib==1.5.3 jsonpickle==3.4.2 jsonschema==4.26.0 jsonschema-specifications==2025.9.1 -keras==3.13.1 +keras==3.12.0 keyring==25.7.0 keyrings.google-artifactregistry-auth==1.1.2 lark==1.2.2 @@ -189,9 +189,9 @@ nvidia-nvshmem-cu12==3.3.20 nvidia-nvtx-cu12==12.6.77 oauth2client==4.1.3 objsize==0.7.1 -openai==1.107.1 +openai==1.109.1 openai-harmony==0.0.8 -opencv-python-headless==4.12.0.88 +opencv-python-headless==4.13.0.90 opentelemetry-api==1.39.1 opentelemetry-resourcedetector-gcp==1.11.0a0 opentelemetry-sdk==1.39.1 @@ -211,7 +211,7 @@ pillow==12.1.0 pip==25.3 pluggy==1.6.0 prometheus-fastapi-instrumentator==7.1.0 -prometheus_client==0.23.1 +prometheus_client==0.24.1 propcache==0.4.1 proto-plus==1.27.0 protobuf==5.29.5 @@ -220,7 +220,7 @@ psycopg2-binary==2.9.11 py-cpuinfo==9.0.0 pyarrow==18.1.0 pyarrow-hotfix==0.7 -pyasn1==0.6.1 +pyasn1==0.6.2 pyasn1_modules==0.4.2 pybase64==1.4.3 pycountry==24.6.1 @@ -249,7 +249,7 @@ PyYAML==6.0.3 pyzmq==27.1.0 ray==2.53.0 referencing==0.37.0 -regex==2025.11.3 +regex==2026.1.15 requests==2.32.5 requests-mock==1.12.1 rich==14.2.0 @@ -271,7 +271,7 @@ six==1.17.0 sniffio==1.3.1 sortedcontainers==2.4.0 soundfile==0.13.1 -soupsieve==2.8.1 +soupsieve==2.8.2 soxr==1.0.0 SQLAlchemy==2.0.45 sqlalchemy_pytds==1.0.2 @@ -288,7 +288,7 @@ testcontainers==4.14.0 threadpoolctl==3.6.0 tiktoken==0.12.0 tokenizers==0.21.4 -tomli==2.3.0 +tomli==2.4.0 torch==2.7.1 torchaudio==2.7.1 torchvision==0.22.1 diff --git a/sdks/python/container/ml/py311/base_image_requirements.txt b/sdks/python/container/ml/py311/base_image_requirements.txt index 33e462ff4b2e..ea560c243829 100644 --- a/sdks/python/container/ml/py311/base_image_requirements.txt +++ b/sdks/python/container/ml/py311/base_image_requirements.txt @@ -42,7 +42,7 @@ certifi==2026.1.4 cffi==2.0.0 charset-normalizer==3.4.4 click==8.3.1 -cloud-sql-python-connector==1.19.0 +cloud-sql-python-connector==1.20.0 crcmod==1.7 cryptography==46.0.3 Cython==3.2.4 @@ -63,7 +63,7 @@ fsspec==2026.1.0 future==1.0.0 gast==0.7.0 google-api-core==2.29.0 -google-api-python-client==2.187.0 +google-api-python-client==2.188.0 google-apitools==0.5.31 google-auth==2.47.0 google-auth-httplib2==0.2.1 @@ -73,22 +73,22 @@ google-cloud-bigquery-storage==2.36.0 google-cloud-bigtable==2.35.0 google-cloud-core==2.5.0 google-cloud-datastore==2.23.0 -google-cloud-dlp==3.33.0 -google-cloud-kms==3.7.0 -google-cloud-language==2.18.0 -google-cloud-monitoring==2.28.0 +google-cloud-dlp==3.34.0 +google-cloud-kms==3.9.0 +google-cloud-language==2.19.0 +google-cloud-monitoring==2.29.0 google-cloud-profiler==4.1.0 google-cloud-pubsub==2.34.0 google-cloud-pubsublite==1.13.0 google-cloud-recommendations-ai==0.10.18 -google-cloud-resource-manager==1.15.0 +google-cloud-resource-manager==1.16.0 google-cloud-secret-manager==2.26.0 -google-cloud-spanner==3.61.0 +google-cloud-spanner==3.62.0 google-cloud-storage==2.19.0 -google-cloud-videointelligence==2.17.0 -google-cloud-vision==3.11.0 +google-cloud-videointelligence==2.18.0 +google-cloud-vision==3.12.0 google-crc32c==1.8.0 -google-genai==1.57.0 +google-genai==1.59.0 google-pasta==0.2.0 google-resumable-media==2.8.0 googleapis-common-protos==1.72.0 @@ -105,7 +105,7 @@ h5py==3.15.1 hf-xet==1.2.0 hpack==4.1.0 httpcore==1.0.9 -httplib2==0.22.0 +httplib2==0.31.1 httpx==0.28.1 huggingface-hub==0.36.0 hyperframe==6.1.0 @@ -114,16 +114,17 @@ idna==3.11 importlib_metadata==8.7.1 iniconfig==2.3.0 jaraco.classes==3.4.0 -jaraco.context==6.0.2 +jaraco.context==6.1.0 jaraco.functools==4.4.0 jeepney==0.9.0 Jinja2==3.1.6 +jiter==0.12.0 joblib==1.5.3 Js2Py==0.74 jsonpickle==3.4.2 jsonschema==4.26.0 jsonschema-specifications==2025.9.1 -keras==3.13.0 +keras==3.13.1 keyring==25.7.0 keyrings.google-artifactregistry-auth==1.1.2 libclang==18.1.1 @@ -144,6 +145,7 @@ nltk==3.9.2 numpy==2.4.1 oauth2client==4.1.3 objsize==0.7.1 +openai==1.109.1 opentelemetry-api==1.39.1 opentelemetry-resourcedetector-gcp==1.11.0a0 opentelemetry-sdk==1.39.1 @@ -166,7 +168,7 @@ protobuf==5.29.5 psycopg2-binary==2.9.11 pyarrow==18.1.0 pyarrow-hotfix==0.7 -pyasn1==0.6.1 +pyasn1==0.6.2 pyasn1_modules==0.4.2 pycparser==2.23 pydantic==2.12.5 @@ -188,7 +190,7 @@ python-tds==1.17.1 pytz==2025.2 PyYAML==6.0.3 referencing==0.37.0 -regex==2025.11.3 +regex==2026.1.15 requests==2.32.5 requests-mock==1.12.1 rich==14.2.0 @@ -203,7 +205,7 @@ setuptools==80.9.0 six==1.17.0 sniffio==1.3.1 sortedcontainers==2.4.0 -soupsieve==2.8.1 +soupsieve==2.8.2 SQLAlchemy==2.0.45 sqlalchemy_pytds==1.0.2 sqlparse==0.5.5 diff --git a/sdks/python/container/ml/py311/gpu_image_requirements.txt b/sdks/python/container/ml/py311/gpu_image_requirements.txt index 124dd8622942..243c31bd91c6 100644 --- a/sdks/python/container/ml/py311/gpu_image_requirements.txt +++ b/sdks/python/container/ml/py311/gpu_image_requirements.txt @@ -46,7 +46,7 @@ certifi==2026.1.4 cffi==2.0.0 charset-normalizer==3.4.4 click==8.3.1 -cloud-sql-python-connector==1.19.0 +cloud-sql-python-connector==1.20.0 cloudpickle==3.1.2 compressed-tensors==0.10.2 crcmod==1.7 @@ -66,7 +66,7 @@ envoy-data-plane==0.2.6 execnet==2.1.2 fastapi==0.128.0 fastapi-cli==0.0.20 -fastapi-cloud-cli==0.9.0 +fastapi-cloud-cli==0.11.0 fastar==0.8.0 fastavro==1.12.1 fasteners==0.20 @@ -80,7 +80,7 @@ future==1.0.0 gast==0.7.0 gguf==0.17.1 google-api-core==2.29.0 -google-api-python-client==2.187.0 +google-api-python-client==2.188.0 google-apitools==0.5.31 google-auth==2.47.0 google-auth-httplib2==0.2.1 @@ -90,22 +90,22 @@ google-cloud-bigquery-storage==2.36.0 google-cloud-bigtable==2.35.0 google-cloud-core==2.5.0 google-cloud-datastore==2.23.0 -google-cloud-dlp==3.33.0 -google-cloud-kms==3.7.0 -google-cloud-language==2.18.0 -google-cloud-monitoring==2.28.0 +google-cloud-dlp==3.34.0 +google-cloud-kms==3.9.0 +google-cloud-language==2.19.0 +google-cloud-monitoring==2.29.0 google-cloud-profiler==4.1.0 google-cloud-pubsub==2.34.0 google-cloud-pubsublite==1.13.0 google-cloud-recommendations-ai==0.10.18 -google-cloud-resource-manager==1.15.0 +google-cloud-resource-manager==1.16.0 google-cloud-secret-manager==2.26.0 -google-cloud-spanner==3.61.0 +google-cloud-spanner==3.62.0 google-cloud-storage==2.19.0 -google-cloud-videointelligence==2.17.0 -google-cloud-vision==3.11.0 +google-cloud-videointelligence==2.18.0 +google-cloud-vision==3.12.0 google-crc32c==1.8.0 -google-genai==1.57.0 +google-genai==1.59.0 google-pasta==0.2.0 google-resumable-media==2.8.0 googleapis-common-protos==1.72.0 @@ -122,7 +122,7 @@ h5py==3.15.1 hf-xet==1.2.0 hpack==4.1.0 httpcore==1.0.9 -httplib2==0.22.0 +httplib2==0.31.1 httptools==0.7.1 httpx==0.28.1 huggingface-hub==0.36.0 @@ -133,7 +133,7 @@ importlib_metadata==8.7.1 iniconfig==2.3.0 interegular==0.3.3 jaraco.classes==3.4.0 -jaraco.context==6.0.2 +jaraco.context==6.1.0 jaraco.functools==4.4.0 jeepney==0.9.0 Jinja2==3.1.6 @@ -142,7 +142,7 @@ joblib==1.5.3 jsonpickle==3.4.2 jsonschema==4.26.0 jsonschema-specifications==2025.9.1 -keras==3.13.0 +keras==3.13.1 keyring==25.7.0 keyrings.google-artifactregistry-auth==1.1.2 lark==1.2.2 @@ -187,9 +187,9 @@ nvidia-nvshmem-cu12==3.3.20 nvidia-nvtx-cu12==12.6.77 oauth2client==4.1.3 objsize==0.7.1 -openai==1.107.1 +openai==1.109.1 openai-harmony==0.0.8 -opencv-python-headless==4.12.0.88 +opencv-python-headless==4.13.0.90 opentelemetry-api==1.39.1 opentelemetry-resourcedetector-gcp==1.11.0a0 opentelemetry-sdk==1.39.1 @@ -209,7 +209,7 @@ pillow==12.1.0 pip==25.3 pluggy==1.6.0 prometheus-fastapi-instrumentator==7.1.0 -prometheus_client==0.23.1 +prometheus_client==0.24.1 propcache==0.4.1 proto-plus==1.27.0 protobuf==5.29.5 @@ -218,7 +218,7 @@ psycopg2-binary==2.9.11 py-cpuinfo==9.0.0 pyarrow==18.1.0 pyarrow-hotfix==0.7 -pyasn1==0.6.1 +pyasn1==0.6.2 pyasn1_modules==0.4.2 pybase64==1.4.3 pycountry==24.6.1 @@ -247,7 +247,7 @@ PyYAML==6.0.3 pyzmq==27.1.0 ray==2.53.0 referencing==0.37.0 -regex==2025.11.3 +regex==2026.1.15 requests==2.32.5 requests-mock==1.12.1 rich==14.2.0 @@ -269,7 +269,7 @@ six==1.17.0 sniffio==1.3.1 sortedcontainers==2.4.0 soundfile==0.13.1 -soupsieve==2.8.1 +soupsieve==2.8.2 soxr==1.0.0 SQLAlchemy==2.0.45 sqlalchemy_pytds==1.0.2 diff --git a/sdks/python/container/ml/py312/base_image_requirements.txt b/sdks/python/container/ml/py312/base_image_requirements.txt index 15e9382b217e..cad445fedb8b 100644 --- a/sdks/python/container/ml/py312/base_image_requirements.txt +++ b/sdks/python/container/ml/py312/base_image_requirements.txt @@ -41,7 +41,7 @@ certifi==2026.1.4 cffi==2.0.0 charset-normalizer==3.4.4 click==8.3.1 -cloud-sql-python-connector==1.19.0 +cloud-sql-python-connector==1.20.0 crcmod==1.7 cryptography==46.0.3 Cython==3.2.4 @@ -62,7 +62,7 @@ fsspec==2026.1.0 future==1.0.0 gast==0.7.0 google-api-core==2.29.0 -google-api-python-client==2.187.0 +google-api-python-client==2.188.0 google-apitools==0.5.31 google-auth==2.47.0 google-auth-httplib2==0.2.1 @@ -72,22 +72,22 @@ google-cloud-bigquery-storage==2.36.0 google-cloud-bigtable==2.35.0 google-cloud-core==2.5.0 google-cloud-datastore==2.23.0 -google-cloud-dlp==3.33.0 -google-cloud-kms==3.7.0 -google-cloud-language==2.18.0 -google-cloud-monitoring==2.28.0 +google-cloud-dlp==3.34.0 +google-cloud-kms==3.9.0 +google-cloud-language==2.19.0 +google-cloud-monitoring==2.29.0 google-cloud-profiler==4.1.0 google-cloud-pubsub==2.34.0 google-cloud-pubsublite==1.13.0 google-cloud-recommendations-ai==0.10.18 -google-cloud-resource-manager==1.15.0 +google-cloud-resource-manager==1.16.0 google-cloud-secret-manager==2.26.0 -google-cloud-spanner==3.61.0 +google-cloud-spanner==3.62.0 google-cloud-storage==2.19.0 -google-cloud-videointelligence==2.17.0 -google-cloud-vision==3.11.0 +google-cloud-videointelligence==2.18.0 +google-cloud-vision==3.12.0 google-crc32c==1.8.0 -google-genai==1.57.0 +google-genai==1.59.0 google-pasta==0.2.0 google-resumable-media==2.8.0 googleapis-common-protos==1.72.0 @@ -104,7 +104,7 @@ h5py==3.15.1 hf-xet==1.2.0 hpack==4.1.0 httpcore==1.0.9 -httplib2==0.22.0 +httplib2==0.31.1 httpx==0.28.1 huggingface-hub==0.36.0 hyperframe==6.1.0 @@ -113,15 +113,16 @@ idna==3.11 importlib_metadata==8.7.1 iniconfig==2.3.0 jaraco.classes==3.4.0 -jaraco.context==6.0.2 +jaraco.context==6.1.0 jaraco.functools==4.4.0 jeepney==0.9.0 Jinja2==3.1.6 +jiter==0.12.0 joblib==1.5.3 jsonpickle==3.4.2 jsonschema==4.26.0 jsonschema-specifications==2025.9.1 -keras==3.13.0 +keras==3.13.1 keyring==25.7.0 keyrings.google-artifactregistry-auth==1.1.2 libclang==18.1.1 @@ -142,6 +143,7 @@ nltk==3.9.2 numpy==2.4.1 oauth2client==4.1.3 objsize==0.7.1 +openai==1.109.1 opentelemetry-api==1.39.1 opentelemetry-resourcedetector-gcp==1.11.0a0 opentelemetry-sdk==1.39.1 @@ -164,7 +166,7 @@ protobuf==5.29.5 psycopg2-binary==2.9.11 pyarrow==18.1.0 pyarrow-hotfix==0.7 -pyasn1==0.6.1 +pyasn1==0.6.2 pyasn1_modules==0.4.2 pycparser==2.23 pydantic==2.12.5 @@ -185,7 +187,7 @@ python-tds==1.17.1 pytz==2025.2 PyYAML==6.0.3 referencing==0.37.0 -regex==2025.11.3 +regex==2026.1.15 requests==2.32.5 requests-mock==1.12.1 rich==14.2.0 @@ -200,7 +202,7 @@ setuptools==80.9.0 six==1.17.0 sniffio==1.3.1 sortedcontainers==2.4.0 -soupsieve==2.8.1 +soupsieve==2.8.2 SQLAlchemy==2.0.45 sqlalchemy_pytds==1.0.2 sqlparse==0.5.5 diff --git a/sdks/python/container/ml/py312/gpu_image_requirements.txt b/sdks/python/container/ml/py312/gpu_image_requirements.txt index 6067f74349f0..d73a017401f9 100644 --- a/sdks/python/container/ml/py312/gpu_image_requirements.txt +++ b/sdks/python/container/ml/py312/gpu_image_requirements.txt @@ -45,7 +45,7 @@ certifi==2026.1.4 cffi==2.0.0 charset-normalizer==3.4.4 click==8.3.1 -cloud-sql-python-connector==1.19.0 +cloud-sql-python-connector==1.20.0 cloudpickle==3.1.2 compressed-tensors==0.10.2 crcmod==1.7 @@ -65,7 +65,7 @@ envoy-data-plane==0.2.6 execnet==2.1.2 fastapi==0.128.0 fastapi-cli==0.0.20 -fastapi-cloud-cli==0.9.0 +fastapi-cloud-cli==0.11.0 fastar==0.8.0 fastavro==1.12.1 fasteners==0.20 @@ -79,7 +79,7 @@ future==1.0.0 gast==0.7.0 gguf==0.17.1 google-api-core==2.29.0 -google-api-python-client==2.187.0 +google-api-python-client==2.188.0 google-apitools==0.5.31 google-auth==2.47.0 google-auth-httplib2==0.2.1 @@ -89,22 +89,22 @@ google-cloud-bigquery-storage==2.36.0 google-cloud-bigtable==2.35.0 google-cloud-core==2.5.0 google-cloud-datastore==2.23.0 -google-cloud-dlp==3.33.0 -google-cloud-kms==3.7.0 -google-cloud-language==2.18.0 -google-cloud-monitoring==2.28.0 +google-cloud-dlp==3.34.0 +google-cloud-kms==3.9.0 +google-cloud-language==2.19.0 +google-cloud-monitoring==2.29.0 google-cloud-profiler==4.1.0 google-cloud-pubsub==2.34.0 google-cloud-pubsublite==1.13.0 google-cloud-recommendations-ai==0.10.18 -google-cloud-resource-manager==1.15.0 +google-cloud-resource-manager==1.16.0 google-cloud-secret-manager==2.26.0 -google-cloud-spanner==3.61.0 +google-cloud-spanner==3.62.0 google-cloud-storage==2.19.0 -google-cloud-videointelligence==2.17.0 -google-cloud-vision==3.11.0 +google-cloud-videointelligence==2.18.0 +google-cloud-vision==3.12.0 google-crc32c==1.8.0 -google-genai==1.57.0 +google-genai==1.59.0 google-pasta==0.2.0 google-resumable-media==2.8.0 googleapis-common-protos==1.72.0 @@ -121,7 +121,7 @@ h5py==3.15.1 hf-xet==1.2.0 hpack==4.1.0 httpcore==1.0.9 -httplib2==0.22.0 +httplib2==0.31.1 httptools==0.7.1 httpx==0.28.1 huggingface-hub==0.36.0 @@ -132,7 +132,7 @@ importlib_metadata==8.7.1 iniconfig==2.3.0 interegular==0.3.3 jaraco.classes==3.4.0 -jaraco.context==6.0.2 +jaraco.context==6.1.0 jaraco.functools==4.4.0 jeepney==0.9.0 Jinja2==3.1.6 @@ -141,7 +141,7 @@ joblib==1.5.3 jsonpickle==3.4.2 jsonschema==4.26.0 jsonschema-specifications==2025.9.1 -keras==3.13.0 +keras==3.13.1 keyring==25.7.0 keyrings.google-artifactregistry-auth==1.1.2 lark==1.2.2 @@ -186,9 +186,9 @@ nvidia-nvshmem-cu12==3.3.20 nvidia-nvtx-cu12==12.6.77 oauth2client==4.1.3 objsize==0.7.1 -openai==1.107.1 +openai==1.109.1 openai-harmony==0.0.8 -opencv-python-headless==4.12.0.88 +opencv-python-headless==4.13.0.90 opentelemetry-api==1.39.1 opentelemetry-resourcedetector-gcp==1.11.0a0 opentelemetry-sdk==1.39.1 @@ -208,7 +208,7 @@ pillow==12.1.0 pip==25.3 pluggy==1.6.0 prometheus-fastapi-instrumentator==7.1.0 -prometheus_client==0.23.1 +prometheus_client==0.24.1 propcache==0.4.1 proto-plus==1.27.0 protobuf==5.29.5 @@ -217,7 +217,7 @@ psycopg2-binary==2.9.11 py-cpuinfo==9.0.0 pyarrow==18.1.0 pyarrow-hotfix==0.7 -pyasn1==0.6.1 +pyasn1==0.6.2 pyasn1_modules==0.4.2 pybase64==1.4.3 pycountry==24.6.1 @@ -246,7 +246,7 @@ PyYAML==6.0.3 pyzmq==27.1.0 ray==2.53.0 referencing==0.37.0 -regex==2025.11.3 +regex==2026.1.15 requests==2.32.5 requests-mock==1.12.1 rich==14.2.0 @@ -268,7 +268,7 @@ six==1.17.0 sniffio==1.3.1 sortedcontainers==2.4.0 soundfile==0.13.1 -soupsieve==2.8.1 +soupsieve==2.8.2 soxr==1.0.0 SQLAlchemy==2.0.45 sqlalchemy_pytds==1.0.2 diff --git a/sdks/python/container/ml/py313/base_image_requirements.txt b/sdks/python/container/ml/py313/base_image_requirements.txt index bf9589c73d05..f43e0dc9a658 100644 --- a/sdks/python/container/ml/py313/base_image_requirements.txt +++ b/sdks/python/container/ml/py313/base_image_requirements.txt @@ -41,7 +41,7 @@ certifi==2026.1.4 cffi==2.0.0 charset-normalizer==3.4.4 click==8.3.1 -cloud-sql-python-connector==1.19.0 +cloud-sql-python-connector==1.20.0 crcmod==1.7 cryptography==46.0.3 Cython==3.2.4 @@ -71,21 +71,21 @@ google-cloud-bigquery-storage==2.36.0 google-cloud-bigtable==2.35.0 google-cloud-core==2.5.0 google-cloud-datastore==2.23.0 -google-cloud-dlp==3.33.0 -google-cloud-kms==3.7.0 -google-cloud-language==2.18.0 -google-cloud-monitoring==2.28.0 +google-cloud-dlp==3.34.0 +google-cloud-kms==3.9.0 +google-cloud-language==2.19.0 +google-cloud-monitoring==2.29.0 google-cloud-pubsub==2.34.0 google-cloud-pubsublite==1.13.0 google-cloud-recommendations-ai==0.10.18 -google-cloud-resource-manager==1.15.0 +google-cloud-resource-manager==1.16.0 google-cloud-secret-manager==2.26.0 -google-cloud-spanner==3.61.0 +google-cloud-spanner==3.62.0 google-cloud-storage==2.19.0 -google-cloud-videointelligence==2.17.0 -google-cloud-vision==3.11.0 +google-cloud-videointelligence==2.18.0 +google-cloud-vision==3.12.0 google-crc32c==1.8.0 -google-genai==1.57.0 +google-genai==1.59.0 google-pasta==0.2.0 google-resumable-media==2.8.0 googleapis-common-protos==1.72.0 @@ -103,7 +103,7 @@ h5py==3.15.1 hf-xet==1.2.0 hpack==4.1.0 httpcore==1.0.9 -httplib2==0.22.0 +httplib2==0.31.1 httpx==0.28.1 huggingface-hub==0.36.0 hyperframe==6.1.0 @@ -112,15 +112,16 @@ idna==3.11 importlib_metadata==8.7.1 iniconfig==2.3.0 jaraco.classes==3.4.0 -jaraco.context==6.0.2 +jaraco.context==6.1.0 jaraco.functools==4.4.0 jeepney==0.9.0 Jinja2==3.1.6 +jiter==0.12.0 joblib==1.5.3 jsonpickle==3.4.2 jsonschema==4.26.0 jsonschema-specifications==2025.9.1 -keras==3.13.0 +keras==3.13.1 keyring==25.7.0 keyrings.google-artifactregistry-auth==1.1.2 libclang==18.1.1 @@ -140,6 +141,7 @@ nltk==3.9.2 numpy==2.4.1 oauth2client==4.1.3 objsize==0.7.1 +openai==1.109.1 opentelemetry-api==1.39.1 opentelemetry-resourcedetector-gcp==1.11.0a0 opentelemetry-sdk==1.39.1 @@ -158,11 +160,11 @@ pip==25.3 pluggy==1.6.0 propcache==0.4.1 proto-plus==1.27.0 -protobuf==6.33.3 +protobuf==6.33.4 psycopg2-binary==2.9.11 pyarrow==18.1.0 pyarrow-hotfix==0.7 -pyasn1==0.6.1 +pyasn1==0.6.2 pyasn1_modules==0.4.2 pycparser==2.23 pydantic==2.12.5 @@ -183,7 +185,7 @@ python-tds==1.17.1 pytz==2025.2 PyYAML==6.0.3 referencing==0.37.0 -regex==2025.11.3 +regex==2026.1.15 requests==2.32.5 requests-mock==1.12.1 rich==14.2.0 @@ -198,7 +200,7 @@ setuptools==80.9.0 six==1.17.0 sniffio==1.3.1 sortedcontainers==2.4.0 -soupsieve==2.8.1 +soupsieve==2.8.2 SQLAlchemy==2.0.45 sqlalchemy_pytds==1.0.2 sqlparse==0.5.5 diff --git a/sdks/python/container/py310/base_image_requirements.txt b/sdks/python/container/py310/base_image_requirements.txt index 76772ffbb79e..08a469647f18 100644 --- a/sdks/python/container/py310/base_image_requirements.txt +++ b/sdks/python/container/py310/base_image_requirements.txt @@ -41,7 +41,7 @@ certifi==2026.1.4 cffi==2.0.0 charset-normalizer==3.4.4 click==8.3.1 -cloud-sql-python-connector==1.19.0 +cloud-sql-python-connector==1.20.0 crcmod==1.7 cryptography==46.0.3 Cython==3.2.4 @@ -59,7 +59,7 @@ freezegun==1.5.5 frozenlist==1.8.0 future==1.0.0 google-api-core==2.29.0 -google-api-python-client==2.187.0 +google-api-python-client==2.188.0 google-apitools==0.5.31 google-auth==2.47.0 google-auth-httplib2==0.2.1 @@ -69,22 +69,22 @@ google-cloud-bigquery-storage==2.36.0 google-cloud-bigtable==2.35.0 google-cloud-core==2.5.0 google-cloud-datastore==2.23.0 -google-cloud-dlp==3.33.0 -google-cloud-kms==3.7.0 -google-cloud-language==2.18.0 -google-cloud-monitoring==2.28.0 +google-cloud-dlp==3.34.0 +google-cloud-kms==3.9.0 +google-cloud-language==2.19.0 +google-cloud-monitoring==2.29.0 google-cloud-profiler==4.1.0 google-cloud-pubsub==2.34.0 google-cloud-pubsublite==1.13.0 google-cloud-recommendations-ai==0.10.18 -google-cloud-resource-manager==1.15.0 +google-cloud-resource-manager==1.16.0 google-cloud-secret-manager==2.26.0 -google-cloud-spanner==3.61.0 +google-cloud-spanner==3.62.0 google-cloud-storage==2.19.0 -google-cloud-videointelligence==2.17.0 -google-cloud-vision==3.11.0 +google-cloud-videointelligence==2.18.0 +google-cloud-vision==3.12.0 google-crc32c==1.8.0 -google-genai==1.57.0 +google-genai==1.59.0 google-resumable-media==2.8.0 googleapis-common-protos==1.72.0 greenlet==3.3.0 @@ -98,7 +98,7 @@ h11==0.16.0 h2==4.3.0 hpack==4.1.0 httpcore==1.0.9 -httplib2==0.22.0 +httplib2==0.31.1 httpx==0.28.1 hyperframe==6.1.0 hypothesis==6.148.3 @@ -106,10 +106,11 @@ idna==3.11 importlib_metadata==8.7.1 iniconfig==2.3.0 jaraco.classes==3.4.0 -jaraco.context==6.0.2 +jaraco.context==6.1.0 jaraco.functools==4.4.0 jeepney==0.9.0 Jinja2==3.1.6 +jiter==0.12.0 joblib==1.5.3 Js2Py==0.74 jsonpickle==3.4.2 @@ -127,6 +128,7 @@ nltk==3.9.2 numpy==2.2.6 oauth2client==4.1.3 objsize==0.7.1 +openai==1.109.1 opentelemetry-api==1.39.1 opentelemetry-resourcedetector-gcp==1.11.0a0 opentelemetry-sdk==1.39.1 @@ -146,7 +148,7 @@ protobuf==5.29.5 psycopg2-binary==2.9.11 pyarrow==18.1.0 pyarrow-hotfix==0.7 -pyasn1==0.6.1 +pyasn1==0.6.2 pyasn1_modules==0.4.2 pycparser==2.23 pydantic==2.12.5 @@ -168,7 +170,7 @@ python-tds==1.17.1 pytz==2025.2 PyYAML==6.0.3 referencing==0.37.0 -regex==2025.11.3 +regex==2026.1.15 requests==2.32.5 requests-mock==1.12.1 rpds-py==0.30.0 @@ -181,14 +183,14 @@ setuptools==80.9.0 six==1.17.0 sniffio==1.3.1 sortedcontainers==2.4.0 -soupsieve==2.8.1 +soupsieve==2.8.2 SQLAlchemy==2.0.45 sqlalchemy_pytds==1.0.2 sqlparse==0.5.5 tenacity==8.5.0 testcontainers==4.14.0 threadpoolctl==3.6.0 -tomli==2.3.0 +tomli==2.4.0 tqdm==4.67.1 typing-inspection==0.4.2 typing_extensions==4.15.0 diff --git a/sdks/python/container/py311/base_image_requirements.txt b/sdks/python/container/py311/base_image_requirements.txt index 2a1b53d96ab4..99f03a4af305 100644 --- a/sdks/python/container/py311/base_image_requirements.txt +++ b/sdks/python/container/py311/base_image_requirements.txt @@ -40,7 +40,7 @@ certifi==2026.1.4 cffi==2.0.0 charset-normalizer==3.4.4 click==8.3.1 -cloud-sql-python-connector==1.19.0 +cloud-sql-python-connector==1.20.0 crcmod==1.7 cryptography==46.0.3 Cython==3.2.4 @@ -57,7 +57,7 @@ freezegun==1.5.5 frozenlist==1.8.0 future==1.0.0 google-api-core==2.29.0 -google-api-python-client==2.187.0 +google-api-python-client==2.188.0 google-apitools==0.5.31 google-auth==2.47.0 google-auth-httplib2==0.2.1 @@ -67,22 +67,22 @@ google-cloud-bigquery-storage==2.36.0 google-cloud-bigtable==2.35.0 google-cloud-core==2.5.0 google-cloud-datastore==2.23.0 -google-cloud-dlp==3.33.0 -google-cloud-kms==3.7.0 -google-cloud-language==2.18.0 -google-cloud-monitoring==2.28.0 +google-cloud-dlp==3.34.0 +google-cloud-kms==3.9.0 +google-cloud-language==2.19.0 +google-cloud-monitoring==2.29.0 google-cloud-profiler==4.1.0 google-cloud-pubsub==2.34.0 google-cloud-pubsublite==1.13.0 google-cloud-recommendations-ai==0.10.18 -google-cloud-resource-manager==1.15.0 +google-cloud-resource-manager==1.16.0 google-cloud-secret-manager==2.26.0 -google-cloud-spanner==3.61.0 +google-cloud-spanner==3.62.0 google-cloud-storage==2.19.0 -google-cloud-videointelligence==2.17.0 -google-cloud-vision==3.11.0 +google-cloud-videointelligence==2.18.0 +google-cloud-vision==3.12.0 google-crc32c==1.8.0 -google-genai==1.57.0 +google-genai==1.59.0 google-resumable-media==2.8.0 googleapis-common-protos==1.72.0 greenlet==3.3.0 @@ -96,7 +96,7 @@ h11==0.16.0 h2==4.3.0 hpack==4.1.0 httpcore==1.0.9 -httplib2==0.22.0 +httplib2==0.31.1 httpx==0.28.1 hyperframe==6.1.0 hypothesis==6.148.3 @@ -104,10 +104,11 @@ idna==3.11 importlib_metadata==8.7.1 iniconfig==2.3.0 jaraco.classes==3.4.0 -jaraco.context==6.0.2 +jaraco.context==6.1.0 jaraco.functools==4.4.0 jeepney==0.9.0 Jinja2==3.1.6 +jiter==0.12.0 joblib==1.5.3 Js2Py==0.74 jsonpickle==3.4.2 @@ -125,6 +126,7 @@ nltk==3.9.2 numpy==2.4.1 oauth2client==4.1.3 objsize==0.7.1 +openai==1.109.1 opentelemetry-api==1.39.1 opentelemetry-resourcedetector-gcp==1.11.0a0 opentelemetry-sdk==1.39.1 @@ -144,7 +146,7 @@ protobuf==5.29.5 psycopg2-binary==2.9.11 pyarrow==18.1.0 pyarrow-hotfix==0.7 -pyasn1==0.6.1 +pyasn1==0.6.2 pyasn1_modules==0.4.2 pycparser==2.23 pydantic==2.12.5 @@ -166,7 +168,7 @@ python-tds==1.17.1 pytz==2025.2 PyYAML==6.0.3 referencing==0.37.0 -regex==2025.11.3 +regex==2026.1.15 requests==2.32.5 requests-mock==1.12.1 rpds-py==0.30.0 @@ -179,7 +181,7 @@ setuptools==80.9.0 six==1.17.0 sniffio==1.3.1 sortedcontainers==2.4.0 -soupsieve==2.8.1 +soupsieve==2.8.2 SQLAlchemy==2.0.45 sqlalchemy_pytds==1.0.2 sqlparse==0.5.5 diff --git a/sdks/python/container/py312/base_image_requirements.txt b/sdks/python/container/py312/base_image_requirements.txt index 356f224d854a..119cf2daad2c 100644 --- a/sdks/python/container/py312/base_image_requirements.txt +++ b/sdks/python/container/py312/base_image_requirements.txt @@ -39,7 +39,7 @@ certifi==2026.1.4 cffi==2.0.0 charset-normalizer==3.4.4 click==8.3.1 -cloud-sql-python-connector==1.19.0 +cloud-sql-python-connector==1.20.0 crcmod==1.7 cryptography==46.0.3 Cython==3.2.4 @@ -56,7 +56,7 @@ freezegun==1.5.5 frozenlist==1.8.0 future==1.0.0 google-api-core==2.29.0 -google-api-python-client==2.187.0 +google-api-python-client==2.188.0 google-apitools==0.5.31 google-auth==2.47.0 google-auth-httplib2==0.2.1 @@ -66,22 +66,22 @@ google-cloud-bigquery-storage==2.36.0 google-cloud-bigtable==2.35.0 google-cloud-core==2.5.0 google-cloud-datastore==2.23.0 -google-cloud-dlp==3.33.0 -google-cloud-kms==3.7.0 -google-cloud-language==2.18.0 -google-cloud-monitoring==2.28.0 +google-cloud-dlp==3.34.0 +google-cloud-kms==3.9.0 +google-cloud-language==2.19.0 +google-cloud-monitoring==2.29.0 google-cloud-profiler==4.1.0 google-cloud-pubsub==2.34.0 google-cloud-pubsublite==1.13.0 google-cloud-recommendations-ai==0.10.18 -google-cloud-resource-manager==1.15.0 +google-cloud-resource-manager==1.16.0 google-cloud-secret-manager==2.26.0 -google-cloud-spanner==3.61.0 +google-cloud-spanner==3.62.0 google-cloud-storage==2.19.0 -google-cloud-videointelligence==2.17.0 -google-cloud-vision==3.11.0 +google-cloud-videointelligence==2.18.0 +google-cloud-vision==3.12.0 google-crc32c==1.8.0 -google-genai==1.57.0 +google-genai==1.59.0 google-resumable-media==2.8.0 googleapis-common-protos==1.72.0 greenlet==3.3.0 @@ -95,7 +95,7 @@ h11==0.16.0 h2==4.3.0 hpack==4.1.0 httpcore==1.0.9 -httplib2==0.22.0 +httplib2==0.31.1 httpx==0.28.1 hyperframe==6.1.0 hypothesis==6.148.3 @@ -103,10 +103,11 @@ idna==3.11 importlib_metadata==8.7.1 iniconfig==2.3.0 jaraco.classes==3.4.0 -jaraco.context==6.0.2 +jaraco.context==6.1.0 jaraco.functools==4.4.0 jeepney==0.9.0 Jinja2==3.1.6 +jiter==0.12.0 joblib==1.5.3 jsonpickle==3.4.2 jsonschema==4.26.0 @@ -123,6 +124,7 @@ nltk==3.9.2 numpy==2.4.1 oauth2client==4.1.3 objsize==0.7.1 +openai==1.109.1 opentelemetry-api==1.39.1 opentelemetry-resourcedetector-gcp==1.11.0a0 opentelemetry-sdk==1.39.1 @@ -142,7 +144,7 @@ protobuf==5.29.5 psycopg2-binary==2.9.11 pyarrow==18.1.0 pyarrow-hotfix==0.7 -pyasn1==0.6.1 +pyasn1==0.6.2 pyasn1_modules==0.4.2 pycparser==2.23 pydantic==2.12.5 @@ -163,7 +165,7 @@ python-tds==1.17.1 pytz==2025.2 PyYAML==6.0.3 referencing==0.37.0 -regex==2025.11.3 +regex==2026.1.15 requests==2.32.5 requests-mock==1.12.1 rpds-py==0.30.0 @@ -176,7 +178,7 @@ setuptools==80.9.0 six==1.17.0 sniffio==1.3.1 sortedcontainers==2.4.0 -soupsieve==2.8.1 +soupsieve==2.8.2 SQLAlchemy==2.0.45 sqlalchemy_pytds==1.0.2 sqlparse==0.5.5 diff --git a/sdks/python/container/py313/base_image_requirements.txt b/sdks/python/container/py313/base_image_requirements.txt index 04ec48e1efbc..b465cdeac37b 100644 --- a/sdks/python/container/py313/base_image_requirements.txt +++ b/sdks/python/container/py313/base_image_requirements.txt @@ -39,7 +39,7 @@ certifi==2026.1.4 cffi==2.0.0 charset-normalizer==3.4.4 click==8.3.1 -cloud-sql-python-connector==1.19.0 +cloud-sql-python-connector==1.20.0 crcmod==1.7 cryptography==46.0.3 Cython==3.2.4 @@ -65,21 +65,21 @@ google-cloud-bigquery-storage==2.36.0 google-cloud-bigtable==2.35.0 google-cloud-core==2.5.0 google-cloud-datastore==2.23.0 -google-cloud-dlp==3.33.0 -google-cloud-kms==3.7.0 -google-cloud-language==2.18.0 -google-cloud-monitoring==2.28.0 +google-cloud-dlp==3.34.0 +google-cloud-kms==3.9.0 +google-cloud-language==2.19.0 +google-cloud-monitoring==2.29.0 google-cloud-pubsub==2.34.0 google-cloud-pubsublite==1.13.0 google-cloud-recommendations-ai==0.10.18 -google-cloud-resource-manager==1.15.0 +google-cloud-resource-manager==1.16.0 google-cloud-secret-manager==2.26.0 -google-cloud-spanner==3.61.0 +google-cloud-spanner==3.62.0 google-cloud-storage==2.19.0 -google-cloud-videointelligence==2.17.0 -google-cloud-vision==3.11.0 +google-cloud-videointelligence==2.18.0 +google-cloud-vision==3.12.0 google-crc32c==1.8.0 -google-genai==1.57.0 +google-genai==1.59.0 google-resumable-media==2.8.0 googleapis-common-protos==1.72.0 greenlet==3.3.0 @@ -94,7 +94,7 @@ h11==0.16.0 h2==4.3.0 hpack==4.1.0 httpcore==1.0.9 -httplib2==0.22.0 +httplib2==0.31.1 httpx==0.28.1 hyperframe==6.1.0 hypothesis==6.148.3 @@ -102,10 +102,11 @@ idna==3.11 importlib_metadata==8.7.1 iniconfig==2.3.0 jaraco.classes==3.4.0 -jaraco.context==6.0.2 +jaraco.context==6.1.0 jaraco.functools==4.4.0 jeepney==0.9.0 Jinja2==3.1.6 +jiter==0.12.0 joblib==1.5.3 jsonpickle==3.4.2 jsonschema==4.26.0 @@ -121,6 +122,7 @@ nltk==3.9.2 numpy==2.4.1 oauth2client==4.1.3 objsize==0.7.1 +openai==1.109.1 opentelemetry-api==1.39.1 opentelemetry-resourcedetector-gcp==1.11.0a0 opentelemetry-sdk==1.39.1 @@ -136,11 +138,11 @@ pip==25.3 pluggy==1.6.0 propcache==0.4.1 proto-plus==1.27.0 -protobuf==6.33.3 +protobuf==6.33.4 psycopg2-binary==2.9.11 pyarrow==18.1.0 pyarrow-hotfix==0.7 -pyasn1==0.6.1 +pyasn1==0.6.2 pyasn1_modules==0.4.2 pycparser==2.23 pydantic==2.12.5 @@ -161,7 +163,7 @@ python-tds==1.17.1 pytz==2025.2 PyYAML==6.0.3 referencing==0.37.0 -regex==2025.11.3 +regex==2026.1.15 requests==2.32.5 requests-mock==1.12.1 rpds-py==0.30.0 @@ -174,7 +176,7 @@ setuptools==80.9.0 six==1.17.0 sniffio==1.3.1 sortedcontainers==2.4.0 -soupsieve==2.8.1 +soupsieve==2.8.2 SQLAlchemy==2.0.45 sqlalchemy_pytds==1.0.2 sqlparse==0.5.5 From f14f99648bc49c2d4be7c6e18efd33d686b698c6 Mon Sep 17 00:00:00 2001 From: Mohamed Awnallah Date: Mon, 19 Jan 2026 06:38:27 +0000 Subject: [PATCH 7/7] sdks/python: fix proposed linting issues --- .../ml/rag/embeddings/huggingface_test.py | 108 ++-- .../transforms/embeddings/open_ai_it_test.py | 474 +++++++++--------- 2 files changed, 291 insertions(+), 291 deletions(-) diff --git a/sdks/python/apache_beam/ml/rag/embeddings/huggingface_test.py b/sdks/python/apache_beam/ml/rag/embeddings/huggingface_test.py index 051068318172..f3509537ea71 100644 --- a/sdks/python/apache_beam/ml/rag/embeddings/huggingface_test.py +++ b/sdks/python/apache_beam/ml/rag/embeddings/huggingface_test.py @@ -32,69 +32,73 @@ # pylint: disable=unused-import try: - from sentence_transformers import SentenceTransformer + from sentence_transformers import SentenceTransformer - SENTENCE_TRANSFORMERS_AVAILABLE = True + SENTENCE_TRANSFORMERS_AVAILABLE = True except ImportError: - SENTENCE_TRANSFORMERS_AVAILABLE = False + SENTENCE_TRANSFORMERS_AVAILABLE = False @pytest.mark.uses_transformers @unittest.skipIf( - not SENTENCE_TRANSFORMERS_AVAILABLE, "sentence-transformers not available" -) + not SENTENCE_TRANSFORMERS_AVAILABLE, "sentence-transformers not available") class HuggingfaceTextEmbeddingsTest(unittest.TestCase): - def setUp(self): - self.artifact_location = tempfile.mkdtemp(prefix="sentence_transformers_") - self.test_chunks = [ - Chunk( - content=Content(text="This is a test sentence."), - id="1", - metadata={"source": "test.txt", "language": "en"}, - ), - Chunk( - content=Content(text="Another example."), - id="2", - metadata={"source": "test.txt", "language": "en"}, - ), - ] + def setUp(self): + self.artifact_location = tempfile.mkdtemp(prefix="sentence_transformers_") + self.test_chunks = [ + Chunk( + content=Content(text="This is a test sentence."), + id="1", + metadata={ + "source": "test.txt", "language": "en" + }, + ), + Chunk( + content=Content(text="Another example."), + id="2", + metadata={ + "source": "test.txt", "language": "en" + }, + ), + ] - def tearDown(self) -> None: - shutil.rmtree(self.artifact_location) + def tearDown(self) -> None: + shutil.rmtree(self.artifact_location) - def test_embedding_pipeline(self): - expected = [ - Chunk( - id="1", - embedding=Embedding(dense_embedding=[0.0] * 384), - metadata={"source": "test.txt", "language": "en"}, - content=Content(text="This is a test sentence."), - ), - Chunk( - id="2", - embedding=Embedding(dense_embedding=[0.0] * 384), - metadata={"source": "test.txt", "language": "en"}, - content=Content(text="Another example."), - ), - ] - embedder = HuggingfaceTextEmbeddings( - model_name="sentence-transformers/all-MiniLM-L6-v2" - ) + def test_embedding_pipeline(self): + expected = [ + Chunk( + id="1", + embedding=Embedding(dense_embedding=[0.0] * 384), + metadata={ + "source": "test.txt", "language": "en" + }, + content=Content(text="This is a test sentence."), + ), + Chunk( + id="2", + embedding=Embedding(dense_embedding=[0.0] * 384), + metadata={ + "source": "test.txt", "language": "en" + }, + content=Content(text="Another example."), + ), + ] + embedder = HuggingfaceTextEmbeddings( + model_name="sentence-transformers/all-MiniLM-L6-v2") - with TestPipeline() as p: - embeddings = ( - p - | beam.Create(self.test_chunks) - | MLTransform( - write_artifact_location=self.artifact_location - ).with_transform(embedder) - ) + with TestPipeline() as p: + embeddings = ( + p + | beam.Create(self.test_chunks) + | MLTransform(write_artifact_location=self.artifact_location). + with_transform(embedder)) - assert_that( - embeddings, - equal_to(expected, equals_fn=TestHelpers.chunk_approximately_equals), - ) + assert_that( + embeddings, + equal_to(expected, equals_fn=TestHelpers.chunk_approximately_equals), + ) if __name__ == "__main__": - unittest.main() + unittest.main() diff --git a/sdks/python/apache_beam/ml/transforms/embeddings/open_ai_it_test.py b/sdks/python/apache_beam/ml/transforms/embeddings/open_ai_it_test.py index ea32684dd85e..9bcc384d48a2 100644 --- a/sdks/python/apache_beam/ml/transforms/embeddings/open_ai_it_test.py +++ b/sdks/python/apache_beam/ml/transforms/embeddings/open_ai_it_test.py @@ -27,261 +27,257 @@ # pylint: disable=ungrouped-imports try: - import tensorflow_transform as tft + import tensorflow_transform as tft - from apache_beam.ml.transforms.tft import ScaleTo01 + from apache_beam.ml.transforms.tft import ScaleTo01 except ImportError: - tft = None + tft = None test_query = "This is a test" test_query_column = "embedding" model_name: str = "text-embedding-3-small" -@unittest.skipIf(OpenAITextEmbeddings is None, "OpenAI Python SDK is not installed.") +@unittest.skipIf( + OpenAITextEmbeddings is None, "OpenAI Python SDK is not installed.") class OpenAIEmbeddingsTest(unittest.TestCase): - def setUp(self) -> None: - self.artifact_location = tempfile.mkdtemp(prefix="_openai_test") - self.api_key = os.environ.get("OPENAI_API_KEY") - - def tearDown(self) -> None: - shutil.rmtree(self.artifact_location) - - def test_openai_text_embeddings(self): - embedding_config = OpenAITextEmbeddings( - model_name=model_name, columns=["embedding"], api_key=self.api_key - ) - with beam.Pipeline() as pipeline: - transformed_pcoll = ( - pipeline - | "CreateData" >> beam.Create([{test_query_column: test_query}]) - | "MLTransform" - >> MLTransform( - write_artifact_location=self.artifact_location - ).with_transform(embedding_config) - ) - - def assert_element(element): - # OpenAI text-embedding-3-small produces 1536-dimensional embeddings - assert len(element[test_query_column]) == 1536 - - _ = transformed_pcoll | beam.Map(assert_element) - - @unittest.skipIf(tft is None, "Tensorflow Transform is not installed.") - def test_embeddings_with_scale_to_0_1(self): - embedding_config = OpenAITextEmbeddings( - model_name=model_name, - columns=[test_query_column], - api_key=self.api_key, + def setUp(self) -> None: + self.artifact_location = tempfile.mkdtemp(prefix="_openai_test") + self.api_key = os.environ.get("OPENAI_API_KEY") + + def tearDown(self) -> None: + shutil.rmtree(self.artifact_location) + + def test_openai_text_embeddings(self): + embedding_config = OpenAITextEmbeddings( + model_name=model_name, columns=["embedding"], api_key=self.api_key) + with beam.Pipeline() as pipeline: + transformed_pcoll = ( + pipeline + | "CreateData" >> beam.Create([{ + test_query_column: test_query + }]) + | "MLTransform" >> MLTransform( + write_artifact_location=self.artifact_location).with_transform( + embedding_config)) + + def assert_element(element): + # OpenAI text-embedding-3-small produces 1536-dimensional embeddings + assert len(element[test_query_column]) == 1536 + + _ = transformed_pcoll | beam.Map(assert_element) + + @unittest.skipIf(tft is None, "Tensorflow Transform is not installed.") + def test_embeddings_with_scale_to_0_1(self): + embedding_config = OpenAITextEmbeddings( + model_name=model_name, + columns=[test_query_column], + api_key=self.api_key, + ) + scale_config = ScaleTo01(columns=["embedding"]) + with beam.Pipeline() as pipeline: + transformed_pcoll = ( + pipeline + | "CreateData" >> beam.Create([{ + test_query_column: test_query + }]) + | "MLTransform" >> MLTransform( + write_artifact_location=self.artifact_location).with_transform( + embedding_config)).with_transform(scale_config) + + def assert_element(element): + embedding_values = element.embedding + assert 0 <= max(embedding_values) <= 1 + assert 0 <= min(embedding_values) <= 1 + + _ = transformed_pcoll | beam.Map(assert_element) + + def test_embeddings_with_dimensions(self): + embedding_config = OpenAITextEmbeddings( + model_name=model_name, + columns=[test_query_column], + api_key=self.api_key, + dimensions=512, + ) + with beam.Pipeline() as pipeline: + transformed_pcoll = ( + pipeline + | "CreateData" >> beam.Create([{ + test_query_column: test_query + }]) + | "MLTransform" >> MLTransform( + write_artifact_location=self.artifact_location).with_transform( + embedding_config)) + + def assert_element(element): + # Check that we get 512-dimensional embeddings as requested + assert len(element[test_query_column]) == 512 + + _ = transformed_pcoll | beam.Map(assert_element) + + def pipeline_with_configurable_artifact_location( + self, + pipeline, + embedding_config=None, + read_artifact_location=None, + write_artifact_location=None, + ): + if write_artifact_location: + return pipeline | MLTransform( + write_artifact_location=write_artifact_location).with_transform( + embedding_config) + elif read_artifact_location: + return pipeline | MLTransform( + read_artifact_location=read_artifact_location) + else: + raise NotImplementedError + + def test_embeddings_with_read_artifact_location(self): + embedding_config = OpenAITextEmbeddings( + model_name=model_name, + columns=[test_query_column], + api_key=self.api_key) + + with beam.Pipeline() as p: + data = p | "CreateData" >> beam.Create([{test_query_column: test_query}]) + _ = self.pipeline_with_configurable_artifact_location( + pipeline=data, + embedding_config=embedding_config, + write_artifact_location=self.artifact_location, + ) + + with beam.Pipeline() as p: + data = p | "CreateData" >> beam.Create([{ + test_query_column: test_query + }, { + test_query_column: test_query + }]) + result_pcoll = self.pipeline_with_configurable_artifact_location( + pipeline=data, read_artifact_location=self.artifact_location) + + # Since we don't know the exact values of the embeddings, + # we just check that they are within a reasonable range + def assert_element(element): + # Embeddings should be normalized and generally small values + assert -1 <= element <= 1 + + _ = ( + result_pcoll + | beam.Map(lambda x: max(x[test_query_column])) + | beam.Map(assert_element)) + + def test_with_int_data_types(self): + embedding_config = OpenAITextEmbeddings( + model_name=model_name, + columns=[test_query_column], + api_key=self.api_key) + with self.assertRaises(Exception): + with beam.Pipeline() as pipeline: + _ = ( + pipeline + | "CreateData" >> beam.Create([{ + test_query_column: 1 + }]) + | "MLTransform" >> MLTransform( + write_artifact_location=self.artifact_location).with_transform( + embedding_config)) + + def test_with_artifact_location(self): + """Local artifact location test""" + secondary_artifact_location = tempfile.mkdtemp( + prefix="_openai_secondary_test") + + try: + embedding_config = OpenAITextEmbeddings( + model_name=model_name, + columns=[test_query_column], + api_key=self.api_key) + + with beam.Pipeline() as p: + data = p | "CreateData" >> beam.Create([{ + test_query_column: test_query + }]) + _ = self.pipeline_with_configurable_artifact_location( + pipeline=data, + embedding_config=embedding_config, + write_artifact_location=secondary_artifact_location, ) - scale_config = ScaleTo01(columns=["embedding"]) - with beam.Pipeline() as pipeline: - transformed_pcoll = ( - pipeline - | "CreateData" >> beam.Create([{test_query_column: test_query}]) - | "MLTransform" - >> MLTransform( - write_artifact_location=self.artifact_location - ).with_transform(embedding_config) - ).with_transform(scale_config) - - def assert_element(element): - embedding_values = element.embedding - assert 0 <= max(embedding_values) <= 1 - assert 0 <= min(embedding_values) <= 1 - - _ = transformed_pcoll | beam.Map(assert_element) - - def test_embeddings_with_dimensions(self): - embedding_config = OpenAITextEmbeddings( + + with beam.Pipeline() as p: + data = p | "CreateData" >> beam.Create([{ + test_query_column: test_query + }, { + test_query_column: test_query + }]) + result_pcoll = self.pipeline_with_configurable_artifact_location( + pipeline=data, read_artifact_location=secondary_artifact_location) + + def assert_element(element): + # Embeddings should be normalized and generally small values + assert -1 <= element <= 1 + + _ = ( + result_pcoll + | beam.Map(lambda x: max(x[test_query_column])) + | beam.Map(assert_element)) + finally: + # Clean up the temporary directory + shutil.rmtree(secondary_artifact_location) + + def test_mltransform_to_ptransform_with_openai(self): + transforms = [ + OpenAITextEmbeddings( + columns=["x"], model_name=model_name, - columns=[test_query_column], api_key=self.api_key, dimensions=512, - ) - with beam.Pipeline() as pipeline: - transformed_pcoll = ( - pipeline - | "CreateData" >> beam.Create([{test_query_column: test_query}]) - | "MLTransform" - >> MLTransform( - write_artifact_location=self.artifact_location - ).with_transform(embedding_config) - ) - - def assert_element(element): - # Check that we get 512-dimensional embeddings as requested - assert len(element[test_query_column]) == 512 - - _ = transformed_pcoll | beam.Map(assert_element) - - def pipeline_with_configurable_artifact_location( - self, - pipeline, - embedding_config=None, - read_artifact_location=None, - write_artifact_location=None, - ): - if write_artifact_location: - return pipeline | MLTransform( - write_artifact_location=write_artifact_location - ).with_transform(embedding_config) - elif read_artifact_location: - return pipeline | MLTransform(read_artifact_location=read_artifact_location) - else: - raise NotImplementedError - - def test_embeddings_with_read_artifact_location(self): - embedding_config = OpenAITextEmbeddings( - model_name=model_name, columns=[test_query_column], api_key=self.api_key - ) - - with beam.Pipeline() as p: - data = p | "CreateData" >> beam.Create([{test_query_column: test_query}]) - _ = self.pipeline_with_configurable_artifact_location( - pipeline=data, - embedding_config=embedding_config, - write_artifact_location=self.artifact_location, - ) - - with beam.Pipeline() as p: - data = p | "CreateData" >> beam.Create( - [{test_query_column: test_query}, {test_query_column: test_query}] - ) - result_pcoll = self.pipeline_with_configurable_artifact_location( - pipeline=data, read_artifact_location=self.artifact_location - ) - - # Since we don't know the exact values of the embeddings, - # we just check that they are within a reasonable range - def assert_element(element): - # Embeddings should be normalized and generally small values - assert -1 <= element <= 1 - - _ = ( - result_pcoll - | beam.Map(lambda x: max(x[test_query_column])) - | beam.Map(assert_element) - ) - - def test_with_int_data_types(self): - embedding_config = OpenAITextEmbeddings( - model_name=model_name, columns=[test_query_column], api_key=self.api_key - ) - with self.assertRaises(Exception): - with beam.Pipeline() as pipeline: - _ = ( - pipeline - | "CreateData" >> beam.Create([{test_query_column: 1}]) - | "MLTransform" - >> MLTransform( - write_artifact_location=self.artifact_location - ).with_transform(embedding_config) - ) - - def test_with_artifact_location(self): - """Local artifact location test""" - secondary_artifact_location = tempfile.mkdtemp(prefix="_openai_secondary_test") - - try: - embedding_config = OpenAITextEmbeddings( - model_name=model_name, columns=[test_query_column], api_key=self.api_key - ) - - with beam.Pipeline() as p: - data = p | "CreateData" >> beam.Create( - [{test_query_column: test_query}] - ) - _ = self.pipeline_with_configurable_artifact_location( - pipeline=data, - embedding_config=embedding_config, - write_artifact_location=secondary_artifact_location, - ) - - with beam.Pipeline() as p: - data = p | "CreateData" >> beam.Create( - [{test_query_column: test_query}, {test_query_column: test_query}] - ) - result_pcoll = self.pipeline_with_configurable_artifact_location( - pipeline=data, read_artifact_location=secondary_artifact_location - ) - - def assert_element(element): - # Embeddings should be normalized and generally small values - assert -1 <= element <= 1 - - _ = ( - result_pcoll - | beam.Map(lambda x: max(x[test_query_column])) - | beam.Map(assert_element) - ) - finally: - # Clean up the temporary directory - shutil.rmtree(secondary_artifact_location) - - def test_mltransform_to_ptransform_with_openai(self): - transforms = [ - OpenAITextEmbeddings( - columns=["x"], - model_name=model_name, - api_key=self.api_key, - dimensions=512, - ), - OpenAITextEmbeddings( - columns=["y", "z"], model_name=model_name, api_key=self.api_key - ), - ] - ptransform_mapper = base._MLTransformToPTransformMapper( - transforms=transforms, - artifact_location=self.artifact_location, - artifact_mode=None, + ), + OpenAITextEmbeddings( + columns=["y", "z"], model_name=model_name, api_key=self.api_key), + ] + ptransform_mapper = base._MLTransformToPTransformMapper( + transforms=transforms, + artifact_location=self.artifact_location, + artifact_mode=None, + ) + + ptransform_list = ptransform_mapper.create_and_save_ptransform_list() + self.assertTrue(len(ptransform_list) == 2) + + self.assertEqual(type(ptransform_list[0]), RunInference) + expected_columns = [["x"], ["y", "z"]] + expected_dimensions = [512, None] + for i in range(len(ptransform_list)): + self.assertEqual(type(ptransform_list[i]), RunInference) + self.assertEqual( + type(ptransform_list[i]._model_handler), base._TextEmbeddingHandler) + self.assertEqual( + ptransform_list[i]._model_handler.columns, expected_columns[i]) + self.assertEqual( + ptransform_list[i]._model_handler._underlying.model_name, model_name) + if expected_dimensions[i]: + self.assertEqual( + ptransform_list[i]._model_handler._underlying.dimensions, + expected_dimensions[i], ) - ptransform_list = ptransform_mapper.create_and_save_ptransform_list() - self.assertTrue(len(ptransform_list) == 2) - - self.assertEqual(type(ptransform_list[0]), RunInference) - expected_columns = [["x"], ["y", "z"]] - expected_dimensions = [512, None] - for i in range(len(ptransform_list)): - self.assertEqual(type(ptransform_list[i]), RunInference) - self.assertEqual( - type(ptransform_list[i]._model_handler), base._TextEmbeddingHandler - ) - self.assertEqual( - ptransform_list[i]._model_handler.columns, expected_columns[i] - ) - self.assertEqual( - ptransform_list[i]._model_handler._underlying.model_name, model_name - ) - if expected_dimensions[i]: - self.assertEqual( - ptransform_list[i]._model_handler._underlying.dimensions, - expected_dimensions[i], - ) - - ptransform_list = ( - base._MLTransformToPTransformMapper.load_transforms_from_artifact_location( - self.artifact_location - ) + ptransform_list = ( + base._MLTransformToPTransformMapper. + load_transforms_from_artifact_location(self.artifact_location)) + for i in range(len(ptransform_list)): + self.assertEqual(type(ptransform_list[i]), RunInference) + self.assertEqual( + type(ptransform_list[i]._model_handler), base._TextEmbeddingHandler) + self.assertEqual( + ptransform_list[i]._model_handler.columns, expected_columns[i]) + self.assertEqual( + ptransform_list[i]._model_handler._underlying.model_name, model_name) + if expected_dimensions[i]: + self.assertEqual( + ptransform_list[i]._model_handler._underlying.dimensions, + expected_dimensions[i], ) - for i in range(len(ptransform_list)): - self.assertEqual(type(ptransform_list[i]), RunInference) - self.assertEqual( - type(ptransform_list[i]._model_handler), base._TextEmbeddingHandler - ) - self.assertEqual( - ptransform_list[i]._model_handler.columns, expected_columns[i] - ) - self.assertEqual( - ptransform_list[i]._model_handler._underlying.model_name, model_name - ) - if expected_dimensions[i]: - self.assertEqual( - ptransform_list[i]._model_handler._underlying.dimensions, - expected_dimensions[i], - ) if __name__ == "__main__": - unittest.main() + unittest.main()