From d332d02d4ac76ed9fe54866c1285940f740fe011 Mon Sep 17 00:00:00 2001 From: Danny McCormick Date: Fri, 24 Mar 2023 14:43:24 -0400 Subject: [PATCH 1/8] Force version of sklearn in examples --- .../sklearn_examples_requirements.txt | 18 ++++++++++++++++++ .../sklearn_japanese_housing_regression.py | 4 ++++ .../inference/sklearn_mnist_classification.py | 2 ++ 3 files changed, 24 insertions(+) create mode 100644 sdks/python/apache_beam/examples/inference/sklearn_examples_requirements.txt diff --git a/sdks/python/apache_beam/examples/inference/sklearn_examples_requirements.txt b/sdks/python/apache_beam/examples/inference/sklearn_examples_requirements.txt new file mode 100644 index 000000000000..256e6be3b567 --- /dev/null +++ b/sdks/python/apache_beam/examples/inference/sklearn_examples_requirements.txt @@ -0,0 +1,18 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +scikit-learn==1.0.2 \ No newline at end of file diff --git a/sdks/python/apache_beam/examples/inference/sklearn_japanese_housing_regression.py b/sdks/python/apache_beam/examples/inference/sklearn_japanese_housing_regression.py index 0d0adb0425a8..e6a544c6a325 100644 --- a/sdks/python/apache_beam/examples/inference/sklearn_japanese_housing_regression.py +++ b/sdks/python/apache_beam/examples/inference/sklearn_japanese_housing_regression.py @@ -32,6 +32,7 @@ import argparse from typing import Iterable +import os import pandas import apache_beam as beam @@ -137,6 +138,9 @@ def run( known_args, pipeline_args = parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = save_main_session + requirements_dir = os.path.dirname(os.path.realpath(__file__)) + # Pin to the version that we trained the model on. + pipeline_options.view_as(SetupOptions).requirements_file = f'{requirements_dir}/sklearn_examples_requirements.txt' pipeline = test_pipeline if not test_pipeline: diff --git a/sdks/python/apache_beam/examples/inference/sklearn_mnist_classification.py b/sdks/python/apache_beam/examples/inference/sklearn_mnist_classification.py index e748166e6fda..fe46b280df77 100644 --- a/sdks/python/apache_beam/examples/inference/sklearn_mnist_classification.py +++ b/sdks/python/apache_beam/examples/inference/sklearn_mnist_classification.py @@ -90,6 +90,8 @@ def run( known_args, pipeline_args = parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = save_main_session + # Pin to the version that we trained the model on. + pipeline_options.view_as(SetupOptions).requirements_file = f'{requirements_dir}/sklearn_examples_requirements.txt' # In this example we pass keyed inputs to RunInference transform. # Therefore, we use KeyedModelHandler wrapper over SklearnModelHandlerNumpy. From a4ccaf2940cf880d2c5ffd69eab37a269394a138 Mon Sep 17 00:00:00 2001 From: Danny McCormick Date: Fri, 24 Mar 2023 14:44:25 -0400 Subject: [PATCH 2/8] Add missing lines --- .../examples/inference/sklearn_mnist_classification.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sdks/python/apache_beam/examples/inference/sklearn_mnist_classification.py b/sdks/python/apache_beam/examples/inference/sklearn_mnist_classification.py index fe46b280df77..0bbabd75cbe2 100644 --- a/sdks/python/apache_beam/examples/inference/sklearn_mnist_classification.py +++ b/sdks/python/apache_beam/examples/inference/sklearn_mnist_classification.py @@ -26,6 +26,7 @@ import argparse import logging +import os from typing import Iterable from typing import List from typing import Tuple @@ -90,6 +91,7 @@ def run( known_args, pipeline_args = parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = save_main_session + requirements_dir = os.path.dirname(os.path.realpath(__file__)) # Pin to the version that we trained the model on. pipeline_options.view_as(SetupOptions).requirements_file = f'{requirements_dir}/sklearn_examples_requirements.txt' From c21348e68248504b37ad131e3f9596ac54be3a9c Mon Sep 17 00:00:00 2001 From: Danny McCormick Date: Fri, 24 Mar 2023 14:45:06 -0400 Subject: [PATCH 3/8] Comments --- .../examples/inference/sklearn_japanese_housing_regression.py | 2 +- .../examples/inference/sklearn_mnist_classification.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sdks/python/apache_beam/examples/inference/sklearn_japanese_housing_regression.py b/sdks/python/apache_beam/examples/inference/sklearn_japanese_housing_regression.py index e6a544c6a325..8b3fbdbfb671 100644 --- a/sdks/python/apache_beam/examples/inference/sklearn_japanese_housing_regression.py +++ b/sdks/python/apache_beam/examples/inference/sklearn_japanese_housing_regression.py @@ -139,7 +139,7 @@ def run( pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = save_main_session requirements_dir = os.path.dirname(os.path.realpath(__file__)) - # Pin to the version that we trained the model on. + # Pin to the version that we trained the model on. Sklearn doesn't guarantee compatability between versions. pipeline_options.view_as(SetupOptions).requirements_file = f'{requirements_dir}/sklearn_examples_requirements.txt' pipeline = test_pipeline diff --git a/sdks/python/apache_beam/examples/inference/sklearn_mnist_classification.py b/sdks/python/apache_beam/examples/inference/sklearn_mnist_classification.py index 0bbabd75cbe2..a5522f833456 100644 --- a/sdks/python/apache_beam/examples/inference/sklearn_mnist_classification.py +++ b/sdks/python/apache_beam/examples/inference/sklearn_mnist_classification.py @@ -92,7 +92,7 @@ def run( pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = save_main_session requirements_dir = os.path.dirname(os.path.realpath(__file__)) - # Pin to the version that we trained the model on. + # Pin to the version that we trained the model on. Sklearn doesn't guarantee compatability between versions. pipeline_options.view_as(SetupOptions).requirements_file = f'{requirements_dir}/sklearn_examples_requirements.txt' # In this example we pass keyed inputs to RunInference transform. From 078e9bb7bd49f78a6efc99d95debe2ed1fbfd67e Mon Sep 17 00:00:00 2001 From: Danny McCormick Date: Fri, 24 Mar 2023 14:48:36 -0400 Subject: [PATCH 4/8] Update readme --- sdks/python/apache_beam/examples/inference/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/sdks/python/apache_beam/examples/inference/README.md b/sdks/python/apache_beam/examples/inference/README.md index f769e82bf22f..2dc1247934d0 100644 --- a/sdks/python/apache_beam/examples/inference/README.md +++ b/sdks/python/apache_beam/examples/inference/README.md @@ -334,6 +334,7 @@ To use this transform, you need a dataset and model for language modeling. ... ``` 2. Create a file named `MODEL_PATH` that contains the pickled file of a scikit-learn model trained on MNIST data. Please refer to this scikit-learn [model persistence documentation](https://scikit-learn.org/stable/model_persistence.html) on how to serialize models. +3. Update sklearn_examples_requirements.txt to match the version of sklearn used to train the model. Sklearn doesn't guarantee model compatability between versions. ### Running `sklearn_mnist_classification.py` From db5f7d0baf9a262fbe834f2c318525a235f365e9 Mon Sep 17 00:00:00 2001 From: Danny McCormick Date: Sat, 25 Mar 2023 08:20:46 -0400 Subject: [PATCH 5/8] Lint --- .../inference/sklearn_japanese_housing_regression.py | 8 ++++++-- .../examples/inference/sklearn_mnist_classification.py | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/sdks/python/apache_beam/examples/inference/sklearn_japanese_housing_regression.py b/sdks/python/apache_beam/examples/inference/sklearn_japanese_housing_regression.py index 8b3fbdbfb671..690f5e67075a 100644 --- a/sdks/python/apache_beam/examples/inference/sklearn_japanese_housing_regression.py +++ b/sdks/python/apache_beam/examples/inference/sklearn_japanese_housing_regression.py @@ -88,6 +88,7 @@ def sort_by_features(dataframe, max_size): class LoadDataframe(beam.DoFn): + def process(self, file_name: str) -> Iterable[pandas.DataFrame]: """ Loads data files as a pandas dataframe.""" file = FileSystems.open(file_name, 'rb') @@ -139,8 +140,11 @@ def run( pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = save_main_session requirements_dir = os.path.dirname(os.path.realpath(__file__)) - # Pin to the version that we trained the model on. Sklearn doesn't guarantee compatability between versions. - pipeline_options.view_as(SetupOptions).requirements_file = f'{requirements_dir}/sklearn_examples_requirements.txt' + # Pin to the version that we trained the model on. + # Sklearn doesn't guarantee compatability between versions. + pipeline_options.view_as( + SetupOptions + ).requirements_file = f'{requirements_dir}/sklearn_examples_requirements.txt' pipeline = test_pipeline if not test_pipeline: diff --git a/sdks/python/apache_beam/examples/inference/sklearn_mnist_classification.py b/sdks/python/apache_beam/examples/inference/sklearn_mnist_classification.py index a5522f833456..e1c1a4fe78af 100644 --- a/sdks/python/apache_beam/examples/inference/sklearn_mnist_classification.py +++ b/sdks/python/apache_beam/examples/inference/sklearn_mnist_classification.py @@ -53,6 +53,7 @@ class PostProcessor(beam.DoFn): """Process the PredictionResult to get the predicted label. Returns a comma separated string with true label and predicted label. """ + def process(self, element: Tuple[int, PredictionResult]) -> Iterable[str]: label, prediction_result = element prediction = prediction_result.inference @@ -92,8 +93,11 @@ def run( pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = save_main_session requirements_dir = os.path.dirname(os.path.realpath(__file__)) - # Pin to the version that we trained the model on. Sklearn doesn't guarantee compatability between versions. - pipeline_options.view_as(SetupOptions).requirements_file = f'{requirements_dir}/sklearn_examples_requirements.txt' + # Pin to the version that we trained the model on. + # Sklearn doesn't guarantee compatability between versions. + pipeline_options.view_as( + SetupOptions + ).requirements_file = f'{requirements_dir}/sklearn_examples_requirements.txt' # In this example we pass keyed inputs to RunInference transform. # Therefore, we use KeyedModelHandler wrapper over SklearnModelHandlerNumpy. From 8cd2fbf06b7c93135aec6f949a68d0228869d2ea Mon Sep 17 00:00:00 2001 From: Danny McCormick Date: Sat, 25 Mar 2023 08:22:48 -0400 Subject: [PATCH 6/8] Add info in requirements.txt --- .../examples/inference/sklearn_examples_requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sdks/python/apache_beam/examples/inference/sklearn_examples_requirements.txt b/sdks/python/apache_beam/examples/inference/sklearn_examples_requirements.txt index 256e6be3b567..7c41e37e01b7 100644 --- a/sdks/python/apache_beam/examples/inference/sklearn_examples_requirements.txt +++ b/sdks/python/apache_beam/examples/inference/sklearn_examples_requirements.txt @@ -15,4 +15,6 @@ # limitations under the License. # +# This should match the saved version of your trained model. +# Beam's tests use sklearn 1.0.2 for their saved models. scikit-learn==1.0.2 \ No newline at end of file From 17959958ccbd5b2ba1f74929b7173fcdb596de16 Mon Sep 17 00:00:00 2001 From: Danny McCormick Date: Sat, 25 Mar 2023 08:26:56 -0400 Subject: [PATCH 7/8] Lint --- .../examples/inference/sklearn_japanese_housing_regression.py | 1 - .../examples/inference/sklearn_mnist_classification.py | 1 - 2 files changed, 2 deletions(-) diff --git a/sdks/python/apache_beam/examples/inference/sklearn_japanese_housing_regression.py b/sdks/python/apache_beam/examples/inference/sklearn_japanese_housing_regression.py index 690f5e67075a..f23412a82e94 100644 --- a/sdks/python/apache_beam/examples/inference/sklearn_japanese_housing_regression.py +++ b/sdks/python/apache_beam/examples/inference/sklearn_japanese_housing_regression.py @@ -88,7 +88,6 @@ def sort_by_features(dataframe, max_size): class LoadDataframe(beam.DoFn): - def process(self, file_name: str) -> Iterable[pandas.DataFrame]: """ Loads data files as a pandas dataframe.""" file = FileSystems.open(file_name, 'rb') diff --git a/sdks/python/apache_beam/examples/inference/sklearn_mnist_classification.py b/sdks/python/apache_beam/examples/inference/sklearn_mnist_classification.py index e1c1a4fe78af..6f8ea929bbb6 100644 --- a/sdks/python/apache_beam/examples/inference/sklearn_mnist_classification.py +++ b/sdks/python/apache_beam/examples/inference/sklearn_mnist_classification.py @@ -53,7 +53,6 @@ class PostProcessor(beam.DoFn): """Process the PredictionResult to get the predicted label. Returns a comma separated string with true label and predicted label. """ - def process(self, element: Tuple[int, PredictionResult]) -> Iterable[str]: label, prediction_result = element prediction = prediction_result.inference From 27d611a2f2f0ecf876573ee7afed563aaf0587ba Mon Sep 17 00:00:00 2001 From: Danny McCormick Date: Sat, 25 Mar 2023 08:33:46 -0400 Subject: [PATCH 8/8] Lint --- .../examples/inference/sklearn_japanese_housing_regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/examples/inference/sklearn_japanese_housing_regression.py b/sdks/python/apache_beam/examples/inference/sklearn_japanese_housing_regression.py index f23412a82e94..3aa2f362fa64 100644 --- a/sdks/python/apache_beam/examples/inference/sklearn_japanese_housing_regression.py +++ b/sdks/python/apache_beam/examples/inference/sklearn_japanese_housing_regression.py @@ -30,9 +30,9 @@ """ import argparse +import os from typing import Iterable -import os import pandas import apache_beam as beam