-
Notifications
You must be signed in to change notification settings - Fork 4.5k
Sklearn Mnist example and IT test #21781
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
29767cf
f7a36a6
0be2a8b
e4cf3f3
6b6d6b1
0375d74
1ccdbf3
c472974
2c61681
3f527be
d70eee6
fa42b67
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,112 @@ | ||
| # | ||
| # Licensed to the Apache Software Foundation (ASF) under one or more | ||
| # contributor license agreements. See the NOTICE file distributed with | ||
| # this work for additional information regarding copyright ownership. | ||
| # The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| # (the "License"); you may not use this file except in compliance with | ||
| # the License. You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
| # | ||
|
|
||
| """A pipeline that uses RunInference API to classify MNIST data. | ||
|
|
||
| This pipeline takes a text file in which data is comma separated ints. The first | ||
| column would be the true label and the rest would be the pixel values. The data | ||
| is processed and then a model trained on the MNIST data would be used to perform | ||
| the inference. The pipeline writes the prediction to an output file in which | ||
| users can then compare against the true label. | ||
| """ | ||
|
|
||
| import argparse | ||
| from typing import Iterable | ||
| from typing import List | ||
| from typing import Tuple | ||
|
|
||
| import apache_beam as beam | ||
| from apache_beam.ml.inference.base import KeyedModelHandler | ||
| from apache_beam.ml.inference.base import PredictionResult | ||
| from apache_beam.ml.inference.base import RunInference | ||
| from apache_beam.ml.inference.sklearn_inference import ModelFileType | ||
| from apache_beam.ml.inference.sklearn_inference import SklearnModelHandlerNumpy | ||
| from apache_beam.options.pipeline_options import PipelineOptions | ||
| from apache_beam.options.pipeline_options import SetupOptions | ||
|
|
||
|
|
||
| def process_input(row: str) -> Tuple[int, List[int]]: | ||
| data = row.split(',') | ||
| label, pixels = int(data[0]), data[1:] | ||
| pixels = [int(pixel) for pixel in pixels] | ||
| return label, pixels | ||
|
|
||
|
|
||
| class PostProcessor(beam.DoFn): | ||
| """Process the PredictionResult to get the predicted label. | ||
| Returns a comma separated string with true label and predicted label. | ||
| """ | ||
| def process(self, element: Tuple[int, PredictionResult]) -> Iterable[str]: | ||
| label, prediction_result = element | ||
| prediction = prediction_result.inference | ||
| yield '{},{}'.format(label, prediction) | ||
|
|
||
|
|
||
| def parse_known_args(argv): | ||
| """Parses args for the workflow.""" | ||
| parser = argparse.ArgumentParser() | ||
| parser.add_argument( | ||
| '--input_file', | ||
| dest='input', | ||
| required=True, | ||
| help='text file with comma separated int values.') | ||
| parser.add_argument( | ||
| '--output', | ||
| dest='output', | ||
| required=True, | ||
| help='Path to save output predictions.') | ||
| parser.add_argument( | ||
| '--model_path', | ||
| dest='model_path', | ||
AnandInguva marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| required=True, | ||
| help='Path to load the Sklearn model for Inference.') | ||
| return parser.parse_known_args(argv) | ||
|
|
||
|
|
||
| def run(argv=None, save_main_session=True): | ||
| """Entry point. Defines and runs the pipeline.""" | ||
| known_args, pipeline_args = parse_known_args(argv) | ||
| pipeline_options = PipelineOptions(pipeline_args) | ||
| pipeline_options.view_as(SetupOptions).save_main_session = save_main_session | ||
|
|
||
| # In this example we pass keyed inputs to RunInference transform. | ||
| # Therefore, we use KeyedModelHandler wrapper over SklearnModelHandlerNumpy. | ||
| model_loader = KeyedModelHandler( | ||
| SklearnModelHandlerNumpy( | ||
| model_file_type=ModelFileType.PICKLE, | ||
| model_uri=known_args.model_path)) | ||
|
|
||
| with beam.Pipeline(options=pipeline_options) as p: | ||
| label_pixel_tuple = ( | ||
| p | ||
| | "ReadFromInput" >> beam.io.ReadFromText( | ||
| known_args.input, skip_header_lines=1) | ||
| | "PreProcessInputs" >> beam.Map(process_input)) | ||
|
|
||
| predictions = ( | ||
| label_pixel_tuple | ||
| | "RunInference" >> RunInference(model_loader) | ||
| | "PostProcessOutputs" >> beam.ParDo(PostProcessor())) | ||
|
|
||
| _ = predictions | "WriteOutput" >> beam.io.WriteToText( | ||
| known_args.output, | ||
| shard_name_template='', | ||
| append_trailing_newlines=True) | ||
|
|
||
|
|
||
| if __name__ == '__main__': | ||
| run() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,76 @@ | ||
| # | ||
| # Licensed to the Apache Software Foundation (ASF) under one or more | ||
| # contributor license agreements. See the NOTICE file distributed with | ||
| # this work for additional information regarding copyright ownership. | ||
| # The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| # (the "License"); you may not use this file except in compliance with | ||
| # the License. You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
| # | ||
|
|
||
| """End-to-End test for Sklearn Inference""" | ||
|
|
||
| import logging | ||
| import unittest | ||
| import uuid | ||
|
|
||
| import pytest | ||
|
|
||
| from apache_beam.examples.inference import sklearn_mnist_classification | ||
| from apache_beam.io.filesystems import FileSystems | ||
| from apache_beam.testing.test_pipeline import TestPipeline | ||
|
|
||
|
|
||
| def process_outputs(filepath): | ||
| with FileSystems().open(filepath) as f: | ||
| lines = f.readlines() | ||
| lines = [l.decode('utf-8').strip('\n') for l in lines] | ||
| return lines | ||
|
|
||
|
|
||
| @pytest.mark.skip | ||
| @pytest.mark.uses_sklearn | ||
| @pytest.mark.it_postcommit | ||
| class SklearnInference(unittest.TestCase): | ||
| def test_sklearn_mnist_classification(self): | ||
AnandInguva marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| test_pipeline = TestPipeline(is_integration_test=False) | ||
| input_file = 'gs://apache-beam-ml/testing/inputs/it_mnist_data.csv' | ||
|
||
| output_file_dir = 'gs://temp-storage-for-end-to-end-tests' | ||
| output_file = '/'.join([output_file_dir, str(uuid.uuid4()), 'result.txt']) | ||
| model_path = 'gs://apache-beam-ml/models/mnist_model_svm.pickle' | ||
| extra_opts = { | ||
| 'input': input_file, | ||
| 'output': output_file, | ||
| 'model_path': model_path, | ||
| } | ||
| sklearn_mnist_classification.run( | ||
| test_pipeline.get_full_options_as_args(**extra_opts), | ||
| save_main_session=False) | ||
| self.assertEqual(FileSystems().exists(output_file), True) | ||
AnandInguva marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| expected_output_filepath = 'gs://apache-beam-ml/testing/expected_outputs/test_sklearn_mnist_classification_actuals.txt' # pylint: disable=line-too-long | ||
| expected_outputs = process_outputs(expected_output_filepath) | ||
|
|
||
| predicted_outputs = process_outputs(output_file) | ||
| self.assertEqual(len(expected_outputs), len(predicted_outputs)) | ||
|
|
||
| predictions_dict = {} | ||
| for i in range(len(predicted_outputs)): | ||
| true_label, prediction = predicted_outputs[i].split(',') | ||
| predictions_dict[true_label] = prediction | ||
|
|
||
| for i in range(len(expected_outputs)): | ||
| true_label, expected_prediction = expected_outputs[i].split(',') | ||
| self.assertEqual(predictions_dict[true_label], expected_prediction) | ||
|
|
||
|
|
||
| if __name__ == '__main__': | ||
| logging.getLogger().setLevel(logging.DEBUG) | ||
| unittest.main() | ||
Uh oh!
There was an error while loading. Please reload this page.