diff --git a/sdks/python/apache_beam/ml/anomaly/detectors/offline.py b/sdks/python/apache_beam/ml/anomaly/detectors/offline.py new file mode 100644 index 000000000000..6b8912575977 --- /dev/null +++ b/sdks/python/apache_beam/ml/anomaly/detectors/offline.py @@ -0,0 +1,59 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Any +from typing import Dict +from typing import Optional + +import apache_beam as beam +from apache_beam.ml.anomaly.base import AnomalyDetector +from apache_beam.ml.anomaly.specifiable import specifiable +from apache_beam.ml.inference.base import KeyedModelHandler + + +@specifiable +class OfflineDetector(AnomalyDetector): + """A offline anomaly detector that uses a provided model handler for scoring. + + Args: + keyed_model_handler: The model handler to use for inference. + Requires a `KeyModelHandler[Any, Row, float, Any]` instance. + run_inference_args: Optional arguments to pass to RunInference + **kwargs: Additional keyword arguments to pass to the base + AnomalyDetector class. + """ + def __init__( + self, + keyed_model_handler: KeyedModelHandler[Any, beam.Row, float, Any], + run_inference_args: Optional[Dict[str, Any]] = None, + **kwargs): + super().__init__(**kwargs) + + # TODO: validate the model handler type + self._keyed_model_handler = keyed_model_handler + self._run_inference_args = run_inference_args or {} + + # always override model_identifier with model_id from the detector + self._run_inference_args["model_identifier"] = self._model_id + + def learn_one(self, x: beam.Row) -> None: + """Not implemented since OfflineDetector invokes RunInference directly.""" + raise NotImplementedError + + def score_one(self, x: beam.Row) -> Optional[float]: + """Not implemented since OfflineDetector invokes RunInference directly.""" + raise NotImplementedError diff --git a/sdks/python/apache_beam/ml/anomaly/transforms.py b/sdks/python/apache_beam/ml/anomaly/transforms.py index 08b656072ac8..d0a3ade605bc 100644 --- a/sdks/python/apache_beam/ml/anomaly/transforms.py +++ b/sdks/python/apache_beam/ml/anomaly/transforms.py @@ -17,9 +17,11 @@ import dataclasses import uuid +from typing import Any from typing import Callable from typing import Dict from typing import Iterable +from typing import List from typing import Optional from typing import Tuple from typing import TypeVar @@ -33,8 +35,10 @@ from apache_beam.ml.anomaly.base import AnomalyResult from apache_beam.ml.anomaly.base import EnsembleAnomalyDetector from apache_beam.ml.anomaly.base import ThresholdFn +from apache_beam.ml.anomaly.detectors.offline import OfflineDetector from apache_beam.ml.anomaly.specifiable import Spec from apache_beam.ml.anomaly.specifiable import Specifiable +from apache_beam.ml.inference.base import RunInference from apache_beam.transforms.userstate import ReadModifyWriteStateSpec KeyT = TypeVar('KeyT') @@ -97,9 +101,11 @@ def process( yield k1, (k2, AnomalyResult( example=data, - predictions=[AnomalyPrediction( - model_id=self._underlying._model_id, - score=self.score_and_learn(data))])) + predictions=[ + AnomalyPrediction( + model_id=self._underlying._model_id, + score=self.score_and_learn(data)) + ])) model_state.write(self._underlying) @@ -325,7 +331,8 @@ def expand( if self._aggregation_fn is None: # simply put predictions into an iterable (list) ret = ( - post_gbk | beam.MapTuple( + post_gbk + | beam.MapTuple( lambda k, v: ( k[0], @@ -353,7 +360,8 @@ def expand( # We use (original_key, temp_key) as the key for GroupByKey() so that # scores from multiple detectors per data point are grouped. ret = ( - post_gbk | beam.MapTuple( + post_gbk + | beam.MapTuple( lambda k, v, agg=aggregation_fn: ( @@ -406,6 +414,76 @@ def expand( return ret +class RunOfflineDetector(beam.PTransform[beam.PCollection[KeyedInputT], + beam.PCollection[KeyedOutputT]]): + """Runs a offline anomaly detector on a PCollection of data. + + This PTransform applies a `OfflineDetector` to the input data, handling + custom input/output conversion and inference. + + Args: + offline_detector: The `OfflineDetector` to run. + """ + def __init__(self, offline_detector: OfflineDetector): + self._offline_detector = offline_detector + + def unnest_and_convert( + self, nested: Tuple[Tuple[Any, Any], dict[str, List]]) -> KeyedOutputT: + """Unnests and converts the model output to AnomalyResult. + + Args: + nested: A tuple containing the combined key (origin key, temp key) and + a dictionary of input and output from RunInference. + + Returns: + A tuple containing the original key and AnomalyResult. + """ + key, value_dict = nested + score = value_dict['output'][0] + result = AnomalyResult( + example=value_dict['input'][0], + predictions=[ + AnomalyPrediction( + model_id=self._offline_detector._model_id, score=score) + ]) + return key[0], (key[1], result) + + def expand( + self, + input: beam.PCollection[KeyedInputT]) -> beam.PCollection[KeyedOutputT]: + model_uuid = f"{self._offline_detector._model_id}:{uuid.uuid4().hex[:6]}" + + # Call RunInference Transform with the keyed model handler + run_inference = RunInference( + self._offline_detector._keyed_model_handler, + **self._offline_detector._run_inference_args) + + # ((orig_key, temp_key), beam.Row) + rekeyed_model_input = input | "Rekey" >> beam.Map( + lambda x: ((x[0], x[1][0]), x[1][1])) + + # ((orig_key, temp_key), float) + rekeyed_model_output = ( + rekeyed_model_input + | f"Call RunInference ({model_uuid})" >> run_inference) + + # ((orig_key, temp_key), {'input':[row], 'output:[float]}) + rekeyed_cogbk = { + 'input': rekeyed_model_input, 'output': rekeyed_model_output + } | beam.CoGroupByKey() + + ret = ( + rekeyed_cogbk | + "Unnest and convert model output" >> beam.Map(self.unnest_and_convert)) + + if self._offline_detector._threshold_criterion: + ret = ( + ret | f"Run Threshold Criterion ({model_uuid})" >> + RunThresholdCriterion(self._offline_detector._threshold_criterion)) + + return ret + + class RunEnsembleDetector(beam.PTransform[beam.PCollection[KeyedInputT], beam.PCollection[KeyedOutputT]]): """Runs an ensemble of anomaly detectors on a PCollection of data. @@ -432,8 +510,14 @@ def expand( for idx, detector in enumerate(self._ensemble_detector._sub_detectors): if isinstance(detector, EnsembleAnomalyDetector): results.append( - input | f"Run Ensemble Detector at index {idx} ({model_uuid})" >> + input + | f"Run Ensemble Detector at index {idx} ({model_uuid})" >> RunEnsembleDetector(detector)) + elif isinstance(detector, OfflineDetector): + results.append( + input + | f"Run Offline Detector at index {idx} ({model_uuid})" >> + RunOfflineDetector(detector)) else: results.append( input @@ -518,6 +602,8 @@ def expand( if isinstance(self._root_detector, EnsembleAnomalyDetector): keyed_output = (keyed_input | RunEnsembleDetector(self._root_detector)) + elif isinstance(self._root_detector, OfflineDetector): + keyed_output = (keyed_input | RunOfflineDetector(self._root_detector)) else: keyed_output = (keyed_input | RunOneDetector(self._root_detector)) diff --git a/sdks/python/apache_beam/ml/anomaly/transforms_test.py b/sdks/python/apache_beam/ml/anomaly/transforms_test.py index cf398728f372..b8ed7c7e5e19 100644 --- a/sdks/python/apache_beam/ml/anomaly/transforms_test.py +++ b/sdks/python/apache_beam/ml/anomaly/transforms_test.py @@ -17,20 +17,45 @@ import logging import math +import os +import pickle +import shutil +import tempfile import unittest +from typing import Any +from typing import Dict from typing import Iterable +from typing import Optional +from typing import Sequence +from typing import SupportsFloat +from typing import Tuple + +import mock +import numpy +from sklearn.base import BaseEstimator import apache_beam as beam from apache_beam.ml.anomaly.aggregations import AnyVote from apache_beam.ml.anomaly.base import AnomalyPrediction from apache_beam.ml.anomaly.base import AnomalyResult from apache_beam.ml.anomaly.base import EnsembleAnomalyDetector +from apache_beam.ml.anomaly.detectors.offline import OfflineDetector from apache_beam.ml.anomaly.detectors.zscore import ZScore +from apache_beam.ml.anomaly.specifiable import Spec +from apache_beam.ml.anomaly.specifiable import Specifiable +from apache_beam.ml.anomaly.specifiable import _spec_type_to_subspace +from apache_beam.ml.anomaly.specifiable import specifiable from apache_beam.ml.anomaly.thresholds import FixedThreshold from apache_beam.ml.anomaly.thresholds import QuantileThreshold from apache_beam.ml.anomaly.transforms import AnomalyDetection from apache_beam.ml.anomaly.transforms import _StatefulThresholdDoFn from apache_beam.ml.anomaly.transforms import _StatelessThresholdDoFn +from apache_beam.ml.inference.base import KeyedModelHandler +from apache_beam.ml.inference.base import PredictionResult +from apache_beam.ml.inference.base import RunInference +from apache_beam.ml.inference.base import _PostProcessingModelHandler +from apache_beam.ml.inference.base import _PreProcessingModelHandler +from apache_beam.ml.inference.sklearn_inference import SklearnModelHandlerNumpy from apache_beam.testing.test_pipeline import TestPipeline from apache_beam.testing.util import assert_that from apache_beam.testing.util import equal_to @@ -253,6 +278,178 @@ def test_multiple_sub_detectors_with_aggregation(self): prediction in zip(self._input, aggregated)])) +class FakeNumpyModel(): + def __init__(self): + self.total_predict_calls = 0 + + def predict(self, input_vector: numpy.ndarray): + self.total_predict_calls += 1 + return [input_vector[0][0] * 10 - input_vector[0][1]] + + +def alternate_numpy_inference_fn( + model: BaseEstimator, + batch: Sequence[numpy.ndarray], + inference_args: Optional[Dict[str, Any]] = None) -> Any: + return [0] + + +def _to_keyed_numpy_array(t: Tuple[Any, beam.Row]): + """Converts an Apache Beam Row to a NumPy array.""" + return t[0], numpy.array(list(t[1])) + + +def _from_keyed_numpy_array(t: Tuple[Any, PredictionResult]): + assert isinstance(t[1].inference, SupportsFloat) + return t[0], float(t[1].inference) + + +class TestOfflineDetector(unittest.TestCase): + def setUp(self): + global SklearnModelHandlerNumpy, KeyedModelHandler + global _PreProcessingModelHandler, _PostProcessingModelHandler + # Make model handlers into Specifiable + SklearnModelHandlerNumpy = specifiable(SklearnModelHandlerNumpy) + KeyedModelHandler = specifiable(KeyedModelHandler) + _PreProcessingModelHandler = specifiable(_PreProcessingModelHandler) + _PostProcessingModelHandler = specifiable(_PostProcessingModelHandler) + self.tmpdir = tempfile.mkdtemp() + + def tearDown(self): + shutil.rmtree(self.tmpdir) + # Make the model handlers back to normal + SklearnModelHandlerNumpy.unspecifiable() + KeyedModelHandler.unspecifiable() + _PreProcessingModelHandler.unspecifiable() + _PostProcessingModelHandler.unspecifiable() + + def test_default_inference_fn(self): + temp_file_name = self.tmpdir + os.sep + 'pickled_file' + with open(temp_file_name, 'wb') as file: + pickle.dump(FakeNumpyModel(), file) + + keyed_model_handler = KeyedModelHandler( + SklearnModelHandlerNumpy(model_uri=temp_file_name)).with_preprocess_fn( + _to_keyed_numpy_array).with_postprocess_fn(_from_keyed_numpy_array) + + detector = OfflineDetector(keyed_model_handler=keyed_model_handler) + detector_spec = detector.to_spec() + expected_spec = Spec( + type='OfflineDetector', + config={ + 'keyed_model_handler': Spec( + type='_PostProcessingModelHandler', + config={ + 'base': Spec( + type='_PreProcessingModelHandler', + config={ + 'base': Spec( + type='KeyedModelHandler', + config={ + 'unkeyed': Spec( + type='SklearnModelHandlerNumpy', + config={'model_uri': temp_file_name}) + }), + 'preprocess_fn': Spec( + type='_to_keyed_numpy_array', config=None) + }), + 'postprocess_fn': Spec( + type='_from_keyed_numpy_array', config=None) + }) + }) + self.assertEqual(detector_spec, expected_spec) + + self.assertEqual(_spec_type_to_subspace('SklearnModelHandlerNumpy'), '*') + self.assertEqual(_spec_type_to_subspace('_PreProcessingModelHandler'), '*') + self.assertEqual(_spec_type_to_subspace('_PostProcessingModelHandler'), '*') + self.assertEqual(_spec_type_to_subspace('_to_keyed_numpy_array'), '*') + self.assertEqual(_spec_type_to_subspace('_from_keyed_numpy_array'), '*') + + # Make sure the spec from the detector can be used to reconstruct the same + # detector + detector_new = Specifiable.from_spec(detector_spec) + + input = [ + (1, beam.Row(x=1, y=2)), + (1, beam.Row(x=2, y=4)), + (1, beam.Row(x=3, y=6)), + ] + expected_predictions = [ + AnomalyPrediction( + model_id='OfflineDetector', + score=8.0, + label=None, + threshold=None, + info='', + source_predictions=None), + AnomalyPrediction( + model_id='OfflineDetector', + score=16.0, + label=None, + threshold=None, + info='', + source_predictions=None), + AnomalyPrediction( + model_id='OfflineDetector', + score=24.0, + label=None, + threshold=None, + info='', + source_predictions=None), + ] + with TestPipeline() as p: + result = ( + p | beam.Create(input) + # TODO: get rid of this conversion between BeamSchema to beam.Row. + | beam.Map(lambda t: (t[0], beam.Row(**t[1]._asdict()))) + | AnomalyDetection(detector_new)) + + assert_that( + result, + equal_to([( + input[0], + AnomalyResult(example=input[1], predictions=[prediction])) + for input, + prediction in zip(input, expected_predictions)])) + + def test_run_inference_args(self): + model_handler = SklearnModelHandlerNumpy(model_uri="unused") + detector = OfflineDetector( + keyed_model_handler=model_handler, + run_inference_args={"inference_args": { + "multiplier": 10 + }}) + + p = TestPipeline() + + input = [ + (1, beam.Row(x=1, y=2)), + (1, beam.Row(x=2, y=4)), + (1, beam.Row(x=3, y=6)), + ] + + # patch the RunInference in "apache_beam.ml.anomaly.transforms" where + # it is imported and call + with mock.patch('apache_beam.ml.anomaly.transforms.RunInference') as mock_run_inference: # pylint: disable=line-too-long + # make the actual RunInference as the sideeffect, so we record the call + # information but also create the true RunInference instance. + mock_run_inference.side_effect = RunInference + try: + p = TestPipeline() + _ = (p | beam.Create(input) | AnomalyDetection(detector)) + except: # pylint: disable=bare-except + pass + call_args = mock_run_inference.call_args[1] + self.assertEqual( + call_args, + { + 'inference_args': { + 'multiplier': 10 + }, + 'model_identifier': 'OfflineDetector' + }) + + R = beam.Row(x=10, y=20)