diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py index f41f3b59..dcf64b91 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py @@ -189,7 +189,7 @@ def summarize( Returns: pd.DataFrame: metric dataframe for all metrics and aggregations """ - self.summary_df = pd.concat( + summary_df = pd.concat( [ self._summarize( self.forecast_df, @@ -202,4 +202,21 @@ def summarize( ] ) + # add Metric Hub metadata columns + summary_df["metric_alias"] = self.metric_hub.alias.lower() + summary_df["metric_hub_app_name"] = self.metric_hub.app_name.lower() + summary_df["metric_hub_slug"] = self.metric_hub.slug.lower() + summary_df["metric_start_date"] = pd.to_datetime(self.metric_hub.min_date) + summary_df["metric_end_date"] = pd.to_datetime(self.metric_hub.max_date) + summary_df["metric_collected_at"] = self.collected_at + + # add forecast model metadata columns + summary_df["forecast_start_date"] = self.start_date + summary_df["forecast_end_date"] = self.end_date + summary_df["forecast_trained_at"] = self.trained_at + summary_df["forecast_predicted_at"] = self.predicted_at + summary_df["forecast_parameters"] = self.metadata_params + + self.summary_df = summary_df + return self.summary_df diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py index 60b8982a..19f57e1d 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py @@ -20,15 +20,20 @@ class ProphetForecast(BaseForecast): def column_names_map(self) -> Dict[str, str]: return {"submission_date": "ds", "value": "y"} - def _fit(self, observed_df) -> None: - self.model = prophet.Prophet( - **self.parameters, + def _build_model(self, parameter_dict): + model = prophet.Prophet( + **parameter_dict, uncertainty_samples=self.number_of_simulations, mcmc_samples=0, ) if self.use_holidays: - self.model.add_country_holidays(country_name="US") + model.add_country_holidays(country_name="US") + + return model + + def _fit(self, observed_df) -> None: + self.model = self._build_model(self.parameters) # Modify observed data to have column names that Prophet expects, and fit # the model @@ -235,24 +240,6 @@ def _summarize( # add summary metadata columns df["aggregation_period"] = period.lower() - # reorder columns to make interpretation easier - df = df[["submission_date", "aggregation_period", "source", "measure", "value"]] - - # add Metric Hub metadata columns - df["metric_alias"] = self.metric_hub.alias.lower() - df["metric_hub_app_name"] = self.metric_hub.app_name.lower() - df["metric_hub_slug"] = self.metric_hub.slug.lower() - df["metric_start_date"] = pd.to_datetime(self.metric_hub.min_date) - df["metric_end_date"] = pd.to_datetime(self.metric_hub.max_date) - df["metric_collected_at"] = self.collected_at - - # add forecast model metadata columns - df["forecast_start_date"] = self.start_date - df["forecast_end_date"] = self.end_date - df["forecast_trained_at"] = self.trained_at - df["forecast_predicted_at"] = self.predicted_at - df["forecast_parameters"] = self.metadata_params - return df def _summarize_legacy(self) -> pd.DataFrame: diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py index 6a731560..19a2db9d 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py @@ -1,14 +1,25 @@ from typing import List +import collections +from datetime import date, datetime +from dateutil.relativedelta import relativedelta import pytest import pandas as pd from dotmap import DotMap import numpy as np -from datetime import datetime, timedelta, timezone +from datetime import timedelta, timezone from kpi_forecasting.models.base_forecast import BaseForecast +# Arbitrarily choose some date to use for the tests +TEST_DATE = date(2024, 1, 1) +TEST_DATE_STR = TEST_DATE.strftime("%Y-%m-%d") +TEST_DATE_NEXT_DAY = date(2024, 1, 2) +TEST_DATE_NEXT_DAY_STR = TEST_DATE_NEXT_DAY.strftime("%Y-%m-%d") +TEST_PREDICT_END = TEST_DATE + relativedelta(months=2) +TEST_PREDICT_END_STR = TEST_PREDICT_END.strftime("%Y-%m-%d") + class BadClass(BaseForecast): pass @@ -29,8 +40,9 @@ def _get_observed_data(self): self.observed_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2020-01-01"), - pd.to_datetime("1990-01-01"), + TEST_DATE, + TEST_DATE + - relativedelta(years=1), # just an arbitrary date in the past ] } ) @@ -76,8 +88,8 @@ def test_not_implemented(): def test_post_init(good_class): - start_date = "2124-01-01" - end_date = "2124-02-02" + start_date = TEST_DATE_STR + end_date = TEST_PREDICT_END_STR good_class = good_class( model_type="test", parameters=DotMap(), @@ -108,7 +120,7 @@ def test_post_init_default_dates(good_class): ) # this is the max date of the self.observed_data['submission_date'] plus one day # from the object definion - start_date = pd.to_datetime("2020-01-02") + start_date = TEST_DATE_NEXT_DAY end_date = ( datetime.now(timezone.utc).replace(tzinfo=None) + timedelta(weeks=78) ).date() @@ -123,15 +135,15 @@ def test_fit(good_class): model_type="test", parameters=DotMap(), use_holidays=None, - start_date="2124-01-01", - end_date="2124-02-02", + start_date=TEST_DATE_STR, + end_date=TEST_PREDICT_END_STR, metric_hub=None, ) good_class.fit() assert good_class.model - # - assert good_class.model.is_fit == pd.to_datetime("2020-01-01") + # model sets is_fit to the largest day in the observed data + assert good_class.model.is_fit == TEST_DATE def test_predict_and_validate(good_class): @@ -139,8 +151,8 @@ def test_predict_and_validate(good_class): model_type="test", parameters=DotMap(), use_holidays=None, - start_date="2124-01-01", - end_date="2124-02-02", + start_date=TEST_DATE_STR, + end_date=TEST_PREDICT_END_STR, metric_hub=None, ) # overwrite date range set in __post_init__ @@ -154,12 +166,24 @@ def test_summarize(good_class): model_type="test", parameters=DotMap(), use_holidays=None, - start_date="2124-01-01", - end_date="2124-02-02", + start_date=TEST_DATE_STR, + end_date=TEST_PREDICT_END_STR, metric_hub=None, ) good_class.forecast_df = np.array([1, 2]) good_class.observed_df = np.array([3, 4]) + MetricHub = collections.namedtuple( + "MetricHub", + ["alias", "app_name", "slug", "min_date", "max_date"], + ) + + dummy_metric_hub = MetricHub("", "", "", TEST_DATE_STR, TEST_DATE_STR) + + # add it here rather than in __init__ so it doesn't try to load data + good_class.metric_hub = dummy_metric_hub + good_class.trained_at = "" + good_class.predicted_at = "" + number_val = 10 output = good_class.summarize( periods=["a", "b", "c"], numpy_aggregations=["sum"], percentiles=["percentiles"] @@ -170,5 +194,27 @@ def test_summarize(good_class): for el in ["a", "b", "c"] ] ) - assert output.reset_index(drop=True).equals(expected_output) - assert good_class.summary_df.reset_index(drop=True).equals(expected_output) + # not going to check all the metadata columns + # in assert_frame_equal. Just make sure they're there + metadata_columns = { + "metric_alias", + "metric_hub_app_name", + "metric_hub_slug", + "metric_start_date", + "metric_end_date", + "metric_collected_at", + "forecast_start_date", + "forecast_end_date", + "forecast_trained_at", + "forecast_predicted_at", + "forecast_parameters", + } + assert set(expected_output.columns) | metadata_columns == set(output.columns) + + pd.testing.assert_frame_equal( + output[expected_output.columns].reset_index(drop=True), expected_output + ) + pd.testing.assert_frame_equal( + good_class.summary_df[expected_output.columns].reset_index(drop=True), + expected_output, + ) diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py index aea1eb65..52121b7c 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py @@ -1,6 +1,8 @@ """tests for the funnel forecast module""" import collections +from datetime import date, datetime +from dateutil.relativedelta import relativedelta import pandas as pd from dotmap import DotMap @@ -11,13 +13,21 @@ from kpi_forecasting.configs.model_inputs import ProphetRegressor, ProphetHoliday from kpi_forecasting.models.funnel_forecast import SegmentModelSettings, FunnelForecast +# Arbitrarily choose some date to use for the tests +TEST_DATE = date(2024, 1, 1) +TEST_DATE_STR = TEST_DATE.strftime("%Y-%m-%d") +TEST_DATE_NEXT_DAY = date(2024, 1, 2) +TEST_DATE_NEXT_DAY_STR = TEST_DATE_NEXT_DAY.strftime("%Y-%m-%d") +TEST_PREDICT_END = TEST_DATE + relativedelta(months=2) +TEST_PREDICT_END_STR = TEST_PREDICT_END.strftime("%Y-%m-%d") + @pytest.fixture() def forecast(): """This mocks a generic forecast object""" # 2024-01-01 is arbitarily chosen as a future date - predict_start_date = "2124-01-01" - predict_end_date = "2124-03-01" + predict_start_date = TEST_DATE_STR + predict_end_date = TEST_PREDICT_END_STR forecast = FunnelForecast( model_type="test", @@ -37,8 +47,8 @@ def segment_info_fit_tests(): in the functions that test fit methods""" # 2024-01-01 is arbitarily chosen as a future date - A1_start_date = "2124-01-01" - A2_start_date = "2124-01-02" + A1_start_date = TEST_DATE_STR + A2_start_date = TEST_DATE_NEXT_DAY_STR segment_info_dict = { "A1": { @@ -83,9 +93,8 @@ def funnel_forecast_for_fit_tests(segment_info_fit_tests, mocker): } parameter_dotmap = DotMap(parameter_dict) - predict_start_date = "2124-01-01" - predict_end_date = "2124-01-02" - + predict_start_date = TEST_DATE_STR + predict_end_date = TEST_DATE_NEXT_DAY_STR forecast = FunnelForecast( model_type="test", parameters=parameter_dotmap, @@ -178,8 +187,8 @@ def test_combine_forecast_observed(mocker, forecast): forecast_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -187,8 +196,8 @@ def test_combine_forecast_observed(mocker, forecast): observed_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "a": ["A1", "A1"], "value": [5, 6], @@ -238,8 +247,8 @@ def test_under_summarize(mocker, forecast): forecast_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -249,11 +258,11 @@ def test_under_summarize(mocker, forecast): observed_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2123-01-01").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE - relativedelta(months=1), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "a": ["A1", "A1", "A1", "A2", "A2"], "value": [10, 20, 30, 40, 50], @@ -265,7 +274,7 @@ def test_under_summarize(mocker, forecast): ["start_date", "forecast_df", "segment", "trained_parameters"], ) dummy_segment_settings = SegmentSettings( - start_date="2124-01-01", + start_date=TEST_DATE_STR, forecast_df=forecast_df.copy(), segment={"a": "A1"}, trained_parameters={"trained_parameters": "yes"}, @@ -288,8 +297,8 @@ def test_under_summarize(mocker, forecast): observed_expected_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "a": ["A1", "A1"], "value": [20, 30], @@ -334,7 +343,7 @@ def test_summarize(mocker, forecast): ["alias", "app_name", "slug", "min_date", "max_date"], ) - dummy_metric_hub = MetricHub("", "", "", "2124-01-01", "2124-01-01") + dummy_metric_hub = MetricHub("", "", "", TEST_DATE_STR, TEST_DATE_STR) # forecast predictions are set with the # mock_aggregate_forecast_observed function so they @@ -342,8 +351,8 @@ def test_summarize(mocker, forecast): forecast_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -353,11 +362,11 @@ def test_summarize(mocker, forecast): observed_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2123-01-01").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE - relativedelta(months=1), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "a": ["A1", "A1", "A1", "A2", "A2"], "value": [10, 20, 30, 40, 50], @@ -373,7 +382,7 @@ def test_summarize(mocker, forecast): # we're only testing that it is concatenated properly # with the segment data added dummy_segment_settings_A1 = SegmentSettings( - start_date="2124-01-01", + start_date=TEST_DATE_STR, forecast_df=forecast_df.copy(), segment={"a": "A1"}, trained_parameters={"trained_parameters": "yes"}, @@ -381,7 +390,7 @@ def test_summarize(mocker, forecast): ) dummy_segment_settings_A2 = SegmentSettings( - start_date="2124-01-01", + start_date=TEST_DATE_STR, forecast_df=forecast_df.copy(), segment={"a": "A2"}, trained_parameters={"trained_parameters": "yes"}, @@ -418,10 +427,10 @@ def test_summarize(mocker, forecast): observed_expected_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "a": ["A1", "A1", "A2", "A2"], "value": [20, 30, 40, 50], @@ -491,8 +500,8 @@ def test_summarize(mocker, forecast): def test_under_predict(mocker): """testing _predict""" # set segment models - # 2124-01-01 chosen as a artibrary date to center tests on - A1_start_date = "2124-01-01" + + A1_start_date = TEST_DATE_STR parameter_dict = { "model_setting_split_dim": "a", "segment_settings": { @@ -508,8 +517,8 @@ def test_under_predict(mocker): } parameter_dotmap = DotMap(parameter_dict) - predict_start_date = "2124-01-02" - predict_end_date = "2124-03-01" + predict_start_date = TEST_DATE_NEXT_DAY_STR + predict_end_date = TEST_PREDICT_END_STR forecast = FunnelForecast( model_type="test", @@ -535,8 +544,8 @@ def test_under_predict(mocker): "b": ["B1", "B2"], "y": [0, 1], "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -557,8 +566,8 @@ def test_under_predict(mocker): dates_to_predict = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ] } ) @@ -574,8 +583,8 @@ def test_under_predict(mocker): { 0: [0, model_value], "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -623,10 +632,10 @@ def test_predict(funnel_forecast_for_fit_tests, segment_info_fit_tests): "b": ["B1", "B2", "B1", "B2"], "y": [-1, 1, -1, 1], "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -652,8 +661,8 @@ def test_predict(funnel_forecast_for_fit_tests, segment_info_fit_tests): { 0: [0, model_value], "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -664,7 +673,7 @@ def test_predict(funnel_forecast_for_fit_tests, segment_info_fit_tests): expected_raw["submission_date"] >= pd.to_datetime(funnel_forecast_for_fit_tests.start_date).date() ) - expected = expected_raw[expected_time_filter] + expected = expected_raw[expected_time_filter].reset_index(drop=True) forecast_df = segment.forecast_df pd.testing.assert_frame_equal(forecast_df, expected) @@ -717,8 +726,8 @@ def test_auto_tuning(forecast, mocker): # set one segment with two sets of grid parameters segment_settings = SegmentModelSettings( segment={"a": "A1"}, - start_date="2124-01-01", - end_date="2124-03-01", + start_date=TEST_DATE_STR, + end_date=TEST_PREDICT_END_STR, holidays=[], regressors=[], grid_parameters={"param1": [1, 2], "param2": [20, 10]}, @@ -738,8 +747,8 @@ def test_auto_tuning(forecast, mocker): "a": ["A1", "A1"], "b": ["B1", "B2"], "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-01").date(), + TEST_DATE, + TEST_DATE, ], } ) @@ -760,10 +769,10 @@ def test_under_fit(funnel_forecast_for_fit_tests, segment_info_fit_tests): "a": ["A1", "A1", "A2", "A2"], "b": ["B1", "B2", "B1", "B2"], "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -807,10 +816,10 @@ def test_fit(funnel_forecast_for_fit_tests, segment_info_fit_tests): "a": ["A1", "A1", "A2", "A2"], "b": ["B1", "B2", "B1", "B2"], "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -872,8 +881,8 @@ def test_set_segment_models(): } parameter_dotmap = DotMap(parameter_dict) - predict_start_date = "2124-01-01" - predict_end_date = "2124-03-01" + predict_start_date = TEST_DATE_STR + predict_end_date = TEST_PREDICT_END_STR forecast = FunnelForecast( model_type="test", @@ -951,8 +960,8 @@ def test_set_segment_models_exception(): } parameter_dotmap = DotMap(parameter_dict) - predict_start_date = "2124-01-01" - predict_end_date = "2124-03-01" + predict_start_date = TEST_DATE_STR + predict_end_date = TEST_PREDICT_END_STR forecast = FunnelForecast( model_type="test", @@ -982,6 +991,14 @@ def test_fill_regressor_dates(forecast): """test _fill_regressor_dates the name in the regressor info indicates which case is being tested Dates are chosen arbitrarily""" + # get the set start and end dates for the forecast fixture + # as datetime objects + default_start_datetime = datetime(TEST_DATE.year, TEST_DATE.month, TEST_DATE.day) + default_end_datetime = datetime( + TEST_PREDICT_END.year, TEST_PREDICT_END.month, TEST_PREDICT_END.day + ) + + # set the start date with an arbitrary date regressor_info = { "name": "only_start", "description": "only has a start", @@ -990,8 +1007,11 @@ def test_fill_regressor_dates(forecast): regressor = ProphetRegressor(**regressor_info) forecast._fill_regressor_dates(regressor) assert regressor.start_date == pd.to_datetime("2020-08-15") - assert regressor.end_date == pd.to_datetime("2124-03-01") + # this is the end dat for the forecast fixture + assert regressor.end_date == default_end_datetime + + # set the end date with an arbitrary date regressor_info = { "name": "only_end", "description": "only has a end", @@ -999,9 +1019,11 @@ def test_fill_regressor_dates(forecast): } regressor = ProphetRegressor(**regressor_info) forecast._fill_regressor_dates(regressor) - assert regressor.start_date == pd.to_datetime("2124-01-01") + # the start date for the forecast fixture is TEST_DATE + assert regressor.start_date == default_start_datetime assert regressor.end_date == pd.to_datetime("2125-08-15") + # set both the start and end dates to arbitrary dates regressor_info = { "name": "both", "description": "only has a start", @@ -1013,15 +1035,17 @@ def test_fill_regressor_dates(forecast): assert regressor.start_date == pd.to_datetime("2020-08-15") assert regressor.end_date == pd.to_datetime("2020-09-15") + # use the defaults for both regressor_info = { "name": "neither", "description": "nothin to see here", } regressor = ProphetRegressor(**regressor_info) forecast._fill_regressor_dates(regressor) - assert regressor.start_date == pd.to_datetime("2124-01-01") - assert regressor.end_date == pd.to_datetime("2124-03-01") + assert regressor.start_date == default_start_datetime + assert regressor.end_date == default_end_datetime + # use arbitrary out of order dates to set regressor_info = { "name": "out_of_order", "description": "best better break", @@ -1039,6 +1063,11 @@ def test_fill_regressor_dates(forecast): def test_add_regressors(forecast): """test add regressors test case for each element of regressor_list_raw is indicated in name""" + + # choose arbitrary dates for dates + # name indicates the relationship of the window + # to the timeframe of the data as defined in the ds + # column of df below regressor_list_raw = [ { "name": "all_in", @@ -1120,8 +1149,8 @@ def test_build_train_dataframe_no_regressors(forecast): } segment_settings = SegmentModelSettings( segment={"a": 1, "b": 2}, - start_date="2124-01-01", - end_date="2124-02-01", + start_date=TEST_DATE_STR, + end_date=TEST_PREDICT_END_STR, holidays=[], regressors=[ProphetRegressor(**r) for r in regressor_list], grid_parameters=grid_parameters, @@ -1134,12 +1163,12 @@ def test_build_train_dataframe_no_regressors(forecast): "b": [1, 1, 2, 2, 2, 2], "y": [1, 2, 3, 4, 5, 6], "submission_date": [ - pd.to_datetime("2124-12-01").date(), - pd.to_datetime("2124-12-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2123-01-01").date(), - pd.to_datetime("2123-01-02").date(), + TEST_DATE - relativedelta(months=1), + TEST_DATE_NEXT_DAY - relativedelta(months=1), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE + relativedelta(months=1), + TEST_DATE_NEXT_DAY + relativedelta(months=1), ], } ) @@ -1153,8 +1182,8 @@ def test_build_train_dataframe_no_regressors(forecast): "b": [2, 2], "y": [3, 4], "ds": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -1172,8 +1201,8 @@ def test_build_train_dataframe_no_regressors(forecast): "b": [2, 2], "y": [3, 4], "ds": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "floor": [1.5, 1.5], "cap": [6.0, 6.0], @@ -1193,20 +1222,24 @@ def test_build_train_dataframe(forecast): { "name": "all_in", "description": "it's all in", - "start_date": "2124-01-01", - "end_date": "2124-01-06", + "start_date": TEST_DATE_STR, + "end_date": (TEST_DATE + relativedelta(days=6)).strftime("%Y-%m-%d"), }, { "name": "all_out", "description": "it's all in", - "start_date": "2124-02-01", - "end_date": "2124-02-06", + "start_date": (TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d"), + "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( + "%Y-%m-%d" + ), }, { "name": "just_end", "description": "just the second one", - "start_date": "2124-01-02", - "end_date": "2124-02-06", + "start_date": (TEST_DATE + relativedelta(days=1)).strftime("%Y-%m-%d"), + "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( + "%Y-%m-%d" + ), }, ] @@ -1226,8 +1259,8 @@ def test_build_train_dataframe(forecast): } segment_settings = SegmentModelSettings( segment={"a": 1, "b": 2}, - start_date="2124-01-01", - end_date="2124-02-01", + start_date=TEST_DATE_STR, + end_date=(TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d"), holidays=[], regressors=[ProphetRegressor(**r) for r in regressor_list], grid_parameters=grid_parameters, @@ -1240,12 +1273,12 @@ def test_build_train_dataframe(forecast): "b": [1, 1, 2, 2, 2, 2], "y": [1, 2, 3, 4, 5, 6], "submission_date": [ - pd.to_datetime("2124-12-01").date(), - pd.to_datetime("2124-12-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2123-01-01").date(), - pd.to_datetime("2123-01-02").date(), + TEST_DATE - relativedelta(months=1), + TEST_DATE_NEXT_DAY - relativedelta(months=1), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE + relativedelta(months=1), + TEST_DATE_NEXT_DAY + relativedelta(months=1), ], } ) @@ -1258,8 +1291,8 @@ def test_build_train_dataframe(forecast): "b": [2, 2], "y": [3, 4], "ds": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "all_in": [0, 0], "all_out": [ @@ -1282,8 +1315,8 @@ def test_build_train_dataframe(forecast): "b": [2, 2], "y": [3, 4], "ds": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "all_in": [0, 0], "all_out": [1, 1], @@ -1320,8 +1353,8 @@ def test_build_predict_dataframe_no_regressors(forecast): } segment_settings = SegmentModelSettings( segment={"a": 1, "b": 2}, - start_date="2124-01-01", - end_date="2124-02-01", + start_date=TEST_DATE_STR, + end_date=TEST_PREDICT_END_STR, holidays=[], regressors=[ProphetRegressor(**r) for r in regressor_list], grid_parameters=grid_parameters, @@ -1334,12 +1367,12 @@ def test_build_predict_dataframe_no_regressors(forecast): dates_to_predict = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-12-01").date(), - pd.to_datetime("2124-12-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2123-01-01").date(), - pd.to_datetime("2123-01-02").date(), + TEST_DATE - relativedelta(months=1), + TEST_DATE_NEXT_DAY - relativedelta(months=1), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -1350,12 +1383,12 @@ def test_build_predict_dataframe_no_regressors(forecast): expected_predict_df = pd.DataFrame( { "ds": [ - pd.to_datetime("2124-12-01").date(), - pd.to_datetime("2124-12-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2123-01-01").date(), - pd.to_datetime("2123-01-02").date(), + TEST_DATE - relativedelta(months=1), + TEST_DATE_NEXT_DAY - relativedelta(months=1), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -1372,12 +1405,12 @@ def test_build_predict_dataframe_no_regressors(forecast): expected_predict_wlog_df = pd.DataFrame( { "ds": [ - pd.to_datetime("2124-12-01").date(), - pd.to_datetime("2124-12-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2123-01-01").date(), - pd.to_datetime("2123-01-02").date(), + TEST_DATE - relativedelta(months=1), + TEST_DATE_NEXT_DAY - relativedelta(months=1), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "floor": [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0], "cap": [10.0, 10.0, 10.0, 10.0, 10.0, 10.0], @@ -1397,20 +1430,24 @@ def test_build_predict_dataframe(forecast): { "name": "all_in", "description": "it's all in", - "start_date": "2124-01-01", - "end_date": "2124-01-06", + "start_date": TEST_DATE_STR, + "end_date": (TEST_DATE + relativedelta(days=6)).strftime("%Y-%m-%d"), }, { "name": "all_out", "description": "it's all in", - "start_date": "2124-02-01", - "end_date": "2124-02-06", + "start_date": (TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d"), + "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( + "%Y-%m-%d" + ), }, { "name": "just_end", "description": "just the second one", - "start_date": "2124-01-02", - "end_date": "2124-02-06", + "start_date": (TEST_DATE + relativedelta(days=1)).strftime("%Y-%m-%d"), + "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( + "%Y-%m-%d" + ), }, ] @@ -1430,8 +1467,8 @@ def test_build_predict_dataframe(forecast): } segment_settings = SegmentModelSettings( segment={"a": 1, "b": 2}, - start_date="2124-01-01", - end_date="2124-02-01", + start_date=TEST_DATE_STR, + end_date=TEST_PREDICT_END_STR, holidays=[], regressors=[ProphetRegressor(**r) for r in regressor_list], grid_parameters=grid_parameters, @@ -1443,10 +1480,7 @@ def test_build_predict_dataframe(forecast): dates_to_predict = pd.DataFrame( { - "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - ], + "submission_date": [TEST_DATE, TEST_DATE_NEXT_DAY], } ) @@ -1456,10 +1490,7 @@ def test_build_predict_dataframe(forecast): ) expected_train_df = pd.DataFrame( { - "ds": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - ], + "ds": [TEST_DATE, TEST_DATE_NEXT_DAY], "all_in": [0, 0], "all_out": [1, 1], "just_end": [1, 0], @@ -1477,10 +1508,7 @@ def test_build_predict_dataframe(forecast): ) expected_train_wlog_df = pd.DataFrame( { - "ds": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - ], + "ds": [TEST_DATE, TEST_DATE_NEXT_DAY], "all_in": [0, 0], "all_out": [1, 1], "just_end": [1, 0], @@ -1503,23 +1531,28 @@ def test_build_model(forecast): { "name": "all_in", "description": "it's all in", - "start_date": "2124-01-01", - "end_date": "2124-01-06", + "start_date": TEST_DATE_STR, + "end_date": (TEST_DATE + relativedelta(days=6)).strftime("%Y-%m-%d"), }, { "name": "all_out", "description": "it's all in", - "start_date": "2124-02-01", - "end_date": "2124-02-06", + "start_date": (TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d"), + "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( + "%Y-%m-%d" + ), }, { "name": "just_end", "description": "just the second one", - "start_date": "2124-01-02", - "end_date": "2124-02-06", + "start_date": (TEST_DATE + relativedelta(days=1)).strftime("%Y-%m-%d"), + "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( + "%Y-%m-%d" + ), }, ] + # use holidays from holiday config file holiday_list = { "easter": { "name": "easter", @@ -1568,8 +1601,8 @@ def test_build_model(forecast): } segment_settings = SegmentModelSettings( segment={"a": 1, "b": 2}, - start_date="2124-01-01", - end_date="2124-02-01", + start_date=TEST_DATE_STR, + end_date=TEST_PREDICT_END_STR, holidays=[ProphetHoliday(**h) for h in holiday_list.values()], regressors=[ProphetRegressor(**r) for r in regressor_list], grid_parameters=grid_parameters, diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py index 18d3df67..1e211375 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py @@ -1,17 +1,552 @@ +from datetime import date +from dateutil.relativedelta import relativedelta + import pandas as pd from dotmap import DotMap import numpy as np +import pytest +import collections from kpi_forecasting.models.prophet_forecast import ProphetForecast +# Arbitrarily choose some date to use for the tests +TEST_DATE = date(2024, 1, 1) +TEST_DATE_STR = TEST_DATE.strftime("%Y-%m-%d") +TEST_DATE_NEXT_DAY = date(2024, 1, 1) +TEST_DATE_NEXT_DAY_STR = TEST_DATE_NEXT_DAY.strftime("%Y-%m-%d") + + +@pytest.fixture +def forecast(): + A1_start_date = TEST_DATE_STR + parameter_dict = { + "model_setting_split_dim": "a", + "segment_settings": { + "A1": { + "start_date": A1_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {"param1": [1, 2], "param2": [20, 10]}, + "cv_settings": {}, + }, + }, + } + + parameter_dotmap = DotMap(parameter_dict) + predict_start_date = TEST_DATE_NEXT_DAY_STR + # arbitarily set it a couple months in the future + predict_end_date = (TEST_DATE + relativedelta(months=2)).strftime("%Y-%m-%d") + return ProphetForecast( + model_type="test", + parameters=parameter_dotmap, + use_holidays=None, + start_date=predict_start_date, + end_date=predict_end_date, + metric_hub=None, + ) + + +class MockModel: + """Used in place of prophet.Prophet for testing purposes""" + + def __init__(self, param1=0, param2=0, **kwargs): + self.value = param1 * param2 + self.history = None + + def fit(self, df, *args, **kwargs): + self.history = df + return None + + def predict(self, dates_to_predict): + output = dates_to_predict.copy() + + output[ + [ + "yhat", + "trend", + "trend_upper", + "trend_lower", + "weekly", + "weekly_upper", + "weekly_lower", + "yearly", + "yearly_upper", + "yearly_lower", + ] + ] = 0 # some dummy value so it has the right shape + + return output + + def predictive_samples(self, dates_to_predict): + # prophet function outputs dict of numpy arrays + # only element we care about is `yhat` + output = np.arange(len(dates_to_predict)) * self.value + return {"yhat": {0: output}} + + +def mock_build_model(parameters): + """mocks the FunnelForecast build_model method""" + return MockModel( + **parameters, + ) + + +def mock_aggregate_forecast_observed( + forecast_df, observed_df, period, numpy_aggregations, percentiles +): + """Mocks the aggregate_forecast_observed function defined in ProphetForecast + and inherited in FunnelForecast. + This function is tested extensively in test_prophet_forecast + so we can make dummy outputs for tests related to it""" + + # add dummy columns where aggregated metrics woudl go + percentile_columns = [f"p{el}" for el in percentiles] + output_forecast_df = forecast_df.copy() + output_forecast_df[numpy_aggregations + percentile_columns] = 0 + return output_forecast_df, observed_df.copy() + + +def test_under_fit(forecast, mocker): + """test the _fit method""" + + observed_data = pd.DataFrame( + { + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + } + ) + mocker.patch.object(forecast, "_build_model", mock_build_model) + + forecast._fit(observed_data) + + # checking that history is set in the mocked Model ensures fit was called on it + pd.testing.assert_frame_equal( + observed_data.rename(columns={"submission_date": "ds"}), forecast.model.history + ) + + +def test_fit(forecast, mocker): + """test the fit function. It is inherited from BaseForecast + and calls _fit with the proper object attributes. Test looks very + similar to that for _fit""" + observed_data = pd.DataFrame( + { + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + } + ) + mocker.patch.object(forecast, "_build_model", mock_build_model) + + forecast.observed_df = observed_data + forecast.fit() + + # checking that history is set in the mocked Model ensures fit was called on it + pd.testing.assert_frame_equal( + observed_data.rename(columns={"submission_date": "ds"}), forecast.model.history + ) + + assert forecast.trained_at is not None + + +def test_combine_forecast_observed(mocker, forecast): + """tests the _combine_forecast_observed method""" + # forecast predictions are set with the + # mock_aggregate_forecast_observed function so they + # can be ommited here + forecast_df = pd.DataFrame( + { + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + } + ) + + # rows with negative values are those expected to be removed + # by filters in summarize + observed_df = pd.DataFrame( + { + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + "value": [10, 20], + } + ) + + mocker.patch.object( + forecast, "_aggregate_forecast_observed", mock_aggregate_forecast_observed + ) + + numpy_aggregations = ["mean"] + percentiles = [10, 50, 90] + output_df = forecast._combine_forecast_observed( + forecast_df, + observed_df, + period="period", + numpy_aggregations=numpy_aggregations, + percentiles=percentiles, + ) + observed_expected_df = pd.DataFrame( + { + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + "value": [10, 20], + "measure": ["observed", "observed"], + "source": ["historical", "historical"], + } + ) + + # 4x2 columns, 4 metrics (mean, p10, p50, p90) + forecast_expected_df = pd.DataFrame( + { + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + "measure": ["mean", "mean", "p10", "p10", "p50", "p50", "p90", "p90"], + "value": [0] * 8, + "source": ["forecast"] * 8, + } + ) + + # concat in same order to make our lives easier + expected = pd.concat([observed_expected_df, forecast_expected_df]).sort_values( + ["submission_date", "measure"] + ) + assert set(expected.columns) == set(output_df.columns) + # force value columns to be floats in both cases to make check easier + numeric_cols = ["value", "value_low", "value_mid", "value_high"] + pd.testing.assert_frame_equal( + output_df.sort_values(["submission_date", "measure"]).reset_index(drop=True), + expected[output_df.columns].reset_index(drop=True), + ) + + # should not be any nulls outside the metric column + non_metric_columns = [el for el in output_df.columns if el not in numeric_cols] + assert not pd.isna(output_df[non_metric_columns]).any(axis=None) + + +def test_under_summarize(mocker, forecast): + """testing _summarize""" + # forecast predictions are set with the + # mock_aggregate_forecast_observed function so they + # can be ommited here + forecast_df = pd.DataFrame( + { + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + } + ) + + # rows with negative values are those expected to be removed + # by filters in summarize + observed_df = pd.DataFrame( + { + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + "value": [10, 20], + } + ) + + mocker.patch.object( + forecast, "_aggregate_forecast_observed", mock_aggregate_forecast_observed + ) + + numpy_aggregations = ["mean"] + percentiles = [10, 50, 90] + output_df = forecast._summarize( + forecast_df, + observed_df, + period="period", + numpy_aggregations=numpy_aggregations, + percentiles=percentiles, + ) + observed_expected_df = pd.DataFrame( + { + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + "value": [10, 20], + "measure": ["observed", "observed"], + "source": ["historical", "historical"], + } + ) + + # 4x2 columns, 4 metrics (mean, p10, p50, p90) + forecast_expected_df = pd.DataFrame( + { + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + "measure": ["mean", "mean", "p10", "p10", "p50", "p50", "p90", "p90"], + "value": [0] * 8, + "source": ["forecast"] * 8, + } + ) + + # concat in same order to make our lives easier + expected = pd.concat([observed_expected_df, forecast_expected_df]).sort_values( + ["submission_date", "measure"] + ) + expected["aggregation_period"] = "period" + + assert set(expected.columns) == set(output_df.columns) + # force value columns to be floats in both cases to make check easier + numeric_cols = ["value", "value_low", "value_mid", "value_high"] + pd.testing.assert_frame_equal( + output_df.sort_values(["submission_date", "measure"]).reset_index(drop=True), + expected[output_df.columns].reset_index(drop=True), + ) + + # should not be any nulls outside the metric column + non_metric_columns = [el for el in output_df.columns if el not in numeric_cols] + assert not pd.isna(output_df[non_metric_columns]).any(axis=None) + + +def test_summarize(mocker, forecast): + """testing summarize""" + # create dummy metric hub object to when meta data from + # it is added we don't get an error + MetricHub = collections.namedtuple( + "MetricHub", + ["alias", "app_name", "slug", "min_date", "max_date"], + ) + + dummy_metric_hub = MetricHub("", "", "", TEST_DATE_STR, TEST_DATE_STR) + + # forecast predictions are set with the + # mock_aggregate_forecast_observed function so they + # can be ommited here + forecast_df = pd.DataFrame( + { + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + } + ) + + # rows with negative values are those expected to be removed + # by filters in summarize + observed_df = pd.DataFrame( + { + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + "value": [10, 20], + } + ) + + mocker.patch.object( + forecast, "_aggregate_forecast_observed", mock_aggregate_forecast_observed + ) + + numpy_aggregations = ["mean"] + percentiles = [10, 50, 90] + + forecast.observed_df = observed_df + forecast.forecast_df = forecast_df + forecast.metric_hub = dummy_metric_hub + + # timestamp attributes created by fit and predict + # must be added manuall + forecast.collected_at = "" + forecast.trained_at = "" + forecast.predicted_at = "" + forecast.metadata_params = "" + + numpy_aggregations = ["mean"] + percentiles = [10, 50, 90] + forecast.summarize( + periods=["period1", "period2"], + numpy_aggregations=numpy_aggregations, + percentiles=percentiles, + ) + + output_df = forecast.summary_df + + observed_expected_df = pd.DataFrame( + { + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + "value": [10, 20], + "measure": ["observed", "observed"], + "source": ["historical", "historical"], + } + ) + + # 4x2 columns, 4 metrics (mean, p10, p50, p90) + forecast_expected_df = pd.DataFrame( + { + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + "measure": ["mean", "mean", "p10", "p10", "p50", "p50", "p90", "p90"], + "value": [0] * 8, + "source": ["forecast"] * 8, + } + ) + + # concat in same order to make our lives easier + expected = pd.concat([observed_expected_df, forecast_expected_df]).sort_values( + ["submission_date", "measure"] + ) + expected1 = expected.copy() + expected2 = expected.copy() + expected1["aggregation_period"] = "period1" + expected2["aggregation_period"] = "period2" + + expected = pd.concat([expected1, expected2]) + + # not going to check all the metadata columns + # in assert_frame_equal. Just make sure they're there + metadata_columns = { + "metric_alias", + "metric_hub_app_name", + "metric_hub_slug", + "metric_start_date", + "metric_end_date", + "metric_collected_at", + "forecast_start_date", + "forecast_end_date", + "forecast_trained_at", + "forecast_predicted_at", + "forecast_parameters", + } + assert set(expected.columns) | metadata_columns == set(output_df.columns) + # force value columns to be floats in both cases to make check easier + numeric_cols = ["value", "value_low", "value_mid", "value_high"] + pd.testing.assert_frame_equal( + output_df.sort_values(["submission_date", "aggregation_period", "measure"])[ + expected.columns + ].reset_index(drop=True), + expected.sort_values( + ["submission_date", "aggregation_period", "measure"] + ).reset_index(drop=True), + ) + + # should not be any nulls outside the metric column + non_metric_columns = [el for el in output_df.columns if el not in numeric_cols] + assert not pd.isna(output_df[non_metric_columns]).any(axis=None) + + +def test_under_predict(mocker, forecast): + """testing _predict""" + # this ensures forecast is using MockModel + mocker.patch.object(forecast, "_build_model", mock_build_model) + + observed_df = pd.DataFrame( + { + "y": [0, 1], + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + } + ) + + dates_to_predict = pd.DataFrame( + { + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ] + } + ) + forecast.observed_df = observed_df + forecast.parameters = {"param1": 1, "param2": 2} + forecast.fit() + out = forecast._predict(dates_to_predict).reset_index(drop=True) + + # in MockModel, the predictive_samples method sets the output to + # np.arange(len(dates_to_predict)) * self.value for one column called 0 + # this helps ensure the forecast_df in segment_models is set properly + expected = pd.DataFrame( + { + 0: [0, 2], + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + } + ) + + pd.testing.assert_frame_equal(out, expected) + + # test predict while we're here + + forecast.dates_to_predict = dates_to_predict + forecast.number_of_simulations = 1 # so that _validate doesn't break + forecast.predict() + + out = forecast.forecast_df + + # in MockModel, the predictive_samples method sets the output to + # np.arange(len(dates_to_predict)) * self.value for one column called 0 + # this helps ensure the forecast_df in segment_models is set properly + expected = pd.DataFrame( + { + 0: [0, 2], + "submission_date": [ + TEST_DATE, + TEST_DATE_NEXT_DAY, + ], + } + ) + + pd.testing.assert_frame_equal(out, expected) + assert forecast.predicted_at is not None + def test_summarize_non_overlapping_day(): - observed_start_date = "2124-01-01" - observed_end_date = "2124-02-01" + observed_start_date = TEST_DATE_STR + observed_end_date = (TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d") - predict_start_date = "2124-02-02" - predict_end_date = "2124-03-01" + predict_start_date = (TEST_DATE + relativedelta(months=1, days=1)).strftime( + "%Y-%m-%d" + ) + predict_end_date = (TEST_DATE + relativedelta(months=2)).strftime("%Y-%m-%d") forecast = ProphetForecast( model_type="test", @@ -33,10 +568,15 @@ def test_summarize_non_overlapping_day(): } ) + # there are the samples generated + # the mean and median are the aggregates used test_samples = np.array([1, 1, 2, 3, 5, 8, 13]) test_mean = np.mean(test_samples) test_median = np.median(test_samples) + # mean and median scale with a factor + # so a factor is multiplied on to make sure the aggregation is working + # across rows properly forecast_array = np.stack( [test_samples * i for i in range(1, 1 + len(predict_submission_dates))], axis=0, @@ -110,12 +650,22 @@ def test_summarize_non_overlapping_day(): def test_summarize_non_overlapping_month(): + # choose arbitrary year for the start and end dates + # two full months (Jan and Feb ) + # are in the observed data, the number of days (31 and 28 days respectively) + # in each month is used in the checks observed_start_date = "2124-01-01" observed_end_date = "2124-02-28" + # two full months (April and May ) + # are in the observed data, the number of days (28 and 31 days respectively) + # in each month is used in the checks predict_start_date = "2124-04-01" predict_end_date = "2124-05-31" + print(observed_start_date, observed_end_date) + print(predict_start_date, predict_end_date) + forecast = ProphetForecast( model_type="test", parameters=DotMap(), @@ -229,11 +779,11 @@ def test_summarize_non_overlapping_month(): def test_summarize_overlapping_day(): - observed_start_date = "2124-01-01" - observed_end_date = "2124-02-01" + observed_start_date = TEST_DATE_STR + observed_end_date = (TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d") - predict_start_date = "2124-01-01" - predict_end_date = "2124-02-01" + predict_start_date = TEST_DATE_STR + predict_end_date = (TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d") forecast = ProphetForecast( model_type="test", @@ -255,10 +805,15 @@ def test_summarize_overlapping_day(): } ) + # there are the samples generated + # the mean and median are the aggregates used test_samples = np.array([1, 1, 2, 3, 5, 8, 13]) test_mean = np.mean(test_samples) test_median = np.median(test_samples) + # mean and median scale with a factor + # so a factor is multiplied on to make sure the aggregation is working + # across rows properly forecast_array = np.stack( [test_samples * i for i in range(1, 1 + len(predict_submission_dates))], axis=0, @@ -334,6 +889,10 @@ def test_summarize_overlapping_day(): def test_summarize_overlapping_month(): + # choose arbitrary year for the start and end dates + # two full months (Jan and Feb ) + # are in the observed data, the number of days (31 and 28 days respectively) + # in each month is used in the checks observed_start_date = "2124-01-01" observed_end_date = "2124-02-28" @@ -360,10 +919,15 @@ def test_summarize_overlapping_month(): } ) + # there are the samples generated + # the mean and median are the aggregates used test_samples = np.array([1, 1, 2, 3, 5, 8, 13]) test_mean = np.mean(test_samples) test_median = np.median(test_samples) + # mean and median scale with a factor + # so a factor is multiplied on to make sure the aggregation is working + # across rows properly forecast_array = np.stack( [test_samples] * len(predict_submission_dates), axis=0,