From 2a60eefdce558ee4c302f1576e92655f6420af91 Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Tue, 9 Jul 2024 15:56:06 -0500 Subject: [PATCH 01/33] refactored base_forecast and prophet_forecast to enable easier testing --- .../kpi_forecasting/models/base_forecast.py | 359 ++++-------------- .../models/prophet_forecast.py | 286 +++++++++++++- .../tests/test_base_forecast.py | 15 + 3 files changed, 361 insertions(+), 299 deletions(-) create mode 100644 jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py index 13385ccd..647bb160 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py @@ -1,19 +1,18 @@ import json import numpy as np import pandas as pd +import abc + -from google.cloud import bigquery -from google.cloud.bigquery.enums import SqlTypeNames as bq_types from dataclasses import dataclass -from datetime import datetime, timedelta -from kpi_forecasting import pandas_extras as pdx +from datetime import datetime, timedelta, timezone from kpi_forecasting.metric_hub import MetricHub from pandas.api import types as pd_types from typing import Dict, List @dataclass -class BaseForecast: +class BaseForecast(abc.ABC): """ A base class for fitting, forecasting, and summarizing forecasts. This class should not be invoked directly; it should be inherited by a child class. The @@ -46,7 +45,7 @@ class BaseForecast: def __post_init__(self) -> None: # fetch observed observed data - self.collected_at = datetime.utcnow() + self.collected_at = datetime.now(timezone.utc) self.observed_df = self.metric_hub.fetch() # use default start/end dates if the user doesn't specify them @@ -70,26 +69,55 @@ def __post_init__(self) -> None: } ) - def _fit(self) -> None: - """ - Fit a forecasting model using `self.observed_df` that was generated using - Metric Hub data. This method should update `self.model`. + @abc.abstractmethod + def _fit(self, observed_df: pd.DataFrame) -> None: + """Fit a forecasting model using `observed_df.` This will typically + be the data that was generated using + Metric Hub in `__post_init__`. + This method should update `self.model`. + + Args: + observed_df (pd.DataFrame): observed data used to fit the model """ raise NotImplementedError - def _predict(self) -> pd.DataFrame: - """ - Forecast using `self.model`. This method should return a dataframe that will + @abc.abstractmethod + def _predict(self, dates_to_predict: pd.DataFrame) -> pd.DataFrame: + """Forecast using `self.model` on dates in `dates_to_predict`. + This method should return a dataframe that will be validated by `_validate_forecast_df`. + + Args: + dates_to_predict (pd.DataFrame): dataframe of dates to forecast for + + Returns: + pd.DataFrame: dataframe of predictions """ raise NotImplementedError - def _predict_legacy(self) -> pd.DataFrame: - """ - Forecast using `self.model`, adhering to the legacy data format. + @abc.abstractmethod + def _summarize( + self, + forecast_df: pd.DataFrame, + observed_df: pd.DataFrame, + period: str, + numpy_aggregations: List[str], + percentiles: List[int], + ) -> pd.DataFrame: + """Calculate summary metrics for `forecast_df` over a given period, and + add metadata. + + Args: + forecast_df (pd.DataFrame): forecast dataframe created by `predict` + observed_df (pd.DataFrame): observed data used to generate prediction + period (str): aggregation period up to which metrics are aggregated + numpy_aggregations (List[str]): List of numpy aggregation names + percentiles (List[int]): List of percentiles to aggregate up to + + Returns: + pd.DataFrame: dataframe containing metrics listed in numpy_aggregations + and percentiles """ - # TODO: This method should be removed once the forecasting data model is updated: - # https://mozilla-hub.atlassian.net/browse/DS-2676 raise NotImplementedError @property @@ -100,7 +128,7 @@ def _default_start_date(self) -> str: @property def _default_end_date(self) -> str: """78 weeks (18 months) ahead of the current UTC date.""" - return (datetime.utcnow() + timedelta(weeks=78)).date() + return (datetime.now(timezone.utc) + timedelta(weeks=78)).date() def _set_seed(self) -> None: """Set random seed to ensure that fits and predictions are reproducible.""" @@ -133,205 +161,30 @@ def _validate_forecast_df(self) -> None: f" but column {i} has type {df[i].dtypes}." ) - def _summarize( - self, - period: str, - numpy_aggregations: List[str], - percentiles: List[int], - ) -> pd.DataFrame: - """ - Calculate summary metrics for `self.forecast_df` over a given period, and - add metadata. - """ - # build a list of all functions that we'll summarize the data by - aggregations = [getattr(np, i) for i in numpy_aggregations] - aggregations.extend([pdx.percentile(i) for i in percentiles]) - - # aggregate metric to the correct date period (day, month, year) - observed_summarized = pdx.aggregate_to_period(self.observed_df, period) - forecast_agg = pdx.aggregate_to_period(self.forecast_df, period) - - # find periods of overlap between observed and forecasted data - overlap = forecast_agg.merge( - observed_summarized, - on="submission_date", - how="left", - ).fillna(0) - - forecast_summarized = ( - forecast_agg.set_index("submission_date") - # Add observed data samples to any overlapping forecasted period. This - # ensures that any forecast made partway through a period accounts for - # previously observed data within the period. For example, when a monthly - # forecast is generated in the middle of the month. - .add(overlap[["value"]].values) - # calculate summary values, aggregating by submission_date, - .agg(aggregations, axis=1) - .reset_index() - # "melt" the df from wide-format to long-format. - .melt(id_vars="submission_date", var_name="measure") - ) - - # add datasource-specific metadata columns - forecast_summarized["source"] = "forecast" - observed_summarized["source"] = "historical" - observed_summarized["measure"] = "observed" - - # create a single dataframe that contains observed and forecasted data - df = pd.concat([observed_summarized, forecast_summarized]) - - # add summary metadata columns - df["aggregation_period"] = period.lower() - - # reorder columns to make interpretation easier - df = df[["submission_date", "aggregation_period", "source", "measure", "value"]] - - # add Metric Hub metadata columns - df["metric_alias"] = self.metric_hub.alias.lower() - df["metric_hub_app_name"] = self.metric_hub.app_name.lower() - df["metric_hub_slug"] = self.metric_hub.slug.lower() - df["metric_start_date"] = pd.to_datetime(self.metric_hub.min_date) - df["metric_end_date"] = pd.to_datetime(self.metric_hub.max_date) - df["metric_collected_at"] = self.collected_at - - # add forecast model metadata columns - df["forecast_start_date"] = self.start_date - df["forecast_end_date"] = self.end_date - df["forecast_trained_at"] = self.trained_at - df["forecast_predicted_at"] = self.predicted_at - df["forecast_parameters"] = self.metadata_params - - return df - - def _summarize_legacy(self) -> pd.DataFrame: - """ - Converts a `self.summary_df` to the legacy format used in - `moz-fx-data-shared-prod.telemetry_derived.kpi_automated_forecast_confidences_v1` - """ - # TODO: This method should be removed once the forecasting data model is updated: - # https://mozilla-hub.atlassian.net/browse/DS-2676 - - df = self.summary_df.copy(deep=True) - - # rename columns to legacy values - df.rename( - columns={ - "forecast_end_date": "asofdate", - "submission_date": "date", - "metric_alias": "target", - "aggregation_period": "unit", - }, - inplace=True, - ) - df["forecast_date"] = df["forecast_predicted_at"].dt.date - df["type"] = df["source"].replace("historical", "actual") - df = df.replace( - { - "measure": { - "observed": "value", - "p05": "yhat_p5", - "p10": "yhat_p10", - "p20": "yhat_p20", - "p30": "yhat_p30", - "p40": "yhat_p40", - "p50": "yhat_p50", - "p60": "yhat_p60", - "p70": "yhat_p70", - "p80": "yhat_p80", - "p90": "yhat_p90", - "p95": "yhat_p95", - }, - "target": { - "desktop_dau": "desktop", - "mobile_dau": "mobile", - }, - } - ) - - # pivot the df from "long" to "wide" format - index_columns = [ - "asofdate", - "date", - "target", - "unit", - "forecast_parameters", - "forecast_date", - ] - df = ( - df[index_columns + ["measure", "value"]] - .pivot( - index=index_columns, - columns="measure", - values="value", - ) - .reset_index() - ) - - # pivot sets the "name" attribute of the columns for some reason. It's - # None by default, so we just reset that here. - df.columns.name = None - - # When there's an overlap in the observed and forecasted period -- for - # example, when a monthly forecast is generated mid-month -- the legacy - # format only records the forecasted value, not the observed value. To - # account for this, we'll just find the max of the "mean" (forecasted) and - # "value" (observed) data. In all non-overlapping observed periods, the - # forecasted value will be NULL. In all non-overlapping forecasted periods, - # the observed value will be NULL. In overlapping periods, the forecasted - # value will always be larger because it is the sum of the observed and forecasted - # values. Below is a query that demonstrates the legacy behavior: - # - # SELECT * - # FROM `moz-fx-data-shared-prod.telemetry_derived.kpi_automated_forecast_confidences_v1` - # WHERE asofdate = "2023-12-31" - # AND target = "mobile" - # AND unit = "month" - # AND forecast_date = "2022-06-04" - # AND date BETWEEN "2022-05-01" AND "2022-06-01" - # ORDER BY date - df["value"] = df[["mean", "value"]].max(axis=1) - df.drop(columns=["mean"], inplace=True) - - # non-numeric columns are represented in the legacy bq schema as strings - string_cols = [ - "asofdate", - "date", - "target", - "unit", - "forecast_parameters", - "forecast_date", - ] - df[string_cols] = df[string_cols].astype(str) - - return df - def fit(self) -> None: """Fit a model using historic metric data provided by `metric_hub`.""" print(f"Fitting {self.model_type} model.", flush=True) self._set_seed() - self.trained_at = datetime.utcnow() - self._fit() + self.trained_at = datetime.now(timezone.utc) + self._fit(self.observed_df) def predict(self) -> None: - """Generate a forecast from `start_date` to `end_date`.""" + """Generate a forecast from `start_date` to `end_date`. + Result is set to `self.forecast_df`""" print(f"Forecasting from {self.start_date} to {self.end_date}.", flush=True) self._set_seed() - self.predicted_at = datetime.utcnow() - self.forecast_df = self._predict() + self.predicted_at = datetime.now(timezone.utc) + self.forecast_df = self._predict(self.dates_to_predict) self._validate_forecast_df() - # TODO: This line should be removed once the forecasting data model is updated: - # https://mozilla-hub.atlassian.net/browse/DS-2676 - self.forecast_df_legacy = self._predict_legacy() - def summarize( self, periods: List[str] = ["day", "month"], numpy_aggregations: List[str] = ["mean"], percentiles: List[int] = [10, 50, 90], - ) -> None: + ) -> pd.DataFrame: """ - Calculate summary metrics for `self.forecast_df` and add metadata. + Calculate summary metrics for `forecast_df` and add metadata. The dataframe returned here will be reported in Big Query when `write_results` is called. @@ -342,95 +195,21 @@ def summarize( be applied to summarize numeric values in a numpy dataframe. For example, ["mean"]. percentiles (List[int]): A list of integers representing the percentiles that should be reported in the summary. For example [50] would calculate the 50th percentile (i.e. the median). + + Returns: + pd.DataFrame: metric dataframe for all metrics and aggregations """ self.summary_df = pd.concat( - [self._summarize(i, numpy_aggregations, percentiles) for i in periods] + [ + self._summarize( + self.forecast_df, + self.observed_df, + i, + numpy_aggregations, + percentiles, + ) + for i in periods + ] ) - # TODO: remove this once the forecasting data model is updated: - # https://mozilla-hub.atlassian.net/browse/DS-2676 - self.summary_df_legacy = self._summarize_legacy() - - def write_results( - self, - project: str, - dataset: str, - table: str, - project_legacy: str, - dataset_legacy: str, - write_disposition: str = "WRITE_APPEND", - forecast_table_legacy: str = "kpi_automated_forecast_v1", - confidences_table_legacy: str = "kpi_automated_forecast_confidences_v1", - ) -> None: - """ - Write `self.summary_df` to Big Query. - - Args: - project (str): The Big Query project that the data should be written to. - dataset (str): The Big Query dataset that the data should be written to. - table (str): The Big Query table that the data should be written to. - write_disposition (str): In the event that the destination table exists, - should the table be overwritten ("WRITE_TRUNCATE") or appended to - ("WRITE_APPEND")? - """ - print(f"Writing results to `{project}.{dataset}.{table}`.", flush=True) - client = bigquery.Client(project=project) - schema = [ - bigquery.SchemaField("submission_date", bq_types.DATE), - bigquery.SchemaField("aggregation_period", bq_types.STRING), - bigquery.SchemaField("source", bq_types.STRING), - bigquery.SchemaField("measure", bq_types.STRING), - bigquery.SchemaField("value", bq_types.FLOAT), - bigquery.SchemaField("metric_alias", bq_types.STRING), - bigquery.SchemaField("metric_hub_app_name", bq_types.STRING), - bigquery.SchemaField("metric_hub_slug", bq_types.STRING), - bigquery.SchemaField("metric_start_date", bq_types.DATE), - bigquery.SchemaField("metric_end_date", bq_types.DATE), - bigquery.SchemaField("metric_collected_at", bq_types.TIMESTAMP), - bigquery.SchemaField("forecast_start_date", bq_types.DATE), - bigquery.SchemaField("forecast_end_date", bq_types.DATE), - bigquery.SchemaField("forecast_trained_at", bq_types.TIMESTAMP), - bigquery.SchemaField("forecast_predicted_at", bq_types.TIMESTAMP), - bigquery.SchemaField("forecast_parameters", bq_types.STRING), - ] - job = client.load_table_from_dataframe( - dataframe=self.summary_df, - destination=f"{project}.{dataset}.{table}", - job_config=bigquery.LoadJobConfig( - schema=schema, - autodetect=False, - write_disposition=write_disposition, - ), - ) - # Wait for the job to complete. - job.result() - - # TODO: remove the below jobs once the forecasting data model is updated: - # https://mozilla-hub.atlassian.net/browse/DS-2676 - - job = client.load_table_from_dataframe( - dataframe=self.forecast_df_legacy, - destination=f"{project_legacy}.{dataset_legacy}.{forecast_table_legacy}", - job_config=bigquery.LoadJobConfig( - write_disposition=write_disposition, - schema=[ - bigquery.SchemaField("ds", bq_types.TIMESTAMP), - bigquery.SchemaField("forecast_date", bq_types.STRING), - bigquery.SchemaField("forecast_parameters", bq_types.STRING), - ], - ), - ) - job.result() - - job = client.load_table_from_dataframe( - dataframe=self.summary_df_legacy, - destination=f"{project_legacy}.{dataset_legacy}.{confidences_table_legacy}", - job_config=bigquery.LoadJobConfig( - write_disposition=write_disposition, - schema=[ - bigquery.SchemaField("asofdate", bq_types.STRING), - bigquery.SchemaField("date", bq_types.STRING), - ], - ), - ) - job.result() + return self.summary_df diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py index 8402dda4..abc3a4f5 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py @@ -1,11 +1,16 @@ import json import pandas as pd import prophet +import numpy as np +from typing import Dict, List -from datetime import datetime + +from datetime import datetime, timezone from dataclasses import dataclass from kpi_forecasting.models.base_forecast import BaseForecast -from typing import Dict +from kpi_forecasting import pandas_extras as pdx +from google.cloud import bigquery +from google.cloud.bigquery.enums import SqlTypeNames as bq_types @dataclass @@ -14,7 +19,7 @@ class ProphetForecast(BaseForecast): def column_names_map(self) -> Dict[str, str]: return {"submission_date": "ds", "value": "y"} - def _fit(self) -> None: + def _fit(self, observed_df) -> None: self.model = prophet.Prophet( **self.parameters, uncertainty_samples=self.number_of_simulations, @@ -26,16 +31,15 @@ def _fit(self) -> None: # Modify observed data to have column names that Prophet expects, and fit # the model - self.model.fit(self.observed_df.rename(columns=self.column_names_map)) + self.model.fit(observed_df.rename(columns=self.column_names_map)) - def _predict(self) -> pd.DataFrame: + def _predict(self, dates_to_predict) -> pd.DataFrame: # generate the forecast samples samples = self.model.predictive_samples( - self.dates_to_predict.rename(columns=self.column_names_map) + dates_to_predict.rename(columns=self.column_names_map) ) df = pd.DataFrame(samples["yhat"]) - df["submission_date"] = self.dates_to_predict - + df["submission_date"] = dates_to_predict return df def _predict_legacy(self) -> pd.DataFrame: @@ -56,7 +60,7 @@ def _predict_legacy(self) -> pd.DataFrame: else: df["metric"] = self.metric_hub.alias - df["forecast_date"] = str(datetime.utcnow().date()) + df["forecast_date"] = str(datetime.now(timezone.utc).date()) df["forecast_parameters"] = str( json.dumps({**self.parameters, "holidays": self.use_holidays}) ) @@ -116,3 +120,267 @@ def _predict_legacy(self) -> pd.DataFrame: df[column] = 0.0 return df[columns] + + def _summarize( + self, + forecast_df, + observed_df, + period: str, + numpy_aggregations: List[str], + percentiles: List[int], + ) -> pd.DataFrame: + """ + Calculate summary metrics for `self.forecast_df` over a given period, and + add metadata. + """ + # build a list of all functions that we'll summarize the data by + aggregations = [getattr(np, i) for i in numpy_aggregations] + aggregations.extend([pdx.percentile(i) for i in percentiles]) + + # aggregate metric to the correct date period (day, month, year) + observed_summarized = pdx.aggregate_to_period(observed_df, period) + forecast_agg = pdx.aggregate_to_period(forecast_df, period) + + # find periods of overlap between observed and forecasted data + overlap = forecast_agg.merge( + observed_summarized, + on="submission_date", + how="left", + ).fillna(0) + + forecast_summarized = ( + forecast_agg.set_index("submission_date") + # Add observed data samples to any overlapping forecasted period. This + # ensures that any forecast made partway through a period accounts for + # previously observed data within the period. For example, when a monthly + # forecast is generated in the middle of the month. + .add(overlap[["value"]].values) + # calculate summary values, aggregating by submission_date, + .agg(aggregations, axis=1) + .reset_index() + # "melt" the df from wide-format to long-format. + .melt(id_vars="submission_date", var_name="measure") + ) + + # add datasource-specific metadata columns + forecast_summarized["source"] = "forecast" + observed_summarized["source"] = "historical" + observed_summarized["measure"] = "observed" + + # create a single dataframe that contains observed and forecasted data + df = pd.concat([observed_summarized, forecast_summarized]) + + # add summary metadata columns + df["aggregation_period"] = period.lower() + + # reorder columns to make interpretation easier + df = df[["submission_date", "aggregation_period", "source", "measure", "value"]] + + # add Metric Hub metadata columns + df["metric_alias"] = self.metric_hub.alias.lower() + df["metric_hub_app_name"] = self.metric_hub.app_name.lower() + df["metric_hub_slug"] = self.metric_hub.slug.lower() + df["metric_start_date"] = pd.to_datetime(self.metric_hub.min_date) + df["metric_end_date"] = pd.to_datetime(self.metric_hub.max_date) + df["metric_collected_at"] = self.collected_at + + # add forecast model metadata columns + df["forecast_start_date"] = self.start_date + df["forecast_end_date"] = self.end_date + df["forecast_trained_at"] = self.trained_at + df["forecast_predicted_at"] = self.predicted_at + df["forecast_parameters"] = self.metadata_params + + return df + + def _summarize_legacy(self) -> pd.DataFrame: + """ + Converts a `self.summary_df` to the legacy format used in + `moz-fx-data-shared-prod.telemetry_derived.kpi_automated_forecast_confidences_v1` + """ + # TODO: This method should be removed once the forecasting data model is updated: + # https://mozilla-hub.atlassian.net/browse/DS-2676 + + df = self.summary_df.copy(deep=True) + + # rename columns to legacy values + df.rename( + columns={ + "forecast_end_date": "asofdate", + "submission_date": "date", + "metric_alias": "target", + "aggregation_period": "unit", + }, + inplace=True, + ) + df["forecast_date"] = df["forecast_predicted_at"].dt.date + df["type"] = df["source"].replace("historical", "actual") + df = df.replace( + { + "measure": { + "observed": "value", + "p05": "yhat_p5", + "p10": "yhat_p10", + "p20": "yhat_p20", + "p30": "yhat_p30", + "p40": "yhat_p40", + "p50": "yhat_p50", + "p60": "yhat_p60", + "p70": "yhat_p70", + "p80": "yhat_p80", + "p90": "yhat_p90", + "p95": "yhat_p95", + }, + "target": { + "desktop_dau": "desktop", + "mobile_dau": "mobile", + }, + } + ) + + # pivot the df from "long" to "wide" format + index_columns = [ + "asofdate", + "date", + "target", + "unit", + "forecast_parameters", + "forecast_date", + ] + df = ( + df[index_columns + ["measure", "value"]] + .pivot( + index=index_columns, + columns="measure", + values="value", + ) + .reset_index() + ) + + # pivot sets the "name" attribute of the columns for some reason. It's + # None by default, so we just reset that here. + df.columns.name = None + + # When there's an overlap in the observed and forecasted period -- for + # example, when a monthly forecast is generated mid-month -- the legacy + # format only records the forecasted value, not the observed value. To + # account for this, we'll just find the max of the "mean" (forecasted) and + # "value" (observed) data. In all non-overlapping observed periods, the + # forecasted value will be NULL. In all non-overlapping forecasted periods, + # the observed value will be NULL. In overlapping periods, the forecasted + # value will always be larger because it is the sum of the observed and forecasted + # values. Below is a query that demonstrates the legacy behavior: + # + # SELECT * + # FROM `moz-fx-data-shared-prod.telemetry_derived.kpi_automated_forecast_confidences_v1` + # WHERE asofdate = "2023-12-31" + # AND target = "mobile" + # AND unit = "month" + # AND forecast_date = "2022-06-04" + # AND date BETWEEN "2022-05-01" AND "2022-06-01" + # ORDER BY date + df["value"] = df[["mean", "value"]].max(axis=1) + df.drop(columns=["mean"], inplace=True) + + # non-numeric columns are represented in the legacy bq schema as strings + string_cols = [ + "asofdate", + "date", + "target", + "unit", + "forecast_parameters", + "forecast_date", + ] + df[string_cols] = df[string_cols].astype(str) + + return df + + def write_results( + self, + project: str, + dataset: str, + table: str, + project_legacy: str, + dataset_legacy: str, + write_disposition: str = "WRITE_APPEND", + forecast_table_legacy: str = "kpi_automated_forecast_v1", + confidences_table_legacy: str = "kpi_automated_forecast_confidences_v1", + ) -> None: + """ + Write `self.summary_df` to Big Query. + + Args: + project (str): The Big Query project that the data should be written to. + dataset (str): The Big Query dataset that the data should be written to. + table (str): The Big Query table that the data should be written to. + write_disposition (str): In the event that the destination table exists, + should the table be overwritten ("WRITE_TRUNCATE") or appended to + ("WRITE_APPEND")? + """ + # get legacy tables + # TODO: remove this once the forecasting data model is updated: + # https://mozilla-hub.atlassian.net/browse/DS-2676 + self.forecast_df_legacy = self._predict_legacy() + self.summary_df_legacy = self._summarize_legacy() + + print(f"Writing results to `{project}.{dataset}.{table}`.", flush=True) + client = bigquery.Client(project=project) + schema = [ + bigquery.SchemaField("submission_date", bq_types.DATE), + bigquery.SchemaField("aggregation_period", bq_types.STRING), + bigquery.SchemaField("source", bq_types.STRING), + bigquery.SchemaField("measure", bq_types.STRING), + bigquery.SchemaField("value", bq_types.FLOAT), + bigquery.SchemaField("metric_alias", bq_types.STRING), + bigquery.SchemaField("metric_hub_app_name", bq_types.STRING), + bigquery.SchemaField("metric_hub_slug", bq_types.STRING), + bigquery.SchemaField("metric_start_date", bq_types.DATE), + bigquery.SchemaField("metric_end_date", bq_types.DATE), + bigquery.SchemaField("metric_collected_at", bq_types.TIMESTAMP), + bigquery.SchemaField("forecast_start_date", bq_types.DATE), + bigquery.SchemaField("forecast_end_date", bq_types.DATE), + bigquery.SchemaField("forecast_trained_at", bq_types.TIMESTAMP), + bigquery.SchemaField("forecast_predicted_at", bq_types.TIMESTAMP), + bigquery.SchemaField("forecast_parameters", bq_types.STRING), + ] + job = client.load_table_from_dataframe( + dataframe=self.summary_df, + destination=f"{project}.{dataset}.{table}", + job_config=bigquery.LoadJobConfig( + schema=schema, + autodetect=False, + write_disposition=write_disposition, + ), + ) + # Wait for the job to complete. + job.result() + + # TODO: remove the below jobs once the forecasting data model is updated: + # https://mozilla-hub.atlassian.net/browse/DS-2676 + + job = client.load_table_from_dataframe( + dataframe=self.forecast_df_legacy, + destination=f"{project_legacy}.{dataset_legacy}.{forecast_table_legacy}", + job_config=bigquery.LoadJobConfig( + write_disposition=write_disposition, + schema=[ + bigquery.SchemaField("ds", bq_types.TIMESTAMP), + bigquery.SchemaField("forecast_date", bq_types.STRING), + bigquery.SchemaField("forecast_parameters", bq_types.STRING), + ], + ), + ) + job.result() + + job = client.load_table_from_dataframe( + dataframe=self.summary_df_legacy, + destination=f"{project_legacy}.{dataset_legacy}.{confidences_table_legacy}", + job_config=bigquery.LoadJobConfig( + write_disposition=write_disposition, + schema=[ + bigquery.SchemaField("asofdate", bq_types.STRING), + bigquery.SchemaField("date", bq_types.STRING), + ], + ), + ) + job.result() diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py new file mode 100644 index 00000000..a0385c81 --- /dev/null +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py @@ -0,0 +1,15 @@ +import pytest + +from kpi_forecasting.models.base_forecast import BaseForecast + + +class BadClass(BaseForecast): + pass + + +def test_fit_not_implemented(): + with pytest.raises( + TypeError, + match="Can't instantiate abstract class BadClass with abstract methods _fit, _predict, _summarize", + ): + _ = BadClass() From 340fabf8e0564ce7a871cabf0c8ac89029632c8a Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Wed, 10 Jul 2024 15:06:12 -0500 Subject: [PATCH 02/33] Apply suggestions from code review change signatures of `fit` and `predict` to take arguments that default to attributes Co-authored-by: Brad Ochocki Szasz --- .../kpi_forecasting/models/base_forecast.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py index 647bb160..452dac35 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py @@ -161,20 +161,20 @@ def _validate_forecast_df(self) -> None: f" but column {i} has type {df[i].dtypes}." ) - def fit(self) -> None: + def fit(self, observed_df: pd.DataFrame = self.observed_df) -> None: """Fit a model using historic metric data provided by `metric_hub`.""" print(f"Fitting {self.model_type} model.", flush=True) self._set_seed() self.trained_at = datetime.now(timezone.utc) - self._fit(self.observed_df) + self._fit(observed_df) - def predict(self) -> None: + def predict(self, dates_to_predict: pd.DataFrame = self.dates_to_predict) -> None: """Generate a forecast from `start_date` to `end_date`. Result is set to `self.forecast_df`""" print(f"Forecasting from {self.start_date} to {self.end_date}.", flush=True) self._set_seed() self.predicted_at = datetime.now(timezone.utc) - self.forecast_df = self._predict(self.dates_to_predict) + self.forecast_df = self._predict(dates_to_predict) self._validate_forecast_df() def summarize( From 6c7d3f2d06454fffde10a1f0d2b7e1ad2d6ff0da Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Wed, 10 Jul 2024 15:04:06 -0500 Subject: [PATCH 03/33] add test for fit --- .../kpi_forecasting/models/base_forecast.py | 26 +++++++- .../models/prophet_forecast.py | 27 ++++++++ .../tests/test_base_forecast.py | 62 ++++++++++++++++++- 3 files changed, 110 insertions(+), 5 deletions(-) diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py index 452dac35..5787ad61 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py @@ -7,7 +7,6 @@ from dataclasses import dataclass from datetime import datetime, timedelta, timezone from kpi_forecasting.metric_hub import MetricHub -from pandas.api import types as pd_types from typing import Dict, List @@ -43,10 +42,14 @@ class BaseForecast(abc.ABC): metric_hub: MetricHub number_of_simulations: int = 1000 + def _get_observed_data(self): + if self.metric_hub: + self.observed_df = self.metric_hub.fetch() + def __post_init__(self) -> None: # fetch observed observed data self.collected_at = datetime.now(timezone.utc) - self.observed_df = self.metric_hub.fetch() + self._get_observed_data() # use default start/end dates if the user doesn't specify them self.start_date = pd.to_datetime(self.start_date or self._default_start_date) @@ -74,7 +77,7 @@ def _fit(self, observed_df: pd.DataFrame) -> None: """Fit a forecasting model using `observed_df.` This will typically be the data that was generated using Metric Hub in `__post_init__`. - This method should update `self.model`. + This method should update (and potentially set) `self.model`. Args: observed_df (pd.DataFrame): observed data used to fit the model @@ -95,6 +98,14 @@ def _predict(self, dates_to_predict: pd.DataFrame) -> pd.DataFrame: """ raise NotImplementedError + @abc.abstractmethod + def _validate_forecast_df(self, forecast_df: pd.DataFrame) -> None: + """Method to validate reults produced by _predict + + Args: + forecast_df (pd.DataFrame): dataframe produced by `_predict`""" + raise NotImplementedError + @abc.abstractmethod def _summarize( self, @@ -134,6 +145,7 @@ def _set_seed(self) -> None: """Set random seed to ensure that fits and predictions are reproducible.""" np.random.seed(42) +<<<<<<< HEAD def _validate_forecast_df(self) -> None: """Validate that `self.forecast_df` has been generated correctly.""" df = self.forecast_df @@ -162,6 +174,9 @@ def _validate_forecast_df(self) -> None: ) def fit(self, observed_df: pd.DataFrame = self.observed_df) -> None: +======= + def fit(self) -> None: +>>>>>>> 590d1ad (add test for fit) """Fit a model using historic metric data provided by `metric_hub`.""" print(f"Fitting {self.model_type} model.", flush=True) self._set_seed() @@ -174,8 +189,13 @@ def predict(self, dates_to_predict: pd.DataFrame = self.dates_to_predict) -> Non print(f"Forecasting from {self.start_date} to {self.end_date}.", flush=True) self._set_seed() self.predicted_at = datetime.now(timezone.utc) +<<<<<<< HEAD self.forecast_df = self._predict(dates_to_predict) self._validate_forecast_df() +======= + self.forecast_df = self._predict(self.dates_to_predict) + self._validate_forecast_df(self.forecast_df) +>>>>>>> 590d1ad (add test for fit) def summarize( self, diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py index abc3a4f5..ecd4c66a 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py @@ -1,5 +1,6 @@ import json import pandas as pd +from pandas.api import types as pd_types import prophet import numpy as np from typing import Dict, List @@ -42,6 +43,32 @@ def _predict(self, dates_to_predict) -> pd.DataFrame: df["submission_date"] = dates_to_predict return df + def _validate_forecast_df(self, df) -> None: + """Validate that `self.forecast_df` has been generated correctly.""" + columns = df.columns + expected_shape = (len(self.dates_to_predict), 1 + self.number_of_simulations) + numeric_columns = df.drop(columns="submission_date").columns + + if "submission_date" not in columns: + raise ValueError("forecast_df must contain a 'submission_date' column.") + + if df.shape != expected_shape: + raise ValueError( + f"Expected forecast_df to have shape {expected_shape}, but it has shape {df.shape}." + ) + + if not df["submission_date"].equals(self.dates_to_predict["submission_date"]): + raise ValueError( + "forecast_df['submission_date'] does not match dates_to_predict['submission_date']." + ) + + for i in numeric_columns: + if not pd_types.is_numeric_dtype(self.forecast_df[i]): + raise ValueError( + "All forecast_df columns except 'submission_date' must be numeric," + f" but column {i} has type {df[i].dtypes}." + ) + def _predict_legacy(self) -> pd.DataFrame: """ Recreate the legacy format used in diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py index a0385c81..30c5e06b 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py @@ -1,4 +1,9 @@ +from typing import Dict, List + import pytest +import pandas as pd +from dotmap import DotMap + from kpi_forecasting.models.base_forecast import BaseForecast @@ -7,9 +12,62 @@ class BadClass(BaseForecast): pass -def test_fit_not_implemented(): +@pytest.fixture() +def good_class(): + class GoodModel: + def __init__(self): + self.is_fit = False + + def fit(self, observed_data): + self.is_fit = max(observed_data) + + class GoodClass(BaseForecast): + # overwrite _get_observed_data + def _get_observed_data(self): + self.observed_df = range(10) + + def _fit(self, observed_df: pd.DataFrame) -> None: + self.model = GoodModel() + self.model.fit(observed_df) + + def _predict(self, dates_to_predict: pd.DataFrame) -> pd.DataFrame: + pass + + def _validate_forecast_df(self, forecast_df: pd.DataFrame) -> None: + pass + + def _summarize( + self, + forecast_df: pd.DataFrame, + observed_df: pd.DataFrame, + period: str, + numpy_aggregations: List[str], + percentiles: List[int], + ) -> pd.DataFrame: + pass + + return GoodClass + + +def test_not_implemented(): with pytest.raises( TypeError, - match="Can't instantiate abstract class BadClass with abstract methods _fit, _predict, _summarize", + match="Can't instantiate abstract class BadClass with abstract methods _fit, _predict, _summarize, _validate_forecast_df", ): _ = BadClass() + + +def test_fit(good_class): + good_class = good_class( + model_type="test", + parameters=DotMap(), + use_holidays=None, + start_date="2124-01-01", + end_date="2124-02-02", + metric_hub=None, + ) + good_class.fit() + assert good_class.model + + # + assert good_class.model.is_fit == 9 From 38e721db89be9d82e5d645c0b18da04126ee8ef7 Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Wed, 10 Jul 2024 15:13:40 -0500 Subject: [PATCH 04/33] revert signatures --- .../kpi_forecasting/models/base_forecast.py | 40 +------------------ 1 file changed, 2 insertions(+), 38 deletions(-) diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py index 5787ad61..0f619113 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py @@ -145,57 +145,21 @@ def _set_seed(self) -> None: """Set random seed to ensure that fits and predictions are reproducible.""" np.random.seed(42) -<<<<<<< HEAD - def _validate_forecast_df(self) -> None: - """Validate that `self.forecast_df` has been generated correctly.""" - df = self.forecast_df - columns = df.columns - expected_shape = (len(self.dates_to_predict), 1 + self.number_of_simulations) - numeric_columns = df.drop(columns="submission_date").columns - - if "submission_date" not in columns: - raise ValueError("forecast_df must contain a 'submission_date' column.") - - if df.shape != expected_shape: - raise ValueError( - f"Expected forecast_df to have shape {expected_shape}, but it has shape {df.shape}." - ) - - if not df["submission_date"].equals(self.dates_to_predict["submission_date"]): - raise ValueError( - "forecast_df['submission_date'] does not match dates_to_predict['submission_date']." - ) - - for i in numeric_columns: - if not pd_types.is_numeric_dtype(self.forecast_df[i]): - raise ValueError( - "All forecast_df columns except 'submission_date' must be numeric," - f" but column {i} has type {df[i].dtypes}." - ) - - def fit(self, observed_df: pd.DataFrame = self.observed_df) -> None: -======= def fit(self) -> None: ->>>>>>> 590d1ad (add test for fit) """Fit a model using historic metric data provided by `metric_hub`.""" print(f"Fitting {self.model_type} model.", flush=True) self._set_seed() self.trained_at = datetime.now(timezone.utc) - self._fit(observed_df) + self._fit(self.observed_df) - def predict(self, dates_to_predict: pd.DataFrame = self.dates_to_predict) -> None: + def predict(self) -> None: """Generate a forecast from `start_date` to `end_date`. Result is set to `self.forecast_df`""" print(f"Forecasting from {self.start_date} to {self.end_date}.", flush=True) self._set_seed() self.predicted_at = datetime.now(timezone.utc) -<<<<<<< HEAD - self.forecast_df = self._predict(dates_to_predict) - self._validate_forecast_df() -======= self.forecast_df = self._predict(self.dates_to_predict) self._validate_forecast_df(self.forecast_df) ->>>>>>> 590d1ad (add test for fit) def summarize( self, From 9b173370e6de383e3bd827f721dc9c2f78210862 Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Wed, 10 Jul 2024 15:15:52 -0500 Subject: [PATCH 05/33] made timezone-aware stamps naive --- .../kpi_forecasting/models/base_forecast.py | 10 ++++++---- .../kpi_forecasting/models/prophet_forecast.py | 4 +++- .../kpi_forecasting/tests/test_metric_hub.py | 6 +++--- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py index 0f619113..b0a84eb2 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py @@ -48,7 +48,7 @@ def _get_observed_data(self): def __post_init__(self) -> None: # fetch observed observed data - self.collected_at = datetime.now(timezone.utc) + self.collected_at = datetime.now(timezone.utc).replace(tzinfo=None) self._get_observed_data() # use default start/end dates if the user doesn't specify them @@ -139,7 +139,9 @@ def _default_start_date(self) -> str: @property def _default_end_date(self) -> str: """78 weeks (18 months) ahead of the current UTC date.""" - return (datetime.now(timezone.utc) + timedelta(weeks=78)).date() + return ( + datetime.now(timezone.utc).replace(tzinfo=None) + timedelta(weeks=78) + ).date() def _set_seed(self) -> None: """Set random seed to ensure that fits and predictions are reproducible.""" @@ -149,7 +151,7 @@ def fit(self) -> None: """Fit a model using historic metric data provided by `metric_hub`.""" print(f"Fitting {self.model_type} model.", flush=True) self._set_seed() - self.trained_at = datetime.now(timezone.utc) + self.trained_at = datetime.now(timezone.utc).replace(tzinfo=None) self._fit(self.observed_df) def predict(self) -> None: @@ -157,7 +159,7 @@ def predict(self) -> None: Result is set to `self.forecast_df`""" print(f"Forecasting from {self.start_date} to {self.end_date}.", flush=True) self._set_seed() - self.predicted_at = datetime.now(timezone.utc) + self.predicted_at = datetime.now(timezone.utc).replace(tzinfo=None) self.forecast_df = self._predict(self.dates_to_predict) self._validate_forecast_df(self.forecast_df) diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py index ecd4c66a..652ced30 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py @@ -87,7 +87,9 @@ def _predict_legacy(self) -> pd.DataFrame: else: df["metric"] = self.metric_hub.alias - df["forecast_date"] = str(datetime.now(timezone.utc).date()) + df["forecast_date"] = str( + datetime.now(timezone.utc).replace(tzinfo=None).date() + ) df["forecast_parameters"] = str( json.dumps({**self.parameters, "holidays": self.use_holidays}) ) diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_metric_hub.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_metric_hub.py index 45d55948..4c58d436 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_metric_hub.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_metric_hub.py @@ -12,7 +12,7 @@ def test_metrichub_for_dau_kpi(): slug="mobile_daily_active_users_v1", start_date="2024-01-01", ) - now = to_datetime(datetime.now(timezone.utc)).date() + now = to_datetime(datetime.now(timezone.utc).replace(tzinfo=None)).date() query = test_metric_hub.query() query_where = f"WHERE submission_date BETWEEN '2024-01-01' AND '{now}'\nGROUP BY" @@ -76,7 +76,7 @@ def test_metrichub_no_end_date(): slug="mobile_daily_active_users_v1", start_date="2024-01-01", ) - now = to_datetime(datetime.now(timezone.utc)).date() + now = to_datetime(datetime.now(timezone.utc).replace(tzinfo=None)).date() assert test_metric_hub.end_date == now @@ -88,7 +88,7 @@ def test_metrichub_last_complete_month(): start_date="2024-01-01", end_date="last complete month", ) - now = to_datetime(datetime.now(timezone.utc)).date() + now = to_datetime(datetime.now(timezone.utc).replace(tzinfo=None)).date() prev_date = previous_period_last_date("last complete month", now) assert test_metric_hub.end_date == to_datetime(prev_date).date() From 90a822edaf2b0ae49298a418bf2faeafc0ea21b4 Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Wed, 10 Jul 2024 16:06:12 -0500 Subject: [PATCH 06/33] finished base_forecast tests --- .../tests/test_base_forecast.py | 126 ++++++++++++++++-- 1 file changed, 113 insertions(+), 13 deletions(-) diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py index 30c5e06b..b53c56b2 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py @@ -3,6 +3,8 @@ import pytest import pandas as pd from dotmap import DotMap +import numpy as np +from datetime import datetime, timedelta, timezone from kpi_forecasting.models.base_forecast import BaseForecast @@ -19,32 +21,47 @@ def __init__(self): self.is_fit = False def fit(self, observed_data): - self.is_fit = max(observed_data) + self.is_fit = max(observed_data["submission_date"]) class GoodClass(BaseForecast): # overwrite _get_observed_data def _get_observed_data(self): - self.observed_df = range(10) - - def _fit(self, observed_df: pd.DataFrame) -> None: + self.observed_df = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2020-01-01"), + pd.to_datetime("1990-01-01"), + ] + } + ) + + def _fit(self, observed_df: np.array) -> None: + # takes array as input to simplify tests self.model = GoodModel() self.model.fit(observed_df) - def _predict(self, dates_to_predict: pd.DataFrame) -> pd.DataFrame: - pass + def _predict(self, dates_to_predict: np.array) -> pd.DataFrame: + # takes array as input to simplify tests + return dates_to_predict * 2 - def _validate_forecast_df(self, forecast_df: pd.DataFrame) -> None: - pass + def _validate_forecast_df(self, forecast_df: np.array) -> None: + # takes array as input to simplify tests + assert np.all(forecast_df // 0 == 0) def _summarize( self, - forecast_df: pd.DataFrame, - observed_df: pd.DataFrame, + forecast_df: np.array, + observed_df: np.array, period: str, numpy_aggregations: List[str], - percentiles: List[int], + percentiles: List[str], ) -> pd.DataFrame: - pass + # input types changes to simplify test + np_func = getattr(np, numpy_aggregations[0]) + agg_val = np_func(forecast_df + observed_df) + return pd.DataFrame( + [{"number": agg_val, "period": period, "percentiles": percentiles[0]}] + ) return GoodClass @@ -57,6 +74,49 @@ def test_not_implemented(): _ = BadClass() +def test_post_init(good_class): + start_date = "2124-01-01" + end_date = "2124-02-02" + good_class = good_class( + model_type="test", + parameters=DotMap(), + use_holidays=None, + start_date=start_date, + end_date=end_date, + metric_hub=None, + ) + dates_to_predict_expected = pd.DataFrame( + { + "submission_date": pd.date_range( + pd.to_datetime(start_date), pd.to_datetime(end_date) + ).date + } + ) + assert good_class.dates_to_predict.equals(dates_to_predict_expected) + + +def test_post_init_default_dates(good_class): + # check default start and end time + good_class = good_class( + model_type="test", + parameters=DotMap(), + use_holidays=None, + start_date="", + end_date="", + metric_hub=None, + ) + # this is the max date of the self.observed_data['submission_date'] plus one day + # from the object definion + start_date = pd.to_datetime("2020-01-02") + end_date = ( + datetime.now(timezone.utc).replace(tzinfo=None) + timedelta(weeks=78) + ).date() + dates_to_predict_expected = pd.DataFrame( + {"submission_date": pd.date_range(start_date, end_date).date} + ) + assert good_class.dates_to_predict.equals(dates_to_predict_expected) + + def test_fit(good_class): good_class = good_class( model_type="test", @@ -70,4 +130,44 @@ def test_fit(good_class): assert good_class.model # - assert good_class.model.is_fit == 9 + assert good_class.model.is_fit == pd.to_datetime("2020-01-01") + + +def test_predict_and_validate(good_class): + good_class = good_class( + model_type="test", + parameters=DotMap(), + use_holidays=None, + start_date="2124-01-01", + end_date="2124-02-02", + metric_hub=None, + ) + # overwrite date range set in __post_init__ + good_class.dates_to_predict = np.arange(10) + good_class.predict() + assert np.all(good_class.forecast_df == good_class.dates_to_predict * 2) + + +def test_summarize(good_class): + good_class = good_class( + model_type="test", + parameters=DotMap(), + use_holidays=None, + start_date="2124-01-01", + end_date="2124-02-02", + metric_hub=None, + ) + good_class.forecast_df = np.array([1, 2]) + good_class.observed_df = np.array([3, 4]) + number_val = 10 + output = good_class.summarize( + periods=["a", "b", "c"], numpy_aggregations=["sum"], percentiles=["percentiles"] + ) + expected_output = pd.DataFrame( + [ + {"number": number_val, "period": el, "percentiles": "percentiles"} + for el in ["a", "b", "c"] + ] + ) + assert output.reset_index(drop=True).equals(expected_output) + assert good_class.summary_df.reset_index(drop=True).equals(expected_output) From 72fabefe8bcf03a66ba809668e4a73cc4bb6b5ee Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Thu, 11 Jul 2024 15:38:43 -0500 Subject: [PATCH 07/33] added tests for prophet class --- .../kpi_forecasting/models/base_forecast.py | 4 + .../models/prophet_forecast.py | 30 +- .../tests/test_base_forecast.py | 2 +- .../tests/test_prophet_forecast.py | 462 ++++++++++++++++++ 4 files changed, 490 insertions(+), 8 deletions(-) create mode 100644 jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py index b0a84eb2..f41f3b59 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py @@ -44,6 +44,10 @@ class BaseForecast(abc.ABC): def _get_observed_data(self): if self.metric_hub: + # the columns in this dataframe + # are "value" for the metric, submission_date + # and any segments where the column name + # is the name of the segment self.observed_df = self.metric_hub.fetch() def __post_init__(self) -> None: diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py index 652ced30..b8539dab 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py @@ -150,27 +150,26 @@ def _predict_legacy(self) -> pd.DataFrame: return df[columns] - def _summarize( + def _combine_forecast_observed( self, forecast_df, observed_df, period: str, numpy_aggregations: List[str], percentiles: List[int], - ) -> pd.DataFrame: - """ - Calculate summary metrics for `self.forecast_df` over a given period, and - add metadata. - """ + ): # build a list of all functions that we'll summarize the data by aggregations = [getattr(np, i) for i in numpy_aggregations] aggregations.extend([pdx.percentile(i) for i in percentiles]) # aggregate metric to the correct date period (day, month, year) observed_summarized = pdx.aggregate_to_period(observed_df, period) - forecast_agg = pdx.aggregate_to_period(forecast_df, period) + forecast_agg = pdx.aggregate_to_period(forecast_df, period).sort_values( + "submission_date" + ) # find periods of overlap between observed and forecasted data + # merge preserves key order so overlap will be sorted by submission_date overlap = forecast_agg.merge( observed_summarized, on="submission_date", @@ -198,7 +197,24 @@ def _summarize( # create a single dataframe that contains observed and forecasted data df = pd.concat([observed_summarized, forecast_summarized]) + return df + + def _summarize( + self, + forecast_df, + observed_df, + period: str, + numpy_aggregations: List[str], + percentiles: List[int], + ) -> pd.DataFrame: + """ + Calculate summary metrics for `self.forecast_df` over a given period, and + add metadata. + """ + df = self._combine_forecast_observed( + forecast_df, observed_df, period, numpy_aggregations, percentiles + ) # add summary metadata columns df["aggregation_period"] = period.lower() diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py index b53c56b2..dc3a7156 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py @@ -1,4 +1,4 @@ -from typing import Dict, List +from typing import List import pytest import pandas as pd diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py new file mode 100644 index 00000000..db2d2a3b --- /dev/null +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py @@ -0,0 +1,462 @@ +from typing import List + +import pytest +import pandas as pd +from dotmap import DotMap +import numpy as np +from datetime import datetime, timedelta, timezone + + +from kpi_forecasting.models.prophet_forecast import ProphetForecast + + +def test_summarize_non_overlapping_day(): + observed_start_date = "2124-01-01" + observed_end_date = "2124-02-01" + + predict_start_date = "2124-02-02" + predict_end_date = "2124-03-01" + + forecast = ProphetForecast( + model_type="test", + parameters=DotMap(), + use_holidays=None, + start_date=predict_start_date, + end_date=predict_end_date, + metric_hub=None, + ) + observed_submission_dates = pd.date_range( + pd.to_datetime(observed_start_date), pd.to_datetime(observed_end_date) + ).date + predict_submission_dates = forecast.dates_to_predict["submission_date"].values + + observed_df = pd.DataFrame( + { + "submission_date": observed_submission_dates, + "value": range(len(observed_submission_dates)), + } + ) + + test_samples = np.array([1, 1, 2, 3, 5, 8, 13]) + test_mean = np.mean(test_samples) + test_median = np.median(test_samples) + + forecast_array = np.stack( + [test_samples * i for i in range(1, 1 + len(predict_submission_dates))], + axis=0, + ) + forecast_data = {str(i): forecast_array[:, i] for i in range(len(test_samples))} + forecast_df = pd.DataFrame( + dict(**{"submission_date": predict_submission_dates}, **forecast_data) + ) + + output_df = forecast._combine_forecast_observed( + forecast_df, observed_df, "day", ["mean", "median"], [50] + ) + + expected_observed_df = observed_df.copy() + expected_observed_df["source"] = "historical" + expected_observed_df["measure"] = "observed" + expected_observed_df["submission_date"] = ( + pd.to_datetime(expected_observed_df["submission_date"].values) + .to_period("d") + .to_timestamp() + ) + + forecast_mean_df = pd.DataFrame( + { + "submission_date": pd.to_datetime(forecast_df["submission_date"].values) + .to_period("d") + .to_timestamp(), + "value": [ + test_mean * i for i in range(1, 1 + len(predict_submission_dates)) + ], + "source": ["forecast"] * len(predict_submission_dates), + "measure": ["mean"] * len(predict_submission_dates), + } + ) + + forecast_median_df = pd.DataFrame( + { + "submission_date": pd.to_datetime(forecast_df["submission_date"].values) + .to_period("d") + .to_timestamp(), + "value": [ + test_median * i for i in range(1, 1 + len(predict_submission_dates)) + ], + "source": ["forecast"] * len(predict_submission_dates), + "measure": ["median"] * len(predict_submission_dates), + } + ) + + forecast_p50_df = forecast_median_df.copy() + forecast_p50_df["measure"] = "p50" + + expected_df = pd.concat( + [expected_observed_df, forecast_mean_df, forecast_median_df, forecast_p50_df] + ) + + assert set(expected_df.columns) == set(output_df.columns) + columns = expected_df.columns + expected_df_compare = ( + expected_df[columns] + .sort_values(["submission_date", "source", "measure"]) + .reset_index(drop=True) + ) + output_df_compare = ( + output_df[columns] + .sort_values(["submission_date", "source", "measure"]) + .reset_index(drop=True) + ) + pd.testing.assert_frame_equal( + expected_df_compare, output_df_compare, check_exact=False + ) + + +def test_summarize_non_overlapping_month(): + observed_start_date = "2124-01-01" + observed_end_date = "2124-02-28" + + predict_start_date = "2124-04-01" + predict_end_date = "2124-05-31" + + forecast = ProphetForecast( + model_type="test", + parameters=DotMap(), + use_holidays=None, + start_date=predict_start_date, + end_date=predict_end_date, + metric_hub=None, + ) + observed_submission_dates = pd.date_range( + pd.to_datetime(observed_start_date), pd.to_datetime(observed_end_date) + ).date + predict_submission_dates = forecast.dates_to_predict["submission_date"].values + + observed_df = pd.DataFrame( + { + "submission_date": observed_submission_dates, + "value": [1] * len(observed_submission_dates), + } + ) + + test_samples = np.array([1, 1, 2, 3, 5, 8, 13]) + test_mean = np.mean(test_samples) + test_median = np.median(test_samples) + + forecast_array = np.stack( + [test_samples] * len(predict_submission_dates), + axis=0, + ) + forecast_data = {str(i): forecast_array[:, i] for i in range(len(test_samples))} + forecast_df = pd.DataFrame( + dict(**{"submission_date": predict_submission_dates}, **forecast_data) + ) + + output_df = forecast._combine_forecast_observed( + forecast_df, observed_df, "month", ["mean", "median"], [50] + ) + + expected_observed_dates = sorted( + pd.to_datetime(observed_df["submission_date"].values) + .to_period("m") + .to_timestamp() + .unique() + ) + expected_observed_df = pd.DataFrame( + { + "submission_date": expected_observed_dates, + "source": ["historical", "historical"], + "measure": ["observed", "observed"], + "value": [31, 28], # number of days in each month + } + ) + + forecast_observed_dates = sorted( + pd.to_datetime(forecast_df["submission_date"].values) + .to_period("m") + .to_timestamp() + .unique() + ) + forecast_mean_df = pd.DataFrame( + { + "submission_date": forecast_observed_dates, + "source": ["forecast", "forecast"], + "measure": ["mean", "mean"], + "value": [test_mean * 30, test_mean * 31], # number of days in each month + } + ) + + forecast_median_df = pd.DataFrame( + { + "submission_date": forecast_observed_dates, + "source": ["forecast", "forecast"], + "measure": ["median", "median"], + "value": [ + test_median * 30, + test_median * 31, + ], # number of days in each month + } + ) + + forecast_p50_df = pd.DataFrame( + { + "submission_date": forecast_observed_dates, + "source": ["forecast", "forecast"], + "measure": ["p50", "p50"], + "value": [ + test_median * 30, + test_median * 31, + ], # number of days in each month + } + ) + + expected_df = pd.concat( + [expected_observed_df, forecast_mean_df, forecast_median_df, forecast_p50_df] + ) + + assert set(expected_df.columns) == set(output_df.columns) + columns = expected_df.columns + expected_df_compare = ( + expected_df[columns] + .sort_values(["submission_date", "source", "measure"]) + .reset_index(drop=True) + ) + output_df_compare = ( + output_df[columns] + .sort_values(["submission_date", "source", "measure"]) + .reset_index(drop=True) + ) + pd.testing.assert_frame_equal( + expected_df_compare, output_df_compare, check_exact=False + ) + + +def test_summarize_overlapping_day(): + observed_start_date = "2124-01-01" + observed_end_date = "2124-02-01" + + predict_start_date = "2124-01-01" + predict_end_date = "2124-02-01" + + forecast = ProphetForecast( + model_type="test", + parameters=DotMap(), + use_holidays=None, + start_date=predict_start_date, + end_date=predict_end_date, + metric_hub=None, + ) + observed_submission_dates = pd.date_range( + pd.to_datetime(observed_start_date), pd.to_datetime(observed_end_date) + ).date + predict_submission_dates = forecast.dates_to_predict["submission_date"].values + + observed_df = pd.DataFrame( + { + "submission_date": observed_submission_dates, + "value": [1] * len(observed_submission_dates), + } + ) + + test_samples = np.array([1, 1, 2, 3, 5, 8, 13]) + test_mean = np.mean(test_samples) + test_median = np.median(test_samples) + + forecast_array = np.stack( + [test_samples * i for i in range(1, 1 + len(predict_submission_dates))], + axis=0, + ) + forecast_data = {str(i): forecast_array[:, i] for i in range(len(test_samples))} + forecast_df = pd.DataFrame( + dict(**{"submission_date": predict_submission_dates}, **forecast_data) + ) + + output_df = forecast._combine_forecast_observed( + forecast_df, observed_df, "day", ["mean", "median"], [50] + ) + + expected_observed_df = observed_df.copy() + expected_observed_df["source"] = "historical" + expected_observed_df["measure"] = "observed" + expected_observed_df["submission_date"] = ( + pd.to_datetime(expected_observed_df["submission_date"].values) + .to_period("d") + .to_timestamp() + ) + + # value has + 1 due to observed (which has value=1) being added + # due to overlap + forecast_mean_df = pd.DataFrame( + { + "submission_date": pd.to_datetime(forecast_df["submission_date"].values) + .to_period("d") + .to_timestamp(), + "value": [ + test_mean * i + 1 for i in range(1, 1 + len(predict_submission_dates)) + ], + "source": ["forecast"] * len(predict_submission_dates), + "measure": ["mean"] * len(predict_submission_dates), + } + ) + + forecast_median_df = pd.DataFrame( + { + "submission_date": pd.to_datetime(forecast_df["submission_date"].values) + .to_period("d") + .to_timestamp(), + "value": [ + test_median * i + 1 for i in range(1, 1 + len(predict_submission_dates)) + ], + "source": ["forecast"] * len(predict_submission_dates), + "measure": ["median"] * len(predict_submission_dates), + } + ) + + forecast_p50_df = forecast_median_df.copy() + forecast_p50_df["measure"] = "p50" + + expected_df = pd.concat( + [expected_observed_df, forecast_mean_df, forecast_median_df, forecast_p50_df] + ) + + assert set(expected_df.columns) == set(output_df.columns) + columns = expected_df.columns + expected_df_compare = ( + expected_df[columns] + .sort_values(["submission_date", "source", "measure"]) + .reset_index(drop=True) + ) + output_df_compare = ( + output_df[columns] + .sort_values(["submission_date", "source", "measure"]) + .reset_index(drop=True) + ) + pd.testing.assert_frame_equal( + expected_df_compare, output_df_compare, check_exact=False + ) + + +def test_summarize_overlapping_month(): + observed_start_date = "2124-01-01" + observed_end_date = "2124-02-28" + + predict_start_date = "2124-01-01" + predict_end_date = "2124-02-28" + + forecast = ProphetForecast( + model_type="test", + parameters=DotMap(), + use_holidays=None, + start_date=predict_start_date, + end_date=predict_end_date, + metric_hub=None, + ) + observed_submission_dates = pd.date_range( + pd.to_datetime(observed_start_date), pd.to_datetime(observed_end_date) + ).date + predict_submission_dates = forecast.dates_to_predict["submission_date"].values + + observed_df = pd.DataFrame( + { + "submission_date": observed_submission_dates, + "value": [1] * len(observed_submission_dates), + } + ) + + test_samples = np.array([1, 1, 2, 3, 5, 8, 13]) + test_mean = np.mean(test_samples) + test_median = np.median(test_samples) + + forecast_array = np.stack( + [test_samples] * len(predict_submission_dates), + axis=0, + ) + forecast_data = {str(i): forecast_array[:, i] for i in range(len(test_samples))} + forecast_df = pd.DataFrame( + dict(**{"submission_date": predict_submission_dates}, **forecast_data) + ) + + output_df = forecast._combine_forecast_observed( + forecast_df, observed_df, "month", ["mean", "median"], [50] + ) + + expected_observed_dates = sorted( + pd.to_datetime(observed_df["submission_date"].values) + .to_period("m") + .to_timestamp() + .unique() + ) + expected_observed_df = pd.DataFrame( + { + "submission_date": expected_observed_dates, + "source": ["historical", "historical"], + "measure": ["observed", "observed"], + "value": [31, 28], # number of days in each month + } + ) + + forecast_observed_dates = sorted( + pd.to_datetime(forecast_df["submission_date"].values) + .to_period("m") + .to_timestamp() + .unique() + ) + + # add extra length of month for aggregated value column that gets added + # due to overlap + forecast_mean_df = pd.DataFrame( + { + "submission_date": forecast_observed_dates, + "source": ["forecast", "forecast"], + "measure": ["mean", "mean"], + "value": [ + test_mean * 31 + 31, + test_mean * 28 + 28, + ], # number of days in each month + } + ) + + forecast_median_df = pd.DataFrame( + { + "submission_date": forecast_observed_dates, + "source": ["forecast", "forecast"], + "measure": ["median", "median"], + "value": [ + test_median * 31 + 31, + test_median * 28 + 28, + ], # number of days in each month + } + ) + + forecast_p50_df = pd.DataFrame( + { + "submission_date": forecast_observed_dates, + "source": ["forecast", "forecast"], + "measure": ["p50", "p50"], + "value": [ + test_median * 31 + 31, + test_median * 28 + 28, + ], # number of days in each month + } + ) + + expected_df = pd.concat( + [expected_observed_df, forecast_mean_df, forecast_median_df, forecast_p50_df] + ) + + assert set(expected_df.columns) == set(output_df.columns) + columns = expected_df.columns + expected_df_compare = ( + expected_df[columns] + .sort_values(["submission_date", "source", "measure"]) + .reset_index(drop=True) + ) + output_df_compare = ( + output_df[columns] + .sort_values(["submission_date", "source", "measure"]) + .reset_index(drop=True) + ) + pd.testing.assert_frame_equal( + expected_df_compare, output_df_compare, check_exact=False + ) From 1ece1dd3295cb87797cc9de33988b5318381f4df Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Thu, 11 Jul 2024 15:39:07 -0500 Subject: [PATCH 08/33] linting --- .../kpi_forecasting/tests/test_prophet_forecast.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py index db2d2a3b..dea754a3 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py @@ -1,10 +1,7 @@ -from typing import List -import pytest import pandas as pd from dotmap import DotMap import numpy as np -from datetime import datetime, timedelta, timezone from kpi_forecasting.models.prophet_forecast import ProphetForecast From 606e2e4dce2e61d58027de636f57692e35b5d7a1 Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Thu, 11 Jul 2024 15:42:42 -0500 Subject: [PATCH 09/33] fixed divide by zero --- .../kpi_forecasting/tests/test_base_forecast.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py index dc3a7156..6a731560 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py @@ -46,7 +46,8 @@ def _predict(self, dates_to_predict: np.array) -> pd.DataFrame: def _validate_forecast_df(self, forecast_df: np.array) -> None: # takes array as input to simplify tests - assert np.all(forecast_df // 0 == 0) + # check that all are even after _predict runs + assert np.all(forecast_df % 2 == 0) def _summarize( self, From 585f2ca204e9920f089b8e3e6656de6d6a7b1aff Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Thu, 11 Jul 2024 15:44:29 -0500 Subject: [PATCH 10/33] linting again --- .../kpi_forecasting/tests/test_prophet_forecast.py | 1 - 1 file changed, 1 deletion(-) diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py index dea754a3..18d3df67 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py @@ -1,4 +1,3 @@ - import pandas as pd from dotmap import DotMap import numpy as np From 97bd46c63a8913f6213e23cb7acf75f556dda647 Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Tue, 23 Jul 2024 12:03:02 -0500 Subject: [PATCH 11/33] adding tests to funnel_forecast --- .../kpi_forecasting/models/funnel_forecast.py | 25 +- .../tests/test_funnel_forecast.py | 700 ++++++++++++++++++ 2 files changed, 716 insertions(+), 9 deletions(-) diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py index c5d4a980..35b42c1b 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py @@ -64,6 +64,10 @@ def __post_init__(self) -> None: """ super().__post_init__() + if self.metric_hub is None: + # this is used to avoid the code below for testing purposes + return + # Overwrite dates_to_predict to provide historical date forecasts self.dates_to_predict = pd.DataFrame( { @@ -155,6 +159,11 @@ def _fill_regressor_dates(self, regressor: ProphetRegressor) -> ProphetRegressor setattr(regressor, date, getattr(self, date)) elif isinstance(getattr(regressor, date), str): setattr(regressor, date, pd.to_datetime(getattr(regressor, date))) + + if regressor.end_date < regressor.start_date: + raise Exception( + f"Regressor {regressor.name} start date comes after end date" + ) return regressor def _build_model( @@ -252,7 +261,7 @@ def _build_model_dataframe( df["floor"] = segment_settings.trained_parameters["floor"] df["cap"] = segment_settings.trained_parameters["cap"] else: - raise ValueError("task not in ['train','predict']") + raise ValueError(f"task set to {task}, must be train or predict") if segment_settings.regressors: df = self._add_regressors(df, segment_settings.regressors) @@ -333,26 +342,24 @@ def _auto_tuning(self, segment_settings: SegmentModelSettings) -> Dict[str, floa return param_grid[min_abs_bias_index] - def _add_regressors(self, dat: pd.DataFrame, regressors: List[ProphetRegressor]): + def _add_regressors(self, df: pd.DataFrame, regressors: List[ProphetRegressor]): """ Add regressor columns to the dataframe for training or prediction. Args: - dat (pd.DataFrame): The input dataframe. + df (pd.DataFrame): The input dataframe. regressors (List[ProphetRegressor]): The list of regressors to add. Returns: pd.DataFrame: The dataframe with regressors added. """ - df = dat.copy().rename(columns=self.column_names_map) - df["ds"] = pd.to_datetime(df["ds"]) for regressor in regressors: regressor = self._fill_regressor_dates(regressor) # finds rows where date is in regressor date ranges and sets that regressor ## value to 1, else 0 df[regressor.name] = np.where( - (df["ds"] >= pd.to_datetime(regressor.start_date)) - & (df["ds"] <= pd.to_datetime(regressor.end_date)), + (df["ds"] >= pd.to_datetime(regressor.start_date).date()) + & (df["ds"] <= pd.to_datetime(regressor.end_date).date()), 0, 1, ) @@ -693,10 +700,10 @@ def write_results( if components_table: numeric_cols = self.components_df.dtypes[ - self.components_df.dtypes == float + self.components_df.dtypes is float ].index.tolist() string_cols = self.components_df.dtypes[ - self.components_df.dtypes == object + self.components_df.dtypes is object ].index.tolist() self.components_df["metric_slug"] = self.metric_hub.slug self.components_df["forecast_trained_at"] = self.trained_at diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py index e69de29b..bf8342ea 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py @@ -0,0 +1,700 @@ +import re + +import pandas as pd +from dotmap import DotMap +import pytest + + +from kpi_forecasting.configs.model_inputs import ProphetRegressor, ProphetHoliday +from kpi_forecasting.models.funnel_forecast import SegmentModelSettings, FunnelForecast + + +@pytest.fixture() +def forecast(): + predict_start_date = "2124-01-01" + predict_end_date = "2124-03-01" + + forecast = FunnelForecast( + model_type="test", + parameters=DotMap(), + use_holidays=None, + start_date=predict_start_date, + end_date=predict_end_date, + metric_hub=None, + ) + return forecast + + +def test_fill_regressor_dates(forecast): + regressor_info = { + "name": "only_start", + "description": "only has a start", + "start_date": "2020-08-15", + } + regressor = ProphetRegressor(**regressor_info) + forecast._fill_regressor_dates(regressor) + assert regressor.start_date == pd.to_datetime("2020-08-15") + assert regressor.end_date == pd.to_datetime("2124-03-01") + + regressor_info = { + "name": "only_end", + "description": "only has a end", + "end_date": "2125-08-15", + } + regressor = ProphetRegressor(**regressor_info) + forecast._fill_regressor_dates(regressor) + assert regressor.start_date == pd.to_datetime("2124-01-01") + assert regressor.end_date == pd.to_datetime("2125-08-15") + + regressor_info = { + "name": "both", + "description": "only has a start", + "start_date": "2020-08-15", + "end_date": "2020-09-15", + } + regressor = ProphetRegressor(**regressor_info) + forecast._fill_regressor_dates(regressor) + assert regressor.start_date == pd.to_datetime("2020-08-15") + assert regressor.end_date == pd.to_datetime("2020-09-15") + + regressor_info = { + "name": "neither", + "description": "nothin to see here", + } + regressor = ProphetRegressor(**regressor_info) + forecast._fill_regressor_dates(regressor) + assert regressor.start_date == pd.to_datetime("2124-01-01") + assert regressor.end_date == pd.to_datetime("2124-03-01") + + regressor_info = { + "name": "out_of_order", + "description": "best better break", + "start_date": "2020-08-15", + "end_date": "2000-09-15", + } + regressor = ProphetRegressor(**regressor_info) + with pytest.raises( + Exception, + match="Regressor out_of_order start date comes after end date", + ): + forecast._fill_regressor_dates(regressor) + + +def test_add_regressors(forecast): + regressor_list_raw = [ + { + "name": "all_in", + "description": "it's all in", + "start_date": "2124-01-01", + "end_date": "2124-01-06", + }, + { + "name": "all_out", + "description": "it's all in", + "start_date": "2124-02-01", + "end_date": "2124-02-06", + }, + { + "name": "just_end", + "description": "just the second half", + "start_date": "2124-01-03", + "end_date": "2124-02-06", + }, + { + "name": "just_middle", + "description": "just the middle two", + "start_date": "2124-01-02", + "end_date": "2124-01-03", + }, + ] + + regressor_list = [ProphetRegressor(**r) for r in regressor_list_raw] + + df = pd.DataFrame( + { + "ds": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2124-01-03").date(), + pd.to_datetime("2124-01-04").date(), + ], + } + ) + + output_df = forecast._add_regressors(df, regressors=regressor_list) + + expected_df = pd.DataFrame( + { + "ds": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2124-01-03").date(), + pd.to_datetime("2124-01-04").date(), + ], + "all_in": [0, 0, 0, 0], + "all_out": [1, 1, 1, 1], + "just_end": [1, 1, 0, 0], + "just_middle": [1, 0, 0, 1], + } + ) + + assert set(output_df.columns) == set(expected_df.columns) + pd.testing.assert_frame_equal(output_df, expected_df[output_df.columns]) + + +def test_build_model_dataframe_exception(forecast): + regressor_list = [] + + grid_parameters = { + "changepoint_prior_scale": [0.01, 0.1, 0.15, 0.2], + "changepoint_range": [0.8, 0.9, 1], + "n_changepoints": [30], + "weekly_seasonality": True, + "yearly_seasonality": True, + "growth": "logistic", + } + cv_settings = { + "initial": "366 days", + "period": "30 days", + "horizon": "30 days", + "parallel": "processes", + } + segment_settings = SegmentModelSettings( + segment={"a": 1, "b": 2}, + start_date="2124-01-01", + end_date="2124-02-01", + holidays=[], + regressors=[ProphetRegressor(**r) for r in regressor_list], + grid_parameters=grid_parameters, + cv_settings=cv_settings, + ) + + observed_df = pd.DataFrame( + { + "a": [1, 1, 1, 1, 3, 3], + "b": [1, 1, 2, 2, 2, 2], + "y": [1, 2, 3, 4, 5, 6], + "submission_date": [ + pd.to_datetime("2124-12-01").date(), + pd.to_datetime("2124-12-02").date(), + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2123-01-01").date(), + pd.to_datetime("2123-01-02").date(), + ], + } + ) + + forecast.observed_df = observed_df + + with pytest.raises(ValueError, match="task set to test, must be train or predict"): + _ = forecast._build_model_dataframe( + segment_settings=segment_settings, task="test" + ) + + +def test_build_model_dataframe_no_regressors_train(forecast): + regressor_list = [] + + grid_parameters = { + "changepoint_prior_scale": [0.01, 0.1, 0.15, 0.2], + "changepoint_range": [0.8, 0.9, 1], + "n_changepoints": [30], + "weekly_seasonality": True, + "yearly_seasonality": True, + "growth": "logistic", + } + cv_settings = { + "initial": "366 days", + "period": "30 days", + "horizon": "30 days", + "parallel": "processes", + } + segment_settings = SegmentModelSettings( + segment={"a": 1, "b": 2}, + start_date="2124-01-01", + end_date="2124-02-01", + holidays=[], + regressors=[ProphetRegressor(**r) for r in regressor_list], + grid_parameters=grid_parameters, + cv_settings=cv_settings, + ) + + observed_df = pd.DataFrame( + { + "a": [1, 1, 1, 1, 3, 3], + "b": [1, 1, 2, 2, 2, 2], + "y": [1, 2, 3, 4, 5, 6], + "submission_date": [ + pd.to_datetime("2124-12-01").date(), + pd.to_datetime("2124-12-02").date(), + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2123-01-01").date(), + pd.to_datetime("2123-01-02").date(), + ], + } + ) + + forecast.observed_df = observed_df + + output_train_df = forecast._build_model_dataframe( + segment_settings=segment_settings, task="train" + ) + expected_train_df = pd.DataFrame( + { + "a": [1, 1], + "b": [2, 2], + "y": [3, 4], + "ds": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + } + ) + pd.testing.assert_frame_equal( + output_train_df.reset_index(drop=True), expected_train_df + ) + + output_train_wlog_df = forecast._build_model_dataframe( + segment_settings=segment_settings, task="train", add_logistic_growth_cols=True + ) + expected_train_wlog_df = pd.DataFrame( + { + "a": [1, 1], + "b": [2, 2], + "y": [3, 4], + "ds": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + "floor": [1.5, 1.5], + "cap": [6.0, 6.0], + } + ) + + assert set(output_train_wlog_df.columns) == set(expected_train_wlog_df.columns) + pd.testing.assert_frame_equal( + output_train_wlog_df.reset_index(drop=True), + expected_train_wlog_df[output_train_wlog_df.columns], + ) + + +def test_build_model_dataframe_train(forecast): + regressor_list = [ + { + "name": "all_in", + "description": "it's all in", + "start_date": "2124-01-01", + "end_date": "2124-01-06", + }, + { + "name": "all_out", + "description": "it's all in", + "start_date": "2124-02-01", + "end_date": "2124-02-06", + }, + { + "name": "just_end", + "description": "just the second one", + "start_date": "2124-01-02", + "end_date": "2124-02-06", + }, + ] + + grid_parameters = { + "changepoint_prior_scale": [0.01, 0.1, 0.15, 0.2], + "changepoint_range": [0.8, 0.9, 1], + "n_changepoints": [30], + "weekly_seasonality": True, + "yearly_seasonality": True, + "growth": "logistic", + } + cv_settings = { + "initial": "366 days", + "period": "30 days", + "horizon": "30 days", + "parallel": "processes", + } + segment_settings = SegmentModelSettings( + segment={"a": 1, "b": 2}, + start_date="2124-01-01", + end_date="2124-02-01", + holidays=[], + regressors=[ProphetRegressor(**r) for r in regressor_list], + grid_parameters=grid_parameters, + cv_settings=cv_settings, + ) + + observed_df = pd.DataFrame( + { + "a": [1, 1, 1, 1, 3, 3], + "b": [1, 1, 2, 2, 2, 2], + "y": [1, 2, 3, 4, 5, 6], + "submission_date": [ + pd.to_datetime("2124-12-01").date(), + pd.to_datetime("2124-12-02").date(), + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2123-01-01").date(), + pd.to_datetime("2123-01-02").date(), + ], + } + ) + + forecast.observed_df = observed_df + + output_train_df = forecast._build_model_dataframe( + segment_settings=segment_settings, task="train" + ) + expected_train_df = pd.DataFrame( + { + "a": [1, 1], + "b": [2, 2], + "y": [3, 4], + "ds": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + "all_in": [0, 0], + "all_out": [1, 1], + "just_end": [1, 0], + } + ) + pd.testing.assert_frame_equal( + output_train_df.reset_index(drop=True), expected_train_df + ) + + output_train_wlog_df = forecast._build_model_dataframe( + segment_settings=segment_settings, task="train", add_logistic_growth_cols=True + ) + expected_train_wlog_df = pd.DataFrame( + { + "a": [1, 1], + "b": [2, 2], + "y": [3, 4], + "ds": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + "all_in": [0, 0], + "all_out": [1, 1], + "just_end": [1, 0], + "floor": [1.5, 1.5], + "cap": [6.0, 6.0], + } + ) + + assert set(output_train_wlog_df.columns) == set(expected_train_wlog_df.columns) + pd.testing.assert_frame_equal( + output_train_wlog_df.reset_index(drop=True), + expected_train_wlog_df[output_train_wlog_df.columns], + ) + + +def test_build_model_dataframe_no_regressors_predict(forecast): + regressor_list = [] + + grid_parameters = { + "changepoint_prior_scale": [0.01, 0.1, 0.15, 0.2], + "changepoint_range": [0.8, 0.9, 1], + "n_changepoints": [30], + "weekly_seasonality": True, + "yearly_seasonality": True, + "growth": "logistic", + } + cv_settings = { + "initial": "366 days", + "period": "30 days", + "horizon": "30 days", + "parallel": "processes", + } + segment_settings = SegmentModelSettings( + segment={"a": 1, "b": 2}, + start_date="2124-01-01", + end_date="2124-02-01", + holidays=[], + regressors=[ProphetRegressor(**r) for r in regressor_list], + grid_parameters=grid_parameters, + cv_settings=cv_settings, + ) + + segment_settings.trained_parameters = {"floor": -1.0, "cap": 10.0} + + dates_to_predict = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2124-12-01").date(), + pd.to_datetime("2124-12-02").date(), + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2123-01-01").date(), + pd.to_datetime("2123-01-02").date(), + ], + } + ) + + forecast.dates_to_predict = dates_to_predict + + output_predict_df = forecast._build_model_dataframe( + segment_settings=segment_settings, task="predict" + ) + expected_predict_df = pd.DataFrame( + { + "ds": [ + pd.to_datetime("2124-12-01").date(), + pd.to_datetime("2124-12-02").date(), + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2123-01-01").date(), + pd.to_datetime("2123-01-02").date(), + ], + } + ) + pd.testing.assert_frame_equal( + output_predict_df.reset_index(drop=True), expected_predict_df + ) + + output_predict_wlog_df = forecast._build_model_dataframe( + segment_settings=segment_settings, task="predict", add_logistic_growth_cols=True + ) + expected_predict_wlog_df = pd.DataFrame( + { + "ds": [ + pd.to_datetime("2124-12-01").date(), + pd.to_datetime("2124-12-02").date(), + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2123-01-01").date(), + pd.to_datetime("2123-01-02").date(), + ], + "floor": [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0], + "cap": [10.0, 10.0, 10.0, 10.0, 10.0, 10.0], + } + ) + + assert set(output_predict_wlog_df.columns) == set(expected_predict_wlog_df.columns) + pd.testing.assert_frame_equal( + output_predict_wlog_df.reset_index(drop=True), + expected_predict_wlog_df[output_predict_wlog_df.columns], + ) + + +def test_build_model_dataframe_predict(forecast): + regressor_list = [ + { + "name": "all_in", + "description": "it's all in", + "start_date": "2124-01-01", + "end_date": "2124-01-06", + }, + { + "name": "all_out", + "description": "it's all in", + "start_date": "2124-02-01", + "end_date": "2124-02-06", + }, + { + "name": "just_end", + "description": "just the second one", + "start_date": "2124-01-02", + "end_date": "2124-02-06", + }, + ] + + grid_parameters = { + "changepoint_prior_scale": [0.01, 0.1, 0.15, 0.2], + "changepoint_range": [0.8, 0.9, 1], + "n_changepoints": [30], + "weekly_seasonality": True, + "yearly_seasonality": True, + "growth": "logistic", + } + cv_settings = { + "initial": "366 days", + "period": "30 days", + "horizon": "30 days", + "parallel": "processes", + } + segment_settings = SegmentModelSettings( + segment={"a": 1, "b": 2}, + start_date="2124-01-01", + end_date="2124-02-01", + holidays=[], + regressors=[ProphetRegressor(**r) for r in regressor_list], + grid_parameters=grid_parameters, + cv_settings=cv_settings, + ) + + observed_df = pd.DataFrame( + { + "a": [1, 1, 1, 1, 3, 3], + "b": [1, 1, 2, 2, 2, 2], + "y": [1, 2, 3, 4, 5, 6], + "submission_date": [ + pd.to_datetime("2124-12-01").date(), + pd.to_datetime("2124-12-02").date(), + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2123-01-01").date(), + pd.to_datetime("2123-01-02").date(), + ], + } + ) + + forecast.observed_df = observed_df + + output_train_df = forecast._build_model_dataframe( + segment_settings=segment_settings, task="train" + ) + expected_train_df = pd.DataFrame( + { + "a": [1, 1], + "b": [2, 2], + "y": [3, 4], + "ds": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + "all_in": [0, 0], + "all_out": [1, 1], + "just_end": [1, 0], + } + ) + pd.testing.assert_frame_equal( + output_train_df.reset_index(drop=True), expected_train_df + ) + + output_train_wlog_df = forecast._build_model_dataframe( + segment_settings=segment_settings, task="train", add_logistic_growth_cols=True + ) + expected_train_wlog_df = pd.DataFrame( + { + "a": [1, 1], + "b": [2, 2], + "y": [3, 4], + "ds": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + "all_in": [0, 0], + "all_out": [1, 1], + "just_end": [1, 0], + "floor": [1.5, 1.5], + "cap": [6.0, 6.0], + } + ) + + assert set(output_train_wlog_df.columns) == set(expected_train_wlog_df.columns) + pd.testing.assert_frame_equal( + output_train_wlog_df.reset_index(drop=True), + expected_train_wlog_df[output_train_wlog_df.columns], + ) + + +def test_build_model(forecast): + regressor_list = [ + { + "name": "all_in", + "description": "it's all in", + "start_date": "2124-01-01", + "end_date": "2124-01-06", + }, + { + "name": "all_out", + "description": "it's all in", + "start_date": "2124-02-01", + "end_date": "2124-02-06", + }, + { + "name": "just_end", + "description": "just the second one", + "start_date": "2124-01-02", + "end_date": "2124-02-06", + }, + ] + + holiday_list = { + "easter": { + "name": "easter", + "ds": [ + "2016-03-27", + "2017-04-16", + "2018-04-01", + "2019-04-21", + "2020-04-12", + "2021-04-04", + "2022-04-17", + "2023-04-09", + "2024-03-31", + "2025-04-20", + ], + "lower_window": -2, + "upper_window": 1, + }, + "covid_sip1": { + "name": "covid_sip1", + "ds": ["2020-03-14"], + "lower_window": 0, + "upper_window": 45, + }, + "covid_sip11": { + "name": "covid_sip11", + "ds": ["2020-03-14"], + "lower_window": -14, + "upper_window": 30, + }, + } + + grid_parameters = { + "changepoint_prior_scale": [0.01, 0.1, 0.15, 0.2], + "changepoint_range": [0.8, 0.9, 1], + "n_changepoints": [30], + "weekly_seasonality": True, + "yearly_seasonality": True, + "growth": "logistic", + } + cv_settings = { + "initial": "366 days", + "period": "30 days", + "horizon": "30 days", + "parallel": "processes", + } + segment_settings = SegmentModelSettings( + segment={"a": 1, "b": 2}, + start_date="2124-01-01", + end_date="2124-02-01", + holidays=[ProphetHoliday(**h) for h in holiday_list.values()], + regressors=[ProphetRegressor(**r) for r in regressor_list], + grid_parameters=grid_parameters, + cv_settings=cv_settings, + ) + + model = forecast._build_model( + segment_settings=segment_settings, + parameters={ + "changepoint_prior_scale": 0.01, + "changepoint_range": 0.8, + "n_changepoints": 30, + "weekly_seasonality": True, + "yearly_seasonality": True, + "growth": "logistic", + }, + ) + + holiday_df = model.holidays + expected_holidays = pd.concat( + [ + pd.DataFrame( + { + "holiday": h["name"], + "ds": pd.to_datetime(h["ds"]), + "lower_window": h["lower_window"], + "upper_window": h["upper_window"], + } + ) + for h in holiday_list.values() + ], + ignore_index=True, + ) + pd.testing.assert_frame_equal(holiday_df, expected_holidays) From c35247dfca7e877983535880e744af44f251cd3e Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Mon, 29 Jul 2024 09:58:44 -0500 Subject: [PATCH 12/33] added tests for funnel_forecast --- .../kpi_forecasting/models/funnel_forecast.py | 360 +++--- .../models/prophet_forecast.py | 29 +- .../tests/test_funnel_forecast.py | 1131 +++++++++++++++-- .../tests/test_performance_analysis.py | 1 - jobs/kpi-forecasting/requirements.txt | 1 + 5 files changed, 1256 insertions(+), 266 deletions(-) diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py index 35b42c1b..c4683f16 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py @@ -18,8 +18,7 @@ holiday_collection, regressor_collection, ) -from kpi_forecasting.models.base_forecast import BaseForecast -from kpi_forecasting import pandas_extras as pdx +from kpi_forecasting.models.prophet_forecast import ProphetForecast @dataclass @@ -45,7 +44,7 @@ class SegmentModelSettings: @dataclass -class FunnelForecast(BaseForecast): +class FunnelForecast(ProphetForecast): """ FunnelForecast class for generating and managing forecast models. The class handles cases where forecasts for a combination of dimensions are required for a metric. @@ -77,11 +76,28 @@ def __post_init__(self) -> None: } ) + self._set_segment_models(self.observed_df, self.metric_hub.segments.keys()) + + # initialize unset attributes + self.components_df = None + + def _set_segment_models( + self, observed_df: pd.DataFrame, segment_column_list: list + ) -> None: + """Creates a SegmentSettings object for each segment specified in the + metric_hub.segments section of the config. These objects are stored in a list + in the segment_models attribute + Parameters can be specified independently for at most one dimension column + set using model_setting_split_dim in self.parameters + + Args: + observed_df (pd.DataFrame): dataframe containing observed data used to model + must contain columns specified in the keys of the segments section of the config + segment_column_list (list): list of columns of observed_df to use to determine segments + """ # Construct a DataFrame containing all combination of segment values ## in the observed_df - combination_df = self.observed_df[ - self.metric_hub.segments.keys() - ].drop_duplicates() + combination_df = observed_df[segment_column_list].drop_duplicates() # Construct dictionaries from those combinations segment_combinations = combination_df.to_dict("records") @@ -90,6 +106,13 @@ def __post_init__(self) -> None: ## populate the list with segments and parameters for the segment split_dim = self.parameters["model_setting_split_dim"] + # check to make sure split_dim is one of the columns set in segment_column_list + if split_dim not in segment_column_list: + columns_str = ",".join(segment_column_list) + raise ValueError( + f"model_setting_split_dim set to {split_dim} which is not among segment columns: {columns_str}" + ) + # For each segment combinination, get the model parameters from the config ## file. Parse the holidays and regressors specified in the config file. segment_models = [] @@ -126,9 +149,6 @@ def __post_init__(self) -> None: ) self.segment_models = segment_models - # initialize unset attributes - self.components_df = None - @property def column_names_map(self) -> Dict[str, str]: """ @@ -211,81 +231,102 @@ def _build_model( return m - def _build_model_dataframe( + def _build_train_dataframe( self, + observed_df, segment_settings: SegmentModelSettings, - task: str, add_logistic_growth_cols: bool = False, ) -> pd.DataFrame: """ - Build the model dataframe for training or prediction. + Build the model dataframe for training Args: + observed_df: dataframe of observed data segment_settings (SegmentModelSettings): The settings for the segment. - task (str): The task, either 'train' or 'predict'. add_logistic_growth_cols (bool, optional): Whether to add logistic growth columns. Defaults to False. Returns: pd.DataFrame: The dataframe for the model. """ - # build training dataframe - if task == "train": - # find indices in observed_df for rows that exactly match segment dict - segment_historical_indices = ( - self.observed_df[list(segment_settings.segment)] - == pd.Series(segment_settings.segment) - ).all(axis=1) - df = ( - self.observed_df.loc[ - (segment_historical_indices) - & ( # filter observed_df if segment start date > metric_hub start date - self.observed_df["submission_date"] - >= datetime.strptime( - segment_settings.start_date, "%Y-%m-%d" - ).date() - ) - ] - .rename(columns=self.column_names_map) - .copy() - ) - # define limits for logistic growth - if add_logistic_growth_cols: - df["floor"] = df["y"].min() * 0.5 - df["cap"] = df["y"].max() * 1.5 + # find indices in observed_df for rows that exactly match segment dict + segment_historical_indices = ( + observed_df[list(segment_settings.segment)] + == pd.Series(segment_settings.segment) + ).all(axis=1) + df = ( + observed_df.loc[ + (segment_historical_indices) + & ( # filter observed_df if segment start date > metric_hub start date + observed_df["submission_date"] + >= datetime.strptime(segment_settings.start_date, "%Y-%m-%d").date() + ) + ] + .rename(columns=self.column_names_map) + .copy() + ) + # define limits for logistic growth + if add_logistic_growth_cols: + df["floor"] = df["y"].min() * 0.5 + df["cap"] = df["y"].max() * 1.5 + + if segment_settings.regressors: + df = self._add_regressors(df, segment_settings.regressors) + return df + + def _build_predict_dataframe( + self, + dates_to_predict: pd.DataFrame, + segment_settings: SegmentModelSettings, + add_logistic_growth_cols: bool = False, + ) -> pd.DataFrame: + """creates dataframe used for prediction + + Args: + dates_to_predict (pd.DataFrame): dataframe of dates to predict + segment_settings (SegmentModelSettings): settings related to the segment + add_logistic_growth_cols (bool): Whether to add logistic growth columns. Defaults to False. + + Returns: + pd.DataFrame: dataframe to use used in prediction + """ # predict dataframe only needs dates to predict, logistic growth limits, and regressors - elif task == "predict": - df = self.dates_to_predict.rename(columns=self.column_names_map).copy() - if add_logistic_growth_cols: - df["floor"] = segment_settings.trained_parameters["floor"] - df["cap"] = segment_settings.trained_parameters["cap"] - else: - raise ValueError(f"task set to {task}, must be train or predict") + df = dates_to_predict.rename(columns=self.column_names_map).copy() + if add_logistic_growth_cols: + df["floor"] = segment_settings.trained_parameters["floor"] + df["cap"] = segment_settings.trained_parameters["cap"] if segment_settings.regressors: df = self._add_regressors(df, segment_settings.regressors) return df - def _fit(self) -> None: + def _fit(self, observed_df: pd.DataFrame) -> None: """ Fit and save a Prophet model for each segment combination. + + Args: + observed_df (pd.DataFrame): dataframe of observations. Expected to have columns + specified in the segments section of the config, + submission_date column with unique dates corresponding to each observation and + y column containing values of observations """ for segment_settings in self.segment_models: - parameters = self._auto_tuning(segment_settings) + parameters = self._auto_tuning(observed_df, segment_settings) # Initialize model; build model dataframe add_log_growth_cols = ( "growth" in parameters.keys() and parameters["growth"] == "logistic" ) - test_dat = self._build_model_dataframe( - segment_settings, "train", add_log_growth_cols + test_dat = self._build_train_dataframe( + observed_df, segment_settings, add_log_growth_cols ) model = self._build_model(segment_settings, parameters) model.fit(test_dat) if add_log_growth_cols: + # all values in these colunns are the same parameters["floor"] = test_dat["floor"].values[0] parameters["cap"] = test_dat["cap"].values[0] @@ -296,11 +337,39 @@ def _fit(self) -> None: segment_settings.trained_parameters = parameters segment_settings.segment_model = model - def _auto_tuning(self, segment_settings: SegmentModelSettings) -> Dict[str, float]: + def _get_crossvalidation_metric( + self, m: prophet.Prophet, cv_settings: dict + ) -> float: + """function for calculated the metric used for crossvalidation + + Args: + m (prophet.Prophet): Prophet model for crossvalidation + cv_settings (dict): settings set by segment in the config file + + Returns: + float: Metric where closer to zero means a better model + """ + df_cv = cross_validation(m, **cv_settings) + + df_bias = df_cv.groupby("cutoff")[["yhat", "y"]].sum().reset_index() + df_bias["pcnt_bias"] = df_bias["yhat"] / df_bias["y"] - 1 + # Prophet splits the historical data when doing cross validation using + # cutoffs. The `.tail(3)` limits the periods we consider for the best + # parameters to the 3 most recent cutoff periods. + return df_bias.tail(3)["pcnt_bias"].mean() + + def _auto_tuning( + self, observed_df, segment_settings: SegmentModelSettings + ) -> Dict[str, float]: """ Perform automatic tuning of model parameters. Args: + observed_df (pd.DataFrame): dataframe of observed data + Expected to have columns: + specified in the segments section of the config, + submission_date column with unique dates corresponding to each observation and + y column containing values of observations segment_settings (SegmentModelSettings): The settings for the segment. Returns: @@ -320,8 +389,8 @@ def _auto_tuning(self, segment_settings: SegmentModelSettings) -> Dict[str, floa for v in itertools.product(*segment_settings.grid_parameters.values()) ] - test_dat = self._build_model_dataframe( - segment_settings, "train", add_log_growth_cols + test_dat = self._build_train_dataframe( + observed_df, segment_settings, add_log_growth_cols ) bias = [] @@ -329,14 +398,10 @@ def _auto_tuning(self, segment_settings: SegmentModelSettings) -> Dict[str, floa m = self._build_model(segment_settings, params) m.fit(test_dat) - df_cv = cross_validation(m, **segment_settings.cv_settings) - - df_bias = df_cv.groupby("cutoff")[["yhat", "y"]].sum().reset_index() - df_bias["pcnt_bias"] = df_bias["yhat"] / df_bias["y"] - 1 - # Prophet splits the historical data when doing cross validation using - # cutoffs. The `.tail(3)` limits the periods we consider for the best - # parameters to the 3 most recent cutoff periods. - bias.append(df_bias.tail(3)["pcnt_bias"].mean()) + crossval_metric = self._get_crossvalidation_metric( + m, **segment_settings.cv_settings + ) + bias.append(crossval_metric) min_abs_bias_index = np.argmin(np.abs(bias)) @@ -357,19 +422,20 @@ def _add_regressors(self, df: pd.DataFrame, regressors: List[ProphetRegressor]): regressor = self._fill_regressor_dates(regressor) # finds rows where date is in regressor date ranges and sets that regressor ## value to 1, else 0 - df[regressor.name] = np.where( + df[regressor.name] = ( (df["ds"] >= pd.to_datetime(regressor.start_date).date()) - & (df["ds"] <= pd.to_datetime(regressor.end_date).date()), - 0, - 1, - ) + & (df["ds"] <= pd.to_datetime(regressor.end_date).date()) + ).astype(int) return df - def _predict(self, segment_settings: SegmentModelSettings) -> pd.DataFrame: + def _predict( + self, dates_to_predict_raw: pd.DataFrame, segment_settings: SegmentModelSettings + ) -> pd.DataFrame: """ Generate forecast samples for a segment. Args: + dates_to_predict (pd.DataFrame): dataframe of dates to predict segment_settings (SegmentModelSettings): The settings for the segment. Returns: @@ -380,14 +446,14 @@ def _predict(self, segment_settings: SegmentModelSettings) -> pd.DataFrame: and segment_settings.trained_parameters["growth"] == "logistic" ) # add regressors, logistic growth limits (if applicable) to predict dataframe - dates_to_predict = self._build_model_dataframe( - segment_settings, "predict", add_log_growth_cols + dates_to_predict = self._build_predict_dataframe( + dates_to_predict_raw, segment_settings, add_log_growth_cols ) # draws samples from Prophet posterior distribution, to provide percentile predictions samples = segment_settings.segment_model.predictive_samples(dates_to_predict) df = pd.DataFrame(samples["yhat"]) - df["submission_date"] = self.dates_to_predict + df["submission_date"] = dates_to_predict_raw component_cols = [ "ds", @@ -467,6 +533,54 @@ def _percentile_name_map(self, percentiles: List[int]) -> Dict[str, str]: "mean": "value", } + def _combine_forecast_observed( + self, + forecast_df: pd.DataFrame, + observed_df: pd.DataFrame, + period: str, + numpy_aggregations: List, + percentiles, + segment: dict, + ) -> pd.DataFrame: + """Calculate aggregates over the forecase and observed data + and concatenate the two dataframes + Args: + forecast_df (pd.DataFrame): forecast dataframe + observed_df (pd.DataFrame): observed dataframe + period (str): period to aggregate up to, must be in (day, month, year) + numpy_aggregations (List): List of aggregation functions to apply across samples from the + posterior-predictive distribution. Must take + in a numpy array and return a single value + percentiles: 3-element list of percentiles to calculate across samples from the posterior-predictive distribution + segment (dict): dictionary that lists columns and values corresponding to the segment + keys are the column name used to segment and values are the values + of that column corresponding to the current segment + + Returns: + pd.DataFrame: combined dataframe containing aggregated values from observed and forecast + """ + forecast_summarized, observed_summarized = self._aggregate_forecast_observed( + forecast_df, observed_df, period, numpy_aggregations, percentiles + ) + + # add datasource-specific metadata columns + forecast_summarized["source"] = "forecast" + observed_summarized["source"] = "historical" + + # add segment columns to forecast table + for dim, value in segment.items(): + forecast_summarized[dim] = value + + # rename forecast percentile to low, middle, high + # rename mean to value + forecast_summarized = forecast_summarized.rename( + columns=self._percentile_name_map(percentiles) + ) + + # create a single dataframe that contains observed and forecasted data + df = pd.concat([observed_summarized, forecast_summarized]) + return df + def _summarize( self, segment_settings: SegmentModelSettings, @@ -475,7 +589,8 @@ def _summarize( percentiles: List[int] = [10, 50, 90], ) -> pd.DataFrame: """ - Calculate summary metrics for `forecast_df` over a given period, and add metadata. + Calculate summary metrics on a specific segment + for `forecast_df` over a given period, and add metadata. Args: segment_settings (SegmentModelSettings): The settings for the segment. @@ -492,9 +607,6 @@ def _summarize( Can only pass a list of length 3 as percentiles, for lower, mid, and upper values. """ ) - # build a list of all functions that we'll summarize the data by - aggregations = [getattr(np, i) for i in numpy_aggregations] - aggregations.extend([pdx.percentile(i) for i in percentiles]) # the start date for this segment's historical data, in cases where the full time series ## of historical data is not used for model training @@ -508,82 +620,24 @@ def _summarize( == pd.Series(segment_settings.segment) ).all(axis=1) - # aggregate metric to the correct date period (day, month, year) - observed_summarized = pdx.aggregate_to_period( - ( - self.observed_df.loc[ - (segment_historical_indices) - & ( - self.observed_df["submission_date"] - >= segment_observed_start_date - ) - ].copy() - ), + segment_observed_df = self.observed_df.loc[ + (segment_historical_indices) + & (self.observed_df["submission_date"] >= segment_observed_start_date) + ].copy() + + df = self._combine_forecast_observed( + segment_settings.forecast_df, + segment_observed_df, period, + numpy_aggregations, + percentiles, + segment_settings.segment, ) - forecast_agg = pdx.aggregate_to_period(segment_settings.forecast_df, period) - - # find periods of overlap between observed and forecasted data - overlap = forecast_agg.merge( - observed_summarized, - on="submission_date", - how="left", - ).fillna(0) - - forecast_summarized = ( - forecast_agg.set_index("submission_date") - # Add observed data samples to any overlapping forecasted period. This - # ensures that any forecast made partway through a period accounts for - # previously observed data within the period. For example, when a monthly - # forecast is generated in the middle of the month. - .add(overlap[["value"]].values) - # calculate summary values, aggregating by submission_date, - .agg(aggregations, axis=1) - .reset_index() - ).rename(columns=self._percentile_name_map(percentiles)) - # add datasource-specific metadata columns - forecast_summarized["source"] = "forecast" - observed_summarized["source"] = "historical" - - # add segment columns to forecast table - for dim, value in segment_settings.segment.items(): - forecast_summarized[dim] = value - - # create a single dataframe that contains observed and forecasted data - df = pd.concat([observed_summarized, forecast_summarized]) + df["forecast_parameters"] = json.dumps(segment_settings.trained_parameters) # add summary metadata columns df["aggregation_period"] = period.lower() - - # reorder columns to make interpretation easier - df = df[ - [ - "submission_date", - "aggregation_period", - "source", - "value", - "value_low", - "value_mid", - "value_high", - ] - ] - - # add Metric Hub metadata columns - df["metric_alias"] = self.metric_hub.alias.lower() - df["metric_hub_app_name"] = self.metric_hub.app_name.lower() - df["metric_hub_slug"] = self.metric_hub.slug.lower() - df["metric_start_date"] = pd.to_datetime(self.metric_hub.min_date) - df["metric_end_date"] = pd.to_datetime(self.metric_hub.max_date) - df["metric_collected_at"] = self.collected_at - - # add forecast model metadata columns - df["forecast_start_date"] = self.start_date - df["forecast_end_date"] = self.end_date - df["forecast_trained_at"] = self.trained_at - df["forecast_predicted_at"] = self.predicted_at - df["forecast_parameters"] = json.dumps(segment_settings.trained_parameters) - return df def predict(self) -> None: @@ -593,7 +647,7 @@ def predict(self) -> None: self.predicted_at = datetime.utcnow() for segment_settings in self.segment_models: - forecast_df = self._predict(segment_settings) + forecast_df = self._predict(self.dates_to_predict, segment_settings) self._validate_forecast_df(forecast_df) segment_settings.forecast_df = forecast_df @@ -627,13 +681,29 @@ def summarize( ] ) for dim, dim_value in segment.segment.items(): - summary_df[dim] = dim_value segment.components_df[dim] = dim_value summary_df_list.append(summary_df.copy(deep=True)) components_df_list.append(segment.components_df) del summary_df - self.summary_df = pd.concat(summary_df_list, ignore_index=True) + df = pd.concat(summary_df_list, ignore_index=True) + + # add Metric Hub metadata columns + df["metric_alias"] = self.metric_hub.alias.lower() + df["metric_hub_app_name"] = self.metric_hub.app_name.lower() + df["metric_hub_slug"] = self.metric_hub.slug.lower() + df["metric_start_date"] = pd.to_datetime(self.metric_hub.min_date) + df["metric_end_date"] = pd.to_datetime(self.metric_hub.max_date) + df["metric_collected_at"] = self.collected_at + + # add forecast model metadata columns + df["forecast_start_date"] = self.start_date + df["forecast_end_date"] = self.end_date + df["forecast_trained_at"] = self.trained_at + df["forecast_predicted_at"] = self.predicted_at + + self.summary_df = df + self.components_df = pd.concat(components_df_list, ignore_index=True) def write_results( diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py index b8539dab..60b8982a 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py @@ -150,7 +150,7 @@ def _predict_legacy(self) -> pd.DataFrame: return df[columns] - def _combine_forecast_observed( + def _aggregate_forecast_observed( self, forecast_df, observed_df, @@ -186,17 +186,34 @@ def _combine_forecast_observed( # calculate summary values, aggregating by submission_date, .agg(aggregations, axis=1) .reset_index() - # "melt" the df from wide-format to long-format. - .melt(id_vars="submission_date", var_name="measure") ) + return forecast_summarized, observed_summarized + + def _combine_forecast_observed( + self, + forecast_df, + observed_df, + period: str, + numpy_aggregations: List[str], + percentiles: List[int], + ): + forecast_summarized, observed_summarized = self._aggregate_forecast_observed( + forecast_df, observed_df, period, numpy_aggregations, percentiles + ) + + # remaining column of metric values get the column name 'value' + forecast_summarized = forecast_summarized.melt( + id_vars="submission_date", var_name="measure" + ) + observed_summarized["measure"] = "observed" + # add datasource-specific metadata columns forecast_summarized["source"] = "forecast" observed_summarized["source"] = "historical" - observed_summarized["measure"] = "observed" - # create a single dataframe that contains observed and forecasted data - df = pd.concat([observed_summarized, forecast_summarized]) + df = pd.concat([forecast_summarized, observed_summarized]) + return df def _summarize( diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py index bf8342ea..c792db67 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py @@ -1,8 +1,11 @@ -import re +"""tests for the funnel forecast module""" + +import collections import pandas as pd from dotmap import DotMap import pytest +import numpy as np from kpi_forecasting.configs.model_inputs import ProphetRegressor, ProphetHoliday @@ -11,6 +14,8 @@ @pytest.fixture() def forecast(): + """This mocks a generic forecast object""" + # 2024-01-01 is arbitarily chosen as a future date predict_start_date = "2124-01-01" predict_end_date = "2124-03-01" @@ -25,7 +30,958 @@ def forecast(): return forecast +@pytest.fixture() +def segment_info_fit_tests(): + """This fixture creates segment info dictionaries + that mimic the content of the config file and are used + in the functions that test fit methods""" + + # 2024-01-01 is arbitarily chosen as a future date + A1_start_date = "2124-01-01" + A2_start_date = "2124-01-02" + + segment_info_dict = { + "A1": { + "start_date": A1_start_date, + "grid_parameters": {"param1": [1, 2], "param2": [20, 10]}, + "min_param_value": 10, + }, + "A2": { + "start_date": A2_start_date, + "grid_parameters": {"param1": [-1, -2], "param2": [3, 4]}, + "min_param_value": -3, # closest to zero + }, + } + return segment_info_dict + + +@pytest.fixture() +def funnel_forecast_for_fit_tests(segment_info_fit_tests, mocker): + """This method creates a forecast object from the segment dict + created in the segment_info_fit_tests fixture. It also + mocks some of the object methods to enable easier testing""" + parameter_dict = { + "model_setting_split_dim": "a", + "segment_settings": { + "A1": { + "start_date": segment_info_fit_tests["A1"]["start_date"], + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": segment_info_fit_tests["A1"]["grid_parameters"], + "cv_settings": {}, + }, + "A2": { + "start_date": segment_info_fit_tests["A2"]["start_date"], + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": segment_info_fit_tests["A2"]["grid_parameters"], + "cv_settings": {}, + }, + }, + } + + parameter_dotmap = DotMap(parameter_dict) + predict_start_date = "2124-01-01" + predict_end_date = "2124-01-02" + + forecast = FunnelForecast( + model_type="test", + parameters=parameter_dotmap, + use_holidays=None, + start_date=predict_start_date, + end_date=predict_end_date, + metric_hub=None, + ) + + mocker.patch.object(forecast, "_build_model", mock_build_model) + mocker.patch.object( + forecast, "_get_crossvalidation_metric", mock_get_crossvalidation_metric + ) + + return forecast + + +class MockModel: + """Used in place of prophet.Prophet for testing purposes""" + + def __init__(self, param1=0, param2=0): + self.value = param1 * param2 + self.history = None + + def fit(self, df, *args, **kwargs): + self.history = df + return None + + def predict(self, dates_to_predict): + output = dates_to_predict.copy() + + output[ + [ + "yhat", + "trend", + "trend_upper", + "trend_lower", + "weekly", + "weekly_upper", + "weekly_lower", + "yearly", + "yearly_upper", + "yearly_lower", + ] + ] = 0 # some dummy value so it has the right shape + + return output + + def predictive_samples(self, dates_to_predict): + # prophet function outputs dict of numpy arrays + # only element we care about is `yhat` + output = np.arange(len(dates_to_predict)) * self.value + return {"yhat": {0: output}} + + +def mock_build_model(segment_settings, parameters): + """mocks the FunnelForecast build_model method""" + return MockModel( + **parameters, + ) + + +def mock_get_crossvalidation_metric(m, *args, **kwargs): + """mocks the FunnelForecast get_crossvalidation_metric + method, meant to be used with MockModel""" + return m.value # value atrribute in MockModel + + +def mock_aggregate_forecast_observed( + forecast_df, observed_df, period, numpy_aggregations, percentiles +): + """Mocks the aggregate_forecast_observed function defined in ProphetForecast + and inherited in FunnelForecast. + This function is tested extensively in test_prophet_forecast + so we can make dummy outputs for tests related to it""" + + # add dummy columns where aggregated metrics woudl go + percentile_columns = [f"p{el}" for el in percentiles] + output_forecast_df = forecast_df.copy() + output_forecast_df[numpy_aggregations + percentile_columns] = 0 + return output_forecast_df, observed_df.copy() + + +def test_combine_forecast_observed(mocker, forecast): + """tests the _combine_forecast_observed method""" + mocker.patch.object( + forecast, "_aggregate_forecast_observed", mock_aggregate_forecast_observed + ) + + forecast_df = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + } + ) + + observed_df = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + "a": ["A1", "A1"], + "value": [5, 6], + } + ) + + numpy_aggregations = ["mean"] + percentiles = [10, 50, 90] + + output_df = forecast._combine_forecast_observed( + forecast_df=forecast_df, + observed_df=observed_df, + period="period", + numpy_aggregations=numpy_aggregations, + percentiles=percentiles, + segment={"a": "A1"}, + ) + + # mean was renamed to value, percentiles to high, medium, low + forecast_df[["value", "value_low", "value_mid", "value_high"]] = 0 + forecast_df["a"] = "A1" # this column is already present in observed + + forecast_df["source"] = "forecast" + observed_df["source"] = "historical" + + # concat in same order to make our lives easier + expected = pd.concat([observed_df, forecast_df]) + assert set(expected.columns) == set(output_df.columns) + pd.testing.assert_frame_equal(output_df, expected[output_df.columns]) + + # should not be any nulls outside the metric column + non_metric_columns = [ + el + for el in output_df.columns + if el not in ["value", "value_low", "value_mid", "value_high"] + ] + assert not pd.isna(output_df[non_metric_columns]).any(axis=None) + + +def test_under_summarize(mocker, forecast): + """testing _summarize""" + # 2024-01-01 is chosen as an arbitrary date to center the tests around + + # forecast predictions are set with the + # mock_aggregate_forecast_observed function so they + # can be ommited here + forecast_df = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + } + ) + + # rows with negative values are those expected to be removed + # by filters in summarize + observed_df = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2123-01-01").date(), + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + "a": ["A1", "A1", "A1", "A2", "A2"], + "value": [10, 20, 30, 40, 50], + } + ) + + SegmentSettings = collections.namedtuple( + "SegmentSettings", + ["start_date", "forecast_df", "segment", "trained_parameters"], + ) + dummy_segment_settings = SegmentSettings( + start_date="2124-01-01", + forecast_df=forecast_df.copy(), + segment={"a": "A1"}, + trained_parameters={"trained_parameters": "yes"}, + ) + + mocker.patch.object( + forecast, "_aggregate_forecast_observed", mock_aggregate_forecast_observed + ) + + forecast.observed_df = observed_df + + numpy_aggregations = ["mean"] + percentiles = [10, 50, 90] + output_df = forecast._summarize( + segment_settings=dummy_segment_settings, + period="period", + numpy_aggregations=numpy_aggregations, + percentiles=percentiles, + ) + observed_expected_df = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + "a": ["A1", "A1"], + "value": [20, 30], + } + ) + + # percentile numeric values changed to names + # mean gets mapped to value + forecast_df[["value", "value_low", "value_mid", "value_high"]] = 0 + + forecast_df["a"] = "A1" # this column is already present in observed + + forecast_df["source"] = "forecast" + observed_expected_df["source"] = "historical" + + # concat in same order to make our lives easier + expected = pd.concat([observed_expected_df, forecast_df]) + expected["forecast_parameters"] = '{"trained_parameters": "yes"}' + expected["aggregation_period"] = "period" + + assert set(expected.columns) == set(output_df.columns) + # force value columns to be floats in both cases to make check easier + numeric_cols = ["value", "value_low", "value_mid", "value_high"] + expected[numeric_cols] = expected[numeric_cols].astype(float) + output_df[numeric_cols] = output_df[numeric_cols].astype(float) + pd.testing.assert_frame_equal( + output_df.reset_index(drop=True), + expected[output_df.columns].reset_index(drop=True), + ) + + # should not be any nulls outside the metric column + non_metric_columns = [el for el in output_df.columns if el not in numeric_cols] + assert not pd.isna(output_df[non_metric_columns]).any(axis=None) + + +def test_summarize(mocker, forecast): + """testing summarize""" + # create dummy metric hub object to when meta data from + # it is added we don't get an error + MetricHub = collections.namedtuple( + "MetricHub", + ["alias", "app_name", "slug", "min_date", "max_date"], + ) + + dummy_metric_hub = MetricHub("", "", "", "2124-01-01", "2124-01-01") + + # forecast predictions are set with the + # mock_aggregate_forecast_observed function so they + # can be ommited here + forecast_df = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + } + ) + + # rows with negative values are those expected to be removed + # by filters in summarize + observed_df = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2123-01-01").date(), + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + "a": ["A1", "A1", "A1", "A2", "A2"], + "value": [10, 20, 30, 40, 50], + } + ) + + SegmentSettings = collections.namedtuple( + "SegmentSettings", + ["start_date", "forecast_df", "segment", "trained_parameters", "components_df"], + ) + + # for the components_df the contents aren't important here + # we're only testing that it is concatenated properly + # with the segment data added + dummy_segment_settings_A1 = SegmentSettings( + start_date="2124-01-01", + forecast_df=forecast_df.copy(), + segment={"a": "A1"}, + trained_parameters={"trained_parameters": "yes"}, + components_df=pd.DataFrame({"testcol": [1]}), + ) + + dummy_segment_settings_A2 = SegmentSettings( + start_date="2124-01-01", + forecast_df=forecast_df.copy(), + segment={"a": "A2"}, + trained_parameters={"trained_parameters": "yes"}, + components_df=pd.DataFrame({"testcol": [2]}), + ) + + segment_models = [dummy_segment_settings_A1, dummy_segment_settings_A2] + + mocker.patch.object( + forecast, "_aggregate_forecast_observed", mock_aggregate_forecast_observed + ) + + forecast.observed_df = observed_df + forecast.segment_models = segment_models + forecast.metric_hub = dummy_metric_hub + + # timestamp attributes created by fit and predict + # must be added manuall + forecast.collected_at = "" + forecast.trained_at = "" + forecast.predicted_at = "" + + numpy_aggregations = ["mean"] + percentiles = [10, 50, 90] + forecast.summarize( + periods=["period"], + numpy_aggregations=numpy_aggregations, + percentiles=percentiles, + ) + + output_df = forecast.summary_df + + # time filter removes first element of observed_df + observed_expected_df = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + "a": ["A1", "A1", "A2", "A2"], + "value": [20, 30, 40, 50], + } + ) + + # doubled because there are two segments in the observed data + forecast_df = pd.concat([forecast_df, forecast_df]) + + forecast_df[["value", "value_low", "value_mid", "value_high"]] = 0 + forecast_df["source"] = "forecast" + + # segment data column is already present in observed + # needs to be added manually for forecast + forecast_df["a"] = [ + "A1", + "A1", + "A2", + "A2", + ] + + observed_expected_df["source"] = "historical" + + # concat in same order to make our lives easier + expected = pd.concat([observed_expected_df, forecast_df]) + expected["forecast_parameters"] = '{"trained_parameters": "yes"}' + expected["aggregation_period"] = "period" + + # not going to check all the metadata columns + # in assert_frame_equal. Just make sure they're there + metadata_columns = { + "metric_alias", + "metric_hub_app_name", + "metric_hub_slug", + "metric_start_date", + "metric_end_date", + "metric_collected_at", + "forecast_start_date", + "forecast_end_date", + "forecast_trained_at", + "forecast_predicted_at", + } + assert set(expected.columns) | metadata_columns == set(output_df.columns) + # force value columns to be floats in both cases to make check easier + numeric_cols = ["value", "value_low", "value_mid", "value_high"] + expected[numeric_cols] = expected[numeric_cols].astype(float) + output_df[numeric_cols] = output_df[numeric_cols].astype(float) + pd.testing.assert_frame_equal( + output_df.sort_values(["a", "submission_date"])[expected.columns].reset_index( + drop=True + ), + expected.sort_values(["a", "submission_date"]).reset_index(drop=True), + ) + + # should not be any nulls outside the metric column + non_metric_columns = [el for el in output_df.columns if el not in numeric_cols] + assert not pd.isna(output_df[non_metric_columns]).any(axis=None) + + # check components + # only checking that concatenation happened properly + # with segment data added + output_components = forecast.components_df + expected_components = pd.DataFrame({"testcol": [1, 2], "a": ["A1", "A2"]}) + pd.testing.assert_frame_equal(expected_components, output_components) + + +def test_under_predict(mocker): + """testing _predict""" + # set segment models + # 2124-01-01 chosen as a artibrary date to center tests on + A1_start_date = "2124-01-01" + parameter_dict = { + "model_setting_split_dim": "a", + "segment_settings": { + "A1": { + "start_date": A1_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {"param1": [1, 2], "param2": [20, 10]}, + "cv_settings": {}, + }, + }, + } + + parameter_dotmap = DotMap(parameter_dict) + predict_start_date = "2124-01-02" + predict_end_date = "2124-03-01" + + forecast = FunnelForecast( + model_type="test", + parameters=parameter_dotmap, + use_holidays=None, + start_date=predict_start_date, + end_date=predict_end_date, + metric_hub=None, + ) + + # this ensures forecast is using MockModel + mocker.patch.object(forecast, "_build_model", mock_build_model) + # the optimization is just using the value attribute of MockModel, + # which is the product of the parameteres passed. The crossvalidation + # will choose the parameters where the absolute value of the product is smallest + mocker.patch.object( + forecast, "_get_crossvalidation_metric", mock_get_crossvalidation_metric + ) + + observed_df = pd.DataFrame( + { + "a": ["A1", "A1"], + "b": ["B1", "B2"], + "y": [0, 1], + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + } + ) + + segment_list = ["a"] + + # manually set segment_models attribute here instead of in __post_init__ + # which is bypassed to avoid a metric hub call + forecast._set_segment_models( + observed_df=observed_df, segment_column_list=segment_list + ) + # check that we only have one element here + assert len(forecast.segment_models) == 1 + # because of the check above we can use the first element + # and know that's all the segments present + segment_settings = forecast.segment_models[0] + + dates_to_predict = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ] + } + ) + forecast.observed_df = observed_df + forecast.fit() + out = forecast._predict(dates_to_predict, segment_settings).reset_index(drop=True) + + # in MockModel, the predictive_samples method sets the output to + # np.arange(len(dates_to_predict)) * self.value for one column called 0 + # this helps ensure the forecast_df in segment_models is set properly + model_value = forecast.segment_models[0].segment_model.value + expected = pd.DataFrame( + { + 0: [0, model_value], + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + } + ) + + # time filter corresponds to the start time of the object + # as opposed to the segment + expected_time_filter = ( + expected["submission_date"] >= pd.to_datetime(forecast.start_date).date() + ) + expected = expected[expected_time_filter].reset_index(drop=True) + + pd.testing.assert_frame_equal(out, expected) + + # check the components + expected_components = observed_df[["submission_date", "y"]].copy() + expected_components[ + [ + "yhat", + "trend", + "trend_upper", + "trend_lower", + "weekly", + "weekly_upper", + "weekly_lower", + "yearly", + "yearly_upper", + "yearly_lower", + ] + ] = 0 + + components_df = forecast.segment_models[0].components_df + assert set(expected_components.columns) == set(components_df.columns) + pd.testing.assert_frame_equal( + components_df, expected_components[components_df.columns] + ) + + +def test_predict(funnel_forecast_for_fit_tests, segment_info_fit_tests): + """test the predict method. This is similar to test_under_predict + but multiple segments are acted upon""" + + observed_data = pd.DataFrame( + { + "a": ["A1", "A1", "A2", "A2"], + "b": ["B1", "B2", "B1", "B2"], + "y": [-1, 1, -1, 1], + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + } + ) + + segment_list = ["a"] + + funnel_forecast_for_fit_tests._set_segment_models( + observed_df=observed_data, segment_column_list=segment_list + ) + funnel_forecast_for_fit_tests.observed_df = observed_data + funnel_forecast_for_fit_tests.fit() + funnel_forecast_for_fit_tests.predict() + + for segment in funnel_forecast_for_fit_tests.segment_models: + key = segment.segment["a"] + + model_value = segment_info_fit_tests[key]["min_param_value"] + + # in MockModel, the predictive_samples method sets the output to + # np.arange(len(dates_to_predict)) * self.value for one column called 0 + # this helps ensure the forecast_df in segment_models is set properly + expected_raw = pd.DataFrame( + { + 0: [0, model_value], + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + } + ) + + # filter in predict happens against object start_date not + # segment start_date + expected_time_filter = ( + expected_raw["submission_date"] + >= pd.to_datetime(funnel_forecast_for_fit_tests.start_date).date() + ) + expected = expected_raw[expected_time_filter] + + forecast_df = segment.forecast_df + pd.testing.assert_frame_equal(forecast_df, expected) + + # check the components + expected_components = expected_raw[["submission_date"]].copy() + expected_components[ + [ + "yhat", + "trend", + "trend_upper", + "trend_lower", + "weekly", + "weekly_upper", + "weekly_lower", + "yearly", + "yearly_upper", + "yearly_lower", + ] + ] = 0 + + # because of time filtereing of training data, if the history has one + # element, y will but [0, 1]. The first element is turned into a NULL + # and then becomes a 0 because of fillna(0) + # if it has two it will have both elements and be [-1,1] + + if len(segment.segment_model.history) == 2: + expected_components["y"] = [-1, 1] + else: + expected_components["y"] = [0, 1] + + components_df = segment.components_df + + # there is weird stuff going on with the types but it shouldn't matter + # so coerce the type + expected_components["y"] = expected_components["y"].astype( + components_df["y"].dtype + ) + assert set(expected_components.columns) == set(components_df.columns) + pd.testing.assert_frame_equal( + components_df, + expected_components[components_df.columns], + check_column_type=False, + ) + + +def test_auto_tuning(forecast, mocker): + """test the auto_tuning function""" + + # set one segment with two sets of grid parameters + segment_settings = SegmentModelSettings( + segment={"a": "A1"}, + start_date="2124-01-01", + end_date="2124-03-01", + holidays=[], + regressors=[], + grid_parameters={"param1": [1, 2], "param2": [20, 10]}, + cv_settings={}, + ) + + mocker.patch.object(forecast, "_build_model", mock_build_model) + + # mock_get_crossvalidation_metric will choose the parameters that + # have the lowest absolute product + mocker.patch.object( + forecast, "_get_crossvalidation_metric", mock_get_crossvalidation_metric + ) + + observed_df = pd.DataFrame( + { + "a": ["A1", "A1"], + "b": ["B1", "B2"], + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-01").date(), + ], + } + ) + + forecast.segment_models = [segment_settings] + + best_params = forecast._auto_tuning(observed_df, segment_settings) + + # in the mocked class the two params get multiplied and the lowest combo gets select + assert best_params == {"param1": 1, "param2": 10} + + +def test_under_fit(funnel_forecast_for_fit_tests, segment_info_fit_tests): + """test the _fit method""" + + observed_data = pd.DataFrame( + { + "a": ["A1", "A1", "A2", "A2"], + "b": ["B1", "B2", "B1", "B2"], + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + } + ) + + segment_list = ["a"] + + funnel_forecast_for_fit_tests._set_segment_models( + observed_df=observed_data, segment_column_list=segment_list + ) + funnel_forecast_for_fit_tests._fit(observed_data) + + # _fit iterates though all the segments in segment_modles + # iterate through them and check based on the value in + # segment_info_fit_tests defined in the fixture of the same name + for segment in funnel_forecast_for_fit_tests.segment_models: + key = segment.segment["a"] + + assert segment.start_date == segment_info_fit_tests[key]["start_date"] + assert segment.grid_parameters == segment_info_fit_tests[key]["grid_parameters"] + segment_model = segment.segment_model + assert segment_model.value == segment_info_fit_tests[key]["min_param_value"] + + # the history attribute is used in the components output so check it is set properly + expected_training = observed_data[ + (observed_data["a"] == key) + & ( + observed_data["submission_date"] + >= pd.to_datetime(segment_info_fit_tests[key]["start_date"]).date() + ) + ].rename(columns={"submission_date": "ds"}) + + pd.testing.assert_frame_equal(segment_model.history, expected_training) + + +def test_fit(funnel_forecast_for_fit_tests, segment_info_fit_tests): + """test the fit function. It is inherited from BaseForecast + and calls _fit with the proper object attributes. Test looks very + similar to that for _fit""" + observed_data = pd.DataFrame( + { + "a": ["A1", "A1", "A2", "A2"], + "b": ["B1", "B2", "B1", "B2"], + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + } + ) + + segment_list = ["a"] + + funnel_forecast_for_fit_tests._set_segment_models( + observed_df=observed_data, segment_column_list=segment_list + ) + funnel_forecast_for_fit_tests.observed_df = observed_data + funnel_forecast_for_fit_tests.fit() + + # _fit is called by fit and iterates though all the segments in segment_modles + # iterate through them and check based on the value in + # segment_info_fit_tests defined in the fixture of the same name + for segment in funnel_forecast_for_fit_tests.segment_models: + key = segment.segment["a"] + + assert segment.start_date == segment_info_fit_tests[key]["start_date"] + assert segment.grid_parameters == segment_info_fit_tests[key]["grid_parameters"] + segment_model = segment.segment_model + assert segment_model.value == segment_info_fit_tests[key]["min_param_value"] + + # check history attribute + expected_training = observed_data[ + (observed_data["a"] == key) + & ( + observed_data["submission_date"] + >= pd.to_datetime(segment_info_fit_tests[key]["start_date"]).date() + ) + ].rename(columns={"submission_date": "ds"}) + pd.testing.assert_frame_equal(segment_model.history, expected_training) + + +def test_set_segment_models(): + """test the set_segment_models method""" + A1_start_date = "2018-01-01" + A2_start_date = "2020-02-02" + parameter_dict = { + "model_setting_split_dim": "a", + "segment_settings": { + "A1": { + "start_date": A1_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {}, + "cv_settings": {}, + }, + "A2": { + "start_date": A2_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {}, + "cv_settings": {}, + }, + }, + } + + parameter_dotmap = DotMap(parameter_dict) + predict_start_date = "2124-01-01" + predict_end_date = "2124-03-01" + + forecast = FunnelForecast( + model_type="test", + parameters=parameter_dotmap, + use_holidays=None, + start_date=predict_start_date, + end_date=predict_end_date, + metric_hub=None, + ) + + observed_data = pd.DataFrame( + {"a": ["A1", "A1", "A2", "A2", "A2"], "b": ["B1", "B2", "B1", "B2", "B2"]} + ) + + segment_list = ["a", "b"] + + forecast._set_segment_models( + observed_df=observed_data, segment_column_list=segment_list + ) + + # put the segments and the start date in the same dictionary to make + # comparison easier + # the important things to check is that all possible combinations + # of segments are present and that each has the parameters set properly + # start_date is a stand-in for these parameters and + # is determined by the value of a as specified in parameter_dict + check_segment_models = [ + dict(**el.segment, **{"start_date": el.start_date}) + for el in forecast.segment_models + ] + expected = [ + {"a": "A1", "b": "B1", "start_date": A1_start_date}, + {"a": "A1", "b": "B2", "start_date": A1_start_date}, + {"a": "A2", "b": "B1", "start_date": A2_start_date}, + {"a": "A2", "b": "B2", "start_date": A2_start_date}, + ] + + # can't make a set of dicts for comparison + # so sort the lists and compare each element + compare_sorted = zip( + sorted(check_segment_models, key=lambda x: (x["a"], x["b"])), + sorted(expected, key=lambda x: (x["a"], x["b"])), + ) + + for checkval, expectedval in compare_sorted: + assert checkval == expectedval + + +def test_set_segment_models_exception(): + """test the exception for segment_models where + and exception is raised if a model_setting_split_dim + is specified that isn't in the data""" + A1_start_date = "2018-01-01" + A2_start_date = "2020-02-02" + parameter_dict = { + "model_setting_split_dim": "c", # not in data + "segment_settings": { + "A1": { + "start_date": A1_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {}, + "cv_settings": {}, + }, + "A2": { + "start_date": A2_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {}, + "cv_settings": {}, + }, + }, + } + + parameter_dotmap = DotMap(parameter_dict) + predict_start_date = "2124-01-01" + predict_end_date = "2124-03-01" + + forecast = FunnelForecast( + model_type="test", + parameters=parameter_dotmap, + use_holidays=None, + start_date=predict_start_date, + end_date=predict_end_date, + metric_hub=None, + ) + + observed_data = pd.DataFrame( + {"a": ["A1", "A1", "A2", "A2", "A2"], "b": ["B1", "B2", "B1", "B2", "B2"]} + ) + + segment_list = ["a", "b"] + + with pytest.raises( + ValueError, + match="model_setting_split_dim set to c which is not among segment columns: a,b", + ): + forecast._set_segment_models( + observed_df=observed_data, segment_column_list=segment_list + ) + + def test_fill_regressor_dates(forecast): + """test _fill_regressor_dates + the name in the regressor info indicates which case is being tested + Dates are chosen arbitrarily""" regressor_info = { "name": "only_start", "description": "only has a start", @@ -81,6 +1037,8 @@ def test_fill_regressor_dates(forecast): def test_add_regressors(forecast): + """test add regressors + test case for each element of regressor_list_raw is indicated in name""" regressor_list_raw = [ { "name": "all_in", @@ -90,7 +1048,7 @@ def test_add_regressors(forecast): }, { "name": "all_out", - "description": "it's all in", + "description": "it's all out", "start_date": "2124-02-01", "end_date": "2124-02-06", }, @@ -131,10 +1089,10 @@ def test_add_regressors(forecast): pd.to_datetime("2124-01-03").date(), pd.to_datetime("2124-01-04").date(), ], - "all_in": [0, 0, 0, 0], - "all_out": [1, 1, 1, 1], - "just_end": [1, 1, 0, 0], - "just_middle": [1, 0, 0, 1], + "all_in": [1, 1, 1, 1], + "all_out": [0, 0, 0, 0], + "just_end": [0, 0, 1, 1], + "just_middle": [0, 1, 1, 0], } ) @@ -142,7 +1100,8 @@ def test_add_regressors(forecast): pd.testing.assert_frame_equal(output_df, expected_df[output_df.columns]) -def test_build_model_dataframe_exception(forecast): +def test_build_train_dataframe_no_regressors(forecast): + """test _build_train_dataframe with no regressors""" regressor_list = [] grid_parameters = { @@ -185,61 +1144,8 @@ def test_build_model_dataframe_exception(forecast): } ) - forecast.observed_df = observed_df - - with pytest.raises(ValueError, match="task set to test, must be train or predict"): - _ = forecast._build_model_dataframe( - segment_settings=segment_settings, task="test" - ) - - -def test_build_model_dataframe_no_regressors_train(forecast): - regressor_list = [] - - grid_parameters = { - "changepoint_prior_scale": [0.01, 0.1, 0.15, 0.2], - "changepoint_range": [0.8, 0.9, 1], - "n_changepoints": [30], - "weekly_seasonality": True, - "yearly_seasonality": True, - "growth": "logistic", - } - cv_settings = { - "initial": "366 days", - "period": "30 days", - "horizon": "30 days", - "parallel": "processes", - } - segment_settings = SegmentModelSettings( - segment={"a": 1, "b": 2}, - start_date="2124-01-01", - end_date="2124-02-01", - holidays=[], - regressors=[ProphetRegressor(**r) for r in regressor_list], - grid_parameters=grid_parameters, - cv_settings=cv_settings, - ) - - observed_df = pd.DataFrame( - { - "a": [1, 1, 1, 1, 3, 3], - "b": [1, 1, 2, 2, 2, 2], - "y": [1, 2, 3, 4, 5, 6], - "submission_date": [ - pd.to_datetime("2124-12-01").date(), - pd.to_datetime("2124-12-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2123-01-01").date(), - pd.to_datetime("2123-01-02").date(), - ], - } - ) - - forecast.observed_df = observed_df - - output_train_df = forecast._build_model_dataframe( - segment_settings=segment_settings, task="train" + output_train_df = forecast._build_train_dataframe( + observed_df, segment_settings=segment_settings ) expected_train_df = pd.DataFrame( { @@ -256,8 +1162,9 @@ def test_build_model_dataframe_no_regressors_train(forecast): output_train_df.reset_index(drop=True), expected_train_df ) - output_train_wlog_df = forecast._build_model_dataframe( - segment_settings=segment_settings, task="train", add_logistic_growth_cols=True + # test again but with add_logistic_growth_cols set to true + output_train_wlog_df = forecast._build_train_dataframe( + observed_df, segment_settings=segment_settings, add_logistic_growth_cols=True ) expected_train_wlog_df = pd.DataFrame( { @@ -280,7 +1187,8 @@ def test_build_model_dataframe_no_regressors_train(forecast): ) -def test_build_model_dataframe_train(forecast): +def test_build_train_dataframe(forecast): + """test _build_train_dataframe and include regressors""" regressor_list = [ { "name": "all_in", @@ -341,11 +1249,8 @@ def test_build_model_dataframe_train(forecast): ], } ) - - forecast.observed_df = observed_df - - output_train_df = forecast._build_model_dataframe( - segment_settings=segment_settings, task="train" + output_train_df = forecast._build_train_dataframe( + observed_df, segment_settings=segment_settings ) expected_train_df = pd.DataFrame( { @@ -356,17 +1261,17 @@ def test_build_model_dataframe_train(forecast): pd.to_datetime("2124-01-01").date(), pd.to_datetime("2124-01-02").date(), ], - "all_in": [0, 0], - "all_out": [1, 1], - "just_end": [1, 0], + "all_in": [1, 1], + "all_out": [0, 0], + "just_end": [0, 1], } ) pd.testing.assert_frame_equal( output_train_df.reset_index(drop=True), expected_train_df ) - output_train_wlog_df = forecast._build_model_dataframe( - segment_settings=segment_settings, task="train", add_logistic_growth_cols=True + output_train_wlog_df = forecast._build_train_dataframe( + observed_df, segment_settings=segment_settings, add_logistic_growth_cols=True ) expected_train_wlog_df = pd.DataFrame( { @@ -377,9 +1282,9 @@ def test_build_model_dataframe_train(forecast): pd.to_datetime("2124-01-01").date(), pd.to_datetime("2124-01-02").date(), ], - "all_in": [0, 0], - "all_out": [1, 1], - "just_end": [1, 0], + "all_in": [1, 1], + "all_out": [0, 0], + "just_end": [0, 1], "floor": [1.5, 1.5], "cap": [6.0, 6.0], } @@ -392,7 +1297,8 @@ def test_build_model_dataframe_train(forecast): ) -def test_build_model_dataframe_no_regressors_predict(forecast): +def test_build_predict_dataframe_no_regressors(forecast): + """test _build_predict with no regressors""" regressor_list = [] grid_parameters = { @@ -419,6 +1325,7 @@ def test_build_model_dataframe_no_regressors_predict(forecast): cv_settings=cv_settings, ) + # manually set trained_parameters, normally this would happen during training segment_settings.trained_parameters = {"floor": -1.0, "cap": 10.0} dates_to_predict = pd.DataFrame( @@ -434,10 +1341,8 @@ def test_build_model_dataframe_no_regressors_predict(forecast): } ) - forecast.dates_to_predict = dates_to_predict - - output_predict_df = forecast._build_model_dataframe( - segment_settings=segment_settings, task="predict" + output_predict_df = forecast._build_predict_dataframe( + dates_to_predict, segment_settings=segment_settings ) expected_predict_df = pd.DataFrame( { @@ -455,8 +1360,11 @@ def test_build_model_dataframe_no_regressors_predict(forecast): output_predict_df.reset_index(drop=True), expected_predict_df ) - output_predict_wlog_df = forecast._build_model_dataframe( - segment_settings=segment_settings, task="predict", add_logistic_growth_cols=True + # test against but with add_logistic_growth_cols set to true + output_predict_wlog_df = forecast._build_predict_dataframe( + dates_to_predict, + segment_settings=segment_settings, + add_logistic_growth_cols=True, ) expected_predict_wlog_df = pd.DataFrame( { @@ -480,7 +1388,8 @@ def test_build_model_dataframe_no_regressors_predict(forecast): ) -def test_build_model_dataframe_predict(forecast): +def test_build_predict_dataframe(forecast): + """test _build_predict_dataframe including regressors""" regressor_list = [ { "name": "all_in", @@ -526,62 +1435,54 @@ def test_build_model_dataframe_predict(forecast): cv_settings=cv_settings, ) - observed_df = pd.DataFrame( + # set training_parameters, which is usually done in the fit method + segment_settings.trained_parameters = {"floor": -1.0, "cap": 10.0} + + dates_to_predict = pd.DataFrame( { - "a": [1, 1, 1, 1, 3, 3], - "b": [1, 1, 2, 2, 2, 2], - "y": [1, 2, 3, 4, 5, 6], "submission_date": [ - pd.to_datetime("2124-12-01").date(), - pd.to_datetime("2124-12-02").date(), pd.to_datetime("2124-01-01").date(), pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2123-01-01").date(), - pd.to_datetime("2123-01-02").date(), ], } ) - forecast.observed_df = observed_df - - output_train_df = forecast._build_model_dataframe( - segment_settings=segment_settings, task="train" + output_train_df = forecast._build_predict_dataframe( + dates_to_predict, + segment_settings=segment_settings, ) expected_train_df = pd.DataFrame( { - "a": [1, 1], - "b": [2, 2], - "y": [3, 4], "ds": [ pd.to_datetime("2124-01-01").date(), pd.to_datetime("2124-01-02").date(), ], - "all_in": [0, 0], - "all_out": [1, 1], - "just_end": [1, 0], + "all_in": [1, 1], + "all_out": [0, 0], + "just_end": [0, 1], } ) pd.testing.assert_frame_equal( output_train_df.reset_index(drop=True), expected_train_df ) - output_train_wlog_df = forecast._build_model_dataframe( - segment_settings=segment_settings, task="train", add_logistic_growth_cols=True + # test again but with add_logistic_growth_cols set to true + output_train_wlog_df = forecast._build_predict_dataframe( + dates_to_predict, + segment_settings=segment_settings, + add_logistic_growth_cols=True, ) expected_train_wlog_df = pd.DataFrame( { - "a": [1, 1], - "b": [2, 2], - "y": [3, 4], "ds": [ pd.to_datetime("2124-01-01").date(), pd.to_datetime("2124-01-02").date(), ], - "all_in": [0, 0], - "all_out": [1, 1], - "just_end": [1, 0], - "floor": [1.5, 1.5], - "cap": [6.0, 6.0], + "all_in": [1, 1], + "all_out": [0, 0], + "just_end": [0, 1], + "floor": [-1.0, -1.0], + "cap": [10.0, 10.0], } ) @@ -593,6 +1494,8 @@ def test_build_model_dataframe_predict(forecast): def test_build_model(forecast): + """test build_model + just runs the function and ensures no error is raised""" regressor_list = [ { "name": "all_in", diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_performance_analysis.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_performance_analysis.py index e6dc10b4..edbc2cbb 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_performance_analysis.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_performance_analysis.py @@ -1,6 +1,5 @@ import pytest import yaml -import cmath import pandas as pd diff --git a/jobs/kpi-forecasting/requirements.txt b/jobs/kpi-forecasting/requirements.txt index 498823ef..218d688a 100644 --- a/jobs/kpi-forecasting/requirements.txt +++ b/jobs/kpi-forecasting/requirements.txt @@ -56,6 +56,7 @@ pyasn1-modules==0.3.0 PyMeeus==0.5.12 pyparsing==3.0.9 pytest==7.3.2 +pytest-mock==3.14.0 pytest-ruff==0.3.2 python-dateutil==2.8.2 pytz==2023.3 From 6ab0527667a59d26111c9bc17f97f7d012c423df Mon Sep 17 00:00:00 2001 From: JCMOSCON1976 <167822375+JCMOSCON1976@users.noreply.github.com> Date: Mon, 29 Jul 2024 16:35:12 -0400 Subject: [PATCH 13/33] feat(workday):remove unwanted fields (#249) Co-authored-by: Julio Cezar Moscon --- .../scripts/api/Workday/Workday.py | 16 +++++++++++++--- .../scripts/api/XMatters/XMatters.py | 2 +- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/jobs/eam-integrations/scripts/api/Workday/Workday.py b/jobs/eam-integrations/scripts/api/Workday/Workday.py index 2d7287c8..ccb4a8a6 100644 --- a/jobs/eam-integrations/scripts/api/Workday/Workday.py +++ b/jobs/eam-integrations/scripts/api/Workday/Workday.py @@ -115,11 +115,21 @@ def get_users(): proxies=_config.proxies, ) results = json.loads(r.text) - return [user for user in results["Report_Entry"] + users = [user for user in results["Report_Entry"] if not (user.get("User_Home_Country", "") == "" and user.get("User_Home_Postal_Code", "") == "")] - - # return results["Report_Entry"] + for user in users: + user['User_Cost_Center'] = '' + user['User_Manager_Email_Address'] = '' + user['User_Functional_Group'] = '' + user['User_Work_Location'] = '' + user['User_Manager_Preferred_First_Name'] = '' + user['User_Manager_Preferred_Last_Name'] = '' + user["Worker_s_Manager"][0]["User_Manager_Preferred_First_Name"] = '' + user["Worker_s_Manager"][0]["User_Manager_Preferred_Last_Name"] = '' + + return users + except Exception: logger.critical(sys.exc_info()[0]) raise diff --git a/jobs/eam-integrations/scripts/api/XMatters/XMatters.py b/jobs/eam-integrations/scripts/api/XMatters/XMatters.py index 48ddb89a..e0b962a5 100644 --- a/jobs/eam-integrations/scripts/api/XMatters/XMatters.py +++ b/jobs/eam-integrations/scripts/api/XMatters/XMatters.py @@ -496,7 +496,7 @@ def delete_sites(xm_sites, xm_sites_in_wd): logger.info("\n") logger.info("Deleting empty sites from XMatters") for site in xm_sites: - if site not in xm_sites_in_wd and site != "Mountain View Office": + if site not in xm_sites_in_wd and site not in ["Default Site", "Mountain View Office"]: logger.info( "Site %s not in WorkDay. INACTIVATING %s from XMatters" % (site, xm_sites[site]) From 07e538891ef024812c3b0b17799b38d213f77f3d Mon Sep 17 00:00:00 2001 From: JCMOSCON1976 <167822375+JCMOSCON1976@users.noreply.github.com> Date: Tue, 30 Jul 2024 11:25:36 -0400 Subject: [PATCH 14/33] fix(exit):Added sys.exit() call (#250) Co-authored-by: Julio Cezar Moscon --- .../scripts/workday_everfi_integration.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/jobs/eam-integrations/scripts/workday_everfi_integration.py b/jobs/eam-integrations/scripts/workday_everfi_integration.py index 4aae9332..e7eb18fe 100644 --- a/jobs/eam-integrations/scripts/workday_everfi_integration.py +++ b/jobs/eam-integrations/scripts/workday_everfi_integration.py @@ -341,7 +341,7 @@ def run(self, force): except (APIAdaptorException, Exception) as e: self.logger.error(str(e)) self.logger.critical("Failed while Getting Workday users...") - + sys.exit(1) # ======================================================== # Getting Everfi users... @@ -353,7 +353,8 @@ def run(self, force): except (APIAdaptorException, Exception) as e: self.logger.error(str(e)) self.logger.critical("Failed while Getting Everfi users...") - + sys.exit(1) + # ======================================================== # Comparing users... # ======================================================== @@ -369,7 +370,8 @@ def run(self, force): except (Exception) as e: self.logger.error(str(e)) self.logger.critical("Failed while Comparing users...") - + sys.exit(1) + # ======================================================== # Deleting Everfi users ... # ======================================================== @@ -380,6 +382,7 @@ def run(self, force): except (APIAdaptorException, Exception) as e: self.logger.error(str(e)) self.logger.critical("Faile while Deleting Everfi users ...") + sys.exit(1) # ======================================================== # Adding Everfi users ... @@ -393,7 +396,7 @@ def run(self, force): except (APIAdaptorException, Exception) as e: self.logger.error(str(e)) self.logger.critical("Failed while Adding Everfi users ...") - + sys.exit(1) # ======================================================== # Updating Everfi users ... # ======================================================== @@ -413,6 +416,7 @@ def run(self, force): except (APIAdaptorException, Exception) as e: self.logger.error(str(e)) self.logger.critical("Failed while Updating Everfi users ...") + sys.exit(1) self.logger.info("End of integration") From b102a7a29e02ae1a7cf76be209d4e7ec6653f10b Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Tue, 30 Jul 2024 15:37:38 -0500 Subject: [PATCH 15/33] fix issue with call to _get_crossvalidation_metric --- jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py index c4683f16..f557a0d7 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py @@ -399,7 +399,7 @@ def _auto_tuning( m.fit(test_dat) crossval_metric = self._get_crossvalidation_metric( - m, **segment_settings.cv_settings + m, segment_settings.cv_settings ) bias.append(crossval_metric) From 0726287e4bf03efb62f6822c9bfe331bbfe5a226 Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Mon, 5 Aug 2024 10:22:14 -0500 Subject: [PATCH 16/33] fixed type check --- .../kpi_forecasting/models/funnel_forecast.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py index f557a0d7..9652ce02 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py @@ -769,12 +769,8 @@ def write_results( job.result() if components_table: - numeric_cols = self.components_df.dtypes[ - self.components_df.dtypes is float - ].index.tolist() - string_cols = self.components_df.dtypes[ - self.components_df.dtypes is object - ].index.tolist() + numeric_cols = list(self.components_df.select_dtypes(include=float).columns) + string_cols = list(self.components_df.select_dtypes(include=object).columns) self.components_df["metric_slug"] = self.metric_hub.slug self.components_df["forecast_trained_at"] = self.trained_at From d8db825704de0542d9312591599bd83e917f0509 Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Tue, 6 Aug 2024 13:18:08 -0500 Subject: [PATCH 17/33] added string case to aggregate_to_period and added tests --- .../kpi_forecasting/pandas_extras.py | 30 ++- .../tests/test_pandas_extras.py | 219 ++++++++++++++++++ 2 files changed, 248 insertions(+), 1 deletion(-) create mode 100644 jobs/kpi-forecasting/kpi_forecasting/tests/test_pandas_extras.py diff --git a/jobs/kpi-forecasting/kpi_forecasting/pandas_extras.py b/jobs/kpi-forecasting/kpi_forecasting/pandas_extras.py index e54fa60a..8ae622bf 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/pandas_extras.py +++ b/jobs/kpi-forecasting/kpi_forecasting/pandas_extras.py @@ -26,4 +26,32 @@ def aggregate_to_period( x = df.copy(deep=True) x[date_col] = pd.to_datetime(x[date_col]).dt.to_period(period[0]).dt.to_timestamp() - return x.groupby(date_col).agg(aggregation).reset_index() + + # treat numeric and string types separately + x_string = x.select_dtypes(include=["datetime64", object]) + x_numeric = x.select_dtypes(include=["float", "int", "datetime64"]) + + if set(x_string.columns) | set(x_numeric.columns) != set(x.columns): + missing_columns = set(x.columns) - ( + set(x_string.columns) | set(x_numeric.columns) + ) + missing_columns_str = ",".join(missing_columns) + raise ValueError( + f"Columns do not have string or numeric type: {missing_columns_str}" + ) + + x_numeric_agg = x_numeric.groupby(date_col).agg(aggregation).reset_index() + + # all values of x_string should be the same because it is just the dimensions + x_string_agg = x_string.drop_duplicates().reset_index(drop=True) + + if len(x_string_agg) != len(x_numeric_agg): + raise ValueError( + "String and Numeric dataframes have different length, likely due to strings not being unique up to aggregation" + ) + + # unique preseves order so we should be fine to concat + output_df = pd.concat( + [x_numeric_agg, x_string_agg.drop(columns=[date_col])], axis=1 + ) + return output_df diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_pandas_extras.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_pandas_extras.py new file mode 100644 index 00000000..c512e0c9 --- /dev/null +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_pandas_extras.py @@ -0,0 +1,219 @@ +import pandas as pd +import pytest + +from kpi_forecasting.pandas_extras import aggregate_to_period + + +def test_only_numeric(): + df = pd.DataFrame( + { + "submission_date": [ + "2020-01-01", + "2020-01-01", + "2020-01-02", + "2020-02-01", + "2020-02-02", + ], + "ints": [1, 2, 3, 4, 5], + "floats": [10.0, 20.0, 30.0, 40.0, 50.0], + } + ) + + day_output = aggregate_to_period(df, "day") + + expected_day = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2020-01-01"), + pd.to_datetime("2020-01-02"), + pd.to_datetime("2020-02-01"), + pd.to_datetime("2020-02-02"), + ], + "ints": [3, 3, 4, 5], + "floats": [30.0, 30.0, 40.0, 50.0], + } + ) + + pd.testing.assert_frame_equal(day_output, expected_day) + + month_output = aggregate_to_period(df, "month") + + expected_month = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2020-01-01"), + pd.to_datetime("2020-02-01"), + ], + "ints": [6, 9], + "floats": [60.0, 90.0], + } + ) + + pd.testing.assert_frame_equal(month_output, expected_month) + + +def test_with_string_and_numeric(): + df = pd.DataFrame( + { + "submission_date": [ + "2020-01-01", + "2020-01-01", + "2020-01-02", + "2020-02-01", + "2020-02-02", + ], + "ints": [1, 2, 3, 4, 5], + "floats": [10.0, 20.0, 30.0, 40.0, 50.0], + "string": ["jan", "jan", "jan", "feb", "feb"], + } + ) + + day_output = aggregate_to_period(df, "day") + + expected_day = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2020-01-01"), + pd.to_datetime("2020-01-02"), + pd.to_datetime("2020-02-01"), + pd.to_datetime("2020-02-02"), + ], + "ints": [3, 3, 4, 5], + "floats": [30.0, 30.0, 40.0, 50.0], + "string": ["jan", "jan", "feb", "feb"], + } + ) + + pd.testing.assert_frame_equal(day_output, expected_day) + + month_output = aggregate_to_period(df, "month") + + expected_month = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2020-01-01"), + pd.to_datetime("2020-02-01"), + ], + "ints": [6, 9], + "floats": [60.0, 90.0], + "string": ["jan", "feb"], + } + ) + + pd.testing.assert_frame_equal(month_output, expected_month) + + +def test_only_string(): + df = pd.DataFrame( + { + "submission_date": [ + "2020-01-01", + "2020-01-01", + "2020-01-02", + "2020-02-01", + "2020-02-02", + ], + "string": ["jan", "jan", "jan", "feb", "feb"], + } + ) + + day_output = aggregate_to_period(df, "day") + + expected_day = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2020-01-01"), + pd.to_datetime("2020-01-02"), + pd.to_datetime("2020-02-01"), + pd.to_datetime("2020-02-02"), + ], + "string": ["jan", "jan", "feb", "feb"], + } + ) + + pd.testing.assert_frame_equal(day_output, expected_day) + + month_output = aggregate_to_period(df, "month") + + expected_month = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2020-01-01"), + pd.to_datetime("2020-02-01"), + ], + "string": ["jan", "feb"], + } + ) + + pd.testing.assert_frame_equal(month_output, expected_month) + + +def test_non_unique_string_exception(): + df = pd.DataFrame( + { + "submission_date": [ + "2020-01-01", + "2020-01-01", + "2020-01-02", + "2020-02-01", + "2020-02-02", + ], + "ints": [1, 2, 3, 4, 5], + "floats": [10.0, 20.0, 30.0, 40.0, 50.0], + "string": ["jan", "jane", "yan", "fev", "feb"], + } + ) + + with pytest.raises( + ValueError, + match="String and Numeric dataframes have different length, likely due to strings not being unique up to aggregation", + ): + _ = aggregate_to_period(df, "day") + + +def test_column_type_exception(): + df = pd.DataFrame( + { + "submission_date": [ + "2020-01-01", + "2020-01-01", + "2020-01-02", + "2020-02-01", + "2020-02-02", + ], + "ints": [1, 2, 3, 4, 5], + "floats": [10.0, 20.0, 30.0, 40.0, 50.0], + "string": ["jan", "jane", "yan", "fev", "feb"], + "bool": [True, True, True, False, False], + } + ) + + with pytest.raises( + ValueError, + match="Columns do not have string or numeric type: bool", + ): + _ = aggregate_to_period(df, "day") + + +def test_agg_exception(): + df = pd.DataFrame( + { + "submission_date": [ + "2020-01-01", + "2020-01-01", + "2020-01-02", + "2020-02-01", + "2020-02-02", + ], + "ints": [1, 2, 3, 4, 5], + "floats": [10.0, 20.0, 30.0, 40.0, 50.0], + "string": ["jan", "jane", "yan", "fev", "feb"], + "bool": [True, True, True, False, False], + } + ) + + with pytest.raises( + ValueError, + match="Don't know how to floor dates by hamburger. Please use 'day', 'month', or 'year'.", + ): + _ = aggregate_to_period(df, "hamburger") From 83aa2298051c2e439aca37c61974155efbb1aba9 Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Wed, 7 Aug 2024 08:42:33 -0500 Subject: [PATCH 18/33] revert file --- .../scripts/workday_everfi_integration.py | 199 ++++++++---------- 1 file changed, 86 insertions(+), 113 deletions(-) diff --git a/jobs/eam-integrations/scripts/workday_everfi_integration.py b/jobs/eam-integrations/scripts/workday_everfi_integration.py index 59e51114..39cf41d0 100644 --- a/jobs/eam-integrations/scripts/workday_everfi_integration.py +++ b/jobs/eam-integrations/scripts/workday_everfi_integration.py @@ -1,3 +1,4 @@ + from workday_everfi.api.Workday import WorkdayAPI from workday_everfi.api.Everfi import EverfiAPI from api.util import Util, APIAdaptorException @@ -5,7 +6,6 @@ import logging import sys - def cal_user_location(wd_user, locs, loc_map_table): loc = "" location_country = wd_user.get("location_country", "") @@ -37,23 +37,19 @@ class Everfi: def __init__(self) -> None: self.everfi_api = EverfiAPI() self.logger = logging.getLogger(self.__class__.__name__) - + def get_everfi_users(self, locs, loc_map_table, hire_dates): - filter = {"filter[active]": "true"} - fields = { - "fields[users]": "email,first_name,last_name,sso_id,employee_id,student_id,location_id,active,user_rule_set_roles,category_labels" - } - return self.everfi_api.get_users( - fields, filter, locs, loc_map_table, hire_dates - ) + filter = {'filter[active]': 'true'} + fields = {'fields[users]': 'email,first_name,last_name,sso_id,employee_id,student_id,location_id,active,user_rule_set_roles,category_labels'} + return self.everfi_api.get_users(fields, filter, locs, loc_map_table, hire_dates) def deactivate_users(self, del_list, everfi_users): count = 0 - + for email in del_list: - id = everfi_users[email].get("id") + id = everfi_users[email].get('id') self.everfi_api.deactivate_user(id) - if "@" in email: + if '@' in email: n = email.split("@")[0] else: n = email @@ -62,10 +58,10 @@ def deactivate_users(self, del_list, everfi_users): if count % 20 == 0: self.logger.info(f"[{count} of {len(del_list)}] users deactivated.") return count - + def activate_user(self, id): - self.everfi_api.set_active(id, True) - + self.everfi_api.set_active(id,True) + def get_locations_mapping_table(self): return self.everfi_api.get_locations_mapping_table() @@ -81,21 +77,17 @@ def upd_everfi_users( ): errors_list = [] count_upd = 0 - loc_id_dict = { - x.get("id"): x.get("attributes").get("name") for x in locs.values() - } - + loc_id_dict = {x.get('id'):x.get('attributes').get('name') for x in locs.values()} + for email in upd_list_keys: wd_user = wd_users[email][1] - loc_id = cal_user_location(wd_user, locs, loc_map_table) - if int(loc_id) != everfi_users[email].get("attributes").get("location_id"): - if "@" in email: + loc_id = cal_user_location(wd_user, locs, loc_map_table) + if int(loc_id) != everfi_users[email].get('attributes').get('location_id'): + if '@' in email: n = email.split("@")[0] else: n = email - self.logger.info( - f"User {n[:4]} .. {n[-1]} changed location from {loc_id_dict[str(everfi_users[email].get('attributes').get('location_id'))]} to {loc_id_dict[loc_id]}" - ) + self.logger.info(f"User {n[:4]} .. {n[-1]} changed location from {loc_id_dict[str(everfi_users[email].get('attributes').get('location_id'))]} to {loc_id_dict[loc_id]}") json_data = { "data": { "type": "registration_sets", @@ -125,14 +117,12 @@ def upd_everfi_users( except Exception as e: self.logger.exception(e) errors_list.append(e) - - cat_label_user_id = self.get_category_label_user_id( - everfi_users[email]["id"] - ) + + cat_label_user_id = self.get_category_label_user_id(everfi_users[email]["id"]) if cat_label_user_id: self.delete_category_label_user(cat_label_user_id) - # wd_users[email][1]["hire_date"] = '2024-07-10' + #wd_users[email][1]["hire_date"] = '2024-07-10' hire_date_id = self.get_hire_date_id( wd_users[email][1]["hire_date"], hire_date_category_id, hire_dates ) @@ -144,29 +134,27 @@ def upd_everfi_users( except Exception as e: self.logger.exception(e) errors_list.append(e) - + if count_upd % 20 == 0: - self.logger.info( - f"[{count_upd} of {len(upd_list_keys)}] users updated." - ) - + self.logger.info(f"[{count_upd} of {len(upd_list_keys)}] users updated.") + count_upd += 1 - + return count_upd - + def get_category_label_user_id(self, id): - ret = self.everfi_api.get_category_label_user_id(id) - if len(ret.data.get("data", "")) > 0: - return ret.data.get("data", "")[0].get("id", "") + ret = self.everfi_api.get_category_label_user_id(id) + if len(ret.data.get('data',''))>0: + return ret.data.get('data','')[0].get('id','') else: return None - + def delete_category_label_user(self, id): - ret = self.everfi_api.delete_category_label_user(id) + ret = self.everfi_api.delete_category_label_user(id) return ret - - def bulk_clear_category_id(self, ids, category_id, category_label): - return self.everfi_api.bulk_clear_category_id(ids, category_id, category_label) + + def bulk_clear_category_id(self, ids, category_id,category_label): + return self.everfi_api.bulk_clear_category_id(ids, category_id,category_label) def get_hire_date_id(self, wd_hire_date, hire_date_category_id, hire_dates): wd_hire_date = wd_hire_date.split("-") @@ -225,34 +213,33 @@ def add_everfi_users( except Exception as e: self.logger.exception(e) self.logger.info("Trying to activate user and update ") - if e.args[0][0].get("id", "") == "user_rule_set": + if (e.args[0][0].get('id','')=='user_rule_set'): # try to active user # find user by email and then update the user with current data - filter = {"filter[email]": wd_user.get("primary_work_email", "")} - fields = {"fields[users]": "id,email"} - # find user id + filter = {'filter[email]': wd_user.get("primary_work_email", "")} + fields = {'fields[users]': 'id,email'} + #find user id user = self.everfi_api.search_user(fields, filter) - id = user.get(email, "").get("id", "") + id = user.get(email,'').get('id', '') if id: - # self.activate_user(id) - json_data["data"]["id"] = id - json_data["data"]["attributes"]["registrations"][0][ - "active" - ] = True - # active user and update fields - r = self.everfi_api.upd_user(id, json_data) - # remove hire date custom field - - # hd = wd_users[email][1]["hire_date"].split('-') + #self.activate_user(id) + json_data['data']['id'] = id + json_data['data']['attributes']['registrations'][0]['active'] = True + #active user and update fields + r = self.everfi_api.upd_user(id, json_data) + #remove hire date custom field + + #hd = wd_users[email][1]["hire_date"].split('-') cat_label_user_id = self.get_category_label_user_id(id) if cat_label_user_id: self.delete_category_label_user(cat_label_user_id) - # self.bulk_clear_category_id([id], hire_date_category_id, hd[1] + '-' + hd[0]) - else: + #self.bulk_clear_category_id([id], hire_date_category_id, hd[1] + '-' + hd[0]) + else: errors.append(e) continue - # wd_users[email][1]["hire_date"] = '2024-07-10' + + #wd_users[email][1]["hire_date"] = '2024-07-10' hire_date_id = self.get_hire_date_id( wd_users[email][1]["hire_date"], hire_date_category_id, hire_dates ) @@ -266,24 +253,25 @@ def add_everfi_users( errors.append(e) count_add += 1 - - if "@" in email: + + if '@' in email: n = email.split("@")[0] else: n = email self.logger.info(f"{n[:4]} .. {n[-1]} added") - + if count_add % 20 == 0: self.logger.info(f"[{count_add} of {len(add_list_keys)}] users added.") - + + + return count_add - class Workday: def build_comparison_string(self, wd_row, locs, loc_map_table): loc_id = cal_user_location(wd_row, locs, loc_map_table) - hire_date = wd_row["hire_date"].split("-") - + hire_date = wd_row['hire_date'].split('-') + is_manager = "supervisor" if wd_row.get("is_manager", "") else "non_supervisor" return ( wd_row["primary_work_email"] @@ -298,9 +286,7 @@ def build_comparison_string(self, wd_row, locs, loc_map_table): + "|" + is_manager + "|" - + hire_date[1] - + "-" - + hire_date[0] + + hire_date[1] + "-" + hire_date[0] + "|" + wd_row["primary_work_email"] ) @@ -319,7 +305,7 @@ def get_wd_users(self, locs, loc_map_table): (df["currently_active"] == True) & (df["moco_or_mofo"] == "MoCo") & (df["worker_type"] == "Employee") - | (df["primary_work_email"] == "jmoscon@mozilla.com") + | (df['primary_work_email'] == "jmoscon@mozilla.com") ] comp = { @@ -353,15 +339,16 @@ def compare_users(self, wd_comp, everfi_comp, wd_users, everfi_users): if everfi_comp[upd_email] != wd_comp[upd_email]: upd_list.append(upd_email) + return add_list, del_list, upd_list def run(self, limit): # ======================================================== # Getting Everfi hire dates, locations and locations mapping table ... - # ======================================================== + # ======================================================== try: self.logger.info("Getting everfi hire dates") - hire_date_category_id, hire_dates = self.everfi.everfi_api.get_hire_dates() + hire_date_category_id, hire_dates = self.everfi.everfi_api.get_hire_dates() self.logger.info(f"Number of hire dates: {len(hire_dates)}") self.logger.info("Getting everfi locations") @@ -374,9 +361,7 @@ def run(self, limit): except (APIAdaptorException, Exception) as e: self.logger.error(str(e)) - self.logger.critical( - "Failed while Getting Everfi hire dates,locations and locations mapping table ..." - ) + self.logger.critical("Failed while Getting Everfi hire dates,locations and locations mapping table ...") sys.exit(1) # ======================================================== @@ -388,7 +373,7 @@ def run(self, limit): self.logger.info(f"Number of wd users: {len(wd_users)}") except (APIAdaptorException, Exception) as e: self.logger.error(str(e)) - self.logger.critical("Failed while Getting Workday users...") + self.logger.critical("Failed while Getting Workday users...") sys.exit(1) # ======================================================== @@ -396,15 +381,13 @@ def run(self, limit): # ======================================================== self.logger.info("Getting Everfi users...") try: - everfi_comp, everfi_users = self.everfi.get_everfi_users( - locs, loc_map_table, hire_dates - ) + everfi_comp, everfi_users = self.everfi.get_everfi_users(locs, loc_map_table, hire_dates) self.logger.info(f"Number of Everfi users: {len(everfi_users)}") except (APIAdaptorException, Exception) as e: self.logger.error(str(e)) self.logger.critical("Failed while Getting Everfi users...") sys.exit(1) - + # ======================================================== # Comparing users... # ======================================================== @@ -414,51 +397,42 @@ def run(self, limit): wd_comp, everfi_comp, wd_users, everfi_users ) - self.logger.info( - f"Number of users to delete w/o limit={len(del_list)} with limit={len(del_list[:limit])}" - ) - self.logger.info( - f"Number of users to add w/o limit={len(add_list)} with limit={len(add_list[:limit])}" - ) - self.logger.info( - f"Number of users to update w/o limit={len(upd_list)} with limit={len(upd_list[:limit])}" - ) + self.logger.info(f"Number of users to delete w/o limit={len(del_list)} with limit={len(del_list[:limit])}") + self.logger.info(f"Number of users to add w/o limit={len(add_list)} with limit={len(add_list[:limit])}") + self.logger.info(f"Number of users to update w/o limit={len(upd_list)} with limit={len(upd_list[:limit])}") del_list = del_list[:limit] add_list = add_list[:limit] upd_list = upd_list[:limit] - - except Exception as e: + + except (Exception) as e: self.logger.error(str(e)) self.logger.critical("Failed while Comparing users...") sys.exit(1) - + + # ======================================================== # Deleting Everfi users ... # ======================================================== - self.logger.info("Deleting Everfi users ...") + self.logger.info("Deleting Everfi users ...") try: + count_dels = self.everfi.deactivate_users(del_list, everfi_users) self.logger.info(f"Number of users deleted {count_dels}") except (APIAdaptorException, Exception) as e: self.logger.error(str(e)) self.logger.critical("Faile while Deleting Everfi users ...") sys.exit(1) - + # ======================================================== # Adding Everfi users ... # ======================================================== self.logger.info("Adding Everfi users ...") try: count_add = self.everfi.add_everfi_users( - hire_date_category_id, - hire_dates, - locs, - add_list, - wd_users, - loc_map_table, + hire_date_category_id, hire_dates, locs, add_list, wd_users, loc_map_table ) - self.logger.info(f"Number of users added {count_add}") + self.logger.info(f"Number of users added {count_add}") except (APIAdaptorException, Exception) as e: self.logger.error(str(e)) self.logger.critical("Failed while Adding Everfi users ...") @@ -467,7 +441,7 @@ def run(self, limit): # Updating Everfi users ... # ======================================================== self.logger.info("Updating Everfi users ...") - + try: count_upd = self.everfi.upd_everfi_users( hire_date_category_id, @@ -483,10 +457,9 @@ def run(self, limit): self.logger.error(str(e)) self.logger.critical("Failed while Updating Everfi users ...") sys.exit(1) - + self.logger.info("End of integration") - if __name__ == "__main__": parser = argparse.ArgumentParser(description="Sync up XMatters with Workday") @@ -498,18 +471,18 @@ def run(self, limit): type=str, default="info", ) - + parser.add_argument( "-m", - "--max_limit", + "--max_limit", action="store", type=int, - help="limit the number of changes in Everfi", - default=10, + help="limit the number of changes in Everfi", + default=10 ) args = None args = parser.parse_args() - + log_level = Util.set_up_logging(args.level) logger = logging.getLogger(__name__) From d5a0e63437c4f5704a97f12094e12a0b12efc2d1 Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Thu, 8 Aug 2024 11:59:24 -0500 Subject: [PATCH 19/33] added more tests to prophet_forecast --- .../kpi_forecasting/models/base_forecast.py | 20 +- .../models/prophet_forecast.py | 31 +- .../tests/test_base_forecast.py | 39 +- .../tests/test_prophet_forecast.py | 533 ++++++++++++++++++ 4 files changed, 598 insertions(+), 25 deletions(-) diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py index f41f3b59..ed958518 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py @@ -189,7 +189,7 @@ def summarize( Returns: pd.DataFrame: metric dataframe for all metrics and aggregations """ - self.summary_df = pd.concat( + summary_df = pd.concat( [ self._summarize( self.forecast_df, @@ -202,4 +202,22 @@ def summarize( ] ) + # add Metric Hub metadata columns + summary_df["metric_alias"] = self.metric_hub.alias.lower() + summary_df["metric_hub_app_name"] = self.metric_hub.app_name.lower() + summary_df["metric_hub_slug"] = self.metric_hub.slug.lower() + summary_df["metric_start_date"] = pd.to_datetime(self.metric_hub.min_date) + summary_df["metric_end_date"] = pd.to_datetime(self.metric_hub.max_date) + summary_df["metric_collected_at"] = self.collected_at + + # add forecast model metadata columns + summary_df["forecast_start_date"] = self.start_date + summary_df["forecast_end_date"] = self.end_date + summary_df["forecast_trained_at"] = self.trained_at + summary_df["forecast_predicted_at"] = self.predicted_at + + summary_df["forecast_parameters"] = self.metadata_params + + self.summary_df = summary_df + return self.summary_df diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py index 60b8982a..19f57e1d 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py @@ -20,15 +20,20 @@ class ProphetForecast(BaseForecast): def column_names_map(self) -> Dict[str, str]: return {"submission_date": "ds", "value": "y"} - def _fit(self, observed_df) -> None: - self.model = prophet.Prophet( - **self.parameters, + def _build_model(self, parameter_dict): + model = prophet.Prophet( + **parameter_dict, uncertainty_samples=self.number_of_simulations, mcmc_samples=0, ) if self.use_holidays: - self.model.add_country_holidays(country_name="US") + model.add_country_holidays(country_name="US") + + return model + + def _fit(self, observed_df) -> None: + self.model = self._build_model(self.parameters) # Modify observed data to have column names that Prophet expects, and fit # the model @@ -235,24 +240,6 @@ def _summarize( # add summary metadata columns df["aggregation_period"] = period.lower() - # reorder columns to make interpretation easier - df = df[["submission_date", "aggregation_period", "source", "measure", "value"]] - - # add Metric Hub metadata columns - df["metric_alias"] = self.metric_hub.alias.lower() - df["metric_hub_app_name"] = self.metric_hub.app_name.lower() - df["metric_hub_slug"] = self.metric_hub.slug.lower() - df["metric_start_date"] = pd.to_datetime(self.metric_hub.min_date) - df["metric_end_date"] = pd.to_datetime(self.metric_hub.max_date) - df["metric_collected_at"] = self.collected_at - - # add forecast model metadata columns - df["forecast_start_date"] = self.start_date - df["forecast_end_date"] = self.end_date - df["forecast_trained_at"] = self.trained_at - df["forecast_predicted_at"] = self.predicted_at - df["forecast_parameters"] = self.metadata_params - return df def _summarize_legacy(self) -> pd.DataFrame: diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py index 6a731560..17ce4d27 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py @@ -1,4 +1,5 @@ from typing import List +import collections import pytest import pandas as pd @@ -160,6 +161,18 @@ def test_summarize(good_class): ) good_class.forecast_df = np.array([1, 2]) good_class.observed_df = np.array([3, 4]) + MetricHub = collections.namedtuple( + "MetricHub", + ["alias", "app_name", "slug", "min_date", "max_date"], + ) + + dummy_metric_hub = MetricHub("", "", "", "2124-01-01", "2124-01-01") + + # add it here rather than in __init__ so it doesn't try to load data + good_class.metric_hub = dummy_metric_hub + good_class.trained_at = "" + good_class.predicted_at = "" + number_val = 10 output = good_class.summarize( periods=["a", "b", "c"], numpy_aggregations=["sum"], percentiles=["percentiles"] @@ -170,5 +183,27 @@ def test_summarize(good_class): for el in ["a", "b", "c"] ] ) - assert output.reset_index(drop=True).equals(expected_output) - assert good_class.summary_df.reset_index(drop=True).equals(expected_output) + # not going to check all the metadata columns + # in assert_frame_equal. Just make sure they're there + metadata_columns = { + "metric_alias", + "metric_hub_app_name", + "metric_hub_slug", + "metric_start_date", + "metric_end_date", + "metric_collected_at", + "forecast_start_date", + "forecast_end_date", + "forecast_trained_at", + "forecast_predicted_at", + "forecast_parameters", + } + assert set(expected_output.columns) | metadata_columns == set(output.columns) + + pd.testing.assert_frame_equal( + output[expected_output.columns].reset_index(drop=True), expected_output + ) + pd.testing.assert_frame_equal( + good_class.summary_df[expected_output.columns].reset_index(drop=True), + expected_output, + ) diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py index 18d3df67..ce372cf6 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py @@ -1,11 +1,544 @@ import pandas as pd from dotmap import DotMap import numpy as np +import pytest +import collections from kpi_forecasting.models.prophet_forecast import ProphetForecast +@pytest.fixture +def forecast(): + A1_start_date = "2124-01-01" + parameter_dict = { + "model_setting_split_dim": "a", + "segment_settings": { + "A1": { + "start_date": A1_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {"param1": [1, 2], "param2": [20, 10]}, + "cv_settings": {}, + }, + }, + } + + parameter_dotmap = DotMap(parameter_dict) + predict_start_date = "2124-01-02" + predict_end_date = "2124-03-01" + return ProphetForecast( + model_type="test", + parameters=parameter_dotmap, + use_holidays=None, + start_date=predict_start_date, + end_date=predict_end_date, + metric_hub=None, + ) + + +class MockModel: + """Used in place of prophet.Prophet for testing purposes""" + + def __init__(self, param1=0, param2=0, **kwargs): + self.value = param1 * param2 + self.history = None + + def fit(self, df, *args, **kwargs): + self.history = df + return None + + def predict(self, dates_to_predict): + output = dates_to_predict.copy() + + output[ + [ + "yhat", + "trend", + "trend_upper", + "trend_lower", + "weekly", + "weekly_upper", + "weekly_lower", + "yearly", + "yearly_upper", + "yearly_lower", + ] + ] = 0 # some dummy value so it has the right shape + + return output + + def predictive_samples(self, dates_to_predict): + # prophet function outputs dict of numpy arrays + # only element we care about is `yhat` + output = np.arange(len(dates_to_predict)) * self.value + return {"yhat": {0: output}} + + +def mock_build_model(parameters): + """mocks the FunnelForecast build_model method""" + return MockModel( + **parameters, + ) + + +def mock_aggregate_forecast_observed( + forecast_df, observed_df, period, numpy_aggregations, percentiles +): + """Mocks the aggregate_forecast_observed function defined in ProphetForecast + and inherited in FunnelForecast. + This function is tested extensively in test_prophet_forecast + so we can make dummy outputs for tests related to it""" + + # add dummy columns where aggregated metrics woudl go + percentile_columns = [f"p{el}" for el in percentiles] + output_forecast_df = forecast_df.copy() + output_forecast_df[numpy_aggregations + percentile_columns] = 0 + return output_forecast_df, observed_df.copy() + + +def test_under_fit(forecast, mocker): + """test the _fit method""" + + observed_data = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + } + ) + mocker.patch.object(forecast, "_build_model", mock_build_model) + + forecast._fit(observed_data) + + # checking that history is set in the mocked Model ensures fit was called on it + pd.testing.assert_frame_equal( + observed_data.rename(columns={"submission_date": "ds"}), forecast.model.history + ) + + +def test_fit(forecast, mocker): + """test the fit function. It is inherited from BaseForecast + and calls _fit with the proper object attributes. Test looks very + similar to that for _fit""" + observed_data = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + } + ) + mocker.patch.object(forecast, "_build_model", mock_build_model) + + forecast.observed_df = observed_data + forecast.fit() + + # checking that history is set in the mocked Model ensures fit was called on it + pd.testing.assert_frame_equal( + observed_data.rename(columns={"submission_date": "ds"}), forecast.model.history + ) + + assert forecast.trained_at is not None + + +def test_combine_forecast_observed(mocker, forecast): + """tests the _combine_forecast_observed method""" + # 2024-01-01 is chosen as an arbitrary date to center the tests around + + # forecast predictions are set with the + # mock_aggregate_forecast_observed function so they + # can be ommited here + forecast_df = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + } + ) + + # rows with negative values are those expected to be removed + # by filters in summarize + observed_df = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + "value": [10, 20], + } + ) + + mocker.patch.object( + forecast, "_aggregate_forecast_observed", mock_aggregate_forecast_observed + ) + + numpy_aggregations = ["mean"] + percentiles = [10, 50, 90] + output_df = forecast._combine_forecast_observed( + forecast_df, + observed_df, + period="period", + numpy_aggregations=numpy_aggregations, + percentiles=percentiles, + ) + observed_expected_df = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + "value": [10, 20], + "measure": ["observed", "observed"], + "source": ["historical", "historical"], + } + ) + + # 4x2 columns, 4 metrics (mean, p10, p50, p90) + forecast_expected_df = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + "measure": ["mean", "mean", "p10", "p10", "p50", "p50", "p90", "p90"], + "value": [0] * 8, + "source": ["forecast"] * 8, + } + ) + + # concat in same order to make our lives easier + expected = pd.concat([observed_expected_df, forecast_expected_df]).sort_values( + ["submission_date", "measure"] + ) + assert set(expected.columns) == set(output_df.columns) + # force value columns to be floats in both cases to make check easier + numeric_cols = ["value", "value_low", "value_mid", "value_high"] + # expected[numeric_cols] = expected[numeric_cols].astype(float) + # output_df[numeric_cols] = output_df[numeric_cols].astype(float) + pd.testing.assert_frame_equal( + output_df.sort_values(["submission_date", "measure"]).reset_index(drop=True), + expected[output_df.columns].reset_index(drop=True), + ) + + # should not be any nulls outside the metric column + non_metric_columns = [el for el in output_df.columns if el not in numeric_cols] + assert not pd.isna(output_df[non_metric_columns]).any(axis=None) + + +def test_under_summarize(mocker, forecast): + """testing _summarize""" + # 2024-01-01 is chosen as an arbitrary date to center the tests around + + # forecast predictions are set with the + # mock_aggregate_forecast_observed function so they + # can be ommited here + forecast_df = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + } + ) + + # rows with negative values are those expected to be removed + # by filters in summarize + observed_df = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + "value": [10, 20], + } + ) + + mocker.patch.object( + forecast, "_aggregate_forecast_observed", mock_aggregate_forecast_observed + ) + + numpy_aggregations = ["mean"] + percentiles = [10, 50, 90] + output_df = forecast._summarize( + forecast_df, + observed_df, + period="period", + numpy_aggregations=numpy_aggregations, + percentiles=percentiles, + ) + observed_expected_df = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + "value": [10, 20], + "measure": ["observed", "observed"], + "source": ["historical", "historical"], + } + ) + + # 4x2 columns, 4 metrics (mean, p10, p50, p90) + forecast_expected_df = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + "measure": ["mean", "mean", "p10", "p10", "p50", "p50", "p90", "p90"], + "value": [0] * 8, + "source": ["forecast"] * 8, + } + ) + + # concat in same order to make our lives easier + expected = pd.concat([observed_expected_df, forecast_expected_df]).sort_values( + ["submission_date", "measure"] + ) + expected["aggregation_period"] = "period" + + assert set(expected.columns) == set(output_df.columns) + # force value columns to be floats in both cases to make check easier + numeric_cols = ["value", "value_low", "value_mid", "value_high"] + # expected[numeric_cols] = expected[numeric_cols].astype(float) + # output_df[numeric_cols] = output_df[numeric_cols].astype(float) + pd.testing.assert_frame_equal( + output_df.sort_values(["submission_date", "measure"]).reset_index(drop=True), + expected[output_df.columns].reset_index(drop=True), + ) + + # should not be any nulls outside the metric column + non_metric_columns = [el for el in output_df.columns if el not in numeric_cols] + assert not pd.isna(output_df[non_metric_columns]).any(axis=None) + + +def test_summarize(mocker, forecast): + """testing summarize""" + # create dummy metric hub object to when meta data from + # it is added we don't get an error + MetricHub = collections.namedtuple( + "MetricHub", + ["alias", "app_name", "slug", "min_date", "max_date"], + ) + + dummy_metric_hub = MetricHub("", "", "", "2124-01-01", "2124-01-01") + + # 2024-01-01 is chosen as an arbitrary date to center the tests around + + # forecast predictions are set with the + # mock_aggregate_forecast_observed function so they + # can be ommited here + forecast_df = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + } + ) + + # rows with negative values are those expected to be removed + # by filters in summarize + observed_df = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + "value": [10, 20], + } + ) + + mocker.patch.object( + forecast, "_aggregate_forecast_observed", mock_aggregate_forecast_observed + ) + + numpy_aggregations = ["mean"] + percentiles = [10, 50, 90] + + forecast.observed_df = observed_df + forecast.forecast_df = forecast_df + forecast.metric_hub = dummy_metric_hub + + # timestamp attributes created by fit and predict + # must be added manuall + forecast.collected_at = "" + forecast.trained_at = "" + forecast.predicted_at = "" + forecast.metadata_params = "" + + numpy_aggregations = ["mean"] + percentiles = [10, 50, 90] + forecast.summarize( + periods=["period1", "period2"], + numpy_aggregations=numpy_aggregations, + percentiles=percentiles, + ) + + output_df = forecast.summary_df + + observed_expected_df = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + "value": [10, 20], + "measure": ["observed", "observed"], + "source": ["historical", "historical"], + } + ) + + # 4x2 columns, 4 metrics (mean, p10, p50, p90) + forecast_expected_df = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + "measure": ["mean", "mean", "p10", "p10", "p50", "p50", "p90", "p90"], + "value": [0] * 8, + "source": ["forecast"] * 8, + } + ) + + # concat in same order to make our lives easier + expected = pd.concat([observed_expected_df, forecast_expected_df]).sort_values( + ["submission_date", "measure"] + ) + expected1 = expected.copy() + expected2 = expected.copy() + expected1["aggregation_period"] = "period1" + expected2["aggregation_period"] = "period2" + + expected = pd.concat([expected1, expected2]) + + # not going to check all the metadata columns + # in assert_frame_equal. Just make sure they're there + metadata_columns = { + "metric_alias", + "metric_hub_app_name", + "metric_hub_slug", + "metric_start_date", + "metric_end_date", + "metric_collected_at", + "forecast_start_date", + "forecast_end_date", + "forecast_trained_at", + "forecast_predicted_at", + "forecast_parameters", + } + assert set(expected.columns) | metadata_columns == set(output_df.columns) + # force value columns to be floats in both cases to make check easier + numeric_cols = ["value", "value_low", "value_mid", "value_high"] + pd.testing.assert_frame_equal( + output_df.sort_values(["submission_date", "aggregation_period", "measure"])[ + expected.columns + ].reset_index(drop=True), + expected.sort_values( + ["submission_date", "aggregation_period", "measure"] + ).reset_index(drop=True), + ) + + # should not be any nulls outside the metric column + non_metric_columns = [el for el in output_df.columns if el not in numeric_cols] + assert not pd.isna(output_df[non_metric_columns]).any(axis=None) + + +def test_under_predict(mocker, forecast): + """testing _predict""" + # this ensures forecast is using MockModel + mocker.patch.object(forecast, "_build_model", mock_build_model) + + observed_df = pd.DataFrame( + { + "y": [0, 1], + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + } + ) + + dates_to_predict = pd.DataFrame( + { + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ] + } + ) + forecast.observed_df = observed_df + forecast.parameters = {"param1": 1, "param2": 2} + forecast.fit() + out = forecast._predict(dates_to_predict).reset_index(drop=True) + + # in MockModel, the predictive_samples method sets the output to + # np.arange(len(dates_to_predict)) * self.value for one column called 0 + # this helps ensure the forecast_df in segment_models is set properly + expected = pd.DataFrame( + { + 0: [0, 2], + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + } + ) + + pd.testing.assert_frame_equal(out, expected) + + # test predict while we're here + + forecast.dates_to_predict = dates_to_predict + forecast.number_of_simulations = 1 # so that _validate doesn't break + forecast.predict() + + out = forecast.forecast_df + + # in MockModel, the predictive_samples method sets the output to + # np.arange(len(dates_to_predict)) * self.value for one column called 0 + # this helps ensure the forecast_df in segment_models is set properly + expected = pd.DataFrame( + { + 0: [0, 2], + "submission_date": [ + pd.to_datetime("2124-01-01").date(), + pd.to_datetime("2124-01-02").date(), + ], + } + ) + + pd.testing.assert_frame_equal(out, expected) + assert forecast.predicted_at is not None + + def test_summarize_non_overlapping_day(): observed_start_date = "2124-01-01" observed_end_date = "2124-02-01" From b3edd109c0764bcf0d429cc0ea3549032505af89 Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Fri, 9 Aug 2024 09:28:19 -0500 Subject: [PATCH 20/33] removed DotMap --- jobs/kpi-forecasting/kpi_forecasting.py | 14 +++++++------- .../configs/model_inputs/__init__.py | 6 +++--- .../kpi-forecasting/kpi_forecasting/inputs.py | 19 +++++-------------- .../kpi_forecasting/metric_hub.py | 3 +-- .../kpi_forecasting/models/base_forecast.py | 2 +- .../kpi_forecasting/models/funnel_forecast.py | 12 +++--------- .../kpi_forecasting/results_processing.py | 11 ++++++----- .../tests/test_base_forecast.py | 11 +++++------ .../tests/test_funnel_forecast.py | 15 +++++---------- .../tests/test_performance_analysis.py | 3 +++ .../tests/test_prophet_forecast.py | 12 +++++------- jobs/kpi-forecasting/requirements.txt | 1 - 12 files changed, 44 insertions(+), 65 deletions(-) diff --git a/jobs/kpi-forecasting/kpi_forecasting.py b/jobs/kpi-forecasting/kpi_forecasting.py index e7dcca7c..d8c3f04c 100644 --- a/jobs/kpi-forecasting/kpi_forecasting.py +++ b/jobs/kpi-forecasting/kpi_forecasting.py @@ -1,4 +1,4 @@ -from kpi_forecasting.inputs import CLI, YAML +from kpi_forecasting.inputs import CLI, load_yaml from kpi_forecasting.models.prophet_forecast import ProphetForecast from kpi_forecasting.models.funnel_forecast import FunnelForecast from kpi_forecasting.metric_hub import MetricHub @@ -13,17 +13,17 @@ def main() -> None: # Load the config - config = YAML(filepath=CLI().args.config).data - model_type = config.forecast_model.model_type + config = load_yaml(filepath=CLI().args.config) + model_type = config["forecast_model"]["model_type"] if model_type in MODELS: - metric_hub = MetricHub(**config.metric_hub) - model = MODELS[model_type](metric_hub=metric_hub, **config.forecast_model) + metric_hub = MetricHub(**config["metric_hub"]) + model = MODELS[model_type](metric_hub=metric_hub, **config["forecast_model"]) model.fit() model.predict() - model.summarize(**config.summarize) - model.write_results(**config.write_results) + model.summarize(**config["summarize"]) + model.write_results(**config["write_results"]) else: raise ValueError(f"Don't know how to forecast using {model_type}.") diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/model_inputs/__init__.py b/jobs/kpi-forecasting/kpi_forecasting/configs/model_inputs/__init__.py index 1ebd482e..caacc611 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/configs/model_inputs/__init__.py +++ b/jobs/kpi-forecasting/kpi_forecasting/configs/model_inputs/__init__.py @@ -3,15 +3,15 @@ from pathlib import Path -from kpi_forecasting.inputs import YAML +from kpi_forecasting.inputs import load_yaml PARENT_PATH = Path(__file__).parent HOLIDAY_PATH = PARENT_PATH / "holidays.yaml" REGRESSOR_PATH = PARENT_PATH / "regressors.yaml" -holiday_collection = YAML(HOLIDAY_PATH) -regressor_collection = YAML(REGRESSOR_PATH) +holiday_collection = load_yaml(HOLIDAY_PATH) +regressor_collection = load_yaml(REGRESSOR_PATH) @attr.s(auto_attribs=True, frozen=False) diff --git a/jobs/kpi-forecasting/kpi_forecasting/inputs.py b/jobs/kpi-forecasting/kpi_forecasting/inputs.py index 034af27a..14da5545 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/inputs.py +++ b/jobs/kpi-forecasting/kpi_forecasting/inputs.py @@ -2,7 +2,6 @@ import yaml from dataclasses import dataclass -from dotmap import DotMap @dataclass @@ -20,18 +19,10 @@ def __post_init__(self) -> None: self.args = self.parser.parse_args() -@dataclass -class YAML: +def load_yaml(filepath: str) -> dict: """ - Create a data structure from a YAML config filepath. Instead of loading the - YAML as a dictionary, which requires verbose code to access nested dictionary - values, this class loads YAML as a dot map. Nested values can be accessed using - dot notation, like `YAML().data.section.subsection.value`. + Create a data structure from a YAML config filepath. """ - - filepath: str - - def __post_init__(self) -> None: - with open(self.filepath, "r") as f: - data = yaml.safe_load(f) - self.data = DotMap(data) + with open(filepath, "r") as f: + data = yaml.safe_load(f) + return data diff --git a/jobs/kpi-forecasting/kpi_forecasting/metric_hub.py b/jobs/kpi-forecasting/kpi_forecasting/metric_hub.py index 64cf9d42..e0a86c83 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/metric_hub.py +++ b/jobs/kpi-forecasting/kpi_forecasting/metric_hub.py @@ -1,7 +1,6 @@ import pandas as pd from dataclasses import dataclass -from dotmap import DotMap from google.cloud import bigquery from mozanalysis.config import ConfigLoader from textwrap import dedent @@ -36,7 +35,7 @@ class MetricHub: app_name: str slug: str start_date: str - segments: DotMap = None + segments: dict = None where: str = None end_date: str = None alias: str = None diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py index ed958518..45c567d2 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py @@ -71,7 +71,7 @@ def __post_init__(self) -> None: self.metadata_params = json.dumps( { "model_type": self.model_type.lower(), - "model_params": self.parameters.toDict(), + "model_params": self.parameters, "use_holidays": self.use_holidays, } ) diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py index 9652ce02..52aa9cc8 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py @@ -117,22 +117,16 @@ def _set_segment_models( ## file. Parse the holidays and regressors specified in the config file. segment_models = [] for segment in segment_combinations: - model_params = getattr( - self.parameters["segment_settings"], segment[split_dim] - ) + model_params = self.parameters["segment_settings"][segment[split_dim]] holiday_list = [] regressor_list = [] if model_params["holidays"]: - holiday_list = [ - getattr(holiday_collection.data, h) - for h in model_params["holidays"] - ] + holiday_list = [holiday_collection[h] for h in model_params["holidays"]] if model_params["regressors"]: regressor_list = [ - getattr(regressor_collection.data, r) - for r in model_params["regressors"] + regressor_collection[r] for r in model_params["regressors"] ] # Create a SegmentModelSettings object for each segment combination diff --git a/jobs/kpi-forecasting/kpi_forecasting/results_processing.py b/jobs/kpi-forecasting/kpi_forecasting/results_processing.py index f7e8ab88..1cb8a9d1 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/results_processing.py +++ b/jobs/kpi-forecasting/kpi_forecasting/results_processing.py @@ -4,7 +4,7 @@ from google.cloud import bigquery from google.cloud.bigquery.enums import SqlTypeNames as bq_types -from kpi_forecasting.inputs import YAML +from kpi_forecasting.inputs import load_yaml import pandas as pd import numpy as np @@ -74,12 +74,13 @@ def _set_intra_forecast_agg_functions(self): def _load_config_data(self): """Extracts data from the list of config files passed to the class and stores it in the - config_data attribute. The filename is the key, and the contents (represnted as a DotMap) + config_data attribute. The filename is the key, and the contents are the values""" self.config_data = {} for config_file in self.input_config_list: full_path = f"{self.input_config_path}/{config_file}" - config_data = YAML(full_path).data + config_data = load_yaml(full_path) + print(config_data) self.config_data[config_file] = config_data def _extract_config_data(self): @@ -99,7 +100,7 @@ def _extract_config_data(self): config_file_list = list(self.config_data.keys()) for config_data in self.config_data.values(): # get segment data - metric_hub_data = config_data.metric_hub.toDict() + metric_hub_data = config_data["metric_hub"] if "segments" in metric_hub_data: segment_data = metric_hub_data["segments"] segment_data_list.append(segment_data) @@ -107,7 +108,7 @@ def _extract_config_data(self): segment_data_list.append(None) # get input table info - input_table_list.append(config_data.write_results.toDict()) + input_table_list.append(config_data["write_results"]) input_table_data = input_table_list.pop(0) input_table_matches_first = [input_table_data == el for el in input_table_list] diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py index 17ce4d27..97c5b229 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py @@ -3,7 +3,6 @@ import pytest import pandas as pd -from dotmap import DotMap import numpy as np from datetime import datetime, timedelta, timezone @@ -81,7 +80,7 @@ def test_post_init(good_class): end_date = "2124-02-02" good_class = good_class( model_type="test", - parameters=DotMap(), + parameters={}, use_holidays=None, start_date=start_date, end_date=end_date, @@ -101,7 +100,7 @@ def test_post_init_default_dates(good_class): # check default start and end time good_class = good_class( model_type="test", - parameters=DotMap(), + parameters={}, use_holidays=None, start_date="", end_date="", @@ -122,7 +121,7 @@ def test_post_init_default_dates(good_class): def test_fit(good_class): good_class = good_class( model_type="test", - parameters=DotMap(), + parameters={}, use_holidays=None, start_date="2124-01-01", end_date="2124-02-02", @@ -138,7 +137,7 @@ def test_fit(good_class): def test_predict_and_validate(good_class): good_class = good_class( model_type="test", - parameters=DotMap(), + parameters={}, use_holidays=None, start_date="2124-01-01", end_date="2124-02-02", @@ -153,7 +152,7 @@ def test_predict_and_validate(good_class): def test_summarize(good_class): good_class = good_class( model_type="test", - parameters=DotMap(), + parameters={}, use_holidays=None, start_date="2124-01-01", end_date="2124-02-02", diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py index c792db67..f7f14184 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py @@ -3,7 +3,6 @@ import collections import pandas as pd -from dotmap import DotMap import pytest import numpy as np @@ -21,7 +20,7 @@ def forecast(): forecast = FunnelForecast( model_type="test", - parameters=DotMap(), + parameters={}, use_holidays=None, start_date=predict_start_date, end_date=predict_end_date, @@ -82,13 +81,12 @@ def funnel_forecast_for_fit_tests(segment_info_fit_tests, mocker): }, } - parameter_dotmap = DotMap(parameter_dict) predict_start_date = "2124-01-01" predict_end_date = "2124-01-02" forecast = FunnelForecast( model_type="test", - parameters=parameter_dotmap, + parameters=parameter_dict, use_holidays=None, start_date=predict_start_date, end_date=predict_end_date, @@ -507,13 +505,12 @@ def test_under_predict(mocker): }, } - parameter_dotmap = DotMap(parameter_dict) predict_start_date = "2124-01-02" predict_end_date = "2124-03-01" forecast = FunnelForecast( model_type="test", - parameters=parameter_dotmap, + parameters=parameter_dict, use_holidays=None, start_date=predict_start_date, end_date=predict_end_date, @@ -871,13 +868,12 @@ def test_set_segment_models(): }, } - parameter_dotmap = DotMap(parameter_dict) predict_start_date = "2124-01-01" predict_end_date = "2124-03-01" forecast = FunnelForecast( model_type="test", - parameters=parameter_dotmap, + parameters=parameter_dict, use_holidays=None, start_date=predict_start_date, end_date=predict_end_date, @@ -950,13 +946,12 @@ def test_set_segment_models_exception(): }, } - parameter_dotmap = DotMap(parameter_dict) predict_start_date = "2124-01-01" predict_end_date = "2124-03-01" forecast = FunnelForecast( model_type="test", - parameters=parameter_dotmap, + parameters=parameter_dict, use_holidays=None, start_date=predict_start_date, end_date=predict_end_date, diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_performance_analysis.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_performance_analysis.py index edbc2cbb..3e4f0120 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_performance_analysis.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_performance_analysis.py @@ -58,6 +58,7 @@ def directory_of_configs(tmp_path_factory): "dataset": "y", "table": "z", }, + "metric_hub": {}, } f4 = tmpdir / "config_nosegments1_1.yaml" f5 = tmpdir / "config_nosegments1_2.yaml" @@ -73,6 +74,7 @@ def directory_of_configs(tmp_path_factory): "dataset": "q", "table": "z", }, + "metric_hub": {}, } f6 = tmpdir / "config_nosegments2_1.yaml" @@ -91,6 +93,7 @@ def get_forecast_performance_config(tmp_path_factory): "dataset": "", "table": "", }, + "metric_hub": {}, } f1 = tmpdir / "config.yaml" with open(f1, "w") as outfile: diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py index ce372cf6..f86b032c 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py @@ -1,5 +1,4 @@ import pandas as pd -from dotmap import DotMap import numpy as np import pytest import collections @@ -25,12 +24,11 @@ def forecast(): }, } - parameter_dotmap = DotMap(parameter_dict) predict_start_date = "2124-01-02" predict_end_date = "2124-03-01" return ProphetForecast( model_type="test", - parameters=parameter_dotmap, + parameters=parameter_dict, use_holidays=None, start_date=predict_start_date, end_date=predict_end_date, @@ -548,7 +546,7 @@ def test_summarize_non_overlapping_day(): forecast = ProphetForecast( model_type="test", - parameters=DotMap(), + parameters={}, use_holidays=None, start_date=predict_start_date, end_date=predict_end_date, @@ -651,7 +649,7 @@ def test_summarize_non_overlapping_month(): forecast = ProphetForecast( model_type="test", - parameters=DotMap(), + parameters={}, use_holidays=None, start_date=predict_start_date, end_date=predict_end_date, @@ -770,7 +768,7 @@ def test_summarize_overlapping_day(): forecast = ProphetForecast( model_type="test", - parameters=DotMap(), + parameters={}, use_holidays=None, start_date=predict_start_date, end_date=predict_end_date, @@ -875,7 +873,7 @@ def test_summarize_overlapping_month(): forecast = ProphetForecast( model_type="test", - parameters=DotMap(), + parameters={}, use_holidays=None, start_date=predict_start_date, end_date=predict_end_date, diff --git a/jobs/kpi-forecasting/requirements.txt b/jobs/kpi-forecasting/requirements.txt index 218d688a..cae076e6 100644 --- a/jobs/kpi-forecasting/requirements.txt +++ b/jobs/kpi-forecasting/requirements.txt @@ -10,7 +10,6 @@ contourpy==1.1.0 convertdate==2.4.0 cycler==0.11.0 db-dtypes==1.1.1 -dotmap==1.3.30 ephem==4.1.4 exceptiongroup==1.1.1 fonttools==4.40.0 From fd1435b74b2dee6edbdb55faafb5fc670b34c138 Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Fri, 9 Aug 2024 13:10:42 -0500 Subject: [PATCH 21/33] modified README to make it match better between FunnelForecast and ProphetForecast --- jobs/kpi-forecasting/README.md | 83 +++++++++- .../kpi_forecasting/configs/dau_desktop.yaml | 2 + .../kpi_forecasting/configs/dau_mobile.yaml | 2 + .../configs/search_forecasting_ad_clicks.yaml | 72 +++++---- ...search_forecasting_daily_active_users.yaml | 66 ++++---- .../search_forecasting_search_count.yaml | 66 ++++---- .../kpi_forecasting/models/base_forecast.py | 20 ++- .../kpi_forecasting/models/funnel_forecast.py | 70 ++++---- .../models/prophet_forecast.py | 10 ++ .../tests/test_funnel_forecast.py | 151 +++++++++--------- 10 files changed, 326 insertions(+), 216 deletions(-) diff --git a/jobs/kpi-forecasting/README.md b/jobs/kpi-forecasting/README.md index 31231cf8..ff1a6ed8 100644 --- a/jobs/kpi-forecasting/README.md +++ b/jobs/kpi-forecasting/README.md @@ -85,8 +85,87 @@ The tests can be run locally with `python -m pytest` in the root directory of th # YAML Configs -Each of the sections in the YAML files contains a list of arguments that are passed to their relevant objects or methods. -Definitions should be documented in the code. +Configuration for each forecast is found in the `configs` folder. Below is an example config file with sample values and a description of what the field means as a comment whe it is not self-evident + +``` +metric_hub: # this configures the observed data fed to the model which is obtained via metrichub + app_name: "multi_product" # metric-hub app name + slug: "search_forecasting_ad_clicks" # metric-hub slug + alias: "search_forecasting_ad_clicks" # metric-hub alias + start_date: "2018-01-01" # date at which the observed data should start + end_date: "last complete month" + # date at which the observed data will end, can be a date or "last complete month" + # which uses `utils.parse_end_date` to determine the last complete month + segments: + # this section is optional and currently only used in funnel forecast, + # specifies which segments are used to partition the data, + # enabling separate models to be fit for each partition. + # Values underneath are a map of column names to be output by the + # metric-hub call and the SQL queries to populate those columns + device: "device" + channel: "'all'" + country: "CASE WHEN country = 'US' THEN 'US' ELSE 'ROW' END" + partner: "partner" + where: "partner = 'Google'" # filter to apply to the metric hub pull + +forecast_model: # this section configures the model + model_type: "funnel" + # type of model object to use, current options are "funnel" for FunnelForecast and "prophet" for ProphetForecast + start_date: NULL + # starting date for the predicted data (unless predict_historical_dates is set), + # if unset, value depends on predict_historical_dates. + end_date: NULL + # final date for the predicted data + use_holidays: False + For prophet-based models, when true, call `model.add_country_holidays(country_name="US")` on model + predict_historical_dates: True + # if predict_historical_dates is True, set to first date of the observed data + # if predict_historical_dates is False, defaults to the day after the last day in the observed data + number_of_simulations: 1000 + # for prophet-based models,number of simulations to run + parameters: + # this section can be a map or a list. + # If it's a map, these parameters are used for all models + # (recall multiple models are train if there is a metric_hub.segments) + # If it's a list, it will set different parameters + # for different subsets of the parition specified in `metric_hub.segments`. + - segment: + # specifies which subset of the partitions this applies to + # key is a column specified in metric_hub.segments + # value is a value that column can take to which the configuration is applied + device: desktop + start_date: "2018-01-01" # only applies to FunnelForecast, allows one to set start date for each sub-model + end_date: NULL # only applies to FunnelForecast, allows one to set end date for each sub-model + holidays: ["easter", "covid_sip11"] # holidays specified in `configs.model_inputs.holidays` to use. + regressors: ["post_esr_migration", "in_covid", "ad_click_bug"] # regressors specified in `configs.model_inputs.regressors` + grid_parameters: + # sets grid for hyperparameter tuning + changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] + changepoint_range: [0.8, 0.9] + n_changepoints: [25, 50] + weekly_seasonality: True + yearly_seasonality: True + cv_settings: + # sets parameters for prophet cross-validation used in FunnelForecast + initial: "1296 days" + period: "30 days" + horizon: "30 days" + parallel: "processes" + ... + +summarize: + # parameters used to summarize and aggregate the predictions + periods: ["day", "month"] # periods to aggregate up to + numpy_aggregations: ["mean"] # numpy aggregation functions to use when aggregating predictions + percentiles: [10, 50, 90] # precentiles to calculate on aggregation + +write_results: + # set the project, dataset and table for output data + project: "moz-fx-data-shared-prod" + dataset: "search_derived" + table: "search_funnel_forecasts_v1" + components_table: "search_forecast_model_components_v1" +``` # Development diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml index 5ba432ea..83e80ab9 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml +++ b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml @@ -11,6 +11,8 @@ forecast_model: start_date: NULL end_date: NULL use_holidays: False + predict_historical_dates: False + number_of_simulations: 1000 parameters: seasonality_prior_scale: 0.00825 changepoint_prior_scale: 0.15983 diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml index 74889971..a3a9f3eb 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml +++ b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml @@ -11,6 +11,8 @@ forecast_model: start_date: NULL end_date: NULL use_holidays: True + predict_historical_dates: False + number_of_simulations: 1000 parameters: seasonality_prior_scale: 0.01 changepoint_prior_scale: 0.01 diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml index a756b518..ea8a2a64 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml +++ b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml @@ -17,42 +17,44 @@ forecast_model: start_date: NULL end_date: NULL use_holidays: False + predict_historical_dates: True + number_of_simulations: 1000 parameters: - model_setting_split_dim: "device" - segment_settings: - desktop: - start_date: "2018-01-01" - end_date: NULL - holidays: ["easter", "covid_sip11"] - regressors: ["post_esr_migration", "in_covid", "ad_click_bug"] - grid_parameters: - changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] - changepoint_range: [0.8, 0.9] - n_changepoints: [25, 50] - weekly_seasonality: True - yearly_seasonality: True - cv_settings: - initial: "1296 days" - period: "30 days" - horizon: "30 days" - parallel: "processes" - mobile: - start_date: "2022-01-01" - end_date: NULL - holidays: ["easter"] - regressors: ["after_fenix", "in_covid"] - grid_parameters: - changepoint_prior_scale: [.01, .1, .15, .2] - changepoint_range: [0.8, 0.9, 1] - n_changepoints: [30] - weekly_seasonality: True - yearly_seasonality: True - growth: "logistic" - cv_settings: - initial: "366 days" - period: "30 days" - horizon: "30 days" - parallel: "processes" + - segment: + device: desktop + start_date: "2018-01-01" + end_date: NULL + holidays: ["easter", "covid_sip11"] + regressors: ["post_esr_migration", "in_covid", "ad_click_bug"] + grid_parameters: + changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] + changepoint_range: [0.8, 0.9] + n_changepoints: [25, 50] + weekly_seasonality: True + yearly_seasonality: True + cv_settings: + initial: "1296 days" + period: "30 days" + horizon: "30 days" + parallel: "processes" + - segment: + device: mobile + start_date: "2022-01-01" + end_date: NULL + holidays: ["easter"] + regressors: ["after_fenix", "in_covid"] + grid_parameters: + changepoint_prior_scale: [.01, .1, .15, .2] + changepoint_range: [0.8, 0.9, 1] + n_changepoints: [30] + weekly_seasonality: True + yearly_seasonality: True + growth: "logistic" + cv_settings: + initial: "366 days" + period: "30 days" + horizon: "30 days" + parallel: "processes" summarize: periods: ["day", "month"] diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml index b6643c4a..3ce3568e 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml +++ b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml @@ -17,39 +17,41 @@ forecast_model: start_date: NULL end_date: NULL use_holidays: False + predict_historical_dates: True + number_of_simulations: 1000 parameters: - model_setting_split_dim: "device" - segment_settings: - desktop: - start_date: "2018-01-01" - end_date: NULL - holidays: ["easter", "covid_sip11"] - regressors: ["post_esr_migration", "in_covid"] - grid_parameters: - changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] - changepoint_range: [0.8, 0.9] - weekly_seasonality: True - yearly_seasonality: True - cv_settings: - initial: "1296 days" - period: "30 days" - horizon: "30 days" - parallel: "processes" - mobile: - start_date: "2021-01-01" - end_date: NULL - holidays: ["easter"] - regressors: ["after_fenix", "in_covid"] - grid_parameters: - changepoint_prior_scale: [0.001, 0.01, 0.1] - weekly_seasonality: True - yearly_seasonality: True - growth: "logistic" - cv_settings: - initial: "366 days" - period: "30 days" - horizon: "30 days" - parallel: "processes" + - segment: + device: desktop + start_date: "2018-01-01" + end_date: NULL + holidays: ["easter", "covid_sip11"] + regressors: ["post_esr_migration", "in_covid"] + grid_parameters: + changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] + changepoint_range: [0.8, 0.9] + weekly_seasonality: True + yearly_seasonality: True + cv_settings: + initial: "1296 days" + period: "30 days" + horizon: "30 days" + parallel: "processes" + - segment: + device: mobile + start_date: "2021-01-01" + end_date: NULL + holidays: ["easter"] + regressors: ["after_fenix", "in_covid"] + grid_parameters: + changepoint_prior_scale: [0.001, 0.01, 0.1] + weekly_seasonality: True + yearly_seasonality: True + growth: "logistic" + cv_settings: + initial: "366 days" + period: "30 days" + horizon: "30 days" + parallel: "processes" summarize: periods: ["day", "month"] diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml index 8dd8f811..75f73ba2 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml +++ b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml @@ -17,39 +17,41 @@ forecast_model: start_date: NULL end_date: NULL use_holidays: False + predict_historical_dates: True + number_of_simulations: 1000 parameters: - model_setting_split_dim: "device" - segment_settings: - desktop: - start_date: "2018-01-01" - end_date: NULL - holidays: ["easter", "covid_sip11"] - regressors: ["post_esr_migration", "in_covid"] - grid_parameters: - changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] - changepoint_range: [0.8, 0.9] - weekly_seasonality: True - yearly_seasonality: True - cv_settings: - initial: "1296 days" - period: "30 days" - horizon: "30 days" - parallel: "processes" - mobile: - start_date: "2020-01-01" - end_date: NULL - holidays: ["easter"] - regressors: ["after_fenix", "in_covid"] - grid_parameters: - changepoint_prior_scale: [0.001, 0.01, 0.1] - weekly_seasonality: True - yearly_seasonality: True - growth: "logistic" - cv_settings: - initial: "366 days" - period: "30 days" - horizon: "30 days" - parallel: "processes" + - segment: + device: desktop + start_date: "2018-01-01" + end_date: NULL + holidays: ["easter", "covid_sip11"] + regressors: ["post_esr_migration", "in_covid"] + grid_parameters: + changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] + changepoint_range: [0.8, 0.9] + weekly_seasonality: True + yearly_seasonality: True + cv_settings: + initial: "1296 days" + period: "30 days" + horizon: "30 days" + parallel: "processes" + - segment: + device: mobile + start_date: "2020-01-01" + end_date: NULL + holidays: ["easter"] + regressors: ["after_fenix", "in_covid"] + grid_parameters: + changepoint_prior_scale: [0.001, 0.01, 0.1] + weekly_seasonality: True + yearly_seasonality: True + growth: "logistic" + cv_settings: + initial: "366 days" + period: "30 days" + horizon: "30 days" + parallel: "processes" summarize: periods: ["day", "month"] diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py index 45c567d2..916c3f07 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py @@ -29,9 +29,6 @@ class BaseForecast(abc.ABC): date the metric should be queried. metric_hub (MetricHub): A MetricHub object that provides details about the metric to be forecasted. - number_of_simulations (int): The number of simulated timeseries that the forecast - should generate. Since many forecast models are probablistic, this enables the - measurement of variation across a range of possible outcomes. """ model_type: str @@ -40,7 +37,7 @@ class BaseForecast(abc.ABC): start_date: str end_date: str metric_hub: MetricHub - number_of_simulations: int = 1000 + predict_historical_dates: bool = False def _get_observed_data(self): if self.metric_hub: @@ -58,9 +55,18 @@ def __post_init__(self) -> None: # use default start/end dates if the user doesn't specify them self.start_date = pd.to_datetime(self.start_date or self._default_start_date) self.end_date = pd.to_datetime(self.end_date or self._default_end_date) - self.dates_to_predict = pd.DataFrame( - {"submission_date": pd.date_range(self.start_date, self.end_date).date} - ) + if self.predict_historical_dates: + self.dates_to_predict = pd.DataFrame( + { + "submission_date": pd.date_range( + self.metric_hub.start_date, self.end_date + ).date + } + ) + else: + self.dates_to_predict = pd.DataFrame( + {"submission_date": pd.date_range(self.start_date, self.end_date).date} + ) # initialize unset attributes self.model = None diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py index 52aa9cc8..aa4a4fb8 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py @@ -67,15 +67,6 @@ def __post_init__(self) -> None: # this is used to avoid the code below for testing purposes return - # Overwrite dates_to_predict to provide historical date forecasts - self.dates_to_predict = pd.DataFrame( - { - "submission_date": pd.date_range( - self.metric_hub.start_date, self.end_date - ).date - } - ) - self._set_segment_models(self.observed_df, self.metric_hub.segments.keys()) # initialize unset attributes @@ -85,10 +76,10 @@ def _set_segment_models( self, observed_df: pd.DataFrame, segment_column_list: list ) -> None: """Creates a SegmentSettings object for each segment specified in the - metric_hub.segments section of the config. These objects are stored in a list - in the segment_models attribute - Parameters can be specified independently for at most one dimension column - set using model_setting_split_dim in self.parameters + metric_hub.segments section of the config. It is populated from the list of + parameters in the forecast_model.parameters section of the configuration file. + The segements section of each element of the list specifies which values within which + segments the parameters are associated with. Args: observed_df (pd.DataFrame): dataframe containing observed data used to model @@ -100,45 +91,64 @@ def _set_segment_models( combination_df = observed_df[segment_column_list].drop_duplicates() # Construct dictionaries from those combinations + # this will be used to check that the config actually partitions the data segment_combinations = combination_df.to_dict("records") - # initialize a list to hold models for each segment - ## populate the list with segments and parameters for the segment - split_dim = self.parameters["model_setting_split_dim"] - - # check to make sure split_dim is one of the columns set in segment_column_list - if split_dim not in segment_column_list: - columns_str = ",".join(segment_column_list) + # get subset of segment that is used in partitioning + split_dims = None + for partition in self.parameters: + partition_dim = set(partition["segment"].keys()) + if split_dims and partition_dim != split_dims: + raise ValueError( + "Segment keys are not the same across different elements of parameters in the config file" + ) + elif split_dims is None: + split_dims = partition_dim + else: + # this is case where split_dim is set and matches paritition_dim + continue + if not split_dims <= set(combination_df.keys()): + missing_dims = split_dims - set(combination_df.keys()) + missing_dims_str = ",".join(missing_dims) raise ValueError( - f"model_setting_split_dim set to {split_dim} which is not among segment columns: {columns_str}" + f"Segment keys missing from metric hub segments: {missing_dims_str}" ) # For each segment combinination, get the model parameters from the config ## file. Parse the holidays and regressors specified in the config file. segment_models = [] for segment in segment_combinations: - model_params = self.parameters["segment_settings"][segment[split_dim]] - + # find the correct configuration + for partition in self.parameters: + partition_segment = partition["segment"] + # get subset of segment that is used to partition + subset_segment = { + key: val for key, val in segment.items() if key in split_dims + } + if partition_segment == subset_segment: + # parition is set to the desired value + # break out of loop + break holiday_list = [] regressor_list = [] - if model_params["holidays"]: - holiday_list = [holiday_collection[h] for h in model_params["holidays"]] - if model_params["regressors"]: + if "holidays" in partition: + holiday_list = [holiday_collection[h] for h in partition["holidays"]] + if "regressors" in partition: regressor_list = [ - regressor_collection[r] for r in model_params["regressors"] + regressor_collection[r] for r in partition["regressors"] ] # Create a SegmentModelSettings object for each segment combination segment_models.append( SegmentModelSettings( segment=segment, - start_date=model_params["start_date"], + start_date=partition["start_date"], end_date=self.end_date, holidays=[ProphetHoliday(**h) for h in holiday_list], regressors=[ProphetRegressor(**r) for r in regressor_list], - grid_parameters=dict(model_params["grid_parameters"]), - cv_settings=dict(model_params["cv_settings"]), + grid_parameters=dict(partition["grid_parameters"]), + cv_settings=dict(partition["cv_settings"]), ) ) self.segment_models = segment_models diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py index 19f57e1d..30d152b3 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py @@ -16,6 +16,16 @@ @dataclass class ProphetForecast(BaseForecast): + """Forecast object specifically for prophet forecast models + + Additional attributes: + number_of_simulations (int): The number of simulated timeseries that the forecast + should generate. Since many forecast models are probablistic, this enables the + measurement of variation across a range of possible outcomes. + """ + + number_of_simulations: int = 1000 + @property def column_names_map(self) -> Dict[str, str]: return {"submission_date": "ds", "value": "y"} diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py index f7f14184..885e2b52 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py @@ -59,34 +59,33 @@ def funnel_forecast_for_fit_tests(segment_info_fit_tests, mocker): """This method creates a forecast object from the segment dict created in the segment_info_fit_tests fixture. It also mocks some of the object methods to enable easier testing""" - parameter_dict = { - "model_setting_split_dim": "a", - "segment_settings": { - "A1": { - "start_date": segment_info_fit_tests["A1"]["start_date"], - "end_date": None, - "holidays": [], - "regressors": [], - "grid_parameters": segment_info_fit_tests["A1"]["grid_parameters"], - "cv_settings": {}, - }, - "A2": { - "start_date": segment_info_fit_tests["A2"]["start_date"], - "end_date": None, - "holidays": [], - "regressors": [], - "grid_parameters": segment_info_fit_tests["A2"]["grid_parameters"], - "cv_settings": {}, - }, + parameter_list = [ + { + "segment": {"a": "A1"}, + "start_date": segment_info_fit_tests["A1"]["start_date"], + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": segment_info_fit_tests["A1"]["grid_parameters"], + "cv_settings": {}, }, - } + { + "segment": {"a": "A2"}, + "start_date": segment_info_fit_tests["A2"]["start_date"], + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": segment_info_fit_tests["A2"]["grid_parameters"], + "cv_settings": {}, + }, + ] predict_start_date = "2124-01-01" predict_end_date = "2124-01-02" forecast = FunnelForecast( model_type="test", - parameters=parameter_dict, + parameters=parameter_list, use_holidays=None, start_date=predict_start_date, end_date=predict_end_date, @@ -491,26 +490,24 @@ def test_under_predict(mocker): # set segment models # 2124-01-01 chosen as a artibrary date to center tests on A1_start_date = "2124-01-01" - parameter_dict = { - "model_setting_split_dim": "a", - "segment_settings": { - "A1": { - "start_date": A1_start_date, - "end_date": None, - "holidays": [], - "regressors": [], - "grid_parameters": {"param1": [1, 2], "param2": [20, 10]}, - "cv_settings": {}, - }, - }, - } + parameter_list = [ + { + "segment": {"a": "A1"}, + "start_date": A1_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {"param1": [1, 2], "param2": [20, 10]}, + "cv_settings": {}, + } + ] predict_start_date = "2124-01-02" predict_end_date = "2124-03-01" forecast = FunnelForecast( model_type="test", - parameters=parameter_dict, + parameters=parameter_list, use_holidays=None, start_date=predict_start_date, end_date=predict_end_date, @@ -846,34 +843,33 @@ def test_set_segment_models(): """test the set_segment_models method""" A1_start_date = "2018-01-01" A2_start_date = "2020-02-02" - parameter_dict = { - "model_setting_split_dim": "a", - "segment_settings": { - "A1": { - "start_date": A1_start_date, - "end_date": None, - "holidays": [], - "regressors": [], - "grid_parameters": {}, - "cv_settings": {}, - }, - "A2": { - "start_date": A2_start_date, - "end_date": None, - "holidays": [], - "regressors": [], - "grid_parameters": {}, - "cv_settings": {}, - }, + parameter_list = [ + { + "segment": {"a": "A1"}, + "start_date": A1_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {}, + "cv_settings": {}, }, - } + { + "segment": {"a": "A2"}, + "start_date": A2_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {}, + "cv_settings": {}, + }, + ] predict_start_date = "2124-01-01" predict_end_date = "2124-03-01" forecast = FunnelForecast( model_type="test", - parameters=parameter_dict, + parameters=parameter_list, use_holidays=None, start_date=predict_start_date, end_date=predict_end_date, @@ -924,34 +920,33 @@ def test_set_segment_models_exception(): is specified that isn't in the data""" A1_start_date = "2018-01-01" A2_start_date = "2020-02-02" - parameter_dict = { - "model_setting_split_dim": "c", # not in data - "segment_settings": { - "A1": { - "start_date": A1_start_date, - "end_date": None, - "holidays": [], - "regressors": [], - "grid_parameters": {}, - "cv_settings": {}, - }, - "A2": { - "start_date": A2_start_date, - "end_date": None, - "holidays": [], - "regressors": [], - "grid_parameters": {}, - "cv_settings": {}, - }, + parameter_list = [ + { + "segment": {"c": "A1"}, + "start_date": A1_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {}, + "cv_settings": {}, }, - } + { + "segment": {"c": "A2"}, + "start_date": A2_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {}, + "cv_settings": {}, + }, + ] predict_start_date = "2124-01-01" predict_end_date = "2124-03-01" forecast = FunnelForecast( model_type="test", - parameters=parameter_dict, + parameters=parameter_list, use_holidays=None, start_date=predict_start_date, end_date=predict_end_date, @@ -966,7 +961,7 @@ def test_set_segment_models_exception(): with pytest.raises( ValueError, - match="model_setting_split_dim set to c which is not among segment columns: a,b", + match="Segment keys missing from metric hub segments: c", ): forecast._set_segment_models( observed_df=observed_data, segment_column_list=segment_list From f551f4c7fd2f35b57456d169f3122cf1ff9c7d1e Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Fri, 9 Aug 2024 16:47:29 -0500 Subject: [PATCH 22/33] Update jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py Co-authored-by: Brad Ochocki Szasz --- jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py | 1 - 1 file changed, 1 deletion(-) diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py index ed958518..dcf64b91 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py @@ -215,7 +215,6 @@ def summarize( summary_df["forecast_end_date"] = self.end_date summary_df["forecast_trained_at"] = self.trained_at summary_df["forecast_predicted_at"] = self.predicted_at - summary_df["forecast_parameters"] = self.metadata_params self.summary_df = summary_df From 1a63912afd4ebd7b5304ec243a1b9a98a2ad7e98 Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Fri, 9 Aug 2024 16:53:42 -0500 Subject: [PATCH 23/33] Brad easy fixes --- .../kpi_forecasting/tests/test_prophet_forecast.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py index ce372cf6..59420cc4 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py @@ -227,8 +227,6 @@ def test_combine_forecast_observed(mocker, forecast): assert set(expected.columns) == set(output_df.columns) # force value columns to be floats in both cases to make check easier numeric_cols = ["value", "value_low", "value_mid", "value_high"] - # expected[numeric_cols] = expected[numeric_cols].astype(float) - # output_df[numeric_cols] = output_df[numeric_cols].astype(float) pd.testing.assert_frame_equal( output_df.sort_values(["submission_date", "measure"]).reset_index(drop=True), expected[output_df.columns].reset_index(drop=True), @@ -320,8 +318,6 @@ def test_under_summarize(mocker, forecast): assert set(expected.columns) == set(output_df.columns) # force value columns to be floats in both cases to make check easier numeric_cols = ["value", "value_low", "value_mid", "value_high"] - # expected[numeric_cols] = expected[numeric_cols].astype(float) - # output_df[numeric_cols] = output_df[numeric_cols].astype(float) pd.testing.assert_frame_equal( output_df.sort_values(["submission_date", "measure"]).reset_index(drop=True), expected[output_df.columns].reset_index(drop=True), @@ -343,7 +339,7 @@ def test_summarize(mocker, forecast): dummy_metric_hub = MetricHub("", "", "", "2124-01-01", "2124-01-01") - # 2024-01-01 is chosen as an arbitrary date to center the tests around + # 2124-01-01 is chosen as an arbitrary date to center the tests around # forecast predictions are set with the # mock_aggregate_forecast_observed function so they From 6a8c90cd5d4dc2478386ae502fb9d5f33702941b Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Mon, 12 Aug 2024 13:21:29 -0500 Subject: [PATCH 24/33] remove magic year --- .../tests/test_base_forecast.py | 41 ++- .../tests/test_funnel_forecast.py | 333 ++++++++++-------- .../tests/test_prophet_forecast.py | 187 ++++++---- 3 files changed, 320 insertions(+), 241 deletions(-) diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py index 17ce4d27..19a2db9d 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py @@ -1,15 +1,25 @@ from typing import List import collections +from datetime import date, datetime +from dateutil.relativedelta import relativedelta import pytest import pandas as pd from dotmap import DotMap import numpy as np -from datetime import datetime, timedelta, timezone +from datetime import timedelta, timezone from kpi_forecasting.models.base_forecast import BaseForecast +# Arbitrarily choose some date to use for the tests +TEST_DATE = date(2024, 1, 1) +TEST_DATE_STR = TEST_DATE.strftime("%Y-%m-%d") +TEST_DATE_NEXT_DAY = date(2024, 1, 2) +TEST_DATE_NEXT_DAY_STR = TEST_DATE_NEXT_DAY.strftime("%Y-%m-%d") +TEST_PREDICT_END = TEST_DATE + relativedelta(months=2) +TEST_PREDICT_END_STR = TEST_PREDICT_END.strftime("%Y-%m-%d") + class BadClass(BaseForecast): pass @@ -30,8 +40,9 @@ def _get_observed_data(self): self.observed_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2020-01-01"), - pd.to_datetime("1990-01-01"), + TEST_DATE, + TEST_DATE + - relativedelta(years=1), # just an arbitrary date in the past ] } ) @@ -77,8 +88,8 @@ def test_not_implemented(): def test_post_init(good_class): - start_date = "2124-01-01" - end_date = "2124-02-02" + start_date = TEST_DATE_STR + end_date = TEST_PREDICT_END_STR good_class = good_class( model_type="test", parameters=DotMap(), @@ -109,7 +120,7 @@ def test_post_init_default_dates(good_class): ) # this is the max date of the self.observed_data['submission_date'] plus one day # from the object definion - start_date = pd.to_datetime("2020-01-02") + start_date = TEST_DATE_NEXT_DAY end_date = ( datetime.now(timezone.utc).replace(tzinfo=None) + timedelta(weeks=78) ).date() @@ -124,15 +135,15 @@ def test_fit(good_class): model_type="test", parameters=DotMap(), use_holidays=None, - start_date="2124-01-01", - end_date="2124-02-02", + start_date=TEST_DATE_STR, + end_date=TEST_PREDICT_END_STR, metric_hub=None, ) good_class.fit() assert good_class.model - # - assert good_class.model.is_fit == pd.to_datetime("2020-01-01") + # model sets is_fit to the largest day in the observed data + assert good_class.model.is_fit == TEST_DATE def test_predict_and_validate(good_class): @@ -140,8 +151,8 @@ def test_predict_and_validate(good_class): model_type="test", parameters=DotMap(), use_holidays=None, - start_date="2124-01-01", - end_date="2124-02-02", + start_date=TEST_DATE_STR, + end_date=TEST_PREDICT_END_STR, metric_hub=None, ) # overwrite date range set in __post_init__ @@ -155,8 +166,8 @@ def test_summarize(good_class): model_type="test", parameters=DotMap(), use_holidays=None, - start_date="2124-01-01", - end_date="2124-02-02", + start_date=TEST_DATE_STR, + end_date=TEST_PREDICT_END_STR, metric_hub=None, ) good_class.forecast_df = np.array([1, 2]) @@ -166,7 +177,7 @@ def test_summarize(good_class): ["alias", "app_name", "slug", "min_date", "max_date"], ) - dummy_metric_hub = MetricHub("", "", "", "2124-01-01", "2124-01-01") + dummy_metric_hub = MetricHub("", "", "", TEST_DATE_STR, TEST_DATE_STR) # add it here rather than in __init__ so it doesn't try to load data good_class.metric_hub = dummy_metric_hub diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py index c792db67..a8f865b5 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py @@ -1,6 +1,8 @@ """tests for the funnel forecast module""" import collections +from datetime import date, datetime +from dateutil.relativedelta import relativedelta import pandas as pd from dotmap import DotMap @@ -11,13 +13,21 @@ from kpi_forecasting.configs.model_inputs import ProphetRegressor, ProphetHoliday from kpi_forecasting.models.funnel_forecast import SegmentModelSettings, FunnelForecast +# Arbitrarily choose some date to use for the tests +TEST_DATE = date(2024, 1, 1) +TEST_DATE_STR = TEST_DATE.strftime("%Y-%m-%d") +TEST_DATE_NEXT_DAY = date(2024, 1, 2) +TEST_DATE_NEXT_DAY_STR = TEST_DATE_NEXT_DAY.strftime("%Y-%m-%d") +TEST_PREDICT_END = TEST_DATE + relativedelta(months=2) +TEST_PREDICT_END_STR = TEST_PREDICT_END.strftime("%Y-%m-%d") + @pytest.fixture() def forecast(): """This mocks a generic forecast object""" # 2024-01-01 is arbitarily chosen as a future date - predict_start_date = "2124-01-01" - predict_end_date = "2124-03-01" + predict_start_date = TEST_DATE_STR + predict_end_date = TEST_PREDICT_END_STR forecast = FunnelForecast( model_type="test", @@ -37,8 +47,8 @@ def segment_info_fit_tests(): in the functions that test fit methods""" # 2024-01-01 is arbitarily chosen as a future date - A1_start_date = "2124-01-01" - A2_start_date = "2124-01-02" + A1_start_date = TEST_DATE_STR + A2_start_date = TEST_DATE_NEXT_DAY_STR segment_info_dict = { "A1": { @@ -83,9 +93,8 @@ def funnel_forecast_for_fit_tests(segment_info_fit_tests, mocker): } parameter_dotmap = DotMap(parameter_dict) - predict_start_date = "2124-01-01" - predict_end_date = "2124-01-02" - + predict_start_date = TEST_DATE_STR + predict_end_date = TEST_DATE_NEXT_DAY_STR forecast = FunnelForecast( model_type="test", parameters=parameter_dotmap, @@ -178,8 +187,8 @@ def test_combine_forecast_observed(mocker, forecast): forecast_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -187,8 +196,8 @@ def test_combine_forecast_observed(mocker, forecast): observed_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "a": ["A1", "A1"], "value": [5, 6], @@ -238,8 +247,8 @@ def test_under_summarize(mocker, forecast): forecast_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -249,11 +258,11 @@ def test_under_summarize(mocker, forecast): observed_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2123-01-01").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE - relativedelta(months=1), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "a": ["A1", "A1", "A1", "A2", "A2"], "value": [10, 20, 30, 40, 50], @@ -265,7 +274,7 @@ def test_under_summarize(mocker, forecast): ["start_date", "forecast_df", "segment", "trained_parameters"], ) dummy_segment_settings = SegmentSettings( - start_date="2124-01-01", + start_date=TEST_DATE_STR, forecast_df=forecast_df.copy(), segment={"a": "A1"}, trained_parameters={"trained_parameters": "yes"}, @@ -288,8 +297,8 @@ def test_under_summarize(mocker, forecast): observed_expected_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "a": ["A1", "A1"], "value": [20, 30], @@ -334,7 +343,7 @@ def test_summarize(mocker, forecast): ["alias", "app_name", "slug", "min_date", "max_date"], ) - dummy_metric_hub = MetricHub("", "", "", "2124-01-01", "2124-01-01") + dummy_metric_hub = MetricHub("", "", "", TEST_DATE_STR, TEST_DATE_STR) # forecast predictions are set with the # mock_aggregate_forecast_observed function so they @@ -342,8 +351,8 @@ def test_summarize(mocker, forecast): forecast_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -353,11 +362,11 @@ def test_summarize(mocker, forecast): observed_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2123-01-01").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE - relativedelta(months=1), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "a": ["A1", "A1", "A1", "A2", "A2"], "value": [10, 20, 30, 40, 50], @@ -373,7 +382,7 @@ def test_summarize(mocker, forecast): # we're only testing that it is concatenated properly # with the segment data added dummy_segment_settings_A1 = SegmentSettings( - start_date="2124-01-01", + start_date=TEST_DATE_STR, forecast_df=forecast_df.copy(), segment={"a": "A1"}, trained_parameters={"trained_parameters": "yes"}, @@ -381,7 +390,7 @@ def test_summarize(mocker, forecast): ) dummy_segment_settings_A2 = SegmentSettings( - start_date="2124-01-01", + start_date=TEST_DATE_STR, forecast_df=forecast_df.copy(), segment={"a": "A2"}, trained_parameters={"trained_parameters": "yes"}, @@ -418,10 +427,10 @@ def test_summarize(mocker, forecast): observed_expected_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "a": ["A1", "A1", "A2", "A2"], "value": [20, 30, 40, 50], @@ -491,8 +500,8 @@ def test_summarize(mocker, forecast): def test_under_predict(mocker): """testing _predict""" # set segment models - # 2124-01-01 chosen as a artibrary date to center tests on - A1_start_date = "2124-01-01" + + A1_start_date = TEST_DATE_STR parameter_dict = { "model_setting_split_dim": "a", "segment_settings": { @@ -508,8 +517,8 @@ def test_under_predict(mocker): } parameter_dotmap = DotMap(parameter_dict) - predict_start_date = "2124-01-02" - predict_end_date = "2124-03-01" + predict_start_date = TEST_DATE_NEXT_DAY_STR + predict_end_date = TEST_PREDICT_END_STR forecast = FunnelForecast( model_type="test", @@ -535,8 +544,8 @@ def test_under_predict(mocker): "b": ["B1", "B2"], "y": [0, 1], "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -557,8 +566,8 @@ def test_under_predict(mocker): dates_to_predict = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ] } ) @@ -574,8 +583,8 @@ def test_under_predict(mocker): { 0: [0, model_value], "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -623,10 +632,10 @@ def test_predict(funnel_forecast_for_fit_tests, segment_info_fit_tests): "b": ["B1", "B2", "B1", "B2"], "y": [-1, 1, -1, 1], "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -652,8 +661,8 @@ def test_predict(funnel_forecast_for_fit_tests, segment_info_fit_tests): { 0: [0, model_value], "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -664,7 +673,7 @@ def test_predict(funnel_forecast_for_fit_tests, segment_info_fit_tests): expected_raw["submission_date"] >= pd.to_datetime(funnel_forecast_for_fit_tests.start_date).date() ) - expected = expected_raw[expected_time_filter] + expected = expected_raw[expected_time_filter].reset_index(drop=True) forecast_df = segment.forecast_df pd.testing.assert_frame_equal(forecast_df, expected) @@ -717,8 +726,8 @@ def test_auto_tuning(forecast, mocker): # set one segment with two sets of grid parameters segment_settings = SegmentModelSettings( segment={"a": "A1"}, - start_date="2124-01-01", - end_date="2124-03-01", + start_date=TEST_DATE_STR, + end_date=TEST_PREDICT_END_STR, holidays=[], regressors=[], grid_parameters={"param1": [1, 2], "param2": [20, 10]}, @@ -738,8 +747,8 @@ def test_auto_tuning(forecast, mocker): "a": ["A1", "A1"], "b": ["B1", "B2"], "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-01").date(), + TEST_DATE, + TEST_DATE, ], } ) @@ -760,10 +769,10 @@ def test_under_fit(funnel_forecast_for_fit_tests, segment_info_fit_tests): "a": ["A1", "A1", "A2", "A2"], "b": ["B1", "B2", "B1", "B2"], "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -807,10 +816,10 @@ def test_fit(funnel_forecast_for_fit_tests, segment_info_fit_tests): "a": ["A1", "A1", "A2", "A2"], "b": ["B1", "B2", "B1", "B2"], "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -872,8 +881,8 @@ def test_set_segment_models(): } parameter_dotmap = DotMap(parameter_dict) - predict_start_date = "2124-01-01" - predict_end_date = "2124-03-01" + predict_start_date = TEST_DATE_STR + predict_end_date = TEST_PREDICT_END_STR forecast = FunnelForecast( model_type="test", @@ -951,8 +960,8 @@ def test_set_segment_models_exception(): } parameter_dotmap = DotMap(parameter_dict) - predict_start_date = "2124-01-01" - predict_end_date = "2124-03-01" + predict_start_date = TEST_DATE_STR + predict_end_date = TEST_PREDICT_END_STR forecast = FunnelForecast( model_type="test", @@ -982,6 +991,14 @@ def test_fill_regressor_dates(forecast): """test _fill_regressor_dates the name in the regressor info indicates which case is being tested Dates are chosen arbitrarily""" + # get the set start and end dates for the forecast fixture + # as datetime objects + default_start_datetime = datetime(TEST_DATE.year, TEST_DATE.month, TEST_DATE.day) + default_end_datetime = datetime( + TEST_PREDICT_END.year, TEST_PREDICT_END.month, TEST_PREDICT_END.day + ) + + # set the start date with an arbitrary date regressor_info = { "name": "only_start", "description": "only has a start", @@ -990,8 +1007,11 @@ def test_fill_regressor_dates(forecast): regressor = ProphetRegressor(**regressor_info) forecast._fill_regressor_dates(regressor) assert regressor.start_date == pd.to_datetime("2020-08-15") - assert regressor.end_date == pd.to_datetime("2124-03-01") + # this is the end dat for the forecast fixture + assert regressor.end_date == default_end_datetime + + # set the end date with an arbitrary date regressor_info = { "name": "only_end", "description": "only has a end", @@ -999,9 +1019,11 @@ def test_fill_regressor_dates(forecast): } regressor = ProphetRegressor(**regressor_info) forecast._fill_regressor_dates(regressor) - assert regressor.start_date == pd.to_datetime("2124-01-01") + # the start date for the forecast fixture is TEST_DATE + assert regressor.start_date == default_start_datetime assert regressor.end_date == pd.to_datetime("2125-08-15") + # set both the start and end dates to arbitrary dates regressor_info = { "name": "both", "description": "only has a start", @@ -1013,15 +1035,17 @@ def test_fill_regressor_dates(forecast): assert regressor.start_date == pd.to_datetime("2020-08-15") assert regressor.end_date == pd.to_datetime("2020-09-15") + # use the defaults for both regressor_info = { "name": "neither", "description": "nothin to see here", } regressor = ProphetRegressor(**regressor_info) forecast._fill_regressor_dates(regressor) - assert regressor.start_date == pd.to_datetime("2124-01-01") - assert regressor.end_date == pd.to_datetime("2124-03-01") + assert regressor.start_date == default_start_datetime + assert regressor.end_date == default_end_datetime + # use arbitrary out of order dates to set regressor_info = { "name": "out_of_order", "description": "best better break", @@ -1039,6 +1063,11 @@ def test_fill_regressor_dates(forecast): def test_add_regressors(forecast): """test add regressors test case for each element of regressor_list_raw is indicated in name""" + + # choose arbitrary dates for dates + # name indicates the relationship of the window + # to the timeframe of the data as defined in the ds + # column of df below regressor_list_raw = [ { "name": "all_in", @@ -1120,8 +1149,8 @@ def test_build_train_dataframe_no_regressors(forecast): } segment_settings = SegmentModelSettings( segment={"a": 1, "b": 2}, - start_date="2124-01-01", - end_date="2124-02-01", + start_date=TEST_DATE_STR, + end_date=TEST_PREDICT_END_STR, holidays=[], regressors=[ProphetRegressor(**r) for r in regressor_list], grid_parameters=grid_parameters, @@ -1134,12 +1163,12 @@ def test_build_train_dataframe_no_regressors(forecast): "b": [1, 1, 2, 2, 2, 2], "y": [1, 2, 3, 4, 5, 6], "submission_date": [ - pd.to_datetime("2124-12-01").date(), - pd.to_datetime("2124-12-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2123-01-01").date(), - pd.to_datetime("2123-01-02").date(), + TEST_DATE - relativedelta(months=1), + TEST_DATE_NEXT_DAY - relativedelta(months=1), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE + relativedelta(months=1), + TEST_DATE_NEXT_DAY + relativedelta(months=1), ], } ) @@ -1153,8 +1182,8 @@ def test_build_train_dataframe_no_regressors(forecast): "b": [2, 2], "y": [3, 4], "ds": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -1172,8 +1201,8 @@ def test_build_train_dataframe_no_regressors(forecast): "b": [2, 2], "y": [3, 4], "ds": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "floor": [1.5, 1.5], "cap": [6.0, 6.0], @@ -1193,20 +1222,24 @@ def test_build_train_dataframe(forecast): { "name": "all_in", "description": "it's all in", - "start_date": "2124-01-01", - "end_date": "2124-01-06", + "start_date": TEST_DATE_STR, + "end_date": (TEST_DATE + relativedelta(days=6)).strftime("%Y-%m-%d"), }, { "name": "all_out", "description": "it's all in", - "start_date": "2124-02-01", - "end_date": "2124-02-06", + "start_date": (TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d"), + "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( + "%Y-%m-%d" + ), }, { "name": "just_end", "description": "just the second one", - "start_date": "2124-01-02", - "end_date": "2124-02-06", + "start_date": (TEST_DATE + relativedelta(days=1)).strftime("%Y-%m-%d"), + "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( + "%Y-%m-%d" + ), }, ] @@ -1226,8 +1259,8 @@ def test_build_train_dataframe(forecast): } segment_settings = SegmentModelSettings( segment={"a": 1, "b": 2}, - start_date="2124-01-01", - end_date="2124-02-01", + start_date=TEST_DATE_STR, + end_date=(TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d"), holidays=[], regressors=[ProphetRegressor(**r) for r in regressor_list], grid_parameters=grid_parameters, @@ -1240,12 +1273,12 @@ def test_build_train_dataframe(forecast): "b": [1, 1, 2, 2, 2, 2], "y": [1, 2, 3, 4, 5, 6], "submission_date": [ - pd.to_datetime("2124-12-01").date(), - pd.to_datetime("2124-12-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2123-01-01").date(), - pd.to_datetime("2123-01-02").date(), + TEST_DATE - relativedelta(months=1), + TEST_DATE_NEXT_DAY - relativedelta(months=1), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE + relativedelta(months=1), + TEST_DATE_NEXT_DAY + relativedelta(months=1), ], } ) @@ -1258,8 +1291,8 @@ def test_build_train_dataframe(forecast): "b": [2, 2], "y": [3, 4], "ds": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "all_in": [1, 1], "all_out": [0, 0], @@ -1279,8 +1312,8 @@ def test_build_train_dataframe(forecast): "b": [2, 2], "y": [3, 4], "ds": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "all_in": [1, 1], "all_out": [0, 0], @@ -1317,8 +1350,8 @@ def test_build_predict_dataframe_no_regressors(forecast): } segment_settings = SegmentModelSettings( segment={"a": 1, "b": 2}, - start_date="2124-01-01", - end_date="2124-02-01", + start_date=TEST_DATE_STR, + end_date=TEST_PREDICT_END_STR, holidays=[], regressors=[ProphetRegressor(**r) for r in regressor_list], grid_parameters=grid_parameters, @@ -1331,12 +1364,12 @@ def test_build_predict_dataframe_no_regressors(forecast): dates_to_predict = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-12-01").date(), - pd.to_datetime("2124-12-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2123-01-01").date(), - pd.to_datetime("2123-01-02").date(), + TEST_DATE - relativedelta(months=1), + TEST_DATE_NEXT_DAY - relativedelta(months=1), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -1347,12 +1380,12 @@ def test_build_predict_dataframe_no_regressors(forecast): expected_predict_df = pd.DataFrame( { "ds": [ - pd.to_datetime("2124-12-01").date(), - pd.to_datetime("2124-12-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2123-01-01").date(), - pd.to_datetime("2123-01-02").date(), + TEST_DATE - relativedelta(months=1), + TEST_DATE_NEXT_DAY - relativedelta(months=1), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -1369,12 +1402,12 @@ def test_build_predict_dataframe_no_regressors(forecast): expected_predict_wlog_df = pd.DataFrame( { "ds": [ - pd.to_datetime("2124-12-01").date(), - pd.to_datetime("2124-12-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2123-01-01").date(), - pd.to_datetime("2123-01-02").date(), + TEST_DATE - relativedelta(months=1), + TEST_DATE_NEXT_DAY - relativedelta(months=1), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "floor": [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0], "cap": [10.0, 10.0, 10.0, 10.0, 10.0, 10.0], @@ -1394,20 +1427,24 @@ def test_build_predict_dataframe(forecast): { "name": "all_in", "description": "it's all in", - "start_date": "2124-01-01", - "end_date": "2124-01-06", + "start_date": TEST_DATE_STR, + "end_date": (TEST_DATE + relativedelta(days=6)).strftime("%Y-%m-%d"), }, { "name": "all_out", "description": "it's all in", - "start_date": "2124-02-01", - "end_date": "2124-02-06", + "start_date": (TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d"), + "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( + "%Y-%m-%d" + ), }, { "name": "just_end", "description": "just the second one", - "start_date": "2124-01-02", - "end_date": "2124-02-06", + "start_date": (TEST_DATE + relativedelta(days=1)).strftime("%Y-%m-%d"), + "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( + "%Y-%m-%d" + ), }, ] @@ -1427,8 +1464,8 @@ def test_build_predict_dataframe(forecast): } segment_settings = SegmentModelSettings( segment={"a": 1, "b": 2}, - start_date="2124-01-01", - end_date="2124-02-01", + start_date=TEST_DATE_STR, + end_date=TEST_PREDICT_END_STR, holidays=[], regressors=[ProphetRegressor(**r) for r in regressor_list], grid_parameters=grid_parameters, @@ -1440,10 +1477,7 @@ def test_build_predict_dataframe(forecast): dates_to_predict = pd.DataFrame( { - "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - ], + "submission_date": [TEST_DATE, TEST_DATE_NEXT_DAY], } ) @@ -1453,10 +1487,7 @@ def test_build_predict_dataframe(forecast): ) expected_train_df = pd.DataFrame( { - "ds": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - ], + "ds": [TEST_DATE, TEST_DATE_NEXT_DAY], "all_in": [1, 1], "all_out": [0, 0], "just_end": [0, 1], @@ -1474,10 +1505,7 @@ def test_build_predict_dataframe(forecast): ) expected_train_wlog_df = pd.DataFrame( { - "ds": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - ], + "ds": [TEST_DATE, TEST_DATE_NEXT_DAY], "all_in": [1, 1], "all_out": [0, 0], "just_end": [0, 1], @@ -1500,23 +1528,28 @@ def test_build_model(forecast): { "name": "all_in", "description": "it's all in", - "start_date": "2124-01-01", - "end_date": "2124-01-06", + "start_date": TEST_DATE_STR, + "end_date": (TEST_DATE + relativedelta(days=6)).strftime("%Y-%m-%d"), }, { "name": "all_out", "description": "it's all in", - "start_date": "2124-02-01", - "end_date": "2124-02-06", + "start_date": (TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d"), + "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( + "%Y-%m-%d" + ), }, { "name": "just_end", "description": "just the second one", - "start_date": "2124-01-02", - "end_date": "2124-02-06", + "start_date": (TEST_DATE + relativedelta(days=1)).strftime("%Y-%m-%d"), + "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime( + "%Y-%m-%d" + ), }, ] + # use holidays from holiday config file holiday_list = { "easter": { "name": "easter", @@ -1565,8 +1598,8 @@ def test_build_model(forecast): } segment_settings = SegmentModelSettings( segment={"a": 1, "b": 2}, - start_date="2124-01-01", - end_date="2124-02-01", + start_date=TEST_DATE_STR, + end_date=TEST_PREDICT_END_STR, holidays=[ProphetHoliday(**h) for h in holiday_list.values()], regressors=[ProphetRegressor(**r) for r in regressor_list], grid_parameters=grid_parameters, diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py index 59420cc4..1e211375 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py @@ -1,3 +1,6 @@ +from datetime import date +from dateutil.relativedelta import relativedelta + import pandas as pd from dotmap import DotMap import numpy as np @@ -7,10 +10,16 @@ from kpi_forecasting.models.prophet_forecast import ProphetForecast +# Arbitrarily choose some date to use for the tests +TEST_DATE = date(2024, 1, 1) +TEST_DATE_STR = TEST_DATE.strftime("%Y-%m-%d") +TEST_DATE_NEXT_DAY = date(2024, 1, 1) +TEST_DATE_NEXT_DAY_STR = TEST_DATE_NEXT_DAY.strftime("%Y-%m-%d") + @pytest.fixture def forecast(): - A1_start_date = "2124-01-01" + A1_start_date = TEST_DATE_STR parameter_dict = { "model_setting_split_dim": "a", "segment_settings": { @@ -26,8 +35,9 @@ def forecast(): } parameter_dotmap = DotMap(parameter_dict) - predict_start_date = "2124-01-02" - predict_end_date = "2124-03-01" + predict_start_date = TEST_DATE_NEXT_DAY_STR + # arbitarily set it a couple months in the future + predict_end_date = (TEST_DATE + relativedelta(months=2)).strftime("%Y-%m-%d") return ProphetForecast( model_type="test", parameters=parameter_dotmap, @@ -104,10 +114,10 @@ def test_under_fit(forecast, mocker): observed_data = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -128,10 +138,10 @@ def test_fit(forecast, mocker): observed_data = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -150,16 +160,14 @@ def test_fit(forecast, mocker): def test_combine_forecast_observed(mocker, forecast): """tests the _combine_forecast_observed method""" - # 2024-01-01 is chosen as an arbitrary date to center the tests around - # forecast predictions are set with the # mock_aggregate_forecast_observed function so they # can be ommited here forecast_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -169,8 +177,8 @@ def test_combine_forecast_observed(mocker, forecast): observed_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "value": [10, 20], } @@ -192,8 +200,8 @@ def test_combine_forecast_observed(mocker, forecast): observed_expected_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "value": [10, 20], "measure": ["observed", "observed"], @@ -205,14 +213,14 @@ def test_combine_forecast_observed(mocker, forecast): forecast_expected_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "measure": ["mean", "mean", "p10", "p10", "p50", "p50", "p90", "p90"], "value": [0] * 8, @@ -239,16 +247,14 @@ def test_combine_forecast_observed(mocker, forecast): def test_under_summarize(mocker, forecast): """testing _summarize""" - # 2024-01-01 is chosen as an arbitrary date to center the tests around - # forecast predictions are set with the # mock_aggregate_forecast_observed function so they # can be ommited here forecast_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -258,8 +264,8 @@ def test_under_summarize(mocker, forecast): observed_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "value": [10, 20], } @@ -281,8 +287,8 @@ def test_under_summarize(mocker, forecast): observed_expected_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "value": [10, 20], "measure": ["observed", "observed"], @@ -294,14 +300,14 @@ def test_under_summarize(mocker, forecast): forecast_expected_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "measure": ["mean", "mean", "p10", "p10", "p50", "p50", "p90", "p90"], "value": [0] * 8, @@ -337,9 +343,7 @@ def test_summarize(mocker, forecast): ["alias", "app_name", "slug", "min_date", "max_date"], ) - dummy_metric_hub = MetricHub("", "", "", "2124-01-01", "2124-01-01") - - # 2124-01-01 is chosen as an arbitrary date to center the tests around + dummy_metric_hub = MetricHub("", "", "", TEST_DATE_STR, TEST_DATE_STR) # forecast predictions are set with the # mock_aggregate_forecast_observed function so they @@ -347,8 +351,8 @@ def test_summarize(mocker, forecast): forecast_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -358,8 +362,8 @@ def test_summarize(mocker, forecast): observed_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "value": [10, 20], } @@ -396,8 +400,8 @@ def test_summarize(mocker, forecast): observed_expected_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "value": [10, 20], "measure": ["observed", "observed"], @@ -409,14 +413,14 @@ def test_summarize(mocker, forecast): forecast_expected_df = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, + TEST_DATE, + TEST_DATE_NEXT_DAY, ], "measure": ["mean", "mean", "p10", "p10", "p50", "p50", "p90", "p90"], "value": [0] * 8, @@ -476,8 +480,8 @@ def test_under_predict(mocker, forecast): { "y": [0, 1], "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -485,8 +489,8 @@ def test_under_predict(mocker, forecast): dates_to_predict = pd.DataFrame( { "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ] } ) @@ -502,8 +506,8 @@ def test_under_predict(mocker, forecast): { 0: [0, 2], "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -525,8 +529,8 @@ def test_under_predict(mocker, forecast): { 0: [0, 2], "submission_date": [ - pd.to_datetime("2124-01-01").date(), - pd.to_datetime("2124-01-02").date(), + TEST_DATE, + TEST_DATE_NEXT_DAY, ], } ) @@ -536,11 +540,13 @@ def test_under_predict(mocker, forecast): def test_summarize_non_overlapping_day(): - observed_start_date = "2124-01-01" - observed_end_date = "2124-02-01" + observed_start_date = TEST_DATE_STR + observed_end_date = (TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d") - predict_start_date = "2124-02-02" - predict_end_date = "2124-03-01" + predict_start_date = (TEST_DATE + relativedelta(months=1, days=1)).strftime( + "%Y-%m-%d" + ) + predict_end_date = (TEST_DATE + relativedelta(months=2)).strftime("%Y-%m-%d") forecast = ProphetForecast( model_type="test", @@ -562,10 +568,15 @@ def test_summarize_non_overlapping_day(): } ) + # there are the samples generated + # the mean and median are the aggregates used test_samples = np.array([1, 1, 2, 3, 5, 8, 13]) test_mean = np.mean(test_samples) test_median = np.median(test_samples) + # mean and median scale with a factor + # so a factor is multiplied on to make sure the aggregation is working + # across rows properly forecast_array = np.stack( [test_samples * i for i in range(1, 1 + len(predict_submission_dates))], axis=0, @@ -639,12 +650,22 @@ def test_summarize_non_overlapping_day(): def test_summarize_non_overlapping_month(): + # choose arbitrary year for the start and end dates + # two full months (Jan and Feb ) + # are in the observed data, the number of days (31 and 28 days respectively) + # in each month is used in the checks observed_start_date = "2124-01-01" observed_end_date = "2124-02-28" + # two full months (April and May ) + # are in the observed data, the number of days (28 and 31 days respectively) + # in each month is used in the checks predict_start_date = "2124-04-01" predict_end_date = "2124-05-31" + print(observed_start_date, observed_end_date) + print(predict_start_date, predict_end_date) + forecast = ProphetForecast( model_type="test", parameters=DotMap(), @@ -758,11 +779,11 @@ def test_summarize_non_overlapping_month(): def test_summarize_overlapping_day(): - observed_start_date = "2124-01-01" - observed_end_date = "2124-02-01" + observed_start_date = TEST_DATE_STR + observed_end_date = (TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d") - predict_start_date = "2124-01-01" - predict_end_date = "2124-02-01" + predict_start_date = TEST_DATE_STR + predict_end_date = (TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d") forecast = ProphetForecast( model_type="test", @@ -784,10 +805,15 @@ def test_summarize_overlapping_day(): } ) + # there are the samples generated + # the mean and median are the aggregates used test_samples = np.array([1, 1, 2, 3, 5, 8, 13]) test_mean = np.mean(test_samples) test_median = np.median(test_samples) + # mean and median scale with a factor + # so a factor is multiplied on to make sure the aggregation is working + # across rows properly forecast_array = np.stack( [test_samples * i for i in range(1, 1 + len(predict_submission_dates))], axis=0, @@ -863,6 +889,10 @@ def test_summarize_overlapping_day(): def test_summarize_overlapping_month(): + # choose arbitrary year for the start and end dates + # two full months (Jan and Feb ) + # are in the observed data, the number of days (31 and 28 days respectively) + # in each month is used in the checks observed_start_date = "2124-01-01" observed_end_date = "2124-02-28" @@ -889,10 +919,15 @@ def test_summarize_overlapping_month(): } ) + # there are the samples generated + # the mean and median are the aggregates used test_samples = np.array([1, 1, 2, 3, 5, 8, 13]) test_mean = np.mean(test_samples) test_median = np.median(test_samples) + # mean and median scale with a factor + # so a factor is multiplied on to make sure the aggregation is working + # across rows properly forecast_array = np.stack( [test_samples] * len(predict_submission_dates), axis=0, From 963a116f5a89b117fce06163b1df4515ca35aa7b Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Fri, 9 Aug 2024 09:28:19 -0500 Subject: [PATCH 25/33] removed DotMap --- jobs/kpi-forecasting/kpi_forecasting.py | 14 +++++++------- .../configs/model_inputs/__init__.py | 6 +++--- .../kpi-forecasting/kpi_forecasting/inputs.py | 19 +++++-------------- .../kpi_forecasting/metric_hub.py | 3 +-- .../kpi_forecasting/models/base_forecast.py | 2 +- .../kpi_forecasting/models/funnel_forecast.py | 12 +++--------- .../kpi_forecasting/results_processing.py | 11 ++++++----- .../tests/test_base_forecast.py | 11 +++++------ .../tests/test_funnel_forecast.py | 16 ++++++---------- .../tests/test_performance_analysis.py | 3 +++ .../tests/test_prophet_forecast.py | 12 +++++------- jobs/kpi-forecasting/requirements.txt | 1 - 12 files changed, 45 insertions(+), 65 deletions(-) diff --git a/jobs/kpi-forecasting/kpi_forecasting.py b/jobs/kpi-forecasting/kpi_forecasting.py index e7dcca7c..d8c3f04c 100644 --- a/jobs/kpi-forecasting/kpi_forecasting.py +++ b/jobs/kpi-forecasting/kpi_forecasting.py @@ -1,4 +1,4 @@ -from kpi_forecasting.inputs import CLI, YAML +from kpi_forecasting.inputs import CLI, load_yaml from kpi_forecasting.models.prophet_forecast import ProphetForecast from kpi_forecasting.models.funnel_forecast import FunnelForecast from kpi_forecasting.metric_hub import MetricHub @@ -13,17 +13,17 @@ def main() -> None: # Load the config - config = YAML(filepath=CLI().args.config).data - model_type = config.forecast_model.model_type + config = load_yaml(filepath=CLI().args.config) + model_type = config["forecast_model"]["model_type"] if model_type in MODELS: - metric_hub = MetricHub(**config.metric_hub) - model = MODELS[model_type](metric_hub=metric_hub, **config.forecast_model) + metric_hub = MetricHub(**config["metric_hub"]) + model = MODELS[model_type](metric_hub=metric_hub, **config["forecast_model"]) model.fit() model.predict() - model.summarize(**config.summarize) - model.write_results(**config.write_results) + model.summarize(**config["summarize"]) + model.write_results(**config["write_results"]) else: raise ValueError(f"Don't know how to forecast using {model_type}.") diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/model_inputs/__init__.py b/jobs/kpi-forecasting/kpi_forecasting/configs/model_inputs/__init__.py index 1ebd482e..caacc611 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/configs/model_inputs/__init__.py +++ b/jobs/kpi-forecasting/kpi_forecasting/configs/model_inputs/__init__.py @@ -3,15 +3,15 @@ from pathlib import Path -from kpi_forecasting.inputs import YAML +from kpi_forecasting.inputs import load_yaml PARENT_PATH = Path(__file__).parent HOLIDAY_PATH = PARENT_PATH / "holidays.yaml" REGRESSOR_PATH = PARENT_PATH / "regressors.yaml" -holiday_collection = YAML(HOLIDAY_PATH) -regressor_collection = YAML(REGRESSOR_PATH) +holiday_collection = load_yaml(HOLIDAY_PATH) +regressor_collection = load_yaml(REGRESSOR_PATH) @attr.s(auto_attribs=True, frozen=False) diff --git a/jobs/kpi-forecasting/kpi_forecasting/inputs.py b/jobs/kpi-forecasting/kpi_forecasting/inputs.py index 034af27a..14da5545 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/inputs.py +++ b/jobs/kpi-forecasting/kpi_forecasting/inputs.py @@ -2,7 +2,6 @@ import yaml from dataclasses import dataclass -from dotmap import DotMap @dataclass @@ -20,18 +19,10 @@ def __post_init__(self) -> None: self.args = self.parser.parse_args() -@dataclass -class YAML: +def load_yaml(filepath: str) -> dict: """ - Create a data structure from a YAML config filepath. Instead of loading the - YAML as a dictionary, which requires verbose code to access nested dictionary - values, this class loads YAML as a dot map. Nested values can be accessed using - dot notation, like `YAML().data.section.subsection.value`. + Create a data structure from a YAML config filepath. """ - - filepath: str - - def __post_init__(self) -> None: - with open(self.filepath, "r") as f: - data = yaml.safe_load(f) - self.data = DotMap(data) + with open(filepath, "r") as f: + data = yaml.safe_load(f) + return data diff --git a/jobs/kpi-forecasting/kpi_forecasting/metric_hub.py b/jobs/kpi-forecasting/kpi_forecasting/metric_hub.py index 64cf9d42..e0a86c83 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/metric_hub.py +++ b/jobs/kpi-forecasting/kpi_forecasting/metric_hub.py @@ -1,7 +1,6 @@ import pandas as pd from dataclasses import dataclass -from dotmap import DotMap from google.cloud import bigquery from mozanalysis.config import ConfigLoader from textwrap import dedent @@ -36,7 +35,7 @@ class MetricHub: app_name: str slug: str start_date: str - segments: DotMap = None + segments: dict = None where: str = None end_date: str = None alias: str = None diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py index dcf64b91..08a0f750 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py @@ -71,7 +71,7 @@ def __post_init__(self) -> None: self.metadata_params = json.dumps( { "model_type": self.model_type.lower(), - "model_params": self.parameters.toDict(), + "model_params": self.parameters, "use_holidays": self.use_holidays, } ) diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py index 9652ce02..52aa9cc8 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py @@ -117,22 +117,16 @@ def _set_segment_models( ## file. Parse the holidays and regressors specified in the config file. segment_models = [] for segment in segment_combinations: - model_params = getattr( - self.parameters["segment_settings"], segment[split_dim] - ) + model_params = self.parameters["segment_settings"][segment[split_dim]] holiday_list = [] regressor_list = [] if model_params["holidays"]: - holiday_list = [ - getattr(holiday_collection.data, h) - for h in model_params["holidays"] - ] + holiday_list = [holiday_collection[h] for h in model_params["holidays"]] if model_params["regressors"]: regressor_list = [ - getattr(regressor_collection.data, r) - for r in model_params["regressors"] + regressor_collection[r] for r in model_params["regressors"] ] # Create a SegmentModelSettings object for each segment combination diff --git a/jobs/kpi-forecasting/kpi_forecasting/results_processing.py b/jobs/kpi-forecasting/kpi_forecasting/results_processing.py index f7e8ab88..1cb8a9d1 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/results_processing.py +++ b/jobs/kpi-forecasting/kpi_forecasting/results_processing.py @@ -4,7 +4,7 @@ from google.cloud import bigquery from google.cloud.bigquery.enums import SqlTypeNames as bq_types -from kpi_forecasting.inputs import YAML +from kpi_forecasting.inputs import load_yaml import pandas as pd import numpy as np @@ -74,12 +74,13 @@ def _set_intra_forecast_agg_functions(self): def _load_config_data(self): """Extracts data from the list of config files passed to the class and stores it in the - config_data attribute. The filename is the key, and the contents (represnted as a DotMap) + config_data attribute. The filename is the key, and the contents are the values""" self.config_data = {} for config_file in self.input_config_list: full_path = f"{self.input_config_path}/{config_file}" - config_data = YAML(full_path).data + config_data = load_yaml(full_path) + print(config_data) self.config_data[config_file] = config_data def _extract_config_data(self): @@ -99,7 +100,7 @@ def _extract_config_data(self): config_file_list = list(self.config_data.keys()) for config_data in self.config_data.values(): # get segment data - metric_hub_data = config_data.metric_hub.toDict() + metric_hub_data = config_data["metric_hub"] if "segments" in metric_hub_data: segment_data = metric_hub_data["segments"] segment_data_list.append(segment_data) @@ -107,7 +108,7 @@ def _extract_config_data(self): segment_data_list.append(None) # get input table info - input_table_list.append(config_data.write_results.toDict()) + input_table_list.append(config_data["write_results"]) input_table_data = input_table_list.pop(0) input_table_matches_first = [input_table_data == el for el in input_table_list] diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py index 19a2db9d..de8aa885 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py @@ -5,7 +5,6 @@ import pytest import pandas as pd -from dotmap import DotMap import numpy as np from datetime import timedelta, timezone @@ -92,7 +91,7 @@ def test_post_init(good_class): end_date = TEST_PREDICT_END_STR good_class = good_class( model_type="test", - parameters=DotMap(), + parameters={}, use_holidays=None, start_date=start_date, end_date=end_date, @@ -112,7 +111,7 @@ def test_post_init_default_dates(good_class): # check default start and end time good_class = good_class( model_type="test", - parameters=DotMap(), + parameters={}, use_holidays=None, start_date="", end_date="", @@ -133,7 +132,7 @@ def test_post_init_default_dates(good_class): def test_fit(good_class): good_class = good_class( model_type="test", - parameters=DotMap(), + parameters={}, use_holidays=None, start_date=TEST_DATE_STR, end_date=TEST_PREDICT_END_STR, @@ -149,7 +148,7 @@ def test_fit(good_class): def test_predict_and_validate(good_class): good_class = good_class( model_type="test", - parameters=DotMap(), + parameters={}, use_holidays=None, start_date=TEST_DATE_STR, end_date=TEST_PREDICT_END_STR, @@ -164,7 +163,7 @@ def test_predict_and_validate(good_class): def test_summarize(good_class): good_class = good_class( model_type="test", - parameters=DotMap(), + parameters={}, use_holidays=None, start_date=TEST_DATE_STR, end_date=TEST_PREDICT_END_STR, diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py index a8f865b5..bb7355fd 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py @@ -5,7 +5,6 @@ from dateutil.relativedelta import relativedelta import pandas as pd -from dotmap import DotMap import pytest import numpy as np @@ -31,7 +30,7 @@ def forecast(): forecast = FunnelForecast( model_type="test", - parameters=DotMap(), + parameters={}, use_holidays=None, start_date=predict_start_date, end_date=predict_end_date, @@ -92,12 +91,12 @@ def funnel_forecast_for_fit_tests(segment_info_fit_tests, mocker): }, } - parameter_dotmap = DotMap(parameter_dict) predict_start_date = TEST_DATE_STR predict_end_date = TEST_DATE_NEXT_DAY_STR + forecast = FunnelForecast( model_type="test", - parameters=parameter_dotmap, + parameters=parameter_dict, use_holidays=None, start_date=predict_start_date, end_date=predict_end_date, @@ -516,13 +515,12 @@ def test_under_predict(mocker): }, } - parameter_dotmap = DotMap(parameter_dict) predict_start_date = TEST_DATE_NEXT_DAY_STR predict_end_date = TEST_PREDICT_END_STR forecast = FunnelForecast( model_type="test", - parameters=parameter_dotmap, + parameters=parameter_dict, use_holidays=None, start_date=predict_start_date, end_date=predict_end_date, @@ -880,13 +878,12 @@ def test_set_segment_models(): }, } - parameter_dotmap = DotMap(parameter_dict) predict_start_date = TEST_DATE_STR predict_end_date = TEST_PREDICT_END_STR forecast = FunnelForecast( model_type="test", - parameters=parameter_dotmap, + parameters=parameter_dict, use_holidays=None, start_date=predict_start_date, end_date=predict_end_date, @@ -959,13 +956,12 @@ def test_set_segment_models_exception(): }, } - parameter_dotmap = DotMap(parameter_dict) predict_start_date = TEST_DATE_STR predict_end_date = TEST_PREDICT_END_STR forecast = FunnelForecast( model_type="test", - parameters=parameter_dotmap, + parameters=parameter_dict, use_holidays=None, start_date=predict_start_date, end_date=predict_end_date, diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_performance_analysis.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_performance_analysis.py index edbc2cbb..3e4f0120 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_performance_analysis.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_performance_analysis.py @@ -58,6 +58,7 @@ def directory_of_configs(tmp_path_factory): "dataset": "y", "table": "z", }, + "metric_hub": {}, } f4 = tmpdir / "config_nosegments1_1.yaml" f5 = tmpdir / "config_nosegments1_2.yaml" @@ -73,6 +74,7 @@ def directory_of_configs(tmp_path_factory): "dataset": "q", "table": "z", }, + "metric_hub": {}, } f6 = tmpdir / "config_nosegments2_1.yaml" @@ -91,6 +93,7 @@ def get_forecast_performance_config(tmp_path_factory): "dataset": "", "table": "", }, + "metric_hub": {}, } f1 = tmpdir / "config.yaml" with open(f1, "w") as outfile: diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py index 1e211375..150ec4da 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py @@ -2,7 +2,6 @@ from dateutil.relativedelta import relativedelta import pandas as pd -from dotmap import DotMap import numpy as np import pytest import collections @@ -34,13 +33,12 @@ def forecast(): }, } - parameter_dotmap = DotMap(parameter_dict) predict_start_date = TEST_DATE_NEXT_DAY_STR # arbitarily set it a couple months in the future predict_end_date = (TEST_DATE + relativedelta(months=2)).strftime("%Y-%m-%d") return ProphetForecast( model_type="test", - parameters=parameter_dotmap, + parameters=parameter_dict, use_holidays=None, start_date=predict_start_date, end_date=predict_end_date, @@ -550,7 +548,7 @@ def test_summarize_non_overlapping_day(): forecast = ProphetForecast( model_type="test", - parameters=DotMap(), + parameters={}, use_holidays=None, start_date=predict_start_date, end_date=predict_end_date, @@ -668,7 +666,7 @@ def test_summarize_non_overlapping_month(): forecast = ProphetForecast( model_type="test", - parameters=DotMap(), + parameters={}, use_holidays=None, start_date=predict_start_date, end_date=predict_end_date, @@ -787,7 +785,7 @@ def test_summarize_overlapping_day(): forecast = ProphetForecast( model_type="test", - parameters=DotMap(), + parameters={}, use_holidays=None, start_date=predict_start_date, end_date=predict_end_date, @@ -901,7 +899,7 @@ def test_summarize_overlapping_month(): forecast = ProphetForecast( model_type="test", - parameters=DotMap(), + parameters={}, use_holidays=None, start_date=predict_start_date, end_date=predict_end_date, diff --git a/jobs/kpi-forecasting/requirements.txt b/jobs/kpi-forecasting/requirements.txt index 218d688a..cae076e6 100644 --- a/jobs/kpi-forecasting/requirements.txt +++ b/jobs/kpi-forecasting/requirements.txt @@ -10,7 +10,6 @@ contourpy==1.1.0 convertdate==2.4.0 cycler==0.11.0 db-dtypes==1.1.1 -dotmap==1.3.30 ephem==4.1.4 exceptiongroup==1.1.1 fonttools==4.40.0 From 0f2f5096cf3e10556d0d0188fcea385fcaf8bb34 Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Fri, 9 Aug 2024 13:10:42 -0500 Subject: [PATCH 26/33] modified README to make it match better between FunnelForecast and ProphetForecast --- jobs/kpi-forecasting/README.md | 83 +++++++++- .../kpi_forecasting/configs/dau_desktop.yaml | 2 + .../kpi_forecasting/configs/dau_mobile.yaml | 2 + .../configs/search_forecasting_ad_clicks.yaml | 72 +++++---- ...search_forecasting_daily_active_users.yaml | 66 ++++---- .../search_forecasting_search_count.yaml | 66 ++++---- .../kpi_forecasting/models/base_forecast.py | 20 ++- .../kpi_forecasting/models/funnel_forecast.py | 70 ++++---- .../models/prophet_forecast.py | 10 ++ .../tests/test_funnel_forecast.py | 151 +++++++++--------- 10 files changed, 326 insertions(+), 216 deletions(-) diff --git a/jobs/kpi-forecasting/README.md b/jobs/kpi-forecasting/README.md index 31231cf8..ff1a6ed8 100644 --- a/jobs/kpi-forecasting/README.md +++ b/jobs/kpi-forecasting/README.md @@ -85,8 +85,87 @@ The tests can be run locally with `python -m pytest` in the root directory of th # YAML Configs -Each of the sections in the YAML files contains a list of arguments that are passed to their relevant objects or methods. -Definitions should be documented in the code. +Configuration for each forecast is found in the `configs` folder. Below is an example config file with sample values and a description of what the field means as a comment whe it is not self-evident + +``` +metric_hub: # this configures the observed data fed to the model which is obtained via metrichub + app_name: "multi_product" # metric-hub app name + slug: "search_forecasting_ad_clicks" # metric-hub slug + alias: "search_forecasting_ad_clicks" # metric-hub alias + start_date: "2018-01-01" # date at which the observed data should start + end_date: "last complete month" + # date at which the observed data will end, can be a date or "last complete month" + # which uses `utils.parse_end_date` to determine the last complete month + segments: + # this section is optional and currently only used in funnel forecast, + # specifies which segments are used to partition the data, + # enabling separate models to be fit for each partition. + # Values underneath are a map of column names to be output by the + # metric-hub call and the SQL queries to populate those columns + device: "device" + channel: "'all'" + country: "CASE WHEN country = 'US' THEN 'US' ELSE 'ROW' END" + partner: "partner" + where: "partner = 'Google'" # filter to apply to the metric hub pull + +forecast_model: # this section configures the model + model_type: "funnel" + # type of model object to use, current options are "funnel" for FunnelForecast and "prophet" for ProphetForecast + start_date: NULL + # starting date for the predicted data (unless predict_historical_dates is set), + # if unset, value depends on predict_historical_dates. + end_date: NULL + # final date for the predicted data + use_holidays: False + For prophet-based models, when true, call `model.add_country_holidays(country_name="US")` on model + predict_historical_dates: True + # if predict_historical_dates is True, set to first date of the observed data + # if predict_historical_dates is False, defaults to the day after the last day in the observed data + number_of_simulations: 1000 + # for prophet-based models,number of simulations to run + parameters: + # this section can be a map or a list. + # If it's a map, these parameters are used for all models + # (recall multiple models are train if there is a metric_hub.segments) + # If it's a list, it will set different parameters + # for different subsets of the parition specified in `metric_hub.segments`. + - segment: + # specifies which subset of the partitions this applies to + # key is a column specified in metric_hub.segments + # value is a value that column can take to which the configuration is applied + device: desktop + start_date: "2018-01-01" # only applies to FunnelForecast, allows one to set start date for each sub-model + end_date: NULL # only applies to FunnelForecast, allows one to set end date for each sub-model + holidays: ["easter", "covid_sip11"] # holidays specified in `configs.model_inputs.holidays` to use. + regressors: ["post_esr_migration", "in_covid", "ad_click_bug"] # regressors specified in `configs.model_inputs.regressors` + grid_parameters: + # sets grid for hyperparameter tuning + changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] + changepoint_range: [0.8, 0.9] + n_changepoints: [25, 50] + weekly_seasonality: True + yearly_seasonality: True + cv_settings: + # sets parameters for prophet cross-validation used in FunnelForecast + initial: "1296 days" + period: "30 days" + horizon: "30 days" + parallel: "processes" + ... + +summarize: + # parameters used to summarize and aggregate the predictions + periods: ["day", "month"] # periods to aggregate up to + numpy_aggregations: ["mean"] # numpy aggregation functions to use when aggregating predictions + percentiles: [10, 50, 90] # precentiles to calculate on aggregation + +write_results: + # set the project, dataset and table for output data + project: "moz-fx-data-shared-prod" + dataset: "search_derived" + table: "search_funnel_forecasts_v1" + components_table: "search_forecast_model_components_v1" +``` # Development diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml index 5ba432ea..83e80ab9 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml +++ b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml @@ -11,6 +11,8 @@ forecast_model: start_date: NULL end_date: NULL use_holidays: False + predict_historical_dates: False + number_of_simulations: 1000 parameters: seasonality_prior_scale: 0.00825 changepoint_prior_scale: 0.15983 diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml index 74889971..a3a9f3eb 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml +++ b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml @@ -11,6 +11,8 @@ forecast_model: start_date: NULL end_date: NULL use_holidays: True + predict_historical_dates: False + number_of_simulations: 1000 parameters: seasonality_prior_scale: 0.01 changepoint_prior_scale: 0.01 diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml index a756b518..ea8a2a64 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml +++ b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml @@ -17,42 +17,44 @@ forecast_model: start_date: NULL end_date: NULL use_holidays: False + predict_historical_dates: True + number_of_simulations: 1000 parameters: - model_setting_split_dim: "device" - segment_settings: - desktop: - start_date: "2018-01-01" - end_date: NULL - holidays: ["easter", "covid_sip11"] - regressors: ["post_esr_migration", "in_covid", "ad_click_bug"] - grid_parameters: - changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] - changepoint_range: [0.8, 0.9] - n_changepoints: [25, 50] - weekly_seasonality: True - yearly_seasonality: True - cv_settings: - initial: "1296 days" - period: "30 days" - horizon: "30 days" - parallel: "processes" - mobile: - start_date: "2022-01-01" - end_date: NULL - holidays: ["easter"] - regressors: ["after_fenix", "in_covid"] - grid_parameters: - changepoint_prior_scale: [.01, .1, .15, .2] - changepoint_range: [0.8, 0.9, 1] - n_changepoints: [30] - weekly_seasonality: True - yearly_seasonality: True - growth: "logistic" - cv_settings: - initial: "366 days" - period: "30 days" - horizon: "30 days" - parallel: "processes" + - segment: + device: desktop + start_date: "2018-01-01" + end_date: NULL + holidays: ["easter", "covid_sip11"] + regressors: ["post_esr_migration", "in_covid", "ad_click_bug"] + grid_parameters: + changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] + changepoint_range: [0.8, 0.9] + n_changepoints: [25, 50] + weekly_seasonality: True + yearly_seasonality: True + cv_settings: + initial: "1296 days" + period: "30 days" + horizon: "30 days" + parallel: "processes" + - segment: + device: mobile + start_date: "2022-01-01" + end_date: NULL + holidays: ["easter"] + regressors: ["after_fenix", "in_covid"] + grid_parameters: + changepoint_prior_scale: [.01, .1, .15, .2] + changepoint_range: [0.8, 0.9, 1] + n_changepoints: [30] + weekly_seasonality: True + yearly_seasonality: True + growth: "logistic" + cv_settings: + initial: "366 days" + period: "30 days" + horizon: "30 days" + parallel: "processes" summarize: periods: ["day", "month"] diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml index b6643c4a..3ce3568e 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml +++ b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml @@ -17,39 +17,41 @@ forecast_model: start_date: NULL end_date: NULL use_holidays: False + predict_historical_dates: True + number_of_simulations: 1000 parameters: - model_setting_split_dim: "device" - segment_settings: - desktop: - start_date: "2018-01-01" - end_date: NULL - holidays: ["easter", "covid_sip11"] - regressors: ["post_esr_migration", "in_covid"] - grid_parameters: - changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] - changepoint_range: [0.8, 0.9] - weekly_seasonality: True - yearly_seasonality: True - cv_settings: - initial: "1296 days" - period: "30 days" - horizon: "30 days" - parallel: "processes" - mobile: - start_date: "2021-01-01" - end_date: NULL - holidays: ["easter"] - regressors: ["after_fenix", "in_covid"] - grid_parameters: - changepoint_prior_scale: [0.001, 0.01, 0.1] - weekly_seasonality: True - yearly_seasonality: True - growth: "logistic" - cv_settings: - initial: "366 days" - period: "30 days" - horizon: "30 days" - parallel: "processes" + - segment: + device: desktop + start_date: "2018-01-01" + end_date: NULL + holidays: ["easter", "covid_sip11"] + regressors: ["post_esr_migration", "in_covid"] + grid_parameters: + changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] + changepoint_range: [0.8, 0.9] + weekly_seasonality: True + yearly_seasonality: True + cv_settings: + initial: "1296 days" + period: "30 days" + horizon: "30 days" + parallel: "processes" + - segment: + device: mobile + start_date: "2021-01-01" + end_date: NULL + holidays: ["easter"] + regressors: ["after_fenix", "in_covid"] + grid_parameters: + changepoint_prior_scale: [0.001, 0.01, 0.1] + weekly_seasonality: True + yearly_seasonality: True + growth: "logistic" + cv_settings: + initial: "366 days" + period: "30 days" + horizon: "30 days" + parallel: "processes" summarize: periods: ["day", "month"] diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml index 8dd8f811..75f73ba2 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml +++ b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml @@ -17,39 +17,41 @@ forecast_model: start_date: NULL end_date: NULL use_holidays: False + predict_historical_dates: True + number_of_simulations: 1000 parameters: - model_setting_split_dim: "device" - segment_settings: - desktop: - start_date: "2018-01-01" - end_date: NULL - holidays: ["easter", "covid_sip11"] - regressors: ["post_esr_migration", "in_covid"] - grid_parameters: - changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] - changepoint_range: [0.8, 0.9] - weekly_seasonality: True - yearly_seasonality: True - cv_settings: - initial: "1296 days" - period: "30 days" - horizon: "30 days" - parallel: "processes" - mobile: - start_date: "2020-01-01" - end_date: NULL - holidays: ["easter"] - regressors: ["after_fenix", "in_covid"] - grid_parameters: - changepoint_prior_scale: [0.001, 0.01, 0.1] - weekly_seasonality: True - yearly_seasonality: True - growth: "logistic" - cv_settings: - initial: "366 days" - period: "30 days" - horizon: "30 days" - parallel: "processes" + - segment: + device: desktop + start_date: "2018-01-01" + end_date: NULL + holidays: ["easter", "covid_sip11"] + regressors: ["post_esr_migration", "in_covid"] + grid_parameters: + changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] + changepoint_range: [0.8, 0.9] + weekly_seasonality: True + yearly_seasonality: True + cv_settings: + initial: "1296 days" + period: "30 days" + horizon: "30 days" + parallel: "processes" + - segment: + device: mobile + start_date: "2020-01-01" + end_date: NULL + holidays: ["easter"] + regressors: ["after_fenix", "in_covid"] + grid_parameters: + changepoint_prior_scale: [0.001, 0.01, 0.1] + weekly_seasonality: True + yearly_seasonality: True + growth: "logistic" + cv_settings: + initial: "366 days" + period: "30 days" + horizon: "30 days" + parallel: "processes" summarize: periods: ["day", "month"] diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py index 08a0f750..6d7bbae9 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py @@ -29,9 +29,6 @@ class BaseForecast(abc.ABC): date the metric should be queried. metric_hub (MetricHub): A MetricHub object that provides details about the metric to be forecasted. - number_of_simulations (int): The number of simulated timeseries that the forecast - should generate. Since many forecast models are probablistic, this enables the - measurement of variation across a range of possible outcomes. """ model_type: str @@ -40,7 +37,7 @@ class BaseForecast(abc.ABC): start_date: str end_date: str metric_hub: MetricHub - number_of_simulations: int = 1000 + predict_historical_dates: bool = False def _get_observed_data(self): if self.metric_hub: @@ -58,9 +55,18 @@ def __post_init__(self) -> None: # use default start/end dates if the user doesn't specify them self.start_date = pd.to_datetime(self.start_date or self._default_start_date) self.end_date = pd.to_datetime(self.end_date or self._default_end_date) - self.dates_to_predict = pd.DataFrame( - {"submission_date": pd.date_range(self.start_date, self.end_date).date} - ) + if self.predict_historical_dates: + self.dates_to_predict = pd.DataFrame( + { + "submission_date": pd.date_range( + self.metric_hub.start_date, self.end_date + ).date + } + ) + else: + self.dates_to_predict = pd.DataFrame( + {"submission_date": pd.date_range(self.start_date, self.end_date).date} + ) # initialize unset attributes self.model = None diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py index 52aa9cc8..aa4a4fb8 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py @@ -67,15 +67,6 @@ def __post_init__(self) -> None: # this is used to avoid the code below for testing purposes return - # Overwrite dates_to_predict to provide historical date forecasts - self.dates_to_predict = pd.DataFrame( - { - "submission_date": pd.date_range( - self.metric_hub.start_date, self.end_date - ).date - } - ) - self._set_segment_models(self.observed_df, self.metric_hub.segments.keys()) # initialize unset attributes @@ -85,10 +76,10 @@ def _set_segment_models( self, observed_df: pd.DataFrame, segment_column_list: list ) -> None: """Creates a SegmentSettings object for each segment specified in the - metric_hub.segments section of the config. These objects are stored in a list - in the segment_models attribute - Parameters can be specified independently for at most one dimension column - set using model_setting_split_dim in self.parameters + metric_hub.segments section of the config. It is populated from the list of + parameters in the forecast_model.parameters section of the configuration file. + The segements section of each element of the list specifies which values within which + segments the parameters are associated with. Args: observed_df (pd.DataFrame): dataframe containing observed data used to model @@ -100,45 +91,64 @@ def _set_segment_models( combination_df = observed_df[segment_column_list].drop_duplicates() # Construct dictionaries from those combinations + # this will be used to check that the config actually partitions the data segment_combinations = combination_df.to_dict("records") - # initialize a list to hold models for each segment - ## populate the list with segments and parameters for the segment - split_dim = self.parameters["model_setting_split_dim"] - - # check to make sure split_dim is one of the columns set in segment_column_list - if split_dim not in segment_column_list: - columns_str = ",".join(segment_column_list) + # get subset of segment that is used in partitioning + split_dims = None + for partition in self.parameters: + partition_dim = set(partition["segment"].keys()) + if split_dims and partition_dim != split_dims: + raise ValueError( + "Segment keys are not the same across different elements of parameters in the config file" + ) + elif split_dims is None: + split_dims = partition_dim + else: + # this is case where split_dim is set and matches paritition_dim + continue + if not split_dims <= set(combination_df.keys()): + missing_dims = split_dims - set(combination_df.keys()) + missing_dims_str = ",".join(missing_dims) raise ValueError( - f"model_setting_split_dim set to {split_dim} which is not among segment columns: {columns_str}" + f"Segment keys missing from metric hub segments: {missing_dims_str}" ) # For each segment combinination, get the model parameters from the config ## file. Parse the holidays and regressors specified in the config file. segment_models = [] for segment in segment_combinations: - model_params = self.parameters["segment_settings"][segment[split_dim]] - + # find the correct configuration + for partition in self.parameters: + partition_segment = partition["segment"] + # get subset of segment that is used to partition + subset_segment = { + key: val for key, val in segment.items() if key in split_dims + } + if partition_segment == subset_segment: + # parition is set to the desired value + # break out of loop + break holiday_list = [] regressor_list = [] - if model_params["holidays"]: - holiday_list = [holiday_collection[h] for h in model_params["holidays"]] - if model_params["regressors"]: + if "holidays" in partition: + holiday_list = [holiday_collection[h] for h in partition["holidays"]] + if "regressors" in partition: regressor_list = [ - regressor_collection[r] for r in model_params["regressors"] + regressor_collection[r] for r in partition["regressors"] ] # Create a SegmentModelSettings object for each segment combination segment_models.append( SegmentModelSettings( segment=segment, - start_date=model_params["start_date"], + start_date=partition["start_date"], end_date=self.end_date, holidays=[ProphetHoliday(**h) for h in holiday_list], regressors=[ProphetRegressor(**r) for r in regressor_list], - grid_parameters=dict(model_params["grid_parameters"]), - cv_settings=dict(model_params["cv_settings"]), + grid_parameters=dict(partition["grid_parameters"]), + cv_settings=dict(partition["cv_settings"]), ) ) self.segment_models = segment_models diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py index 19f57e1d..30d152b3 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py @@ -16,6 +16,16 @@ @dataclass class ProphetForecast(BaseForecast): + """Forecast object specifically for prophet forecast models + + Additional attributes: + number_of_simulations (int): The number of simulated timeseries that the forecast + should generate. Since many forecast models are probablistic, this enables the + measurement of variation across a range of possible outcomes. + """ + + number_of_simulations: int = 1000 + @property def column_names_map(self) -> Dict[str, str]: return {"submission_date": "ds", "value": "y"} diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py index bb7355fd..99ccab84 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py @@ -69,34 +69,33 @@ def funnel_forecast_for_fit_tests(segment_info_fit_tests, mocker): """This method creates a forecast object from the segment dict created in the segment_info_fit_tests fixture. It also mocks some of the object methods to enable easier testing""" - parameter_dict = { - "model_setting_split_dim": "a", - "segment_settings": { - "A1": { - "start_date": segment_info_fit_tests["A1"]["start_date"], - "end_date": None, - "holidays": [], - "regressors": [], - "grid_parameters": segment_info_fit_tests["A1"]["grid_parameters"], - "cv_settings": {}, - }, - "A2": { - "start_date": segment_info_fit_tests["A2"]["start_date"], - "end_date": None, - "holidays": [], - "regressors": [], - "grid_parameters": segment_info_fit_tests["A2"]["grid_parameters"], - "cv_settings": {}, - }, + parameter_list = [ + { + "segment": {"a": "A1"}, + "start_date": segment_info_fit_tests["A1"]["start_date"], + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": segment_info_fit_tests["A1"]["grid_parameters"], + "cv_settings": {}, }, - } + { + "segment": {"a": "A2"}, + "start_date": segment_info_fit_tests["A2"]["start_date"], + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": segment_info_fit_tests["A2"]["grid_parameters"], + "cv_settings": {}, + }, + ] predict_start_date = TEST_DATE_STR predict_end_date = TEST_DATE_NEXT_DAY_STR forecast = FunnelForecast( model_type="test", - parameters=parameter_dict, + parameters=parameter_list, use_holidays=None, start_date=predict_start_date, end_date=predict_end_date, @@ -501,26 +500,24 @@ def test_under_predict(mocker): # set segment models A1_start_date = TEST_DATE_STR - parameter_dict = { - "model_setting_split_dim": "a", - "segment_settings": { - "A1": { - "start_date": A1_start_date, - "end_date": None, - "holidays": [], - "regressors": [], - "grid_parameters": {"param1": [1, 2], "param2": [20, 10]}, - "cv_settings": {}, - }, - }, - } + parameter_list = [ + { + "segment": {"a": "A1"}, + "start_date": A1_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {"param1": [1, 2], "param2": [20, 10]}, + "cv_settings": {}, + } + ] predict_start_date = TEST_DATE_NEXT_DAY_STR predict_end_date = TEST_PREDICT_END_STR forecast = FunnelForecast( model_type="test", - parameters=parameter_dict, + parameters=parameter_list, use_holidays=None, start_date=predict_start_date, end_date=predict_end_date, @@ -856,34 +853,33 @@ def test_set_segment_models(): """test the set_segment_models method""" A1_start_date = "2018-01-01" A2_start_date = "2020-02-02" - parameter_dict = { - "model_setting_split_dim": "a", - "segment_settings": { - "A1": { - "start_date": A1_start_date, - "end_date": None, - "holidays": [], - "regressors": [], - "grid_parameters": {}, - "cv_settings": {}, - }, - "A2": { - "start_date": A2_start_date, - "end_date": None, - "holidays": [], - "regressors": [], - "grid_parameters": {}, - "cv_settings": {}, - }, + parameter_list = [ + { + "segment": {"a": "A1"}, + "start_date": A1_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {}, + "cv_settings": {}, }, - } + { + "segment": {"a": "A2"}, + "start_date": A2_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {}, + "cv_settings": {}, + }, + ] predict_start_date = TEST_DATE_STR predict_end_date = TEST_PREDICT_END_STR forecast = FunnelForecast( model_type="test", - parameters=parameter_dict, + parameters=parameter_list, use_holidays=None, start_date=predict_start_date, end_date=predict_end_date, @@ -934,34 +930,33 @@ def test_set_segment_models_exception(): is specified that isn't in the data""" A1_start_date = "2018-01-01" A2_start_date = "2020-02-02" - parameter_dict = { - "model_setting_split_dim": "c", # not in data - "segment_settings": { - "A1": { - "start_date": A1_start_date, - "end_date": None, - "holidays": [], - "regressors": [], - "grid_parameters": {}, - "cv_settings": {}, - }, - "A2": { - "start_date": A2_start_date, - "end_date": None, - "holidays": [], - "regressors": [], - "grid_parameters": {}, - "cv_settings": {}, - }, + parameter_list = [ + { + "segment": {"c": "A1"}, + "start_date": A1_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {}, + "cv_settings": {}, }, - } + { + "segment": {"c": "A2"}, + "start_date": A2_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {}, + "cv_settings": {}, + }, + ] predict_start_date = TEST_DATE_STR predict_end_date = TEST_PREDICT_END_STR forecast = FunnelForecast( model_type="test", - parameters=parameter_dict, + parameters=parameter_list, use_holidays=None, start_date=predict_start_date, end_date=predict_end_date, @@ -976,7 +971,7 @@ def test_set_segment_models_exception(): with pytest.raises( ValueError, - match="model_setting_split_dim set to c which is not among segment columns: a,b", + match="Segment keys missing from metric hub segments: c", ): forecast._set_segment_models( observed_df=observed_data, segment_column_list=segment_list From e93162c0588ef031044fc2f873de7280beb27b8d Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Mon, 12 Aug 2024 14:20:12 -0500 Subject: [PATCH 27/33] added test for more complex segments --- .../tests/test_funnel_forecast.py | 98 +++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py index 99ccab84..34cef8cc 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py @@ -924,6 +924,104 @@ def test_set_segment_models(): assert checkval == expectedval +def test_set_segment_models_multiple(): + """test the set_segment_models method + with segments on multiple columns""" + # set arbitrary dates + # they're only used to make sure segments are set correctly + A1B1_start_date = "2018-01-01" + A1B2_start_date = "2019-01-01" + A2B1_start_date = "2020-02-02" + A2B2_start_date = "2021-02-02" + parameter_list = [ + { + "segment": {"a": "A1", "b": "B1"}, + "start_date": A1B1_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {}, + "cv_settings": {}, + }, + { + "segment": {"a": "A1", "b": "B2"}, + "start_date": A1B2_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {}, + "cv_settings": {}, + }, + { + "segment": {"a": "A2", "b": "B1"}, + "start_date": A2B1_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {}, + "cv_settings": {}, + }, + { + "segment": {"a": "A2", "b": "B2"}, + "start_date": A2B2_start_date, + "end_date": None, + "holidays": [], + "regressors": [], + "grid_parameters": {}, + "cv_settings": {}, + }, + ] + + predict_start_date = TEST_DATE_STR + predict_end_date = TEST_PREDICT_END_STR + + forecast = FunnelForecast( + model_type="test", + parameters=parameter_list, + use_holidays=None, + start_date=predict_start_date, + end_date=predict_end_date, + metric_hub=None, + ) + + observed_data = pd.DataFrame( + {"a": ["A1", "A1", "A2", "A2", "A2"], "b": ["B1", "B2", "B1", "B2", "B2"]} + ) + + segment_list = ["a", "b"] + + forecast._set_segment_models( + observed_df=observed_data, segment_column_list=segment_list + ) + + # put the segments and the start date in the same dictionary to make + # comparison easier + # the important things to check is that all possible combinations + # of segments are present and that each has the parameters set properly + # start_date is a stand-in for these parameters and + # is determined by the value of a as specified in parameter_dict + check_segment_models = [ + dict(**el.segment, **{"start_date": el.start_date}) + for el in forecast.segment_models + ] + expected = [ + {"a": "A1", "b": "B1", "start_date": A1B1_start_date}, + {"a": "A1", "b": "B2", "start_date": A1B2_start_date}, + {"a": "A2", "b": "B1", "start_date": A2B1_start_date}, + {"a": "A2", "b": "B2", "start_date": A2B2_start_date}, + ] + + # can't make a set of dicts for comparison + # so sort the lists and compare each element + compare_sorted = zip( + sorted(check_segment_models, key=lambda x: (x["a"], x["b"])), + sorted(expected, key=lambda x: (x["a"], x["b"])), + ) + + for checkval, expectedval in compare_sorted: + assert checkval == expectedval + + def test_set_segment_models_exception(): """test the exception for segment_models where and exception is raised if a model_setting_split_dim From 5f0536d5578b09c444a53f5d7b6d778c905423fa Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Tue, 13 Aug 2024 13:28:00 -0500 Subject: [PATCH 28/33] renamed use_holidays to use_all_us_holidays --- jobs/kpi-forecasting/README.md | 2 +- .../kpi_forecasting/configs/dau_desktop.yaml | 2 +- .../kpi_forecasting/configs/dau_mobile.yaml | 2 +- .../configs/search_forecasting_ad_clicks.yaml | 2 +- .../search_forecasting_daily_active_users.yaml | 2 +- .../configs/search_forecasting_search_count.yaml | 2 +- .../kpi_forecasting/models/base_forecast.py | 6 +++--- .../kpi_forecasting/models/prophet_forecast.py | 4 ++-- .../kpi_forecasting/tests/test_base_forecast.py | 10 +++++----- .../tests/test_data/test_funnel_config.yaml | 2 +- .../kpi_forecasting/tests/test_funnel_forecast.py | 12 ++++++------ .../kpi_forecasting/tests/test_prophet_forecast.py | 10 +++++----- 12 files changed, 28 insertions(+), 28 deletions(-) diff --git a/jobs/kpi-forecasting/README.md b/jobs/kpi-forecasting/README.md index ff1a6ed8..efe75df6 100644 --- a/jobs/kpi-forecasting/README.md +++ b/jobs/kpi-forecasting/README.md @@ -116,7 +116,7 @@ forecast_model: # this section configures the model # if unset, value depends on predict_historical_dates. end_date: NULL # final date for the predicted data - use_holidays: False + use_all_us_holidays: False For prophet-based models, when true, call `model.add_country_holidays(country_name="US")` on model predict_historical_dates: True # if predict_historical_dates is True, set to first date of the observed data diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml index 83e80ab9..0b8966f2 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml +++ b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml @@ -10,7 +10,7 @@ forecast_model: model_type: "prophet" start_date: NULL end_date: NULL - use_holidays: False + use_all_us_holidays: False predict_historical_dates: False number_of_simulations: 1000 parameters: diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml index a3a9f3eb..c9288408 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml +++ b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml @@ -10,7 +10,7 @@ forecast_model: model_type: "prophet" start_date: NULL end_date: NULL - use_holidays: True + use_all_us_holidays: True predict_historical_dates: False number_of_simulations: 1000 parameters: diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml index ea8a2a64..7a01aa15 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml +++ b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml @@ -16,7 +16,7 @@ forecast_model: model_type: "funnel" start_date: NULL end_date: NULL - use_holidays: False + use_all_us_holidays: False predict_historical_dates: True number_of_simulations: 1000 parameters: diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml index 3ce3568e..dfb7bb49 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml +++ b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml @@ -16,7 +16,7 @@ forecast_model: model_type: "funnel" start_date: NULL end_date: NULL - use_holidays: False + use_all_us_holidays: False predict_historical_dates: True number_of_simulations: 1000 parameters: diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml index 75f73ba2..17431247 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml +++ b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml @@ -16,7 +16,7 @@ forecast_model: model_type: "funnel" start_date: NULL end_date: NULL - use_holidays: False + use_all_us_holidays: False predict_historical_dates: True number_of_simulations: 1000 parameters: diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py index 6d7bbae9..99504fdb 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py @@ -20,7 +20,7 @@ class BaseForecast(abc.ABC): Args: model_type (str): The name of the forecasting model that's being used. parameters (Dict): Parameters that should be passed to the forecasting model. - use_holidays (bool): Whether or not the forecasting model should use holidays. + use_all_us_holidays (bool): Whether or not the forecasting model should use holidays. The base model does not apply holiday logic; that logic needs to be built in the child class. start_date (str): A 'YYYY-MM-DD' formatted-string that specifies the first @@ -33,7 +33,7 @@ class BaseForecast(abc.ABC): model_type: str parameters: Dict - use_holidays: bool + use_all_us_holidays: bool start_date: str end_date: str metric_hub: MetricHub @@ -78,7 +78,7 @@ def __post_init__(self) -> None: { "model_type": self.model_type.lower(), "model_params": self.parameters, - "use_holidays": self.use_holidays, + "use_all_us_holidays": self.use_all_us_holidays, } ) diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py index 30d152b3..82a07fc4 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py @@ -37,7 +37,7 @@ def _build_model(self, parameter_dict): mcmc_samples=0, ) - if self.use_holidays: + if self.use_all_us_holidays: model.add_country_holidays(country_name="US") return model @@ -106,7 +106,7 @@ def _predict_legacy(self) -> pd.DataFrame: datetime.now(timezone.utc).replace(tzinfo=None).date() ) df["forecast_parameters"] = str( - json.dumps({**self.parameters, "holidays": self.use_holidays}) + json.dumps({**self.parameters, "holidays": self.use_all_us_holidays}) ) alias = self.metric_hub.alias.lower() diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py index de8aa885..c4d823d3 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py @@ -92,7 +92,7 @@ def test_post_init(good_class): good_class = good_class( model_type="test", parameters={}, - use_holidays=None, + use_all_us_holidays=None, start_date=start_date, end_date=end_date, metric_hub=None, @@ -112,7 +112,7 @@ def test_post_init_default_dates(good_class): good_class = good_class( model_type="test", parameters={}, - use_holidays=None, + use_all_us_holidays=None, start_date="", end_date="", metric_hub=None, @@ -133,7 +133,7 @@ def test_fit(good_class): good_class = good_class( model_type="test", parameters={}, - use_holidays=None, + use_all_us_holidays=None, start_date=TEST_DATE_STR, end_date=TEST_PREDICT_END_STR, metric_hub=None, @@ -149,7 +149,7 @@ def test_predict_and_validate(good_class): good_class = good_class( model_type="test", parameters={}, - use_holidays=None, + use_all_us_holidays=None, start_date=TEST_DATE_STR, end_date=TEST_PREDICT_END_STR, metric_hub=None, @@ -164,7 +164,7 @@ def test_summarize(good_class): good_class = good_class( model_type="test", parameters={}, - use_holidays=None, + use_all_us_holidays=None, start_date=TEST_DATE_STR, end_date=TEST_PREDICT_END_STR, metric_hub=None, diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_data/test_funnel_config.yaml b/jobs/kpi-forecasting/kpi_forecasting/tests/test_data/test_funnel_config.yaml index 2aebbeff..17943134 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_data/test_funnel_config.yaml +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_data/test_funnel_config.yaml @@ -15,7 +15,7 @@ forecast_model: model_type: "funnel" start_date: NULL end_date: NULL - use_holidays: False + use_all_us_holidays: False parameters: model_setting_split_dim: "device" segment_settings: diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py index 34cef8cc..535f84c2 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py @@ -31,7 +31,7 @@ def forecast(): forecast = FunnelForecast( model_type="test", parameters={}, - use_holidays=None, + use_all_us_holidays=None, start_date=predict_start_date, end_date=predict_end_date, metric_hub=None, @@ -96,7 +96,7 @@ def funnel_forecast_for_fit_tests(segment_info_fit_tests, mocker): forecast = FunnelForecast( model_type="test", parameters=parameter_list, - use_holidays=None, + use_all_us_holidays=None, start_date=predict_start_date, end_date=predict_end_date, metric_hub=None, @@ -518,7 +518,7 @@ def test_under_predict(mocker): forecast = FunnelForecast( model_type="test", parameters=parameter_list, - use_holidays=None, + use_all_us_holidays=None, start_date=predict_start_date, end_date=predict_end_date, metric_hub=None, @@ -880,7 +880,7 @@ def test_set_segment_models(): forecast = FunnelForecast( model_type="test", parameters=parameter_list, - use_holidays=None, + use_all_us_holidays=None, start_date=predict_start_date, end_date=predict_end_date, metric_hub=None, @@ -978,7 +978,7 @@ def test_set_segment_models_multiple(): forecast = FunnelForecast( model_type="test", parameters=parameter_list, - use_holidays=None, + use_all_us_holidays=None, start_date=predict_start_date, end_date=predict_end_date, metric_hub=None, @@ -1055,7 +1055,7 @@ def test_set_segment_models_exception(): forecast = FunnelForecast( model_type="test", parameters=parameter_list, - use_holidays=None, + use_all_us_holidays=None, start_date=predict_start_date, end_date=predict_end_date, metric_hub=None, diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py index 150ec4da..adc9c4ba 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py @@ -39,7 +39,7 @@ def forecast(): return ProphetForecast( model_type="test", parameters=parameter_dict, - use_holidays=None, + use_all_us_holidays=None, start_date=predict_start_date, end_date=predict_end_date, metric_hub=None, @@ -549,7 +549,7 @@ def test_summarize_non_overlapping_day(): forecast = ProphetForecast( model_type="test", parameters={}, - use_holidays=None, + use_all_us_holidays=None, start_date=predict_start_date, end_date=predict_end_date, metric_hub=None, @@ -667,7 +667,7 @@ def test_summarize_non_overlapping_month(): forecast = ProphetForecast( model_type="test", parameters={}, - use_holidays=None, + use_all_us_holidays=None, start_date=predict_start_date, end_date=predict_end_date, metric_hub=None, @@ -786,7 +786,7 @@ def test_summarize_overlapping_day(): forecast = ProphetForecast( model_type="test", parameters={}, - use_holidays=None, + use_all_us_holidays=None, start_date=predict_start_date, end_date=predict_end_date, metric_hub=None, @@ -900,7 +900,7 @@ def test_summarize_overlapping_month(): forecast = ProphetForecast( model_type="test", parameters={}, - use_holidays=None, + use_all_us_holidays=None, start_date=predict_start_date, end_date=predict_end_date, metric_hub=None, From e0903b3f708b703d1082f30cb3fc5b9ef23088f5 Mon Sep 17 00:00:00 2001 From: m-d-bowerman Date: Tue, 13 Aug 2024 13:21:47 -0700 Subject: [PATCH 29/33] typo --- jobs/kpi-forecasting/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jobs/kpi-forecasting/README.md b/jobs/kpi-forecasting/README.md index ff1a6ed8..2372ac0d 100644 --- a/jobs/kpi-forecasting/README.md +++ b/jobs/kpi-forecasting/README.md @@ -85,7 +85,7 @@ The tests can be run locally with `python -m pytest` in the root directory of th # YAML Configs -Configuration for each forecast is found in the `configs` folder. Below is an example config file with sample values and a description of what the field means as a comment whe it is not self-evident +Configuration for each forecast is found in the `configs` folder. Below is an example config file with sample values and a description of what the field means as a comment when it is not self-evident ``` metric_hub: # this configures the observed data fed to the model which is obtained via metrichub From 109dff7feb6ac97811ce39f93b5e4c47f7af656d Mon Sep 17 00:00:00 2001 From: m-d-bowerman Date: Tue, 13 Aug 2024 13:35:02 -0700 Subject: [PATCH 30/33] added detail to prophet parameter descriptions --- jobs/kpi-forecasting/README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/jobs/kpi-forecasting/README.md b/jobs/kpi-forecasting/README.md index 2372ac0d..ebafbf78 100644 --- a/jobs/kpi-forecasting/README.md +++ b/jobs/kpi-forecasting/README.md @@ -140,17 +140,17 @@ forecast_model: # this section configures the model regressors: ["post_esr_migration", "in_covid", "ad_click_bug"] # regressors specified in `configs.model_inputs.regressors` grid_parameters: # sets grid for hyperparameter tuning - changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] - changepoint_range: [0.8, 0.9] - n_changepoints: [25, 50] - weekly_seasonality: True - yearly_seasonality: True + changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] # parameter of prior distribution controlling how much the trend fluctuates at changepoints + changepoint_range: [0.8, 0.9] # the proportion of the time series over which the changepoints are distributed + n_changepoints: [25, 50] # number of trend changepoints, equally spaced over the time series + weekly_seasonality: True # if weekly seasonality is included in the model + yearly_seasonality: True # if yearly seasonality is included in the model cv_settings: # sets parameters for prophet cross-validation used in FunnelForecast - initial: "1296 days" - period: "30 days" - horizon: "30 days" - parallel: "processes" + initial: "1296 days" # the initial training period, used to train the first iteration of the model for CV + period: "30 days" # spacing between cutoff dates, the sliding window over which each round of cross validation is performed + horizon: "30 days" # forecast horizon used to make predictions and calculate model fit metrics for optimization + parallel: "processes" # how parallelization is performed by Prophet, or None if no paralellization is used ... summarize: From c6ed03c9e5f3deff7b611c938349e0f838387e0d Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Wed, 14 Aug 2024 10:04:40 -0500 Subject: [PATCH 31/33] updated setting of default start date and added tests --- .../kpi_forecasting/models/base_forecast.py | 28 +++++++------ .../tests/test_base_forecast.py | 40 +++++++++++++++++++ 2 files changed, 55 insertions(+), 13 deletions(-) diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py index 99504fdb..896051f8 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py @@ -29,6 +29,9 @@ class BaseForecast(abc.ABC): date the metric should be queried. metric_hub (MetricHub): A MetricHub object that provides details about the metric to be forecasted. + predict_historical_dates (bool): If True, forecast starts at the first + date in the observed data. If False, it uses the value of start_date it set + and the first day after the observed data ends otherwise """ model_type: str @@ -52,21 +55,17 @@ def __post_init__(self) -> None: self.collected_at = datetime.now(timezone.utc).replace(tzinfo=None) self._get_observed_data() + # raise an error is predict_historical_dates is True and start_date is set + if self.start_date and self.predict_historical_dates: + raise ValueError( + "forecast start_date set while predict_historical_dates is True" + ) # use default start/end dates if the user doesn't specify them self.start_date = pd.to_datetime(self.start_date or self._default_start_date) self.end_date = pd.to_datetime(self.end_date or self._default_end_date) - if self.predict_historical_dates: - self.dates_to_predict = pd.DataFrame( - { - "submission_date": pd.date_range( - self.metric_hub.start_date, self.end_date - ).date - } - ) - else: - self.dates_to_predict = pd.DataFrame( - {"submission_date": pd.date_range(self.start_date, self.end_date).date} - ) + self.dates_to_predict = pd.DataFrame( + {"submission_date": pd.date_range(self.start_date, self.end_date).date} + ) # initialize unset attributes self.model = None @@ -144,7 +143,10 @@ def _summarize( @property def _default_start_date(self) -> str: """The first day after the last date in the observed dataset.""" - return self.observed_df["submission_date"].max() + timedelta(days=1) + if self.predict_historical_dates: + return self.observed_df["submission_date"].min() + else: + return self.observed_df["submission_date"].max() + timedelta(days=1) @property def _default_end_date(self) -> str: diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py index c4d823d3..bfea0e5a 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py @@ -107,6 +107,24 @@ def test_post_init(good_class): assert good_class.dates_to_predict.equals(dates_to_predict_expected) +def test_post_init_exception(good_class): + start_date = TEST_DATE_STR + end_date = TEST_PREDICT_END_STR + with pytest.raises( + ValueError, + match="forecast start_date set while predict_historical_dates is True", + ): + _ = good_class( + model_type="test", + parameters={}, + use_all_us_holidays=None, + start_date=start_date, + end_date=end_date, + metric_hub=None, + predict_historical_dates=True, + ) + + def test_post_init_default_dates(good_class): # check default start and end time good_class = good_class( @@ -129,6 +147,28 @@ def test_post_init_default_dates(good_class): assert good_class.dates_to_predict.equals(dates_to_predict_expected) +def test_post_init_default_dates_historical(good_class): + # check default start and end time + good_class = good_class( + model_type="test", + parameters={}, + use_all_us_holidays=None, + start_date="", + end_date="", + metric_hub=None, + predict_historical_dates=True, + ) + # this is the min date of the observed data + start_date = TEST_DATE - relativedelta(years=1) + end_date = ( + datetime.now(timezone.utc).replace(tzinfo=None) + timedelta(weeks=78) + ).date() + dates_to_predict_expected = pd.DataFrame( + {"submission_date": pd.date_range(start_date, end_date).date} + ) + assert good_class.dates_to_predict.equals(dates_to_predict_expected) + + def test_fit(good_class): good_class = good_class( model_type="test", From 2f8b3d09b2514e4a45a734798d6919b236cc82d9 Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Wed, 14 Aug 2024 14:07:21 -0500 Subject: [PATCH 32/33] remove print --- jobs/kpi-forecasting/kpi_forecasting/results_processing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/jobs/kpi-forecasting/kpi_forecasting/results_processing.py b/jobs/kpi-forecasting/kpi_forecasting/results_processing.py index 1cb8a9d1..e2f199e5 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/results_processing.py +++ b/jobs/kpi-forecasting/kpi_forecasting/results_processing.py @@ -80,7 +80,6 @@ def _load_config_data(self): for config_file in self.input_config_list: full_path = f"{self.input_config_path}/{config_file}" config_data = load_yaml(full_path) - print(config_data) self.config_data[config_file] = config_data def _extract_config_data(self): From 877d07cbb3a26b4627fab28faebe57a20573ae74 Mon Sep 17 00:00:00 2001 From: Jared Snyder Date: Wed, 14 Aug 2024 16:14:16 -0500 Subject: [PATCH 33/33] moved filter and updated tests to relfect this --- .../kpi_forecasting/models/funnel_forecast.py | 10 +++-- .../models/prophet_forecast.py | 4 +- .../tests/test_funnel_forecast.py | 45 ++++++++----------- 3 files changed, 28 insertions(+), 31 deletions(-) diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py index b38e4df4..3c06863c 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py @@ -493,9 +493,7 @@ def _predict( segment_settings.components_df = components_df.copy() - return df.loc[ - pd.to_datetime(df["submission_date"]) >= pd.to_datetime(self.start_date) - ] + return df def _validate_forecast_df(self, df: pd.DataFrame) -> None: """ @@ -565,6 +563,12 @@ def _combine_forecast_observed( Returns: pd.DataFrame: combined dataframe containing aggregated values from observed and forecast """ + # filter the forecast data to just the data in the future + last_historic_date = observed_df["submission_date"].max() + forecast_df = forecast_df.loc[ + forecast_df["submission_date"] > last_historic_date + ] + forecast_summarized, observed_summarized = self._aggregate_forecast_observed( forecast_df, observed_df, period, numpy_aggregations, percentiles ) diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py index 82a07fc4..3dc2b920 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py @@ -362,8 +362,8 @@ def write_results( project_legacy: str, dataset_legacy: str, write_disposition: str = "WRITE_APPEND", - forecast_table_legacy: str = "kpi_automated_forecast_v1", - confidences_table_legacy: str = "kpi_automated_forecast_confidences_v1", + forecast_table_legacy: str = "kpi_automated_forecast_v1_branch", + confidences_table_legacy: str = "kpi_automated_forecast_confidences_v1_branch", ) -> None: """ Write `self.summary_df` to Big Query. diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py index aaeed8b7..6e43e409 100644 --- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py +++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py @@ -194,8 +194,8 @@ def test_combine_forecast_observed(mocker, forecast): observed_df = pd.DataFrame( { "submission_date": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, + TEST_DATE - relativedelta(days=2), + TEST_DATE - relativedelta(days=1), ], "a": ["A1", "A1"], "value": [5, 6], @@ -257,10 +257,10 @@ def test_under_summarize(mocker, forecast): { "submission_date": [ TEST_DATE - relativedelta(months=1), - TEST_DATE, - TEST_DATE_NEXT_DAY, - TEST_DATE, - TEST_DATE_NEXT_DAY, + TEST_DATE - relativedelta(days=2), + TEST_DATE - relativedelta(days=1), + TEST_DATE - relativedelta(days=2), + TEST_DATE - relativedelta(days=1), ], "a": ["A1", "A1", "A1", "A2", "A2"], "value": [10, 20, 30, 40, 50], @@ -272,7 +272,7 @@ def test_under_summarize(mocker, forecast): ["start_date", "forecast_df", "segment", "trained_parameters"], ) dummy_segment_settings = SegmentSettings( - start_date=TEST_DATE_STR, + start_date=(TEST_DATE - relativedelta(days=2)).strftime("%Y-%m-%d"), forecast_df=forecast_df.copy(), segment={"a": "A1"}, trained_parameters={"trained_parameters": "yes"}, @@ -295,8 +295,8 @@ def test_under_summarize(mocker, forecast): observed_expected_df = pd.DataFrame( { "submission_date": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, + TEST_DATE - relativedelta(days=2), + TEST_DATE - relativedelta(days=1), ], "a": ["A1", "A1"], "value": [20, 30], @@ -361,10 +361,10 @@ def test_summarize(mocker, forecast): { "submission_date": [ TEST_DATE - relativedelta(months=1), - TEST_DATE, - TEST_DATE_NEXT_DAY, - TEST_DATE, - TEST_DATE_NEXT_DAY, + TEST_DATE - relativedelta(days=2), + TEST_DATE - relativedelta(days=1), + TEST_DATE - relativedelta(days=2), + TEST_DATE - relativedelta(days=1), ], "a": ["A1", "A1", "A1", "A2", "A2"], "value": [10, 20, 30, 40, 50], @@ -380,7 +380,7 @@ def test_summarize(mocker, forecast): # we're only testing that it is concatenated properly # with the segment data added dummy_segment_settings_A1 = SegmentSettings( - start_date=TEST_DATE_STR, + start_date=(TEST_DATE - relativedelta(days=2)).strftime("%Y-%m-%d"), forecast_df=forecast_df.copy(), segment={"a": "A1"}, trained_parameters={"trained_parameters": "yes"}, @@ -388,7 +388,7 @@ def test_summarize(mocker, forecast): ) dummy_segment_settings_A2 = SegmentSettings( - start_date=TEST_DATE_STR, + start_date=(TEST_DATE - relativedelta(days=2)).strftime("%Y-%m-%d"), forecast_df=forecast_df.copy(), segment={"a": "A2"}, trained_parameters={"trained_parameters": "yes"}, @@ -425,10 +425,10 @@ def test_summarize(mocker, forecast): observed_expected_df = pd.DataFrame( { "submission_date": [ - TEST_DATE, - TEST_DATE_NEXT_DAY, - TEST_DATE, - TEST_DATE_NEXT_DAY, + TEST_DATE - relativedelta(days=2), + TEST_DATE - relativedelta(days=1), + TEST_DATE - relativedelta(days=2), + TEST_DATE - relativedelta(days=1), ], "a": ["A1", "A1", "A2", "A2"], "value": [20, 30, 40, 50], @@ -584,13 +584,6 @@ def test_under_predict(mocker): } ) - # time filter corresponds to the start time of the object - # as opposed to the segment - expected_time_filter = ( - expected["submission_date"] >= pd.to_datetime(forecast.start_date).date() - ) - expected = expected[expected_time_filter].reset_index(drop=True) - pd.testing.assert_frame_equal(out, expected) # check the components