From 2a60eefdce558ee4c302f1576e92655f6420af91 Mon Sep 17 00:00:00 2001
From: Jared Snyder <jsnyder@mozilla.com>
Date: Tue, 9 Jul 2024 15:56:06 -0500
Subject: [PATCH 01/33] refactored base_forecast and prophet_forecast to enable
 easier testing

---
 .../kpi_forecasting/models/base_forecast.py   | 359 ++++--------------
 .../models/prophet_forecast.py                | 286 +++++++++++++-
 .../tests/test_base_forecast.py               |  15 +
 3 files changed, 361 insertions(+), 299 deletions(-)
 create mode 100644 jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py

diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
index 13385ccd..647bb160 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
@@ -1,19 +1,18 @@
 import json
 import numpy as np
 import pandas as pd
+import abc
+
 
-from google.cloud import bigquery
-from google.cloud.bigquery.enums import SqlTypeNames as bq_types
 from dataclasses import dataclass
-from datetime import datetime, timedelta
-from kpi_forecasting import pandas_extras as pdx
+from datetime import datetime, timedelta, timezone
 from kpi_forecasting.metric_hub import MetricHub
 from pandas.api import types as pd_types
 from typing import Dict, List
 
 
 @dataclass
-class BaseForecast:
+class BaseForecast(abc.ABC):
     """
     A base class for fitting, forecasting, and summarizing forecasts. This class
     should not be invoked directly; it should be inherited by a child class. The
@@ -46,7 +45,7 @@ class BaseForecast:
 
     def __post_init__(self) -> None:
         # fetch observed observed data
-        self.collected_at = datetime.utcnow()
+        self.collected_at = datetime.now(timezone.utc)
         self.observed_df = self.metric_hub.fetch()
 
         # use default start/end dates if the user doesn't specify them
@@ -70,26 +69,55 @@ def __post_init__(self) -> None:
             }
         )
 
-    def _fit(self) -> None:
-        """
-        Fit a forecasting model using `self.observed_df` that was generated using
-        Metric Hub data. This method should update `self.model`.
+    @abc.abstractmethod
+    def _fit(self, observed_df: pd.DataFrame) -> None:
+        """Fit a forecasting model using `observed_df.` This will typically
+        be the data that was generated using
+        Metric Hub in `__post_init__`.
+        This method should update `self.model`.
+
+        Args:
+            observed_df (pd.DataFrame): observed data used to fit the model
         """
         raise NotImplementedError
 
-    def _predict(self) -> pd.DataFrame:
-        """
-        Forecast using `self.model`. This method should return a dataframe that will
+    @abc.abstractmethod
+    def _predict(self, dates_to_predict: pd.DataFrame) -> pd.DataFrame:
+        """Forecast using `self.model` on dates in `dates_to_predict`.
+        This method should return a dataframe that will
         be validated by `_validate_forecast_df`.
+
+        Args:
+            dates_to_predict (pd.DataFrame): dataframe of dates to forecast for
+
+        Returns:
+            pd.DataFrame: dataframe of predictions
         """
         raise NotImplementedError
 
-    def _predict_legacy(self) -> pd.DataFrame:
-        """
-        Forecast using `self.model`, adhering to the legacy data format.
+    @abc.abstractmethod
+    def _summarize(
+        self,
+        forecast_df: pd.DataFrame,
+        observed_df: pd.DataFrame,
+        period: str,
+        numpy_aggregations: List[str],
+        percentiles: List[int],
+    ) -> pd.DataFrame:
+        """Calculate summary metrics for `forecast_df` over a given period, and
+        add metadata.
+
+        Args:
+            forecast_df (pd.DataFrame): forecast dataframe created by `predict`
+            observed_df (pd.DataFrame): observed data used to generate prediction
+            period (str): aggregation period up to which metrics are aggregated
+            numpy_aggregations (List[str]): List of numpy aggregation names
+            percentiles (List[int]): List of percentiles to aggregate up to
+
+        Returns:
+            pd.DataFrame: dataframe containing metrics listed in numpy_aggregations
+                and percentiles
         """
-        # TODO: This method should be removed once the forecasting data model is updated:
-        # https://mozilla-hub.atlassian.net/browse/DS-2676
         raise NotImplementedError
 
     @property
@@ -100,7 +128,7 @@ def _default_start_date(self) -> str:
     @property
     def _default_end_date(self) -> str:
         """78 weeks (18 months) ahead of the current UTC date."""
-        return (datetime.utcnow() + timedelta(weeks=78)).date()
+        return (datetime.now(timezone.utc) + timedelta(weeks=78)).date()
 
     def _set_seed(self) -> None:
         """Set random seed to ensure that fits and predictions are reproducible."""
@@ -133,205 +161,30 @@ def _validate_forecast_df(self) -> None:
                     f" but column {i} has type {df[i].dtypes}."
                 )
 
-    def _summarize(
-        self,
-        period: str,
-        numpy_aggregations: List[str],
-        percentiles: List[int],
-    ) -> pd.DataFrame:
-        """
-        Calculate summary metrics for `self.forecast_df` over a given period, and
-        add metadata.
-        """
-        # build a list of all functions that we'll summarize the data by
-        aggregations = [getattr(np, i) for i in numpy_aggregations]
-        aggregations.extend([pdx.percentile(i) for i in percentiles])
-
-        # aggregate metric to the correct date period (day, month, year)
-        observed_summarized = pdx.aggregate_to_period(self.observed_df, period)
-        forecast_agg = pdx.aggregate_to_period(self.forecast_df, period)
-
-        # find periods of overlap between observed and forecasted data
-        overlap = forecast_agg.merge(
-            observed_summarized,
-            on="submission_date",
-            how="left",
-        ).fillna(0)
-
-        forecast_summarized = (
-            forecast_agg.set_index("submission_date")
-            # Add observed data samples to any overlapping forecasted period. This
-            # ensures that any forecast made partway through a period accounts for
-            # previously observed data within the period. For example, when a monthly
-            # forecast is generated in the middle of the month.
-            .add(overlap[["value"]].values)
-            # calculate summary values, aggregating by submission_date,
-            .agg(aggregations, axis=1)
-            .reset_index()
-            # "melt" the df from wide-format to long-format.
-            .melt(id_vars="submission_date", var_name="measure")
-        )
-
-        # add datasource-specific metadata columns
-        forecast_summarized["source"] = "forecast"
-        observed_summarized["source"] = "historical"
-        observed_summarized["measure"] = "observed"
-
-        # create a single dataframe that contains observed and forecasted data
-        df = pd.concat([observed_summarized, forecast_summarized])
-
-        # add summary metadata columns
-        df["aggregation_period"] = period.lower()
-
-        # reorder columns to make interpretation easier
-        df = df[["submission_date", "aggregation_period", "source", "measure", "value"]]
-
-        # add Metric Hub metadata columns
-        df["metric_alias"] = self.metric_hub.alias.lower()
-        df["metric_hub_app_name"] = self.metric_hub.app_name.lower()
-        df["metric_hub_slug"] = self.metric_hub.slug.lower()
-        df["metric_start_date"] = pd.to_datetime(self.metric_hub.min_date)
-        df["metric_end_date"] = pd.to_datetime(self.metric_hub.max_date)
-        df["metric_collected_at"] = self.collected_at
-
-        # add forecast model metadata columns
-        df["forecast_start_date"] = self.start_date
-        df["forecast_end_date"] = self.end_date
-        df["forecast_trained_at"] = self.trained_at
-        df["forecast_predicted_at"] = self.predicted_at
-        df["forecast_parameters"] = self.metadata_params
-
-        return df
-
-    def _summarize_legacy(self) -> pd.DataFrame:
-        """
-        Converts a `self.summary_df` to the legacy format used in
-        `moz-fx-data-shared-prod.telemetry_derived.kpi_automated_forecast_confidences_v1`
-        """
-        # TODO: This method should be removed once the forecasting data model is updated:
-        # https://mozilla-hub.atlassian.net/browse/DS-2676
-
-        df = self.summary_df.copy(deep=True)
-
-        # rename columns to legacy values
-        df.rename(
-            columns={
-                "forecast_end_date": "asofdate",
-                "submission_date": "date",
-                "metric_alias": "target",
-                "aggregation_period": "unit",
-            },
-            inplace=True,
-        )
-        df["forecast_date"] = df["forecast_predicted_at"].dt.date
-        df["type"] = df["source"].replace("historical", "actual")
-        df = df.replace(
-            {
-                "measure": {
-                    "observed": "value",
-                    "p05": "yhat_p5",
-                    "p10": "yhat_p10",
-                    "p20": "yhat_p20",
-                    "p30": "yhat_p30",
-                    "p40": "yhat_p40",
-                    "p50": "yhat_p50",
-                    "p60": "yhat_p60",
-                    "p70": "yhat_p70",
-                    "p80": "yhat_p80",
-                    "p90": "yhat_p90",
-                    "p95": "yhat_p95",
-                },
-                "target": {
-                    "desktop_dau": "desktop",
-                    "mobile_dau": "mobile",
-                },
-            }
-        )
-
-        # pivot the df from "long" to "wide" format
-        index_columns = [
-            "asofdate",
-            "date",
-            "target",
-            "unit",
-            "forecast_parameters",
-            "forecast_date",
-        ]
-        df = (
-            df[index_columns + ["measure", "value"]]
-            .pivot(
-                index=index_columns,
-                columns="measure",
-                values="value",
-            )
-            .reset_index()
-        )
-
-        # pivot sets the "name" attribute of the columns for some reason. It's
-        # None by default, so we just reset that here.
-        df.columns.name = None
-
-        # When there's an overlap in the observed and forecasted period -- for
-        # example, when a monthly forecast is generated mid-month -- the legacy
-        # format only records the forecasted value, not the observed value. To
-        # account for this, we'll just find the max of the "mean" (forecasted) and
-        # "value" (observed) data. In all non-overlapping observed periods, the
-        # forecasted value will be NULL. In all non-overlapping forecasted periods,
-        # the observed value will be NULL. In overlapping periods, the forecasted
-        # value will always be larger because it is the sum of the observed and forecasted
-        # values. Below is a query that demonstrates the legacy behavior:
-        #
-        # SELECT *
-        #   FROM `moz-fx-data-shared-prod.telemetry_derived.kpi_automated_forecast_confidences_v1`
-        #  WHERE asofdate = "2023-12-31"
-        #    AND target = "mobile"
-        #    AND unit = "month"
-        #    AND forecast_date = "2022-06-04"
-        #    AND date BETWEEN "2022-05-01" AND "2022-06-01"
-        #  ORDER BY date
-        df["value"] = df[["mean", "value"]].max(axis=1)
-        df.drop(columns=["mean"], inplace=True)
-
-        # non-numeric columns are represented in the legacy bq schema as strings
-        string_cols = [
-            "asofdate",
-            "date",
-            "target",
-            "unit",
-            "forecast_parameters",
-            "forecast_date",
-        ]
-        df[string_cols] = df[string_cols].astype(str)
-
-        return df
-
     def fit(self) -> None:
         """Fit a model using historic metric data provided by `metric_hub`."""
         print(f"Fitting {self.model_type} model.", flush=True)
         self._set_seed()
-        self.trained_at = datetime.utcnow()
-        self._fit()
+        self.trained_at = datetime.now(timezone.utc)
+        self._fit(self.observed_df)
 
     def predict(self) -> None:
-        """Generate a forecast from `start_date` to `end_date`."""
+        """Generate a forecast from `start_date` to `end_date`.
+        Result is set to `self.forecast_df`"""
         print(f"Forecasting from {self.start_date} to {self.end_date}.", flush=True)
         self._set_seed()
-        self.predicted_at = datetime.utcnow()
-        self.forecast_df = self._predict()
+        self.predicted_at = datetime.now(timezone.utc)
+        self.forecast_df = self._predict(self.dates_to_predict)
         self._validate_forecast_df()
 
-        # TODO: This line should be removed once the forecasting data model is updated:
-        # https://mozilla-hub.atlassian.net/browse/DS-2676
-        self.forecast_df_legacy = self._predict_legacy()
-
     def summarize(
         self,
         periods: List[str] = ["day", "month"],
         numpy_aggregations: List[str] = ["mean"],
         percentiles: List[int] = [10, 50, 90],
-    ) -> None:
+    ) -> pd.DataFrame:
         """
-        Calculate summary metrics for `self.forecast_df` and add metadata.
+        Calculate summary metrics for `forecast_df` and add metadata.
         The dataframe returned here will be reported in Big Query when
         `write_results` is called.
 
@@ -342,95 +195,21 @@ def summarize(
                 be applied to summarize numeric values in a numpy dataframe. For example, ["mean"].
             percentiles (List[int]): A list of integers representing the percentiles that should be reported
                 in the summary. For example [50] would calculate the 50th percentile (i.e. the median).
+
+        Returns:
+            pd.DataFrame: metric dataframe for all metrics and aggregations
         """
         self.summary_df = pd.concat(
-            [self._summarize(i, numpy_aggregations, percentiles) for i in periods]
+            [
+                self._summarize(
+                    self.forecast_df,
+                    self.observed_df,
+                    i,
+                    numpy_aggregations,
+                    percentiles,
+                )
+                for i in periods
+            ]
         )
 
-        # TODO: remove this once the forecasting data model is updated:
-        # https://mozilla-hub.atlassian.net/browse/DS-2676
-        self.summary_df_legacy = self._summarize_legacy()
-
-    def write_results(
-        self,
-        project: str,
-        dataset: str,
-        table: str,
-        project_legacy: str,
-        dataset_legacy: str,
-        write_disposition: str = "WRITE_APPEND",
-        forecast_table_legacy: str = "kpi_automated_forecast_v1",
-        confidences_table_legacy: str = "kpi_automated_forecast_confidences_v1",
-    ) -> None:
-        """
-        Write `self.summary_df` to Big Query.
-
-        Args:
-            project (str): The Big Query project that the data should be written to.
-            dataset (str): The Big Query dataset that the data should be written to.
-            table (str): The Big Query table that the data should be written to.
-            write_disposition (str): In the event that the destination table exists,
-                should the table be overwritten ("WRITE_TRUNCATE") or appended to
-                ("WRITE_APPEND")?
-        """
-        print(f"Writing results to `{project}.{dataset}.{table}`.", flush=True)
-        client = bigquery.Client(project=project)
-        schema = [
-            bigquery.SchemaField("submission_date", bq_types.DATE),
-            bigquery.SchemaField("aggregation_period", bq_types.STRING),
-            bigquery.SchemaField("source", bq_types.STRING),
-            bigquery.SchemaField("measure", bq_types.STRING),
-            bigquery.SchemaField("value", bq_types.FLOAT),
-            bigquery.SchemaField("metric_alias", bq_types.STRING),
-            bigquery.SchemaField("metric_hub_app_name", bq_types.STRING),
-            bigquery.SchemaField("metric_hub_slug", bq_types.STRING),
-            bigquery.SchemaField("metric_start_date", bq_types.DATE),
-            bigquery.SchemaField("metric_end_date", bq_types.DATE),
-            bigquery.SchemaField("metric_collected_at", bq_types.TIMESTAMP),
-            bigquery.SchemaField("forecast_start_date", bq_types.DATE),
-            bigquery.SchemaField("forecast_end_date", bq_types.DATE),
-            bigquery.SchemaField("forecast_trained_at", bq_types.TIMESTAMP),
-            bigquery.SchemaField("forecast_predicted_at", bq_types.TIMESTAMP),
-            bigquery.SchemaField("forecast_parameters", bq_types.STRING),
-        ]
-        job = client.load_table_from_dataframe(
-            dataframe=self.summary_df,
-            destination=f"{project}.{dataset}.{table}",
-            job_config=bigquery.LoadJobConfig(
-                schema=schema,
-                autodetect=False,
-                write_disposition=write_disposition,
-            ),
-        )
-        # Wait for the job to complete.
-        job.result()
-
-        # TODO: remove the below jobs once the forecasting data model is updated:
-        # https://mozilla-hub.atlassian.net/browse/DS-2676
-
-        job = client.load_table_from_dataframe(
-            dataframe=self.forecast_df_legacy,
-            destination=f"{project_legacy}.{dataset_legacy}.{forecast_table_legacy}",
-            job_config=bigquery.LoadJobConfig(
-                write_disposition=write_disposition,
-                schema=[
-                    bigquery.SchemaField("ds", bq_types.TIMESTAMP),
-                    bigquery.SchemaField("forecast_date", bq_types.STRING),
-                    bigquery.SchemaField("forecast_parameters", bq_types.STRING),
-                ],
-            ),
-        )
-        job.result()
-
-        job = client.load_table_from_dataframe(
-            dataframe=self.summary_df_legacy,
-            destination=f"{project_legacy}.{dataset_legacy}.{confidences_table_legacy}",
-            job_config=bigquery.LoadJobConfig(
-                write_disposition=write_disposition,
-                schema=[
-                    bigquery.SchemaField("asofdate", bq_types.STRING),
-                    bigquery.SchemaField("date", bq_types.STRING),
-                ],
-            ),
-        )
-        job.result()
+        return self.summary_df
diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py
index 8402dda4..abc3a4f5 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py
@@ -1,11 +1,16 @@
 import json
 import pandas as pd
 import prophet
+import numpy as np
+from typing import Dict, List
 
-from datetime import datetime
+
+from datetime import datetime, timezone
 from dataclasses import dataclass
 from kpi_forecasting.models.base_forecast import BaseForecast
-from typing import Dict
+from kpi_forecasting import pandas_extras as pdx
+from google.cloud import bigquery
+from google.cloud.bigquery.enums import SqlTypeNames as bq_types
 
 
 @dataclass
@@ -14,7 +19,7 @@ class ProphetForecast(BaseForecast):
     def column_names_map(self) -> Dict[str, str]:
         return {"submission_date": "ds", "value": "y"}
 
-    def _fit(self) -> None:
+    def _fit(self, observed_df) -> None:
         self.model = prophet.Prophet(
             **self.parameters,
             uncertainty_samples=self.number_of_simulations,
@@ -26,16 +31,15 @@ def _fit(self) -> None:
 
         # Modify observed data to have column names that Prophet expects, and fit
         # the model
-        self.model.fit(self.observed_df.rename(columns=self.column_names_map))
+        self.model.fit(observed_df.rename(columns=self.column_names_map))
 
-    def _predict(self) -> pd.DataFrame:
+    def _predict(self, dates_to_predict) -> pd.DataFrame:
         # generate the forecast samples
         samples = self.model.predictive_samples(
-            self.dates_to_predict.rename(columns=self.column_names_map)
+            dates_to_predict.rename(columns=self.column_names_map)
         )
         df = pd.DataFrame(samples["yhat"])
-        df["submission_date"] = self.dates_to_predict
-
+        df["submission_date"] = dates_to_predict
         return df
 
     def _predict_legacy(self) -> pd.DataFrame:
@@ -56,7 +60,7 @@ def _predict_legacy(self) -> pd.DataFrame:
         else:
             df["metric"] = self.metric_hub.alias
 
-        df["forecast_date"] = str(datetime.utcnow().date())
+        df["forecast_date"] = str(datetime.now(timezone.utc).date())
         df["forecast_parameters"] = str(
             json.dumps({**self.parameters, "holidays": self.use_holidays})
         )
@@ -116,3 +120,267 @@ def _predict_legacy(self) -> pd.DataFrame:
                 df[column] = 0.0
 
         return df[columns]
+
+    def _summarize(
+        self,
+        forecast_df,
+        observed_df,
+        period: str,
+        numpy_aggregations: List[str],
+        percentiles: List[int],
+    ) -> pd.DataFrame:
+        """
+        Calculate summary metrics for `self.forecast_df` over a given period, and
+        add metadata.
+        """
+        # build a list of all functions that we'll summarize the data by
+        aggregations = [getattr(np, i) for i in numpy_aggregations]
+        aggregations.extend([pdx.percentile(i) for i in percentiles])
+
+        # aggregate metric to the correct date period (day, month, year)
+        observed_summarized = pdx.aggregate_to_period(observed_df, period)
+        forecast_agg = pdx.aggregate_to_period(forecast_df, period)
+
+        # find periods of overlap between observed and forecasted data
+        overlap = forecast_agg.merge(
+            observed_summarized,
+            on="submission_date",
+            how="left",
+        ).fillna(0)
+
+        forecast_summarized = (
+            forecast_agg.set_index("submission_date")
+            # Add observed data samples to any overlapping forecasted period. This
+            # ensures that any forecast made partway through a period accounts for
+            # previously observed data within the period. For example, when a monthly
+            # forecast is generated in the middle of the month.
+            .add(overlap[["value"]].values)
+            # calculate summary values, aggregating by submission_date,
+            .agg(aggregations, axis=1)
+            .reset_index()
+            # "melt" the df from wide-format to long-format.
+            .melt(id_vars="submission_date", var_name="measure")
+        )
+
+        # add datasource-specific metadata columns
+        forecast_summarized["source"] = "forecast"
+        observed_summarized["source"] = "historical"
+        observed_summarized["measure"] = "observed"
+
+        # create a single dataframe that contains observed and forecasted data
+        df = pd.concat([observed_summarized, forecast_summarized])
+
+        # add summary metadata columns
+        df["aggregation_period"] = period.lower()
+
+        # reorder columns to make interpretation easier
+        df = df[["submission_date", "aggregation_period", "source", "measure", "value"]]
+
+        # add Metric Hub metadata columns
+        df["metric_alias"] = self.metric_hub.alias.lower()
+        df["metric_hub_app_name"] = self.metric_hub.app_name.lower()
+        df["metric_hub_slug"] = self.metric_hub.slug.lower()
+        df["metric_start_date"] = pd.to_datetime(self.metric_hub.min_date)
+        df["metric_end_date"] = pd.to_datetime(self.metric_hub.max_date)
+        df["metric_collected_at"] = self.collected_at
+
+        # add forecast model metadata columns
+        df["forecast_start_date"] = self.start_date
+        df["forecast_end_date"] = self.end_date
+        df["forecast_trained_at"] = self.trained_at
+        df["forecast_predicted_at"] = self.predicted_at
+        df["forecast_parameters"] = self.metadata_params
+
+        return df
+
+    def _summarize_legacy(self) -> pd.DataFrame:
+        """
+        Converts a `self.summary_df` to the legacy format used in
+        `moz-fx-data-shared-prod.telemetry_derived.kpi_automated_forecast_confidences_v1`
+        """
+        # TODO: This method should be removed once the forecasting data model is updated:
+        # https://mozilla-hub.atlassian.net/browse/DS-2676
+
+        df = self.summary_df.copy(deep=True)
+
+        # rename columns to legacy values
+        df.rename(
+            columns={
+                "forecast_end_date": "asofdate",
+                "submission_date": "date",
+                "metric_alias": "target",
+                "aggregation_period": "unit",
+            },
+            inplace=True,
+        )
+        df["forecast_date"] = df["forecast_predicted_at"].dt.date
+        df["type"] = df["source"].replace("historical", "actual")
+        df = df.replace(
+            {
+                "measure": {
+                    "observed": "value",
+                    "p05": "yhat_p5",
+                    "p10": "yhat_p10",
+                    "p20": "yhat_p20",
+                    "p30": "yhat_p30",
+                    "p40": "yhat_p40",
+                    "p50": "yhat_p50",
+                    "p60": "yhat_p60",
+                    "p70": "yhat_p70",
+                    "p80": "yhat_p80",
+                    "p90": "yhat_p90",
+                    "p95": "yhat_p95",
+                },
+                "target": {
+                    "desktop_dau": "desktop",
+                    "mobile_dau": "mobile",
+                },
+            }
+        )
+
+        # pivot the df from "long" to "wide" format
+        index_columns = [
+            "asofdate",
+            "date",
+            "target",
+            "unit",
+            "forecast_parameters",
+            "forecast_date",
+        ]
+        df = (
+            df[index_columns + ["measure", "value"]]
+            .pivot(
+                index=index_columns,
+                columns="measure",
+                values="value",
+            )
+            .reset_index()
+        )
+
+        # pivot sets the "name" attribute of the columns for some reason. It's
+        # None by default, so we just reset that here.
+        df.columns.name = None
+
+        # When there's an overlap in the observed and forecasted period -- for
+        # example, when a monthly forecast is generated mid-month -- the legacy
+        # format only records the forecasted value, not the observed value. To
+        # account for this, we'll just find the max of the "mean" (forecasted) and
+        # "value" (observed) data. In all non-overlapping observed periods, the
+        # forecasted value will be NULL. In all non-overlapping forecasted periods,
+        # the observed value will be NULL. In overlapping periods, the forecasted
+        # value will always be larger because it is the sum of the observed and forecasted
+        # values. Below is a query that demonstrates the legacy behavior:
+        #
+        # SELECT *
+        #   FROM `moz-fx-data-shared-prod.telemetry_derived.kpi_automated_forecast_confidences_v1`
+        #  WHERE asofdate = "2023-12-31"
+        #    AND target = "mobile"
+        #    AND unit = "month"
+        #    AND forecast_date = "2022-06-04"
+        #    AND date BETWEEN "2022-05-01" AND "2022-06-01"
+        #  ORDER BY date
+        df["value"] = df[["mean", "value"]].max(axis=1)
+        df.drop(columns=["mean"], inplace=True)
+
+        # non-numeric columns are represented in the legacy bq schema as strings
+        string_cols = [
+            "asofdate",
+            "date",
+            "target",
+            "unit",
+            "forecast_parameters",
+            "forecast_date",
+        ]
+        df[string_cols] = df[string_cols].astype(str)
+
+        return df
+
+    def write_results(
+        self,
+        project: str,
+        dataset: str,
+        table: str,
+        project_legacy: str,
+        dataset_legacy: str,
+        write_disposition: str = "WRITE_APPEND",
+        forecast_table_legacy: str = "kpi_automated_forecast_v1",
+        confidences_table_legacy: str = "kpi_automated_forecast_confidences_v1",
+    ) -> None:
+        """
+        Write `self.summary_df` to Big Query.
+
+        Args:
+            project (str): The Big Query project that the data should be written to.
+            dataset (str): The Big Query dataset that the data should be written to.
+            table (str): The Big Query table that the data should be written to.
+            write_disposition (str): In the event that the destination table exists,
+                should the table be overwritten ("WRITE_TRUNCATE") or appended to
+                ("WRITE_APPEND")?
+        """
+        # get legacy tables
+        # TODO: remove this once the forecasting data model is updated:
+        # https://mozilla-hub.atlassian.net/browse/DS-2676
+        self.forecast_df_legacy = self._predict_legacy()
+        self.summary_df_legacy = self._summarize_legacy()
+
+        print(f"Writing results to `{project}.{dataset}.{table}`.", flush=True)
+        client = bigquery.Client(project=project)
+        schema = [
+            bigquery.SchemaField("submission_date", bq_types.DATE),
+            bigquery.SchemaField("aggregation_period", bq_types.STRING),
+            bigquery.SchemaField("source", bq_types.STRING),
+            bigquery.SchemaField("measure", bq_types.STRING),
+            bigquery.SchemaField("value", bq_types.FLOAT),
+            bigquery.SchemaField("metric_alias", bq_types.STRING),
+            bigquery.SchemaField("metric_hub_app_name", bq_types.STRING),
+            bigquery.SchemaField("metric_hub_slug", bq_types.STRING),
+            bigquery.SchemaField("metric_start_date", bq_types.DATE),
+            bigquery.SchemaField("metric_end_date", bq_types.DATE),
+            bigquery.SchemaField("metric_collected_at", bq_types.TIMESTAMP),
+            bigquery.SchemaField("forecast_start_date", bq_types.DATE),
+            bigquery.SchemaField("forecast_end_date", bq_types.DATE),
+            bigquery.SchemaField("forecast_trained_at", bq_types.TIMESTAMP),
+            bigquery.SchemaField("forecast_predicted_at", bq_types.TIMESTAMP),
+            bigquery.SchemaField("forecast_parameters", bq_types.STRING),
+        ]
+        job = client.load_table_from_dataframe(
+            dataframe=self.summary_df,
+            destination=f"{project}.{dataset}.{table}",
+            job_config=bigquery.LoadJobConfig(
+                schema=schema,
+                autodetect=False,
+                write_disposition=write_disposition,
+            ),
+        )
+        # Wait for the job to complete.
+        job.result()
+
+        # TODO: remove the below jobs once the forecasting data model is updated:
+        # https://mozilla-hub.atlassian.net/browse/DS-2676
+
+        job = client.load_table_from_dataframe(
+            dataframe=self.forecast_df_legacy,
+            destination=f"{project_legacy}.{dataset_legacy}.{forecast_table_legacy}",
+            job_config=bigquery.LoadJobConfig(
+                write_disposition=write_disposition,
+                schema=[
+                    bigquery.SchemaField("ds", bq_types.TIMESTAMP),
+                    bigquery.SchemaField("forecast_date", bq_types.STRING),
+                    bigquery.SchemaField("forecast_parameters", bq_types.STRING),
+                ],
+            ),
+        )
+        job.result()
+
+        job = client.load_table_from_dataframe(
+            dataframe=self.summary_df_legacy,
+            destination=f"{project_legacy}.{dataset_legacy}.{confidences_table_legacy}",
+            job_config=bigquery.LoadJobConfig(
+                write_disposition=write_disposition,
+                schema=[
+                    bigquery.SchemaField("asofdate", bq_types.STRING),
+                    bigquery.SchemaField("date", bq_types.STRING),
+                ],
+            ),
+        )
+        job.result()
diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
new file mode 100644
index 00000000..a0385c81
--- /dev/null
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
@@ -0,0 +1,15 @@
+import pytest
+
+from kpi_forecasting.models.base_forecast import BaseForecast
+
+
+class BadClass(BaseForecast):
+    pass
+
+
+def test_fit_not_implemented():
+    with pytest.raises(
+        TypeError,
+        match="Can't instantiate abstract class BadClass with abstract methods _fit, _predict, _summarize",
+    ):
+        _ = BadClass()

From 340fabf8e0564ce7a871cabf0c8ac89029632c8a Mon Sep 17 00:00:00 2001
From: Jared Snyder <jaredssnyder@gmail.com>
Date: Wed, 10 Jul 2024 15:06:12 -0500
Subject: [PATCH 02/33] Apply suggestions from code review

change signatures of `fit` and `predict` to take arguments that default to attributes

Co-authored-by: Brad Ochocki Szasz <bochocki@mozilla.com>
---
 .../kpi_forecasting/models/base_forecast.py               | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
index 647bb160..452dac35 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
@@ -161,20 +161,20 @@ def _validate_forecast_df(self) -> None:
                     f" but column {i} has type {df[i].dtypes}."
                 )
 
-    def fit(self) -> None:
+    def fit(self, observed_df: pd.DataFrame = self.observed_df) -> None:
         """Fit a model using historic metric data provided by `metric_hub`."""
         print(f"Fitting {self.model_type} model.", flush=True)
         self._set_seed()
         self.trained_at = datetime.now(timezone.utc)
-        self._fit(self.observed_df)
+        self._fit(observed_df)
 
-    def predict(self) -> None:
+    def predict(self, dates_to_predict: pd.DataFrame = self.dates_to_predict) -> None:
         """Generate a forecast from `start_date` to `end_date`.
         Result is set to `self.forecast_df`"""
         print(f"Forecasting from {self.start_date} to {self.end_date}.", flush=True)
         self._set_seed()
         self.predicted_at = datetime.now(timezone.utc)
-        self.forecast_df = self._predict(self.dates_to_predict)
+        self.forecast_df = self._predict(dates_to_predict)
         self._validate_forecast_df()
 
     def summarize(

From 6c7d3f2d06454fffde10a1f0d2b7e1ad2d6ff0da Mon Sep 17 00:00:00 2001
From: Jared Snyder <jsnyder@mozilla.com>
Date: Wed, 10 Jul 2024 15:04:06 -0500
Subject: [PATCH 03/33] add test for fit

---
 .../kpi_forecasting/models/base_forecast.py   | 26 +++++++-
 .../models/prophet_forecast.py                | 27 ++++++++
 .../tests/test_base_forecast.py               | 62 ++++++++++++++++++-
 3 files changed, 110 insertions(+), 5 deletions(-)

diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
index 452dac35..5787ad61 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
@@ -7,7 +7,6 @@
 from dataclasses import dataclass
 from datetime import datetime, timedelta, timezone
 from kpi_forecasting.metric_hub import MetricHub
-from pandas.api import types as pd_types
 from typing import Dict, List
 
 
@@ -43,10 +42,14 @@ class BaseForecast(abc.ABC):
     metric_hub: MetricHub
     number_of_simulations: int = 1000
 
+    def _get_observed_data(self):
+        if self.metric_hub:
+            self.observed_df = self.metric_hub.fetch()
+
     def __post_init__(self) -> None:
         # fetch observed observed data
         self.collected_at = datetime.now(timezone.utc)
-        self.observed_df = self.metric_hub.fetch()
+        self._get_observed_data()
 
         # use default start/end dates if the user doesn't specify them
         self.start_date = pd.to_datetime(self.start_date or self._default_start_date)
@@ -74,7 +77,7 @@ def _fit(self, observed_df: pd.DataFrame) -> None:
         """Fit a forecasting model using `observed_df.` This will typically
         be the data that was generated using
         Metric Hub in `__post_init__`.
-        This method should update `self.model`.
+        This method should update (and potentially set) `self.model`.
 
         Args:
             observed_df (pd.DataFrame): observed data used to fit the model
@@ -95,6 +98,14 @@ def _predict(self, dates_to_predict: pd.DataFrame) -> pd.DataFrame:
         """
         raise NotImplementedError
 
+    @abc.abstractmethod
+    def _validate_forecast_df(self, forecast_df: pd.DataFrame) -> None:
+        """Method to validate reults produced by _predict
+
+        Args:
+            forecast_df (pd.DataFrame): dataframe produced by `_predict`"""
+        raise NotImplementedError
+
     @abc.abstractmethod
     def _summarize(
         self,
@@ -134,6 +145,7 @@ def _set_seed(self) -> None:
         """Set random seed to ensure that fits and predictions are reproducible."""
         np.random.seed(42)
 
+<<<<<<< HEAD
     def _validate_forecast_df(self) -> None:
         """Validate that `self.forecast_df` has been generated correctly."""
         df = self.forecast_df
@@ -162,6 +174,9 @@ def _validate_forecast_df(self) -> None:
                 )
 
     def fit(self, observed_df: pd.DataFrame = self.observed_df) -> None:
+=======
+    def fit(self) -> None:
+>>>>>>> 590d1ad (add test for fit)
         """Fit a model using historic metric data provided by `metric_hub`."""
         print(f"Fitting {self.model_type} model.", flush=True)
         self._set_seed()
@@ -174,8 +189,13 @@ def predict(self, dates_to_predict: pd.DataFrame = self.dates_to_predict) -> Non
         print(f"Forecasting from {self.start_date} to {self.end_date}.", flush=True)
         self._set_seed()
         self.predicted_at = datetime.now(timezone.utc)
+<<<<<<< HEAD
         self.forecast_df = self._predict(dates_to_predict)
         self._validate_forecast_df()
+=======
+        self.forecast_df = self._predict(self.dates_to_predict)
+        self._validate_forecast_df(self.forecast_df)
+>>>>>>> 590d1ad (add test for fit)
 
     def summarize(
         self,
diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py
index abc3a4f5..ecd4c66a 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py
@@ -1,5 +1,6 @@
 import json
 import pandas as pd
+from pandas.api import types as pd_types
 import prophet
 import numpy as np
 from typing import Dict, List
@@ -42,6 +43,32 @@ def _predict(self, dates_to_predict) -> pd.DataFrame:
         df["submission_date"] = dates_to_predict
         return df
 
+    def _validate_forecast_df(self, df) -> None:
+        """Validate that `self.forecast_df` has been generated correctly."""
+        columns = df.columns
+        expected_shape = (len(self.dates_to_predict), 1 + self.number_of_simulations)
+        numeric_columns = df.drop(columns="submission_date").columns
+
+        if "submission_date" not in columns:
+            raise ValueError("forecast_df must contain a 'submission_date' column.")
+
+        if df.shape != expected_shape:
+            raise ValueError(
+                f"Expected forecast_df to have shape {expected_shape}, but it has shape {df.shape}."
+            )
+
+        if not df["submission_date"].equals(self.dates_to_predict["submission_date"]):
+            raise ValueError(
+                "forecast_df['submission_date'] does not match dates_to_predict['submission_date']."
+            )
+
+        for i in numeric_columns:
+            if not pd_types.is_numeric_dtype(self.forecast_df[i]):
+                raise ValueError(
+                    "All forecast_df columns except 'submission_date' must be numeric,"
+                    f" but column {i} has type {df[i].dtypes}."
+                )
+
     def _predict_legacy(self) -> pd.DataFrame:
         """
         Recreate the legacy format used in
diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
index a0385c81..30c5e06b 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
@@ -1,4 +1,9 @@
+from typing import Dict, List
+
 import pytest
+import pandas as pd
+from dotmap import DotMap
+
 
 from kpi_forecasting.models.base_forecast import BaseForecast
 
@@ -7,9 +12,62 @@ class BadClass(BaseForecast):
     pass
 
 
-def test_fit_not_implemented():
+@pytest.fixture()
+def good_class():
+    class GoodModel:
+        def __init__(self):
+            self.is_fit = False
+
+        def fit(self, observed_data):
+            self.is_fit = max(observed_data)
+
+    class GoodClass(BaseForecast):
+        # overwrite _get_observed_data
+        def _get_observed_data(self):
+            self.observed_df = range(10)
+
+        def _fit(self, observed_df: pd.DataFrame) -> None:
+            self.model = GoodModel()
+            self.model.fit(observed_df)
+
+        def _predict(self, dates_to_predict: pd.DataFrame) -> pd.DataFrame:
+            pass
+
+        def _validate_forecast_df(self, forecast_df: pd.DataFrame) -> None:
+            pass
+
+        def _summarize(
+            self,
+            forecast_df: pd.DataFrame,
+            observed_df: pd.DataFrame,
+            period: str,
+            numpy_aggregations: List[str],
+            percentiles: List[int],
+        ) -> pd.DataFrame:
+            pass
+
+    return GoodClass
+
+
+def test_not_implemented():
     with pytest.raises(
         TypeError,
-        match="Can't instantiate abstract class BadClass with abstract methods _fit, _predict, _summarize",
+        match="Can't instantiate abstract class BadClass with abstract methods _fit, _predict, _summarize, _validate_forecast_df",
     ):
         _ = BadClass()
+
+
+def test_fit(good_class):
+    good_class = good_class(
+        model_type="test",
+        parameters=DotMap(),
+        use_holidays=None,
+        start_date="2124-01-01",
+        end_date="2124-02-02",
+        metric_hub=None,
+    )
+    good_class.fit()
+    assert good_class.model
+
+    #
+    assert good_class.model.is_fit == 9

From 38e721db89be9d82e5d645c0b18da04126ee8ef7 Mon Sep 17 00:00:00 2001
From: Jared Snyder <jsnyder@mozilla.com>
Date: Wed, 10 Jul 2024 15:13:40 -0500
Subject: [PATCH 04/33] revert signatures

---
 .../kpi_forecasting/models/base_forecast.py   | 40 +------------------
 1 file changed, 2 insertions(+), 38 deletions(-)

diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
index 5787ad61..0f619113 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
@@ -145,57 +145,21 @@ def _set_seed(self) -> None:
         """Set random seed to ensure that fits and predictions are reproducible."""
         np.random.seed(42)
 
-<<<<<<< HEAD
-    def _validate_forecast_df(self) -> None:
-        """Validate that `self.forecast_df` has been generated correctly."""
-        df = self.forecast_df
-        columns = df.columns
-        expected_shape = (len(self.dates_to_predict), 1 + self.number_of_simulations)
-        numeric_columns = df.drop(columns="submission_date").columns
-
-        if "submission_date" not in columns:
-            raise ValueError("forecast_df must contain a 'submission_date' column.")
-
-        if df.shape != expected_shape:
-            raise ValueError(
-                f"Expected forecast_df to have shape {expected_shape}, but it has shape {df.shape}."
-            )
-
-        if not df["submission_date"].equals(self.dates_to_predict["submission_date"]):
-            raise ValueError(
-                "forecast_df['submission_date'] does not match dates_to_predict['submission_date']."
-            )
-
-        for i in numeric_columns:
-            if not pd_types.is_numeric_dtype(self.forecast_df[i]):
-                raise ValueError(
-                    "All forecast_df columns except 'submission_date' must be numeric,"
-                    f" but column {i} has type {df[i].dtypes}."
-                )
-
-    def fit(self, observed_df: pd.DataFrame = self.observed_df) -> None:
-=======
     def fit(self) -> None:
->>>>>>> 590d1ad (add test for fit)
         """Fit a model using historic metric data provided by `metric_hub`."""
         print(f"Fitting {self.model_type} model.", flush=True)
         self._set_seed()
         self.trained_at = datetime.now(timezone.utc)
-        self._fit(observed_df)
+        self._fit(self.observed_df)
 
-    def predict(self, dates_to_predict: pd.DataFrame = self.dates_to_predict) -> None:
+    def predict(self) -> None:
         """Generate a forecast from `start_date` to `end_date`.
         Result is set to `self.forecast_df`"""
         print(f"Forecasting from {self.start_date} to {self.end_date}.", flush=True)
         self._set_seed()
         self.predicted_at = datetime.now(timezone.utc)
-<<<<<<< HEAD
-        self.forecast_df = self._predict(dates_to_predict)
-        self._validate_forecast_df()
-=======
         self.forecast_df = self._predict(self.dates_to_predict)
         self._validate_forecast_df(self.forecast_df)
->>>>>>> 590d1ad (add test for fit)
 
     def summarize(
         self,

From 9b173370e6de383e3bd827f721dc9c2f78210862 Mon Sep 17 00:00:00 2001
From: Jared Snyder <jsnyder@mozilla.com>
Date: Wed, 10 Jul 2024 15:15:52 -0500
Subject: [PATCH 05/33] made timezone-aware stamps naive

---
 .../kpi_forecasting/models/base_forecast.py            | 10 ++++++----
 .../kpi_forecasting/models/prophet_forecast.py         |  4 +++-
 .../kpi_forecasting/tests/test_metric_hub.py           |  6 +++---
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
index 0f619113..b0a84eb2 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
@@ -48,7 +48,7 @@ def _get_observed_data(self):
 
     def __post_init__(self) -> None:
         # fetch observed observed data
-        self.collected_at = datetime.now(timezone.utc)
+        self.collected_at = datetime.now(timezone.utc).replace(tzinfo=None)
         self._get_observed_data()
 
         # use default start/end dates if the user doesn't specify them
@@ -139,7 +139,9 @@ def _default_start_date(self) -> str:
     @property
     def _default_end_date(self) -> str:
         """78 weeks (18 months) ahead of the current UTC date."""
-        return (datetime.now(timezone.utc) + timedelta(weeks=78)).date()
+        return (
+            datetime.now(timezone.utc).replace(tzinfo=None) + timedelta(weeks=78)
+        ).date()
 
     def _set_seed(self) -> None:
         """Set random seed to ensure that fits and predictions are reproducible."""
@@ -149,7 +151,7 @@ def fit(self) -> None:
         """Fit a model using historic metric data provided by `metric_hub`."""
         print(f"Fitting {self.model_type} model.", flush=True)
         self._set_seed()
-        self.trained_at = datetime.now(timezone.utc)
+        self.trained_at = datetime.now(timezone.utc).replace(tzinfo=None)
         self._fit(self.observed_df)
 
     def predict(self) -> None:
@@ -157,7 +159,7 @@ def predict(self) -> None:
         Result is set to `self.forecast_df`"""
         print(f"Forecasting from {self.start_date} to {self.end_date}.", flush=True)
         self._set_seed()
-        self.predicted_at = datetime.now(timezone.utc)
+        self.predicted_at = datetime.now(timezone.utc).replace(tzinfo=None)
         self.forecast_df = self._predict(self.dates_to_predict)
         self._validate_forecast_df(self.forecast_df)
 
diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py
index ecd4c66a..652ced30 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py
@@ -87,7 +87,9 @@ def _predict_legacy(self) -> pd.DataFrame:
         else:
             df["metric"] = self.metric_hub.alias
 
-        df["forecast_date"] = str(datetime.now(timezone.utc).date())
+        df["forecast_date"] = str(
+            datetime.now(timezone.utc).replace(tzinfo=None).date()
+        )
         df["forecast_parameters"] = str(
             json.dumps({**self.parameters, "holidays": self.use_holidays})
         )
diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_metric_hub.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_metric_hub.py
index 45d55948..4c58d436 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_metric_hub.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_metric_hub.py
@@ -12,7 +12,7 @@ def test_metrichub_for_dau_kpi():
         slug="mobile_daily_active_users_v1",
         start_date="2024-01-01",
     )
-    now = to_datetime(datetime.now(timezone.utc)).date()
+    now = to_datetime(datetime.now(timezone.utc).replace(tzinfo=None)).date()
 
     query = test_metric_hub.query()
     query_where = f"WHERE submission_date BETWEEN '2024-01-01' AND '{now}'\nGROUP BY"
@@ -76,7 +76,7 @@ def test_metrichub_no_end_date():
         slug="mobile_daily_active_users_v1",
         start_date="2024-01-01",
     )
-    now = to_datetime(datetime.now(timezone.utc)).date()
+    now = to_datetime(datetime.now(timezone.utc).replace(tzinfo=None)).date()
 
     assert test_metric_hub.end_date == now
 
@@ -88,7 +88,7 @@ def test_metrichub_last_complete_month():
         start_date="2024-01-01",
         end_date="last complete month",
     )
-    now = to_datetime(datetime.now(timezone.utc)).date()
+    now = to_datetime(datetime.now(timezone.utc).replace(tzinfo=None)).date()
     prev_date = previous_period_last_date("last complete month", now)
 
     assert test_metric_hub.end_date == to_datetime(prev_date).date()

From 90a822edaf2b0ae49298a418bf2faeafc0ea21b4 Mon Sep 17 00:00:00 2001
From: Jared Snyder <jsnyder@mozilla.com>
Date: Wed, 10 Jul 2024 16:06:12 -0500
Subject: [PATCH 06/33] finished base_forecast tests

---
 .../tests/test_base_forecast.py               | 126 ++++++++++++++++--
 1 file changed, 113 insertions(+), 13 deletions(-)

diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
index 30c5e06b..b53c56b2 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
@@ -3,6 +3,8 @@
 import pytest
 import pandas as pd
 from dotmap import DotMap
+import numpy as np
+from datetime import datetime, timedelta, timezone
 
 
 from kpi_forecasting.models.base_forecast import BaseForecast
@@ -19,32 +21,47 @@ def __init__(self):
             self.is_fit = False
 
         def fit(self, observed_data):
-            self.is_fit = max(observed_data)
+            self.is_fit = max(observed_data["submission_date"])
 
     class GoodClass(BaseForecast):
         # overwrite _get_observed_data
         def _get_observed_data(self):
-            self.observed_df = range(10)
-
-        def _fit(self, observed_df: pd.DataFrame) -> None:
+            self.observed_df = pd.DataFrame(
+                {
+                    "submission_date": [
+                        pd.to_datetime("2020-01-01"),
+                        pd.to_datetime("1990-01-01"),
+                    ]
+                }
+            )
+
+        def _fit(self, observed_df: np.array) -> None:
+            # takes array as input to simplify tests
             self.model = GoodModel()
             self.model.fit(observed_df)
 
-        def _predict(self, dates_to_predict: pd.DataFrame) -> pd.DataFrame:
-            pass
+        def _predict(self, dates_to_predict: np.array) -> pd.DataFrame:
+            # takes array as input to simplify tests
+            return dates_to_predict * 2
 
-        def _validate_forecast_df(self, forecast_df: pd.DataFrame) -> None:
-            pass
+        def _validate_forecast_df(self, forecast_df: np.array) -> None:
+            # takes array as input to simplify tests
+            assert np.all(forecast_df // 0 == 0)
 
         def _summarize(
             self,
-            forecast_df: pd.DataFrame,
-            observed_df: pd.DataFrame,
+            forecast_df: np.array,
+            observed_df: np.array,
             period: str,
             numpy_aggregations: List[str],
-            percentiles: List[int],
+            percentiles: List[str],
         ) -> pd.DataFrame:
-            pass
+            # input types changes to simplify test
+            np_func = getattr(np, numpy_aggregations[0])
+            agg_val = np_func(forecast_df + observed_df)
+            return pd.DataFrame(
+                [{"number": agg_val, "period": period, "percentiles": percentiles[0]}]
+            )
 
     return GoodClass
 
@@ -57,6 +74,49 @@ def test_not_implemented():
         _ = BadClass()
 
 
+def test_post_init(good_class):
+    start_date = "2124-01-01"
+    end_date = "2124-02-02"
+    good_class = good_class(
+        model_type="test",
+        parameters=DotMap(),
+        use_holidays=None,
+        start_date=start_date,
+        end_date=end_date,
+        metric_hub=None,
+    )
+    dates_to_predict_expected = pd.DataFrame(
+        {
+            "submission_date": pd.date_range(
+                pd.to_datetime(start_date), pd.to_datetime(end_date)
+            ).date
+        }
+    )
+    assert good_class.dates_to_predict.equals(dates_to_predict_expected)
+
+
+def test_post_init_default_dates(good_class):
+    # check default start and end time
+    good_class = good_class(
+        model_type="test",
+        parameters=DotMap(),
+        use_holidays=None,
+        start_date="",
+        end_date="",
+        metric_hub=None,
+    )
+    # this is the max date of the self.observed_data['submission_date'] plus one day
+    # from the object definion
+    start_date = pd.to_datetime("2020-01-02")
+    end_date = (
+        datetime.now(timezone.utc).replace(tzinfo=None) + timedelta(weeks=78)
+    ).date()
+    dates_to_predict_expected = pd.DataFrame(
+        {"submission_date": pd.date_range(start_date, end_date).date}
+    )
+    assert good_class.dates_to_predict.equals(dates_to_predict_expected)
+
+
 def test_fit(good_class):
     good_class = good_class(
         model_type="test",
@@ -70,4 +130,44 @@ def test_fit(good_class):
     assert good_class.model
 
     #
-    assert good_class.model.is_fit == 9
+    assert good_class.model.is_fit == pd.to_datetime("2020-01-01")
+
+
+def test_predict_and_validate(good_class):
+    good_class = good_class(
+        model_type="test",
+        parameters=DotMap(),
+        use_holidays=None,
+        start_date="2124-01-01",
+        end_date="2124-02-02",
+        metric_hub=None,
+    )
+    # overwrite date range set in __post_init__
+    good_class.dates_to_predict = np.arange(10)
+    good_class.predict()
+    assert np.all(good_class.forecast_df == good_class.dates_to_predict * 2)
+
+
+def test_summarize(good_class):
+    good_class = good_class(
+        model_type="test",
+        parameters=DotMap(),
+        use_holidays=None,
+        start_date="2124-01-01",
+        end_date="2124-02-02",
+        metric_hub=None,
+    )
+    good_class.forecast_df = np.array([1, 2])
+    good_class.observed_df = np.array([3, 4])
+    number_val = 10
+    output = good_class.summarize(
+        periods=["a", "b", "c"], numpy_aggregations=["sum"], percentiles=["percentiles"]
+    )
+    expected_output = pd.DataFrame(
+        [
+            {"number": number_val, "period": el, "percentiles": "percentiles"}
+            for el in ["a", "b", "c"]
+        ]
+    )
+    assert output.reset_index(drop=True).equals(expected_output)
+    assert good_class.summary_df.reset_index(drop=True).equals(expected_output)

From 72fabefe8bcf03a66ba809668e4a73cc4bb6b5ee Mon Sep 17 00:00:00 2001
From: Jared Snyder <jsnyder@mozilla.com>
Date: Thu, 11 Jul 2024 15:38:43 -0500
Subject: [PATCH 07/33] added tests for prophet class

---
 .../kpi_forecasting/models/base_forecast.py   |   4 +
 .../models/prophet_forecast.py                |  30 +-
 .../tests/test_base_forecast.py               |   2 +-
 .../tests/test_prophet_forecast.py            | 462 ++++++++++++++++++
 4 files changed, 490 insertions(+), 8 deletions(-)
 create mode 100644 jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py

diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
index b0a84eb2..f41f3b59 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
@@ -44,6 +44,10 @@ class BaseForecast(abc.ABC):
 
     def _get_observed_data(self):
         if self.metric_hub:
+            # the columns in this dataframe
+            # are "value" for the metric, submission_date
+            # and any segments where the column name
+            # is the name of the segment
             self.observed_df = self.metric_hub.fetch()
 
     def __post_init__(self) -> None:
diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py
index 652ced30..b8539dab 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py
@@ -150,27 +150,26 @@ def _predict_legacy(self) -> pd.DataFrame:
 
         return df[columns]
 
-    def _summarize(
+    def _combine_forecast_observed(
         self,
         forecast_df,
         observed_df,
         period: str,
         numpy_aggregations: List[str],
         percentiles: List[int],
-    ) -> pd.DataFrame:
-        """
-        Calculate summary metrics for `self.forecast_df` over a given period, and
-        add metadata.
-        """
+    ):
         # build a list of all functions that we'll summarize the data by
         aggregations = [getattr(np, i) for i in numpy_aggregations]
         aggregations.extend([pdx.percentile(i) for i in percentiles])
 
         # aggregate metric to the correct date period (day, month, year)
         observed_summarized = pdx.aggregate_to_period(observed_df, period)
-        forecast_agg = pdx.aggregate_to_period(forecast_df, period)
+        forecast_agg = pdx.aggregate_to_period(forecast_df, period).sort_values(
+            "submission_date"
+        )
 
         # find periods of overlap between observed and forecasted data
+        # merge preserves key order so overlap will be sorted by submission_date
         overlap = forecast_agg.merge(
             observed_summarized,
             on="submission_date",
@@ -198,7 +197,24 @@ def _summarize(
 
         # create a single dataframe that contains observed and forecasted data
         df = pd.concat([observed_summarized, forecast_summarized])
+        return df
+
+    def _summarize(
+        self,
+        forecast_df,
+        observed_df,
+        period: str,
+        numpy_aggregations: List[str],
+        percentiles: List[int],
+    ) -> pd.DataFrame:
+        """
+        Calculate summary metrics for `self.forecast_df` over a given period, and
+        add metadata.
+        """
 
+        df = self._combine_forecast_observed(
+            forecast_df, observed_df, period, numpy_aggregations, percentiles
+        )
         # add summary metadata columns
         df["aggregation_period"] = period.lower()
 
diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
index b53c56b2..dc3a7156 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
@@ -1,4 +1,4 @@
-from typing import Dict, List
+from typing import List
 
 import pytest
 import pandas as pd
diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py
new file mode 100644
index 00000000..db2d2a3b
--- /dev/null
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py
@@ -0,0 +1,462 @@
+from typing import List
+
+import pytest
+import pandas as pd
+from dotmap import DotMap
+import numpy as np
+from datetime import datetime, timedelta, timezone
+
+
+from kpi_forecasting.models.prophet_forecast import ProphetForecast
+
+
+def test_summarize_non_overlapping_day():
+    observed_start_date = "2124-01-01"
+    observed_end_date = "2124-02-01"
+
+    predict_start_date = "2124-02-02"
+    predict_end_date = "2124-03-01"
+
+    forecast = ProphetForecast(
+        model_type="test",
+        parameters=DotMap(),
+        use_holidays=None,
+        start_date=predict_start_date,
+        end_date=predict_end_date,
+        metric_hub=None,
+    )
+    observed_submission_dates = pd.date_range(
+        pd.to_datetime(observed_start_date), pd.to_datetime(observed_end_date)
+    ).date
+    predict_submission_dates = forecast.dates_to_predict["submission_date"].values
+
+    observed_df = pd.DataFrame(
+        {
+            "submission_date": observed_submission_dates,
+            "value": range(len(observed_submission_dates)),
+        }
+    )
+
+    test_samples = np.array([1, 1, 2, 3, 5, 8, 13])
+    test_mean = np.mean(test_samples)
+    test_median = np.median(test_samples)
+
+    forecast_array = np.stack(
+        [test_samples * i for i in range(1, 1 + len(predict_submission_dates))],
+        axis=0,
+    )
+    forecast_data = {str(i): forecast_array[:, i] for i in range(len(test_samples))}
+    forecast_df = pd.DataFrame(
+        dict(**{"submission_date": predict_submission_dates}, **forecast_data)
+    )
+
+    output_df = forecast._combine_forecast_observed(
+        forecast_df, observed_df, "day", ["mean", "median"], [50]
+    )
+
+    expected_observed_df = observed_df.copy()
+    expected_observed_df["source"] = "historical"
+    expected_observed_df["measure"] = "observed"
+    expected_observed_df["submission_date"] = (
+        pd.to_datetime(expected_observed_df["submission_date"].values)
+        .to_period("d")
+        .to_timestamp()
+    )
+
+    forecast_mean_df = pd.DataFrame(
+        {
+            "submission_date": pd.to_datetime(forecast_df["submission_date"].values)
+            .to_period("d")
+            .to_timestamp(),
+            "value": [
+                test_mean * i for i in range(1, 1 + len(predict_submission_dates))
+            ],
+            "source": ["forecast"] * len(predict_submission_dates),
+            "measure": ["mean"] * len(predict_submission_dates),
+        }
+    )
+
+    forecast_median_df = pd.DataFrame(
+        {
+            "submission_date": pd.to_datetime(forecast_df["submission_date"].values)
+            .to_period("d")
+            .to_timestamp(),
+            "value": [
+                test_median * i for i in range(1, 1 + len(predict_submission_dates))
+            ],
+            "source": ["forecast"] * len(predict_submission_dates),
+            "measure": ["median"] * len(predict_submission_dates),
+        }
+    )
+
+    forecast_p50_df = forecast_median_df.copy()
+    forecast_p50_df["measure"] = "p50"
+
+    expected_df = pd.concat(
+        [expected_observed_df, forecast_mean_df, forecast_median_df, forecast_p50_df]
+    )
+
+    assert set(expected_df.columns) == set(output_df.columns)
+    columns = expected_df.columns
+    expected_df_compare = (
+        expected_df[columns]
+        .sort_values(["submission_date", "source", "measure"])
+        .reset_index(drop=True)
+    )
+    output_df_compare = (
+        output_df[columns]
+        .sort_values(["submission_date", "source", "measure"])
+        .reset_index(drop=True)
+    )
+    pd.testing.assert_frame_equal(
+        expected_df_compare, output_df_compare, check_exact=False
+    )
+
+
+def test_summarize_non_overlapping_month():
+    observed_start_date = "2124-01-01"
+    observed_end_date = "2124-02-28"
+
+    predict_start_date = "2124-04-01"
+    predict_end_date = "2124-05-31"
+
+    forecast = ProphetForecast(
+        model_type="test",
+        parameters=DotMap(),
+        use_holidays=None,
+        start_date=predict_start_date,
+        end_date=predict_end_date,
+        metric_hub=None,
+    )
+    observed_submission_dates = pd.date_range(
+        pd.to_datetime(observed_start_date), pd.to_datetime(observed_end_date)
+    ).date
+    predict_submission_dates = forecast.dates_to_predict["submission_date"].values
+
+    observed_df = pd.DataFrame(
+        {
+            "submission_date": observed_submission_dates,
+            "value": [1] * len(observed_submission_dates),
+        }
+    )
+
+    test_samples = np.array([1, 1, 2, 3, 5, 8, 13])
+    test_mean = np.mean(test_samples)
+    test_median = np.median(test_samples)
+
+    forecast_array = np.stack(
+        [test_samples] * len(predict_submission_dates),
+        axis=0,
+    )
+    forecast_data = {str(i): forecast_array[:, i] for i in range(len(test_samples))}
+    forecast_df = pd.DataFrame(
+        dict(**{"submission_date": predict_submission_dates}, **forecast_data)
+    )
+
+    output_df = forecast._combine_forecast_observed(
+        forecast_df, observed_df, "month", ["mean", "median"], [50]
+    )
+
+    expected_observed_dates = sorted(
+        pd.to_datetime(observed_df["submission_date"].values)
+        .to_period("m")
+        .to_timestamp()
+        .unique()
+    )
+    expected_observed_df = pd.DataFrame(
+        {
+            "submission_date": expected_observed_dates,
+            "source": ["historical", "historical"],
+            "measure": ["observed", "observed"],
+            "value": [31, 28],  # number of days in each month
+        }
+    )
+
+    forecast_observed_dates = sorted(
+        pd.to_datetime(forecast_df["submission_date"].values)
+        .to_period("m")
+        .to_timestamp()
+        .unique()
+    )
+    forecast_mean_df = pd.DataFrame(
+        {
+            "submission_date": forecast_observed_dates,
+            "source": ["forecast", "forecast"],
+            "measure": ["mean", "mean"],
+            "value": [test_mean * 30, test_mean * 31],  # number of days in each month
+        }
+    )
+
+    forecast_median_df = pd.DataFrame(
+        {
+            "submission_date": forecast_observed_dates,
+            "source": ["forecast", "forecast"],
+            "measure": ["median", "median"],
+            "value": [
+                test_median * 30,
+                test_median * 31,
+            ],  # number of days in each month
+        }
+    )
+
+    forecast_p50_df = pd.DataFrame(
+        {
+            "submission_date": forecast_observed_dates,
+            "source": ["forecast", "forecast"],
+            "measure": ["p50", "p50"],
+            "value": [
+                test_median * 30,
+                test_median * 31,
+            ],  # number of days in each month
+        }
+    )
+
+    expected_df = pd.concat(
+        [expected_observed_df, forecast_mean_df, forecast_median_df, forecast_p50_df]
+    )
+
+    assert set(expected_df.columns) == set(output_df.columns)
+    columns = expected_df.columns
+    expected_df_compare = (
+        expected_df[columns]
+        .sort_values(["submission_date", "source", "measure"])
+        .reset_index(drop=True)
+    )
+    output_df_compare = (
+        output_df[columns]
+        .sort_values(["submission_date", "source", "measure"])
+        .reset_index(drop=True)
+    )
+    pd.testing.assert_frame_equal(
+        expected_df_compare, output_df_compare, check_exact=False
+    )
+
+
+def test_summarize_overlapping_day():
+    observed_start_date = "2124-01-01"
+    observed_end_date = "2124-02-01"
+
+    predict_start_date = "2124-01-01"
+    predict_end_date = "2124-02-01"
+
+    forecast = ProphetForecast(
+        model_type="test",
+        parameters=DotMap(),
+        use_holidays=None,
+        start_date=predict_start_date,
+        end_date=predict_end_date,
+        metric_hub=None,
+    )
+    observed_submission_dates = pd.date_range(
+        pd.to_datetime(observed_start_date), pd.to_datetime(observed_end_date)
+    ).date
+    predict_submission_dates = forecast.dates_to_predict["submission_date"].values
+
+    observed_df = pd.DataFrame(
+        {
+            "submission_date": observed_submission_dates,
+            "value": [1] * len(observed_submission_dates),
+        }
+    )
+
+    test_samples = np.array([1, 1, 2, 3, 5, 8, 13])
+    test_mean = np.mean(test_samples)
+    test_median = np.median(test_samples)
+
+    forecast_array = np.stack(
+        [test_samples * i for i in range(1, 1 + len(predict_submission_dates))],
+        axis=0,
+    )
+    forecast_data = {str(i): forecast_array[:, i] for i in range(len(test_samples))}
+    forecast_df = pd.DataFrame(
+        dict(**{"submission_date": predict_submission_dates}, **forecast_data)
+    )
+
+    output_df = forecast._combine_forecast_observed(
+        forecast_df, observed_df, "day", ["mean", "median"], [50]
+    )
+
+    expected_observed_df = observed_df.copy()
+    expected_observed_df["source"] = "historical"
+    expected_observed_df["measure"] = "observed"
+    expected_observed_df["submission_date"] = (
+        pd.to_datetime(expected_observed_df["submission_date"].values)
+        .to_period("d")
+        .to_timestamp()
+    )
+
+    # value has + 1 due to observed (which has value=1) being added
+    # due to overlap
+    forecast_mean_df = pd.DataFrame(
+        {
+            "submission_date": pd.to_datetime(forecast_df["submission_date"].values)
+            .to_period("d")
+            .to_timestamp(),
+            "value": [
+                test_mean * i + 1 for i in range(1, 1 + len(predict_submission_dates))
+            ],
+            "source": ["forecast"] * len(predict_submission_dates),
+            "measure": ["mean"] * len(predict_submission_dates),
+        }
+    )
+
+    forecast_median_df = pd.DataFrame(
+        {
+            "submission_date": pd.to_datetime(forecast_df["submission_date"].values)
+            .to_period("d")
+            .to_timestamp(),
+            "value": [
+                test_median * i + 1 for i in range(1, 1 + len(predict_submission_dates))
+            ],
+            "source": ["forecast"] * len(predict_submission_dates),
+            "measure": ["median"] * len(predict_submission_dates),
+        }
+    )
+
+    forecast_p50_df = forecast_median_df.copy()
+    forecast_p50_df["measure"] = "p50"
+
+    expected_df = pd.concat(
+        [expected_observed_df, forecast_mean_df, forecast_median_df, forecast_p50_df]
+    )
+
+    assert set(expected_df.columns) == set(output_df.columns)
+    columns = expected_df.columns
+    expected_df_compare = (
+        expected_df[columns]
+        .sort_values(["submission_date", "source", "measure"])
+        .reset_index(drop=True)
+    )
+    output_df_compare = (
+        output_df[columns]
+        .sort_values(["submission_date", "source", "measure"])
+        .reset_index(drop=True)
+    )
+    pd.testing.assert_frame_equal(
+        expected_df_compare, output_df_compare, check_exact=False
+    )
+
+
+def test_summarize_overlapping_month():
+    observed_start_date = "2124-01-01"
+    observed_end_date = "2124-02-28"
+
+    predict_start_date = "2124-01-01"
+    predict_end_date = "2124-02-28"
+
+    forecast = ProphetForecast(
+        model_type="test",
+        parameters=DotMap(),
+        use_holidays=None,
+        start_date=predict_start_date,
+        end_date=predict_end_date,
+        metric_hub=None,
+    )
+    observed_submission_dates = pd.date_range(
+        pd.to_datetime(observed_start_date), pd.to_datetime(observed_end_date)
+    ).date
+    predict_submission_dates = forecast.dates_to_predict["submission_date"].values
+
+    observed_df = pd.DataFrame(
+        {
+            "submission_date": observed_submission_dates,
+            "value": [1] * len(observed_submission_dates),
+        }
+    )
+
+    test_samples = np.array([1, 1, 2, 3, 5, 8, 13])
+    test_mean = np.mean(test_samples)
+    test_median = np.median(test_samples)
+
+    forecast_array = np.stack(
+        [test_samples] * len(predict_submission_dates),
+        axis=0,
+    )
+    forecast_data = {str(i): forecast_array[:, i] for i in range(len(test_samples))}
+    forecast_df = pd.DataFrame(
+        dict(**{"submission_date": predict_submission_dates}, **forecast_data)
+    )
+
+    output_df = forecast._combine_forecast_observed(
+        forecast_df, observed_df, "month", ["mean", "median"], [50]
+    )
+
+    expected_observed_dates = sorted(
+        pd.to_datetime(observed_df["submission_date"].values)
+        .to_period("m")
+        .to_timestamp()
+        .unique()
+    )
+    expected_observed_df = pd.DataFrame(
+        {
+            "submission_date": expected_observed_dates,
+            "source": ["historical", "historical"],
+            "measure": ["observed", "observed"],
+            "value": [31, 28],  # number of days in each month
+        }
+    )
+
+    forecast_observed_dates = sorted(
+        pd.to_datetime(forecast_df["submission_date"].values)
+        .to_period("m")
+        .to_timestamp()
+        .unique()
+    )
+
+    # add extra length of month for aggregated value column that gets added
+    # due to overlap
+    forecast_mean_df = pd.DataFrame(
+        {
+            "submission_date": forecast_observed_dates,
+            "source": ["forecast", "forecast"],
+            "measure": ["mean", "mean"],
+            "value": [
+                test_mean * 31 + 31,
+                test_mean * 28 + 28,
+            ],  # number of days in each month
+        }
+    )
+
+    forecast_median_df = pd.DataFrame(
+        {
+            "submission_date": forecast_observed_dates,
+            "source": ["forecast", "forecast"],
+            "measure": ["median", "median"],
+            "value": [
+                test_median * 31 + 31,
+                test_median * 28 + 28,
+            ],  # number of days in each month
+        }
+    )
+
+    forecast_p50_df = pd.DataFrame(
+        {
+            "submission_date": forecast_observed_dates,
+            "source": ["forecast", "forecast"],
+            "measure": ["p50", "p50"],
+            "value": [
+                test_median * 31 + 31,
+                test_median * 28 + 28,
+            ],  # number of days in each month
+        }
+    )
+
+    expected_df = pd.concat(
+        [expected_observed_df, forecast_mean_df, forecast_median_df, forecast_p50_df]
+    )
+
+    assert set(expected_df.columns) == set(output_df.columns)
+    columns = expected_df.columns
+    expected_df_compare = (
+        expected_df[columns]
+        .sort_values(["submission_date", "source", "measure"])
+        .reset_index(drop=True)
+    )
+    output_df_compare = (
+        output_df[columns]
+        .sort_values(["submission_date", "source", "measure"])
+        .reset_index(drop=True)
+    )
+    pd.testing.assert_frame_equal(
+        expected_df_compare, output_df_compare, check_exact=False
+    )

From 1ece1dd3295cb87797cc9de33988b5318381f4df Mon Sep 17 00:00:00 2001
From: Jared Snyder <jsnyder@mozilla.com>
Date: Thu, 11 Jul 2024 15:39:07 -0500
Subject: [PATCH 08/33] linting

---
 .../kpi_forecasting/tests/test_prophet_forecast.py             | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py
index db2d2a3b..dea754a3 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py
@@ -1,10 +1,7 @@
-from typing import List
 
-import pytest
 import pandas as pd
 from dotmap import DotMap
 import numpy as np
-from datetime import datetime, timedelta, timezone
 
 
 from kpi_forecasting.models.prophet_forecast import ProphetForecast

From 606e2e4dce2e61d58027de636f57692e35b5d7a1 Mon Sep 17 00:00:00 2001
From: Jared Snyder <jsnyder@mozilla.com>
Date: Thu, 11 Jul 2024 15:42:42 -0500
Subject: [PATCH 09/33] fixed divide by zero

---
 .../kpi_forecasting/tests/test_base_forecast.py                | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
index dc3a7156..6a731560 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
@@ -46,7 +46,8 @@ def _predict(self, dates_to_predict: np.array) -> pd.DataFrame:
 
         def _validate_forecast_df(self, forecast_df: np.array) -> None:
             # takes array as input to simplify tests
-            assert np.all(forecast_df // 0 == 0)
+            # check that all are even after _predict runs
+            assert np.all(forecast_df % 2 == 0)
 
         def _summarize(
             self,

From 585f2ca204e9920f089b8e3e6656de6d6a7b1aff Mon Sep 17 00:00:00 2001
From: Jared Snyder <jsnyder@mozilla.com>
Date: Thu, 11 Jul 2024 15:44:29 -0500
Subject: [PATCH 10/33] linting again

---
 .../kpi_forecasting/tests/test_prophet_forecast.py               | 1 -
 1 file changed, 1 deletion(-)

diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py
index dea754a3..18d3df67 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py
@@ -1,4 +1,3 @@
-
 import pandas as pd
 from dotmap import DotMap
 import numpy as np

From 97bd46c63a8913f6213e23cb7acf75f556dda647 Mon Sep 17 00:00:00 2001
From: Jared Snyder <jsnyder@mozilla.com>
Date: Tue, 23 Jul 2024 12:03:02 -0500
Subject: [PATCH 11/33] adding tests to funnel_forecast

---
 .../kpi_forecasting/models/funnel_forecast.py |  25 +-
 .../tests/test_funnel_forecast.py             | 700 ++++++++++++++++++
 2 files changed, 716 insertions(+), 9 deletions(-)

diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py
index c5d4a980..35b42c1b 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py
@@ -64,6 +64,10 @@ def __post_init__(self) -> None:
         """
         super().__post_init__()
 
+        if self.metric_hub is None:
+            # this is used to avoid the code below for testing purposes
+            return
+
         # Overwrite dates_to_predict to provide historical date forecasts
         self.dates_to_predict = pd.DataFrame(
             {
@@ -155,6 +159,11 @@ def _fill_regressor_dates(self, regressor: ProphetRegressor) -> ProphetRegressor
                 setattr(regressor, date, getattr(self, date))
             elif isinstance(getattr(regressor, date), str):
                 setattr(regressor, date, pd.to_datetime(getattr(regressor, date)))
+
+        if regressor.end_date < regressor.start_date:
+            raise Exception(
+                f"Regressor {regressor.name} start date comes after end date"
+            )
         return regressor
 
     def _build_model(
@@ -252,7 +261,7 @@ def _build_model_dataframe(
                 df["floor"] = segment_settings.trained_parameters["floor"]
                 df["cap"] = segment_settings.trained_parameters["cap"]
         else:
-            raise ValueError("task not in ['train','predict']")
+            raise ValueError(f"task set to {task}, must be train or predict")
 
         if segment_settings.regressors:
             df = self._add_regressors(df, segment_settings.regressors)
@@ -333,26 +342,24 @@ def _auto_tuning(self, segment_settings: SegmentModelSettings) -> Dict[str, floa
 
         return param_grid[min_abs_bias_index]
 
-    def _add_regressors(self, dat: pd.DataFrame, regressors: List[ProphetRegressor]):
+    def _add_regressors(self, df: pd.DataFrame, regressors: List[ProphetRegressor]):
         """
         Add regressor columns to the dataframe for training or prediction.
 
         Args:
-            dat (pd.DataFrame): The input dataframe.
+            df (pd.DataFrame): The input dataframe.
             regressors (List[ProphetRegressor]): The list of regressors to add.
 
         Returns:
             pd.DataFrame: The dataframe with regressors added.
         """
-        df = dat.copy().rename(columns=self.column_names_map)
-        df["ds"] = pd.to_datetime(df["ds"])
         for regressor in regressors:
             regressor = self._fill_regressor_dates(regressor)
             # finds rows where date is in regressor date ranges and sets that regressor
             ## value to 1, else 0
             df[regressor.name] = np.where(
-                (df["ds"] >= pd.to_datetime(regressor.start_date))
-                & (df["ds"] <= pd.to_datetime(regressor.end_date)),
+                (df["ds"] >= pd.to_datetime(regressor.start_date).date())
+                & (df["ds"] <= pd.to_datetime(regressor.end_date).date()),
                 0,
                 1,
             )
@@ -693,10 +700,10 @@ def write_results(
 
         if components_table:
             numeric_cols = self.components_df.dtypes[
-                self.components_df.dtypes == float
+                self.components_df.dtypes is float
             ].index.tolist()
             string_cols = self.components_df.dtypes[
-                self.components_df.dtypes == object
+                self.components_df.dtypes is object
             ].index.tolist()
             self.components_df["metric_slug"] = self.metric_hub.slug
             self.components_df["forecast_trained_at"] = self.trained_at
diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py
index e69de29b..bf8342ea 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py
@@ -0,0 +1,700 @@
+import re
+
+import pandas as pd
+from dotmap import DotMap
+import pytest
+
+
+from kpi_forecasting.configs.model_inputs import ProphetRegressor, ProphetHoliday
+from kpi_forecasting.models.funnel_forecast import SegmentModelSettings, FunnelForecast
+
+
+@pytest.fixture()
+def forecast():
+    predict_start_date = "2124-01-01"
+    predict_end_date = "2124-03-01"
+
+    forecast = FunnelForecast(
+        model_type="test",
+        parameters=DotMap(),
+        use_holidays=None,
+        start_date=predict_start_date,
+        end_date=predict_end_date,
+        metric_hub=None,
+    )
+    return forecast
+
+
+def test_fill_regressor_dates(forecast):
+    regressor_info = {
+        "name": "only_start",
+        "description": "only has a start",
+        "start_date": "2020-08-15",
+    }
+    regressor = ProphetRegressor(**regressor_info)
+    forecast._fill_regressor_dates(regressor)
+    assert regressor.start_date == pd.to_datetime("2020-08-15")
+    assert regressor.end_date == pd.to_datetime("2124-03-01")
+
+    regressor_info = {
+        "name": "only_end",
+        "description": "only has a end",
+        "end_date": "2125-08-15",
+    }
+    regressor = ProphetRegressor(**regressor_info)
+    forecast._fill_regressor_dates(regressor)
+    assert regressor.start_date == pd.to_datetime("2124-01-01")
+    assert regressor.end_date == pd.to_datetime("2125-08-15")
+
+    regressor_info = {
+        "name": "both",
+        "description": "only has a start",
+        "start_date": "2020-08-15",
+        "end_date": "2020-09-15",
+    }
+    regressor = ProphetRegressor(**regressor_info)
+    forecast._fill_regressor_dates(regressor)
+    assert regressor.start_date == pd.to_datetime("2020-08-15")
+    assert regressor.end_date == pd.to_datetime("2020-09-15")
+
+    regressor_info = {
+        "name": "neither",
+        "description": "nothin to see here",
+    }
+    regressor = ProphetRegressor(**regressor_info)
+    forecast._fill_regressor_dates(regressor)
+    assert regressor.start_date == pd.to_datetime("2124-01-01")
+    assert regressor.end_date == pd.to_datetime("2124-03-01")
+
+    regressor_info = {
+        "name": "out_of_order",
+        "description": "best better break",
+        "start_date": "2020-08-15",
+        "end_date": "2000-09-15",
+    }
+    regressor = ProphetRegressor(**regressor_info)
+    with pytest.raises(
+        Exception,
+        match="Regressor out_of_order start date comes after end date",
+    ):
+        forecast._fill_regressor_dates(regressor)
+
+
+def test_add_regressors(forecast):
+    regressor_list_raw = [
+        {
+            "name": "all_in",
+            "description": "it's all in",
+            "start_date": "2124-01-01",
+            "end_date": "2124-01-06",
+        },
+        {
+            "name": "all_out",
+            "description": "it's all in",
+            "start_date": "2124-02-01",
+            "end_date": "2124-02-06",
+        },
+        {
+            "name": "just_end",
+            "description": "just the second half",
+            "start_date": "2124-01-03",
+            "end_date": "2124-02-06",
+        },
+        {
+            "name": "just_middle",
+            "description": "just the middle two",
+            "start_date": "2124-01-02",
+            "end_date": "2124-01-03",
+        },
+    ]
+
+    regressor_list = [ProphetRegressor(**r) for r in regressor_list_raw]
+
+    df = pd.DataFrame(
+        {
+            "ds": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+                pd.to_datetime("2124-01-03").date(),
+                pd.to_datetime("2124-01-04").date(),
+            ],
+        }
+    )
+
+    output_df = forecast._add_regressors(df, regressors=regressor_list)
+
+    expected_df = pd.DataFrame(
+        {
+            "ds": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+                pd.to_datetime("2124-01-03").date(),
+                pd.to_datetime("2124-01-04").date(),
+            ],
+            "all_in": [0, 0, 0, 0],
+            "all_out": [1, 1, 1, 1],
+            "just_end": [1, 1, 0, 0],
+            "just_middle": [1, 0, 0, 1],
+        }
+    )
+
+    assert set(output_df.columns) == set(expected_df.columns)
+    pd.testing.assert_frame_equal(output_df, expected_df[output_df.columns])
+
+
+def test_build_model_dataframe_exception(forecast):
+    regressor_list = []
+
+    grid_parameters = {
+        "changepoint_prior_scale": [0.01, 0.1, 0.15, 0.2],
+        "changepoint_range": [0.8, 0.9, 1],
+        "n_changepoints": [30],
+        "weekly_seasonality": True,
+        "yearly_seasonality": True,
+        "growth": "logistic",
+    }
+    cv_settings = {
+        "initial": "366 days",
+        "period": "30 days",
+        "horizon": "30 days",
+        "parallel": "processes",
+    }
+    segment_settings = SegmentModelSettings(
+        segment={"a": 1, "b": 2},
+        start_date="2124-01-01",
+        end_date="2124-02-01",
+        holidays=[],
+        regressors=[ProphetRegressor(**r) for r in regressor_list],
+        grid_parameters=grid_parameters,
+        cv_settings=cv_settings,
+    )
+
+    observed_df = pd.DataFrame(
+        {
+            "a": [1, 1, 1, 1, 3, 3],
+            "b": [1, 1, 2, 2, 2, 2],
+            "y": [1, 2, 3, 4, 5, 6],
+            "submission_date": [
+                pd.to_datetime("2124-12-01").date(),
+                pd.to_datetime("2124-12-02").date(),
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+                pd.to_datetime("2123-01-01").date(),
+                pd.to_datetime("2123-01-02").date(),
+            ],
+        }
+    )
+
+    forecast.observed_df = observed_df
+
+    with pytest.raises(ValueError, match="task set to test, must be train or predict"):
+        _ = forecast._build_model_dataframe(
+            segment_settings=segment_settings, task="test"
+        )
+
+
+def test_build_model_dataframe_no_regressors_train(forecast):
+    regressor_list = []
+
+    grid_parameters = {
+        "changepoint_prior_scale": [0.01, 0.1, 0.15, 0.2],
+        "changepoint_range": [0.8, 0.9, 1],
+        "n_changepoints": [30],
+        "weekly_seasonality": True,
+        "yearly_seasonality": True,
+        "growth": "logistic",
+    }
+    cv_settings = {
+        "initial": "366 days",
+        "period": "30 days",
+        "horizon": "30 days",
+        "parallel": "processes",
+    }
+    segment_settings = SegmentModelSettings(
+        segment={"a": 1, "b": 2},
+        start_date="2124-01-01",
+        end_date="2124-02-01",
+        holidays=[],
+        regressors=[ProphetRegressor(**r) for r in regressor_list],
+        grid_parameters=grid_parameters,
+        cv_settings=cv_settings,
+    )
+
+    observed_df = pd.DataFrame(
+        {
+            "a": [1, 1, 1, 1, 3, 3],
+            "b": [1, 1, 2, 2, 2, 2],
+            "y": [1, 2, 3, 4, 5, 6],
+            "submission_date": [
+                pd.to_datetime("2124-12-01").date(),
+                pd.to_datetime("2124-12-02").date(),
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+                pd.to_datetime("2123-01-01").date(),
+                pd.to_datetime("2123-01-02").date(),
+            ],
+        }
+    )
+
+    forecast.observed_df = observed_df
+
+    output_train_df = forecast._build_model_dataframe(
+        segment_settings=segment_settings, task="train"
+    )
+    expected_train_df = pd.DataFrame(
+        {
+            "a": [1, 1],
+            "b": [2, 2],
+            "y": [3, 4],
+            "ds": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+        }
+    )
+    pd.testing.assert_frame_equal(
+        output_train_df.reset_index(drop=True), expected_train_df
+    )
+
+    output_train_wlog_df = forecast._build_model_dataframe(
+        segment_settings=segment_settings, task="train", add_logistic_growth_cols=True
+    )
+    expected_train_wlog_df = pd.DataFrame(
+        {
+            "a": [1, 1],
+            "b": [2, 2],
+            "y": [3, 4],
+            "ds": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+            "floor": [1.5, 1.5],
+            "cap": [6.0, 6.0],
+        }
+    )
+
+    assert set(output_train_wlog_df.columns) == set(expected_train_wlog_df.columns)
+    pd.testing.assert_frame_equal(
+        output_train_wlog_df.reset_index(drop=True),
+        expected_train_wlog_df[output_train_wlog_df.columns],
+    )
+
+
+def test_build_model_dataframe_train(forecast):
+    regressor_list = [
+        {
+            "name": "all_in",
+            "description": "it's all in",
+            "start_date": "2124-01-01",
+            "end_date": "2124-01-06",
+        },
+        {
+            "name": "all_out",
+            "description": "it's all in",
+            "start_date": "2124-02-01",
+            "end_date": "2124-02-06",
+        },
+        {
+            "name": "just_end",
+            "description": "just the second one",
+            "start_date": "2124-01-02",
+            "end_date": "2124-02-06",
+        },
+    ]
+
+    grid_parameters = {
+        "changepoint_prior_scale": [0.01, 0.1, 0.15, 0.2],
+        "changepoint_range": [0.8, 0.9, 1],
+        "n_changepoints": [30],
+        "weekly_seasonality": True,
+        "yearly_seasonality": True,
+        "growth": "logistic",
+    }
+    cv_settings = {
+        "initial": "366 days",
+        "period": "30 days",
+        "horizon": "30 days",
+        "parallel": "processes",
+    }
+    segment_settings = SegmentModelSettings(
+        segment={"a": 1, "b": 2},
+        start_date="2124-01-01",
+        end_date="2124-02-01",
+        holidays=[],
+        regressors=[ProphetRegressor(**r) for r in regressor_list],
+        grid_parameters=grid_parameters,
+        cv_settings=cv_settings,
+    )
+
+    observed_df = pd.DataFrame(
+        {
+            "a": [1, 1, 1, 1, 3, 3],
+            "b": [1, 1, 2, 2, 2, 2],
+            "y": [1, 2, 3, 4, 5, 6],
+            "submission_date": [
+                pd.to_datetime("2124-12-01").date(),
+                pd.to_datetime("2124-12-02").date(),
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+                pd.to_datetime("2123-01-01").date(),
+                pd.to_datetime("2123-01-02").date(),
+            ],
+        }
+    )
+
+    forecast.observed_df = observed_df
+
+    output_train_df = forecast._build_model_dataframe(
+        segment_settings=segment_settings, task="train"
+    )
+    expected_train_df = pd.DataFrame(
+        {
+            "a": [1, 1],
+            "b": [2, 2],
+            "y": [3, 4],
+            "ds": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+            "all_in": [0, 0],
+            "all_out": [1, 1],
+            "just_end": [1, 0],
+        }
+    )
+    pd.testing.assert_frame_equal(
+        output_train_df.reset_index(drop=True), expected_train_df
+    )
+
+    output_train_wlog_df = forecast._build_model_dataframe(
+        segment_settings=segment_settings, task="train", add_logistic_growth_cols=True
+    )
+    expected_train_wlog_df = pd.DataFrame(
+        {
+            "a": [1, 1],
+            "b": [2, 2],
+            "y": [3, 4],
+            "ds": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+            "all_in": [0, 0],
+            "all_out": [1, 1],
+            "just_end": [1, 0],
+            "floor": [1.5, 1.5],
+            "cap": [6.0, 6.0],
+        }
+    )
+
+    assert set(output_train_wlog_df.columns) == set(expected_train_wlog_df.columns)
+    pd.testing.assert_frame_equal(
+        output_train_wlog_df.reset_index(drop=True),
+        expected_train_wlog_df[output_train_wlog_df.columns],
+    )
+
+
+def test_build_model_dataframe_no_regressors_predict(forecast):
+    regressor_list = []
+
+    grid_parameters = {
+        "changepoint_prior_scale": [0.01, 0.1, 0.15, 0.2],
+        "changepoint_range": [0.8, 0.9, 1],
+        "n_changepoints": [30],
+        "weekly_seasonality": True,
+        "yearly_seasonality": True,
+        "growth": "logistic",
+    }
+    cv_settings = {
+        "initial": "366 days",
+        "period": "30 days",
+        "horizon": "30 days",
+        "parallel": "processes",
+    }
+    segment_settings = SegmentModelSettings(
+        segment={"a": 1, "b": 2},
+        start_date="2124-01-01",
+        end_date="2124-02-01",
+        holidays=[],
+        regressors=[ProphetRegressor(**r) for r in regressor_list],
+        grid_parameters=grid_parameters,
+        cv_settings=cv_settings,
+    )
+
+    segment_settings.trained_parameters = {"floor": -1.0, "cap": 10.0}
+
+    dates_to_predict = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2124-12-01").date(),
+                pd.to_datetime("2124-12-02").date(),
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+                pd.to_datetime("2123-01-01").date(),
+                pd.to_datetime("2123-01-02").date(),
+            ],
+        }
+    )
+
+    forecast.dates_to_predict = dates_to_predict
+
+    output_predict_df = forecast._build_model_dataframe(
+        segment_settings=segment_settings, task="predict"
+    )
+    expected_predict_df = pd.DataFrame(
+        {
+            "ds": [
+                pd.to_datetime("2124-12-01").date(),
+                pd.to_datetime("2124-12-02").date(),
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+                pd.to_datetime("2123-01-01").date(),
+                pd.to_datetime("2123-01-02").date(),
+            ],
+        }
+    )
+    pd.testing.assert_frame_equal(
+        output_predict_df.reset_index(drop=True), expected_predict_df
+    )
+
+    output_predict_wlog_df = forecast._build_model_dataframe(
+        segment_settings=segment_settings, task="predict", add_logistic_growth_cols=True
+    )
+    expected_predict_wlog_df = pd.DataFrame(
+        {
+            "ds": [
+                pd.to_datetime("2124-12-01").date(),
+                pd.to_datetime("2124-12-02").date(),
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+                pd.to_datetime("2123-01-01").date(),
+                pd.to_datetime("2123-01-02").date(),
+            ],
+            "floor": [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0],
+            "cap": [10.0, 10.0, 10.0, 10.0, 10.0, 10.0],
+        }
+    )
+
+    assert set(output_predict_wlog_df.columns) == set(expected_predict_wlog_df.columns)
+    pd.testing.assert_frame_equal(
+        output_predict_wlog_df.reset_index(drop=True),
+        expected_predict_wlog_df[output_predict_wlog_df.columns],
+    )
+
+
+def test_build_model_dataframe_predict(forecast):
+    regressor_list = [
+        {
+            "name": "all_in",
+            "description": "it's all in",
+            "start_date": "2124-01-01",
+            "end_date": "2124-01-06",
+        },
+        {
+            "name": "all_out",
+            "description": "it's all in",
+            "start_date": "2124-02-01",
+            "end_date": "2124-02-06",
+        },
+        {
+            "name": "just_end",
+            "description": "just the second one",
+            "start_date": "2124-01-02",
+            "end_date": "2124-02-06",
+        },
+    ]
+
+    grid_parameters = {
+        "changepoint_prior_scale": [0.01, 0.1, 0.15, 0.2],
+        "changepoint_range": [0.8, 0.9, 1],
+        "n_changepoints": [30],
+        "weekly_seasonality": True,
+        "yearly_seasonality": True,
+        "growth": "logistic",
+    }
+    cv_settings = {
+        "initial": "366 days",
+        "period": "30 days",
+        "horizon": "30 days",
+        "parallel": "processes",
+    }
+    segment_settings = SegmentModelSettings(
+        segment={"a": 1, "b": 2},
+        start_date="2124-01-01",
+        end_date="2124-02-01",
+        holidays=[],
+        regressors=[ProphetRegressor(**r) for r in regressor_list],
+        grid_parameters=grid_parameters,
+        cv_settings=cv_settings,
+    )
+
+    observed_df = pd.DataFrame(
+        {
+            "a": [1, 1, 1, 1, 3, 3],
+            "b": [1, 1, 2, 2, 2, 2],
+            "y": [1, 2, 3, 4, 5, 6],
+            "submission_date": [
+                pd.to_datetime("2124-12-01").date(),
+                pd.to_datetime("2124-12-02").date(),
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+                pd.to_datetime("2123-01-01").date(),
+                pd.to_datetime("2123-01-02").date(),
+            ],
+        }
+    )
+
+    forecast.observed_df = observed_df
+
+    output_train_df = forecast._build_model_dataframe(
+        segment_settings=segment_settings, task="train"
+    )
+    expected_train_df = pd.DataFrame(
+        {
+            "a": [1, 1],
+            "b": [2, 2],
+            "y": [3, 4],
+            "ds": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+            "all_in": [0, 0],
+            "all_out": [1, 1],
+            "just_end": [1, 0],
+        }
+    )
+    pd.testing.assert_frame_equal(
+        output_train_df.reset_index(drop=True), expected_train_df
+    )
+
+    output_train_wlog_df = forecast._build_model_dataframe(
+        segment_settings=segment_settings, task="train", add_logistic_growth_cols=True
+    )
+    expected_train_wlog_df = pd.DataFrame(
+        {
+            "a": [1, 1],
+            "b": [2, 2],
+            "y": [3, 4],
+            "ds": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+            "all_in": [0, 0],
+            "all_out": [1, 1],
+            "just_end": [1, 0],
+            "floor": [1.5, 1.5],
+            "cap": [6.0, 6.0],
+        }
+    )
+
+    assert set(output_train_wlog_df.columns) == set(expected_train_wlog_df.columns)
+    pd.testing.assert_frame_equal(
+        output_train_wlog_df.reset_index(drop=True),
+        expected_train_wlog_df[output_train_wlog_df.columns],
+    )
+
+
+def test_build_model(forecast):
+    regressor_list = [
+        {
+            "name": "all_in",
+            "description": "it's all in",
+            "start_date": "2124-01-01",
+            "end_date": "2124-01-06",
+        },
+        {
+            "name": "all_out",
+            "description": "it's all in",
+            "start_date": "2124-02-01",
+            "end_date": "2124-02-06",
+        },
+        {
+            "name": "just_end",
+            "description": "just the second one",
+            "start_date": "2124-01-02",
+            "end_date": "2124-02-06",
+        },
+    ]
+
+    holiday_list = {
+        "easter": {
+            "name": "easter",
+            "ds": [
+                "2016-03-27",
+                "2017-04-16",
+                "2018-04-01",
+                "2019-04-21",
+                "2020-04-12",
+                "2021-04-04",
+                "2022-04-17",
+                "2023-04-09",
+                "2024-03-31",
+                "2025-04-20",
+            ],
+            "lower_window": -2,
+            "upper_window": 1,
+        },
+        "covid_sip1": {
+            "name": "covid_sip1",
+            "ds": ["2020-03-14"],
+            "lower_window": 0,
+            "upper_window": 45,
+        },
+        "covid_sip11": {
+            "name": "covid_sip11",
+            "ds": ["2020-03-14"],
+            "lower_window": -14,
+            "upper_window": 30,
+        },
+    }
+
+    grid_parameters = {
+        "changepoint_prior_scale": [0.01, 0.1, 0.15, 0.2],
+        "changepoint_range": [0.8, 0.9, 1],
+        "n_changepoints": [30],
+        "weekly_seasonality": True,
+        "yearly_seasonality": True,
+        "growth": "logistic",
+    }
+    cv_settings = {
+        "initial": "366 days",
+        "period": "30 days",
+        "horizon": "30 days",
+        "parallel": "processes",
+    }
+    segment_settings = SegmentModelSettings(
+        segment={"a": 1, "b": 2},
+        start_date="2124-01-01",
+        end_date="2124-02-01",
+        holidays=[ProphetHoliday(**h) for h in holiday_list.values()],
+        regressors=[ProphetRegressor(**r) for r in regressor_list],
+        grid_parameters=grid_parameters,
+        cv_settings=cv_settings,
+    )
+
+    model = forecast._build_model(
+        segment_settings=segment_settings,
+        parameters={
+            "changepoint_prior_scale": 0.01,
+            "changepoint_range": 0.8,
+            "n_changepoints": 30,
+            "weekly_seasonality": True,
+            "yearly_seasonality": True,
+            "growth": "logistic",
+        },
+    )
+
+    holiday_df = model.holidays
+    expected_holidays = pd.concat(
+        [
+            pd.DataFrame(
+                {
+                    "holiday": h["name"],
+                    "ds": pd.to_datetime(h["ds"]),
+                    "lower_window": h["lower_window"],
+                    "upper_window": h["upper_window"],
+                }
+            )
+            for h in holiday_list.values()
+        ],
+        ignore_index=True,
+    )
+    pd.testing.assert_frame_equal(holiday_df, expected_holidays)

From c35247dfca7e877983535880e744af44f251cd3e Mon Sep 17 00:00:00 2001
From: Jared Snyder <jsnyder@mozilla.com>
Date: Mon, 29 Jul 2024 09:58:44 -0500
Subject: [PATCH 12/33] added tests for funnel_forecast

---
 .../kpi_forecasting/models/funnel_forecast.py |  360 +++---
 .../models/prophet_forecast.py                |   29 +-
 .../tests/test_funnel_forecast.py             | 1131 +++++++++++++++--
 .../tests/test_performance_analysis.py        |    1 -
 jobs/kpi-forecasting/requirements.txt         |    1 +
 5 files changed, 1256 insertions(+), 266 deletions(-)

diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py
index 35b42c1b..c4683f16 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py
@@ -18,8 +18,7 @@
     holiday_collection,
     regressor_collection,
 )
-from kpi_forecasting.models.base_forecast import BaseForecast
-from kpi_forecasting import pandas_extras as pdx
+from kpi_forecasting.models.prophet_forecast import ProphetForecast
 
 
 @dataclass
@@ -45,7 +44,7 @@ class SegmentModelSettings:
 
 
 @dataclass
-class FunnelForecast(BaseForecast):
+class FunnelForecast(ProphetForecast):
     """
     FunnelForecast class for generating and managing forecast models. The class handles
     cases where forecasts for a combination of dimensions are required for a metric.
@@ -77,11 +76,28 @@ def __post_init__(self) -> None:
             }
         )
 
+        self._set_segment_models(self.observed_df, self.metric_hub.segments.keys())
+
+        # initialize unset attributes
+        self.components_df = None
+
+    def _set_segment_models(
+        self, observed_df: pd.DataFrame, segment_column_list: list
+    ) -> None:
+        """Creates a SegmentSettings object for each segment specified in the
+            metric_hub.segments section of the config.  These objects are stored in a list
+            in the segment_models attribute
+            Parameters can be specified independently for at most one dimension column
+            set using model_setting_split_dim in self.parameters
+
+        Args:
+            observed_df (pd.DataFrame): dataframe containing observed data used to model
+                must contain columns specified in the keys of the segments section of the config
+            segment_column_list (list): list of columns of observed_df to use to determine segments
+        """
         # Construct a DataFrame containing all combination of segment values
         ## in the observed_df
-        combination_df = self.observed_df[
-            self.metric_hub.segments.keys()
-        ].drop_duplicates()
+        combination_df = observed_df[segment_column_list].drop_duplicates()
 
         # Construct dictionaries from those combinations
         segment_combinations = combination_df.to_dict("records")
@@ -90,6 +106,13 @@ def __post_init__(self) -> None:
         ## populate the list with segments and parameters for the segment
         split_dim = self.parameters["model_setting_split_dim"]
 
+        # check to make sure split_dim is one of the columns set in segment_column_list
+        if split_dim not in segment_column_list:
+            columns_str = ",".join(segment_column_list)
+            raise ValueError(
+                f"model_setting_split_dim set to {split_dim} which is not among segment columns: {columns_str}"
+            )
+
         # For each segment combinination, get the model parameters from the config
         ## file. Parse the holidays and regressors specified in the config file.
         segment_models = []
@@ -126,9 +149,6 @@ def __post_init__(self) -> None:
             )
         self.segment_models = segment_models
 
-        # initialize unset attributes
-        self.components_df = None
-
     @property
     def column_names_map(self) -> Dict[str, str]:
         """
@@ -211,81 +231,102 @@ def _build_model(
 
         return m
 
-    def _build_model_dataframe(
+    def _build_train_dataframe(
         self,
+        observed_df,
         segment_settings: SegmentModelSettings,
-        task: str,
         add_logistic_growth_cols: bool = False,
     ) -> pd.DataFrame:
         """
-        Build the model dataframe for training or prediction.
+        Build the model dataframe for training
 
         Args:
+            observed_df: dataframe of observed data
             segment_settings (SegmentModelSettings): The settings for the segment.
-            task (str): The task, either 'train' or 'predict'.
             add_logistic_growth_cols (bool, optional): Whether to add logistic growth columns. Defaults to False.
 
         Returns:
             pd.DataFrame: The dataframe for the model.
         """
 
-        # build training dataframe
-        if task == "train":
-            # find indices in observed_df for rows that exactly match segment dict
-            segment_historical_indices = (
-                self.observed_df[list(segment_settings.segment)]
-                == pd.Series(segment_settings.segment)
-            ).all(axis=1)
-            df = (
-                self.observed_df.loc[
-                    (segment_historical_indices)
-                    & (  # filter observed_df if segment start date > metric_hub start date
-                        self.observed_df["submission_date"]
-                        >= datetime.strptime(
-                            segment_settings.start_date, "%Y-%m-%d"
-                        ).date()
-                    )
-                ]
-                .rename(columns=self.column_names_map)
-                .copy()
-            )
-            # define limits for logistic growth
-            if add_logistic_growth_cols:
-                df["floor"] = df["y"].min() * 0.5
-                df["cap"] = df["y"].max() * 1.5
+        # find indices in observed_df for rows that exactly match segment dict
+        segment_historical_indices = (
+            observed_df[list(segment_settings.segment)]
+            == pd.Series(segment_settings.segment)
+        ).all(axis=1)
+        df = (
+            observed_df.loc[
+                (segment_historical_indices)
+                & (  # filter observed_df if segment start date > metric_hub start date
+                    observed_df["submission_date"]
+                    >= datetime.strptime(segment_settings.start_date, "%Y-%m-%d").date()
+                )
+            ]
+            .rename(columns=self.column_names_map)
+            .copy()
+        )
+        # define limits for logistic growth
+        if add_logistic_growth_cols:
+            df["floor"] = df["y"].min() * 0.5
+            df["cap"] = df["y"].max() * 1.5
+
+        if segment_settings.regressors:
+            df = self._add_regressors(df, segment_settings.regressors)
+        return df
+
+    def _build_predict_dataframe(
+        self,
+        dates_to_predict: pd.DataFrame,
+        segment_settings: SegmentModelSettings,
+        add_logistic_growth_cols: bool = False,
+    ) -> pd.DataFrame:
+        """creates dataframe used for prediction
+
+        Args:
+            dates_to_predict (pd.DataFrame): dataframe of dates to predict
+            segment_settings (SegmentModelSettings): settings related to the segment
+            add_logistic_growth_cols (bool):  Whether to add logistic growth columns. Defaults to False.
 
+
+        Returns:
+            pd.DataFrame: dataframe to use used in prediction
+        """
         # predict dataframe only needs dates to predict, logistic growth limits, and regressors
-        elif task == "predict":
-            df = self.dates_to_predict.rename(columns=self.column_names_map).copy()
-            if add_logistic_growth_cols:
-                df["floor"] = segment_settings.trained_parameters["floor"]
-                df["cap"] = segment_settings.trained_parameters["cap"]
-        else:
-            raise ValueError(f"task set to {task}, must be train or predict")
+        df = dates_to_predict.rename(columns=self.column_names_map).copy()
+        if add_logistic_growth_cols:
+            df["floor"] = segment_settings.trained_parameters["floor"]
+            df["cap"] = segment_settings.trained_parameters["cap"]
 
         if segment_settings.regressors:
             df = self._add_regressors(df, segment_settings.regressors)
 
         return df
 
-    def _fit(self) -> None:
+    def _fit(self, observed_df: pd.DataFrame) -> None:
         """
         Fit and save a Prophet model for each segment combination.
+
+        Args:
+            observed_df (pd.DataFrame): dataframe of observations.  Expected to have columns
+                specified in the segments section of the config,
+                submission_date column with unique dates corresponding to each observation and
+                y column containing values of observations
         """
         for segment_settings in self.segment_models:
-            parameters = self._auto_tuning(segment_settings)
+            parameters = self._auto_tuning(observed_df, segment_settings)
 
             # Initialize model; build model dataframe
             add_log_growth_cols = (
                 "growth" in parameters.keys() and parameters["growth"] == "logistic"
             )
-            test_dat = self._build_model_dataframe(
-                segment_settings, "train", add_log_growth_cols
+            test_dat = self._build_train_dataframe(
+                observed_df, segment_settings, add_log_growth_cols
             )
             model = self._build_model(segment_settings, parameters)
 
             model.fit(test_dat)
             if add_log_growth_cols:
+                # all values in these colunns are the same
                 parameters["floor"] = test_dat["floor"].values[0]
                 parameters["cap"] = test_dat["cap"].values[0]
 
@@ -296,11 +337,39 @@ def _fit(self) -> None:
             segment_settings.trained_parameters = parameters
             segment_settings.segment_model = model
 
-    def _auto_tuning(self, segment_settings: SegmentModelSettings) -> Dict[str, float]:
+    def _get_crossvalidation_metric(
+        self, m: prophet.Prophet, cv_settings: dict
+    ) -> float:
+        """function for calculated the metric used for crossvalidation
+
+        Args:
+            m (prophet.Prophet): Prophet model for crossvalidation
+            cv_settings (dict): settings set by segment in the config file
+
+        Returns:
+            float: Metric where closer to zero means a better model
+        """
+        df_cv = cross_validation(m, **cv_settings)
+
+        df_bias = df_cv.groupby("cutoff")[["yhat", "y"]].sum().reset_index()
+        df_bias["pcnt_bias"] = df_bias["yhat"] / df_bias["y"] - 1
+        # Prophet splits the historical data when doing cross validation using
+        # cutoffs. The `.tail(3)` limits the periods we consider for the best
+        # parameters to the 3 most recent cutoff periods.
+        return df_bias.tail(3)["pcnt_bias"].mean()
+
+    def _auto_tuning(
+        self, observed_df, segment_settings: SegmentModelSettings
+    ) -> Dict[str, float]:
         """
         Perform automatic tuning of model parameters.
 
         Args:
+            observed_df (pd.DataFrame): dataframe of observed data
+                Expected to have columns:
+                specified in the segments section of the config,
+                submission_date column with unique dates corresponding to each observation and
+                y column containing values of observations
             segment_settings (SegmentModelSettings): The settings for the segment.
 
         Returns:
@@ -320,8 +389,8 @@ def _auto_tuning(self, segment_settings: SegmentModelSettings) -> Dict[str, floa
             for v in itertools.product(*segment_settings.grid_parameters.values())
         ]
 
-        test_dat = self._build_model_dataframe(
-            segment_settings, "train", add_log_growth_cols
+        test_dat = self._build_train_dataframe(
+            observed_df, segment_settings, add_log_growth_cols
         )
         bias = []
 
@@ -329,14 +398,10 @@ def _auto_tuning(self, segment_settings: SegmentModelSettings) -> Dict[str, floa
             m = self._build_model(segment_settings, params)
             m.fit(test_dat)
 
-            df_cv = cross_validation(m, **segment_settings.cv_settings)
-
-            df_bias = df_cv.groupby("cutoff")[["yhat", "y"]].sum().reset_index()
-            df_bias["pcnt_bias"] = df_bias["yhat"] / df_bias["y"] - 1
-            # Prophet splits the historical data when doing cross validation using
-            # cutoffs. The `.tail(3)` limits the periods we consider for the best
-            # parameters to the 3 most recent cutoff periods.
-            bias.append(df_bias.tail(3)["pcnt_bias"].mean())
+            crossval_metric = self._get_crossvalidation_metric(
+                m, **segment_settings.cv_settings
+            )
+            bias.append(crossval_metric)
 
         min_abs_bias_index = np.argmin(np.abs(bias))
 
@@ -357,19 +422,20 @@ def _add_regressors(self, df: pd.DataFrame, regressors: List[ProphetRegressor]):
             regressor = self._fill_regressor_dates(regressor)
             # finds rows where date is in regressor date ranges and sets that regressor
             ## value to 1, else 0
-            df[regressor.name] = np.where(
+            df[regressor.name] = (
                 (df["ds"] >= pd.to_datetime(regressor.start_date).date())
-                & (df["ds"] <= pd.to_datetime(regressor.end_date).date()),
-                0,
-                1,
-            )
+                & (df["ds"] <= pd.to_datetime(regressor.end_date).date())
+            ).astype(int)
         return df
 
-    def _predict(self, segment_settings: SegmentModelSettings) -> pd.DataFrame:
+    def _predict(
+        self, dates_to_predict_raw: pd.DataFrame, segment_settings: SegmentModelSettings
+    ) -> pd.DataFrame:
         """
         Generate forecast samples for a segment.
 
         Args:
+            dates_to_predict (pd.DataFrame): dataframe of dates to predict
             segment_settings (SegmentModelSettings): The settings for the segment.
 
         Returns:
@@ -380,14 +446,14 @@ def _predict(self, segment_settings: SegmentModelSettings) -> pd.DataFrame:
             and segment_settings.trained_parameters["growth"] == "logistic"
         )
         # add regressors, logistic growth limits (if applicable) to predict dataframe
-        dates_to_predict = self._build_model_dataframe(
-            segment_settings, "predict", add_log_growth_cols
+        dates_to_predict = self._build_predict_dataframe(
+            dates_to_predict_raw, segment_settings, add_log_growth_cols
         )
 
         # draws samples from Prophet posterior distribution, to provide percentile predictions
         samples = segment_settings.segment_model.predictive_samples(dates_to_predict)
         df = pd.DataFrame(samples["yhat"])
-        df["submission_date"] = self.dates_to_predict
+        df["submission_date"] = dates_to_predict_raw
 
         component_cols = [
             "ds",
@@ -467,6 +533,54 @@ def _percentile_name_map(self, percentiles: List[int]) -> Dict[str, str]:
             "mean": "value",
         }
 
+    def _combine_forecast_observed(
+        self,
+        forecast_df: pd.DataFrame,
+        observed_df: pd.DataFrame,
+        period: str,
+        numpy_aggregations: List,
+        percentiles,
+        segment: dict,
+    ) -> pd.DataFrame:
+        """Calculate aggregates over the forecase and observed data
+            and concatenate the two dataframes
+        Args:
+            forecast_df (pd.DataFrame): forecast dataframe
+            observed_df (pd.DataFrame): observed dataframe
+            period (str): period to aggregate up to, must be in (day, month, year)
+            numpy_aggregations (List): List of aggregation functions to apply across samples from the
+                                    posterior-predictive distribution.  Must take
+                                    in a numpy array and return a single value
+            percentiles: 3-element list of percentiles to calculate across samples from the posterior-predictive distribution
+            segment (dict): dictionary that lists columns and values corresponding to the segment
+                                keys are the column name used to segment and values are the values
+                                of that column corresponding to the current segment
+
+        Returns:
+            pd.DataFrame: combined dataframe containing aggregated values from observed and forecast
+        """
+        forecast_summarized, observed_summarized = self._aggregate_forecast_observed(
+            forecast_df, observed_df, period, numpy_aggregations, percentiles
+        )
+
+        # add datasource-specific metadata columns
+        forecast_summarized["source"] = "forecast"
+        observed_summarized["source"] = "historical"
+
+        # add segment columns to forecast  table
+        for dim, value in segment.items():
+            forecast_summarized[dim] = value
+
+        # rename forecast percentile to low, middle, high
+        # rename mean to value
+        forecast_summarized = forecast_summarized.rename(
+            columns=self._percentile_name_map(percentiles)
+        )
+
+        # create a single dataframe that contains observed and forecasted data
+        df = pd.concat([observed_summarized, forecast_summarized])
+        return df
+
     def _summarize(
         self,
         segment_settings: SegmentModelSettings,
@@ -475,7 +589,8 @@ def _summarize(
         percentiles: List[int] = [10, 50, 90],
     ) -> pd.DataFrame:
         """
-        Calculate summary metrics for `forecast_df` over a given period, and add metadata.
+        Calculate summary metrics on a specific segment
+        for `forecast_df` over a given period, and add metadata.
 
         Args:
             segment_settings (SegmentModelSettings): The settings for the segment.
@@ -492,9 +607,6 @@ def _summarize(
                 Can only pass a list of length 3 as percentiles, for lower, mid, and upper values.
                 """
             )
-        # build a list of all functions that we'll summarize the data by
-        aggregations = [getattr(np, i) for i in numpy_aggregations]
-        aggregations.extend([pdx.percentile(i) for i in percentiles])
 
         # the start date for this segment's historical data, in cases where the full time series
         ## of historical data is not used for model training
@@ -508,82 +620,24 @@ def _summarize(
             == pd.Series(segment_settings.segment)
         ).all(axis=1)
 
-        # aggregate metric to the correct date period (day, month, year)
-        observed_summarized = pdx.aggregate_to_period(
-            (
-                self.observed_df.loc[
-                    (segment_historical_indices)
-                    & (
-                        self.observed_df["submission_date"]
-                        >= segment_observed_start_date
-                    )
-                ].copy()
-            ),
+        segment_observed_df = self.observed_df.loc[
+            (segment_historical_indices)
+            & (self.observed_df["submission_date"] >= segment_observed_start_date)
+        ].copy()
+
+        df = self._combine_forecast_observed(
+            segment_settings.forecast_df,
+            segment_observed_df,
             period,
+            numpy_aggregations,
+            percentiles,
+            segment_settings.segment,
         )
-        forecast_agg = pdx.aggregate_to_period(segment_settings.forecast_df, period)
-
-        # find periods of overlap between observed and forecasted data
-        overlap = forecast_agg.merge(
-            observed_summarized,
-            on="submission_date",
-            how="left",
-        ).fillna(0)
-
-        forecast_summarized = (
-            forecast_agg.set_index("submission_date")
-            # Add observed data samples to any overlapping forecasted period. This
-            # ensures that any forecast made partway through a period accounts for
-            # previously observed data within the period. For example, when a monthly
-            # forecast is generated in the middle of the month.
-            .add(overlap[["value"]].values)
-            # calculate summary values, aggregating by submission_date,
-            .agg(aggregations, axis=1)
-            .reset_index()
-        ).rename(columns=self._percentile_name_map(percentiles))
 
-        # add datasource-specific metadata columns
-        forecast_summarized["source"] = "forecast"
-        observed_summarized["source"] = "historical"
-
-        # add segment columns to forecast  table
-        for dim, value in segment_settings.segment.items():
-            forecast_summarized[dim] = value
-
-        # create a single dataframe that contains observed and forecasted data
-        df = pd.concat([observed_summarized, forecast_summarized])
+        df["forecast_parameters"] = json.dumps(segment_settings.trained_parameters)
 
         # add summary metadata columns
         df["aggregation_period"] = period.lower()
-
-        # reorder columns to make interpretation easier
-        df = df[
-            [
-                "submission_date",
-                "aggregation_period",
-                "source",
-                "value",
-                "value_low",
-                "value_mid",
-                "value_high",
-            ]
-        ]
-
-        # add Metric Hub metadata columns
-        df["metric_alias"] = self.metric_hub.alias.lower()
-        df["metric_hub_app_name"] = self.metric_hub.app_name.lower()
-        df["metric_hub_slug"] = self.metric_hub.slug.lower()
-        df["metric_start_date"] = pd.to_datetime(self.metric_hub.min_date)
-        df["metric_end_date"] = pd.to_datetime(self.metric_hub.max_date)
-        df["metric_collected_at"] = self.collected_at
-
-        # add forecast model metadata columns
-        df["forecast_start_date"] = self.start_date
-        df["forecast_end_date"] = self.end_date
-        df["forecast_trained_at"] = self.trained_at
-        df["forecast_predicted_at"] = self.predicted_at
-        df["forecast_parameters"] = json.dumps(segment_settings.trained_parameters)
-
         return df
 
     def predict(self) -> None:
@@ -593,7 +647,7 @@ def predict(self) -> None:
         self.predicted_at = datetime.utcnow()
 
         for segment_settings in self.segment_models:
-            forecast_df = self._predict(segment_settings)
+            forecast_df = self._predict(self.dates_to_predict, segment_settings)
             self._validate_forecast_df(forecast_df)
 
             segment_settings.forecast_df = forecast_df
@@ -627,13 +681,29 @@ def summarize(
                 ]
             )
             for dim, dim_value in segment.segment.items():
-                summary_df[dim] = dim_value
                 segment.components_df[dim] = dim_value
             summary_df_list.append(summary_df.copy(deep=True))
             components_df_list.append(segment.components_df)
             del summary_df
 
-        self.summary_df = pd.concat(summary_df_list, ignore_index=True)
+        df = pd.concat(summary_df_list, ignore_index=True)
+
+        # add Metric Hub metadata columns
+        df["metric_alias"] = self.metric_hub.alias.lower()
+        df["metric_hub_app_name"] = self.metric_hub.app_name.lower()
+        df["metric_hub_slug"] = self.metric_hub.slug.lower()
+        df["metric_start_date"] = pd.to_datetime(self.metric_hub.min_date)
+        df["metric_end_date"] = pd.to_datetime(self.metric_hub.max_date)
+        df["metric_collected_at"] = self.collected_at
+
+        # add forecast model metadata columns
+        df["forecast_start_date"] = self.start_date
+        df["forecast_end_date"] = self.end_date
+        df["forecast_trained_at"] = self.trained_at
+        df["forecast_predicted_at"] = self.predicted_at
+
+        self.summary_df = df
+
         self.components_df = pd.concat(components_df_list, ignore_index=True)
 
     def write_results(
diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py
index b8539dab..60b8982a 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py
@@ -150,7 +150,7 @@ def _predict_legacy(self) -> pd.DataFrame:
 
         return df[columns]
 
-    def _combine_forecast_observed(
+    def _aggregate_forecast_observed(
         self,
         forecast_df,
         observed_df,
@@ -186,17 +186,34 @@ def _combine_forecast_observed(
             # calculate summary values, aggregating by submission_date,
             .agg(aggregations, axis=1)
             .reset_index()
-            # "melt" the df from wide-format to long-format.
-            .melt(id_vars="submission_date", var_name="measure")
         )
 
+        return forecast_summarized, observed_summarized
+
+    def _combine_forecast_observed(
+        self,
+        forecast_df,
+        observed_df,
+        period: str,
+        numpy_aggregations: List[str],
+        percentiles: List[int],
+    ):
+        forecast_summarized, observed_summarized = self._aggregate_forecast_observed(
+            forecast_df, observed_df, period, numpy_aggregations, percentiles
+        )
+
+        # remaining column of metric values get the column name 'value'
+        forecast_summarized = forecast_summarized.melt(
+            id_vars="submission_date", var_name="measure"
+        )
+        observed_summarized["measure"] = "observed"
+
         # add datasource-specific metadata columns
         forecast_summarized["source"] = "forecast"
         observed_summarized["source"] = "historical"
-        observed_summarized["measure"] = "observed"
 
-        # create a single dataframe that contains observed and forecasted data
-        df = pd.concat([observed_summarized, forecast_summarized])
+        df = pd.concat([forecast_summarized, observed_summarized])
+
         return df
 
     def _summarize(
diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py
index bf8342ea..c792db67 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py
@@ -1,8 +1,11 @@
-import re
+"""tests for the funnel forecast module"""
+
+import collections
 
 import pandas as pd
 from dotmap import DotMap
 import pytest
+import numpy as np
 
 
 from kpi_forecasting.configs.model_inputs import ProphetRegressor, ProphetHoliday
@@ -11,6 +14,8 @@
 
 @pytest.fixture()
 def forecast():
+    """This mocks a generic forecast object"""
+    # 2024-01-01 is arbitarily chosen as a future date
     predict_start_date = "2124-01-01"
     predict_end_date = "2124-03-01"
 
@@ -25,7 +30,958 @@ def forecast():
     return forecast
 
 
+@pytest.fixture()
+def segment_info_fit_tests():
+    """This fixture creates segment info dictionaries
+    that mimic the content of the config file and are used
+    in the functions that test fit methods"""
+
+    # 2024-01-01 is arbitarily chosen as a future date
+    A1_start_date = "2124-01-01"
+    A2_start_date = "2124-01-02"
+
+    segment_info_dict = {
+        "A1": {
+            "start_date": A1_start_date,
+            "grid_parameters": {"param1": [1, 2], "param2": [20, 10]},
+            "min_param_value": 10,
+        },
+        "A2": {
+            "start_date": A2_start_date,
+            "grid_parameters": {"param1": [-1, -2], "param2": [3, 4]},
+            "min_param_value": -3,  # closest to zero
+        },
+    }
+    return segment_info_dict
+
+
+@pytest.fixture()
+def funnel_forecast_for_fit_tests(segment_info_fit_tests, mocker):
+    """This method creates a forecast object from the segment dict
+    created in the segment_info_fit_tests fixture.  It also
+    mocks some of the object methods to enable easier testing"""
+    parameter_dict = {
+        "model_setting_split_dim": "a",
+        "segment_settings": {
+            "A1": {
+                "start_date": segment_info_fit_tests["A1"]["start_date"],
+                "end_date": None,
+                "holidays": [],
+                "regressors": [],
+                "grid_parameters": segment_info_fit_tests["A1"]["grid_parameters"],
+                "cv_settings": {},
+            },
+            "A2": {
+                "start_date": segment_info_fit_tests["A2"]["start_date"],
+                "end_date": None,
+                "holidays": [],
+                "regressors": [],
+                "grid_parameters": segment_info_fit_tests["A2"]["grid_parameters"],
+                "cv_settings": {},
+            },
+        },
+    }
+
+    parameter_dotmap = DotMap(parameter_dict)
+    predict_start_date = "2124-01-01"
+    predict_end_date = "2124-01-02"
+
+    forecast = FunnelForecast(
+        model_type="test",
+        parameters=parameter_dotmap,
+        use_holidays=None,
+        start_date=predict_start_date,
+        end_date=predict_end_date,
+        metric_hub=None,
+    )
+
+    mocker.patch.object(forecast, "_build_model", mock_build_model)
+    mocker.patch.object(
+        forecast, "_get_crossvalidation_metric", mock_get_crossvalidation_metric
+    )
+
+    return forecast
+
+
+class MockModel:
+    """Used in place of prophet.Prophet for testing purposes"""
+
+    def __init__(self, param1=0, param2=0):
+        self.value = param1 * param2
+        self.history = None
+
+    def fit(self, df, *args, **kwargs):
+        self.history = df
+        return None
+
+    def predict(self, dates_to_predict):
+        output = dates_to_predict.copy()
+
+        output[
+            [
+                "yhat",
+                "trend",
+                "trend_upper",
+                "trend_lower",
+                "weekly",
+                "weekly_upper",
+                "weekly_lower",
+                "yearly",
+                "yearly_upper",
+                "yearly_lower",
+            ]
+        ] = 0  # some dummy value so it has the right shape
+
+        return output
+
+    def predictive_samples(self, dates_to_predict):
+        # prophet function outputs dict of numpy arrays
+        # only element we care about is `yhat`
+        output = np.arange(len(dates_to_predict)) * self.value
+        return {"yhat": {0: output}}
+
+
+def mock_build_model(segment_settings, parameters):
+    """mocks the FunnelForecast build_model method"""
+    return MockModel(
+        **parameters,
+    )
+
+
+def mock_get_crossvalidation_metric(m, *args, **kwargs):
+    """mocks the FunnelForecast get_crossvalidation_metric
+    method, meant to be used with MockModel"""
+    return m.value  # value atrribute in MockModel
+
+
+def mock_aggregate_forecast_observed(
+    forecast_df, observed_df, period, numpy_aggregations, percentiles
+):
+    """Mocks the aggregate_forecast_observed function defined in ProphetForecast
+    and inherited in FunnelForecast.
+    This function is tested extensively in test_prophet_forecast
+    so we can make dummy outputs for tests related to it"""
+
+    # add dummy columns where aggregated metrics woudl go
+    percentile_columns = [f"p{el}" for el in percentiles]
+    output_forecast_df = forecast_df.copy()
+    output_forecast_df[numpy_aggregations + percentile_columns] = 0
+    return output_forecast_df, observed_df.copy()
+
+
+def test_combine_forecast_observed(mocker, forecast):
+    """tests the _combine_forecast_observed method"""
+    mocker.patch.object(
+        forecast, "_aggregate_forecast_observed", mock_aggregate_forecast_observed
+    )
+
+    forecast_df = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+        }
+    )
+
+    observed_df = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+            "a": ["A1", "A1"],
+            "value": [5, 6],
+        }
+    )
+
+    numpy_aggregations = ["mean"]
+    percentiles = [10, 50, 90]
+
+    output_df = forecast._combine_forecast_observed(
+        forecast_df=forecast_df,
+        observed_df=observed_df,
+        period="period",
+        numpy_aggregations=numpy_aggregations,
+        percentiles=percentiles,
+        segment={"a": "A1"},
+    )
+
+    # mean was renamed to value, percentiles to high, medium, low
+    forecast_df[["value", "value_low", "value_mid", "value_high"]] = 0
+    forecast_df["a"] = "A1"  # this column is already present in observed
+
+    forecast_df["source"] = "forecast"
+    observed_df["source"] = "historical"
+
+    # concat in same order to make our lives easier
+    expected = pd.concat([observed_df, forecast_df])
+    assert set(expected.columns) == set(output_df.columns)
+    pd.testing.assert_frame_equal(output_df, expected[output_df.columns])
+
+    # should not be any nulls outside the metric column
+    non_metric_columns = [
+        el
+        for el in output_df.columns
+        if el not in ["value", "value_low", "value_mid", "value_high"]
+    ]
+    assert not pd.isna(output_df[non_metric_columns]).any(axis=None)
+
+
+def test_under_summarize(mocker, forecast):
+    """testing _summarize"""
+    # 2024-01-01 is chosen as an arbitrary date to center the tests around
+
+    # forecast predictions are set with the
+    # mock_aggregate_forecast_observed function so they
+    # can be ommited here
+    forecast_df = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+        }
+    )
+
+    # rows with negative values are those expected to be removed
+    # by filters in summarize
+    observed_df = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2123-01-01").date(),
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+            "a": ["A1", "A1", "A1", "A2", "A2"],
+            "value": [10, 20, 30, 40, 50],
+        }
+    )
+
+    SegmentSettings = collections.namedtuple(
+        "SegmentSettings",
+        ["start_date", "forecast_df", "segment", "trained_parameters"],
+    )
+    dummy_segment_settings = SegmentSettings(
+        start_date="2124-01-01",
+        forecast_df=forecast_df.copy(),
+        segment={"a": "A1"},
+        trained_parameters={"trained_parameters": "yes"},
+    )
+
+    mocker.patch.object(
+        forecast, "_aggregate_forecast_observed", mock_aggregate_forecast_observed
+    )
+
+    forecast.observed_df = observed_df
+
+    numpy_aggregations = ["mean"]
+    percentiles = [10, 50, 90]
+    output_df = forecast._summarize(
+        segment_settings=dummy_segment_settings,
+        period="period",
+        numpy_aggregations=numpy_aggregations,
+        percentiles=percentiles,
+    )
+    observed_expected_df = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+            "a": ["A1", "A1"],
+            "value": [20, 30],
+        }
+    )
+
+    # percentile numeric values changed to names
+    # mean gets mapped to value
+    forecast_df[["value", "value_low", "value_mid", "value_high"]] = 0
+
+    forecast_df["a"] = "A1"  # this column is already present in observed
+
+    forecast_df["source"] = "forecast"
+    observed_expected_df["source"] = "historical"
+
+    # concat in same order to make our lives easier
+    expected = pd.concat([observed_expected_df, forecast_df])
+    expected["forecast_parameters"] = '{"trained_parameters": "yes"}'
+    expected["aggregation_period"] = "period"
+
+    assert set(expected.columns) == set(output_df.columns)
+    # force value columns to be floats in both cases to make check easier
+    numeric_cols = ["value", "value_low", "value_mid", "value_high"]
+    expected[numeric_cols] = expected[numeric_cols].astype(float)
+    output_df[numeric_cols] = output_df[numeric_cols].astype(float)
+    pd.testing.assert_frame_equal(
+        output_df.reset_index(drop=True),
+        expected[output_df.columns].reset_index(drop=True),
+    )
+
+    # should not be any nulls outside the metric column
+    non_metric_columns = [el for el in output_df.columns if el not in numeric_cols]
+    assert not pd.isna(output_df[non_metric_columns]).any(axis=None)
+
+
+def test_summarize(mocker, forecast):
+    """testing summarize"""
+    # create dummy metric hub object to when meta data from
+    # it is added we don't get an error
+    MetricHub = collections.namedtuple(
+        "MetricHub",
+        ["alias", "app_name", "slug", "min_date", "max_date"],
+    )
+
+    dummy_metric_hub = MetricHub("", "", "", "2124-01-01", "2124-01-01")
+
+    # forecast predictions are set with the
+    # mock_aggregate_forecast_observed function so they
+    # can be ommited here
+    forecast_df = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+        }
+    )
+
+    # rows with negative values are those expected to be removed
+    # by filters in summarize
+    observed_df = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2123-01-01").date(),
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+            "a": ["A1", "A1", "A1", "A2", "A2"],
+            "value": [10, 20, 30, 40, 50],
+        }
+    )
+
+    SegmentSettings = collections.namedtuple(
+        "SegmentSettings",
+        ["start_date", "forecast_df", "segment", "trained_parameters", "components_df"],
+    )
+
+    # for the components_df the contents aren't important here
+    # we're only testing that it is concatenated properly
+    # with the segment data added
+    dummy_segment_settings_A1 = SegmentSettings(
+        start_date="2124-01-01",
+        forecast_df=forecast_df.copy(),
+        segment={"a": "A1"},
+        trained_parameters={"trained_parameters": "yes"},
+        components_df=pd.DataFrame({"testcol": [1]}),
+    )
+
+    dummy_segment_settings_A2 = SegmentSettings(
+        start_date="2124-01-01",
+        forecast_df=forecast_df.copy(),
+        segment={"a": "A2"},
+        trained_parameters={"trained_parameters": "yes"},
+        components_df=pd.DataFrame({"testcol": [2]}),
+    )
+
+    segment_models = [dummy_segment_settings_A1, dummy_segment_settings_A2]
+
+    mocker.patch.object(
+        forecast, "_aggregate_forecast_observed", mock_aggregate_forecast_observed
+    )
+
+    forecast.observed_df = observed_df
+    forecast.segment_models = segment_models
+    forecast.metric_hub = dummy_metric_hub
+
+    #  timestamp attributes created by fit and predict
+    # must be added manuall
+    forecast.collected_at = ""
+    forecast.trained_at = ""
+    forecast.predicted_at = ""
+
+    numpy_aggregations = ["mean"]
+    percentiles = [10, 50, 90]
+    forecast.summarize(
+        periods=["period"],
+        numpy_aggregations=numpy_aggregations,
+        percentiles=percentiles,
+    )
+
+    output_df = forecast.summary_df
+
+    # time filter removes first element of observed_df
+    observed_expected_df = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+            "a": ["A1", "A1", "A2", "A2"],
+            "value": [20, 30, 40, 50],
+        }
+    )
+
+    # doubled because there are two segments in the observed data
+    forecast_df = pd.concat([forecast_df, forecast_df])
+
+    forecast_df[["value", "value_low", "value_mid", "value_high"]] = 0
+    forecast_df["source"] = "forecast"
+
+    # segment data column is already present in observed
+    # needs to be added manually for forecast
+    forecast_df["a"] = [
+        "A1",
+        "A1",
+        "A2",
+        "A2",
+    ]
+
+    observed_expected_df["source"] = "historical"
+
+    # concat in same order to make our lives easier
+    expected = pd.concat([observed_expected_df, forecast_df])
+    expected["forecast_parameters"] = '{"trained_parameters": "yes"}'
+    expected["aggregation_period"] = "period"
+
+    # not going to check all the metadata columns
+    # in assert_frame_equal.  Just make sure they're there
+    metadata_columns = {
+        "metric_alias",
+        "metric_hub_app_name",
+        "metric_hub_slug",
+        "metric_start_date",
+        "metric_end_date",
+        "metric_collected_at",
+        "forecast_start_date",
+        "forecast_end_date",
+        "forecast_trained_at",
+        "forecast_predicted_at",
+    }
+    assert set(expected.columns) | metadata_columns == set(output_df.columns)
+    # force value columns to be floats in both cases to make check easier
+    numeric_cols = ["value", "value_low", "value_mid", "value_high"]
+    expected[numeric_cols] = expected[numeric_cols].astype(float)
+    output_df[numeric_cols] = output_df[numeric_cols].astype(float)
+    pd.testing.assert_frame_equal(
+        output_df.sort_values(["a", "submission_date"])[expected.columns].reset_index(
+            drop=True
+        ),
+        expected.sort_values(["a", "submission_date"]).reset_index(drop=True),
+    )
+
+    # should not be any nulls outside the metric column
+    non_metric_columns = [el for el in output_df.columns if el not in numeric_cols]
+    assert not pd.isna(output_df[non_metric_columns]).any(axis=None)
+
+    # check components
+    # only checking that concatenation happened properly
+    # with segment data added
+    output_components = forecast.components_df
+    expected_components = pd.DataFrame({"testcol": [1, 2], "a": ["A1", "A2"]})
+    pd.testing.assert_frame_equal(expected_components, output_components)
+
+
+def test_under_predict(mocker):
+    """testing _predict"""
+    # set segment models
+    # 2124-01-01 chosen as a artibrary date to center tests on
+    A1_start_date = "2124-01-01"
+    parameter_dict = {
+        "model_setting_split_dim": "a",
+        "segment_settings": {
+            "A1": {
+                "start_date": A1_start_date,
+                "end_date": None,
+                "holidays": [],
+                "regressors": [],
+                "grid_parameters": {"param1": [1, 2], "param2": [20, 10]},
+                "cv_settings": {},
+            },
+        },
+    }
+
+    parameter_dotmap = DotMap(parameter_dict)
+    predict_start_date = "2124-01-02"
+    predict_end_date = "2124-03-01"
+
+    forecast = FunnelForecast(
+        model_type="test",
+        parameters=parameter_dotmap,
+        use_holidays=None,
+        start_date=predict_start_date,
+        end_date=predict_end_date,
+        metric_hub=None,
+    )
+
+    # this ensures forecast is using MockModel
+    mocker.patch.object(forecast, "_build_model", mock_build_model)
+    # the optimization is just using the value attribute of MockModel,
+    # which is the product of the parameteres passed.  The crossvalidation
+    # will choose the parameters where the absolute value of the product is smallest
+    mocker.patch.object(
+        forecast, "_get_crossvalidation_metric", mock_get_crossvalidation_metric
+    )
+
+    observed_df = pd.DataFrame(
+        {
+            "a": ["A1", "A1"],
+            "b": ["B1", "B2"],
+            "y": [0, 1],
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+        }
+    )
+
+    segment_list = ["a"]
+
+    # manually set segment_models attribute here instead of in __post_init__
+    # which is bypassed to avoid a metric hub call
+    forecast._set_segment_models(
+        observed_df=observed_df, segment_column_list=segment_list
+    )
+    # check that we only have one element here
+    assert len(forecast.segment_models) == 1
+    # because of the check above we can use the first element
+    # and know that's all the segments present
+    segment_settings = forecast.segment_models[0]
+
+    dates_to_predict = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ]
+        }
+    )
+    forecast.observed_df = observed_df
+    forecast.fit()
+    out = forecast._predict(dates_to_predict, segment_settings).reset_index(drop=True)
+
+    # in MockModel, the predictive_samples method sets the output to
+    # np.arange(len(dates_to_predict)) * self.value for one column called 0
+    # this helps ensure the forecast_df in segment_models is set properly
+    model_value = forecast.segment_models[0].segment_model.value
+    expected = pd.DataFrame(
+        {
+            0: [0, model_value],
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+        }
+    )
+
+    # time filter corresponds to the start time of the object
+    # as opposed to the segment
+    expected_time_filter = (
+        expected["submission_date"] >= pd.to_datetime(forecast.start_date).date()
+    )
+    expected = expected[expected_time_filter].reset_index(drop=True)
+
+    pd.testing.assert_frame_equal(out, expected)
+
+    # check the components
+    expected_components = observed_df[["submission_date", "y"]].copy()
+    expected_components[
+        [
+            "yhat",
+            "trend",
+            "trend_upper",
+            "trend_lower",
+            "weekly",
+            "weekly_upper",
+            "weekly_lower",
+            "yearly",
+            "yearly_upper",
+            "yearly_lower",
+        ]
+    ] = 0
+
+    components_df = forecast.segment_models[0].components_df
+    assert set(expected_components.columns) == set(components_df.columns)
+    pd.testing.assert_frame_equal(
+        components_df, expected_components[components_df.columns]
+    )
+
+
+def test_predict(funnel_forecast_for_fit_tests, segment_info_fit_tests):
+    """test the predict method.  This is similar to test_under_predict
+    but multiple segments are acted upon"""
+
+    observed_data = pd.DataFrame(
+        {
+            "a": ["A1", "A1", "A2", "A2"],
+            "b": ["B1", "B2", "B1", "B2"],
+            "y": [-1, 1, -1, 1],
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+        }
+    )
+
+    segment_list = ["a"]
+
+    funnel_forecast_for_fit_tests._set_segment_models(
+        observed_df=observed_data, segment_column_list=segment_list
+    )
+    funnel_forecast_for_fit_tests.observed_df = observed_data
+    funnel_forecast_for_fit_tests.fit()
+    funnel_forecast_for_fit_tests.predict()
+
+    for segment in funnel_forecast_for_fit_tests.segment_models:
+        key = segment.segment["a"]
+
+        model_value = segment_info_fit_tests[key]["min_param_value"]
+
+        # in MockModel, the predictive_samples method sets the output to
+        # np.arange(len(dates_to_predict)) * self.value for one column called 0
+        # this helps ensure the forecast_df in segment_models is set properly
+        expected_raw = pd.DataFrame(
+            {
+                0: [0, model_value],
+                "submission_date": [
+                    pd.to_datetime("2124-01-01").date(),
+                    pd.to_datetime("2124-01-02").date(),
+                ],
+            }
+        )
+
+        # filter in predict happens against object start_date not
+        # segment start_date
+        expected_time_filter = (
+            expected_raw["submission_date"]
+            >= pd.to_datetime(funnel_forecast_for_fit_tests.start_date).date()
+        )
+        expected = expected_raw[expected_time_filter]
+
+        forecast_df = segment.forecast_df
+        pd.testing.assert_frame_equal(forecast_df, expected)
+
+        # check the components
+        expected_components = expected_raw[["submission_date"]].copy()
+        expected_components[
+            [
+                "yhat",
+                "trend",
+                "trend_upper",
+                "trend_lower",
+                "weekly",
+                "weekly_upper",
+                "weekly_lower",
+                "yearly",
+                "yearly_upper",
+                "yearly_lower",
+            ]
+        ] = 0
+
+        # because of time filtereing of training data, if the history has one
+        # element, y will but [0, 1].  The first element is turned into a NULL
+        # and then becomes a 0 because of fillna(0)
+        # if it has two it will have both elements and be [-1,1]
+
+        if len(segment.segment_model.history) == 2:
+            expected_components["y"] = [-1, 1]
+        else:
+            expected_components["y"] = [0, 1]
+
+        components_df = segment.components_df
+
+        # there is weird stuff going on with the types but it shouldn't matter
+        # so coerce the type
+        expected_components["y"] = expected_components["y"].astype(
+            components_df["y"].dtype
+        )
+        assert set(expected_components.columns) == set(components_df.columns)
+        pd.testing.assert_frame_equal(
+            components_df,
+            expected_components[components_df.columns],
+            check_column_type=False,
+        )
+
+
+def test_auto_tuning(forecast, mocker):
+    """test the auto_tuning function"""
+
+    # set one segment with two sets of grid parameters
+    segment_settings = SegmentModelSettings(
+        segment={"a": "A1"},
+        start_date="2124-01-01",
+        end_date="2124-03-01",
+        holidays=[],
+        regressors=[],
+        grid_parameters={"param1": [1, 2], "param2": [20, 10]},
+        cv_settings={},
+    )
+
+    mocker.patch.object(forecast, "_build_model", mock_build_model)
+
+    # mock_get_crossvalidation_metric will choose the parameters that
+    # have the lowest absolute product
+    mocker.patch.object(
+        forecast, "_get_crossvalidation_metric", mock_get_crossvalidation_metric
+    )
+
+    observed_df = pd.DataFrame(
+        {
+            "a": ["A1", "A1"],
+            "b": ["B1", "B2"],
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-01").date(),
+            ],
+        }
+    )
+
+    forecast.segment_models = [segment_settings]
+
+    best_params = forecast._auto_tuning(observed_df, segment_settings)
+
+    # in the mocked class the two params get multiplied and the lowest combo gets select
+    assert best_params == {"param1": 1, "param2": 10}
+
+
+def test_under_fit(funnel_forecast_for_fit_tests, segment_info_fit_tests):
+    """test the _fit method"""
+
+    observed_data = pd.DataFrame(
+        {
+            "a": ["A1", "A1", "A2", "A2"],
+            "b": ["B1", "B2", "B1", "B2"],
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+        }
+    )
+
+    segment_list = ["a"]
+
+    funnel_forecast_for_fit_tests._set_segment_models(
+        observed_df=observed_data, segment_column_list=segment_list
+    )
+    funnel_forecast_for_fit_tests._fit(observed_data)
+
+    # _fit iterates though all the segments in segment_modles
+    # iterate through them and check based on the value in
+    # segment_info_fit_tests defined in the fixture of the same name
+    for segment in funnel_forecast_for_fit_tests.segment_models:
+        key = segment.segment["a"]
+
+        assert segment.start_date == segment_info_fit_tests[key]["start_date"]
+        assert segment.grid_parameters == segment_info_fit_tests[key]["grid_parameters"]
+        segment_model = segment.segment_model
+        assert segment_model.value == segment_info_fit_tests[key]["min_param_value"]
+
+        # the history attribute is used in the components output so check it is set properly
+        expected_training = observed_data[
+            (observed_data["a"] == key)
+            & (
+                observed_data["submission_date"]
+                >= pd.to_datetime(segment_info_fit_tests[key]["start_date"]).date()
+            )
+        ].rename(columns={"submission_date": "ds"})
+
+        pd.testing.assert_frame_equal(segment_model.history, expected_training)
+
+
+def test_fit(funnel_forecast_for_fit_tests, segment_info_fit_tests):
+    """test the fit function.  It is inherited from BaseForecast
+    and calls _fit with the proper object attributes.  Test looks very
+    similar to that for _fit"""
+    observed_data = pd.DataFrame(
+        {
+            "a": ["A1", "A1", "A2", "A2"],
+            "b": ["B1", "B2", "B1", "B2"],
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+        }
+    )
+
+    segment_list = ["a"]
+
+    funnel_forecast_for_fit_tests._set_segment_models(
+        observed_df=observed_data, segment_column_list=segment_list
+    )
+    funnel_forecast_for_fit_tests.observed_df = observed_data
+    funnel_forecast_for_fit_tests.fit()
+
+    # _fit is called by fit and iterates though all the segments in segment_modles
+    # iterate through them and check based on the value in
+    # segment_info_fit_tests defined in the fixture of the same name
+    for segment in funnel_forecast_for_fit_tests.segment_models:
+        key = segment.segment["a"]
+
+        assert segment.start_date == segment_info_fit_tests[key]["start_date"]
+        assert segment.grid_parameters == segment_info_fit_tests[key]["grid_parameters"]
+        segment_model = segment.segment_model
+        assert segment_model.value == segment_info_fit_tests[key]["min_param_value"]
+
+        # check history attribute
+        expected_training = observed_data[
+            (observed_data["a"] == key)
+            & (
+                observed_data["submission_date"]
+                >= pd.to_datetime(segment_info_fit_tests[key]["start_date"]).date()
+            )
+        ].rename(columns={"submission_date": "ds"})
+        pd.testing.assert_frame_equal(segment_model.history, expected_training)
+
+
+def test_set_segment_models():
+    """test the set_segment_models method"""
+    A1_start_date = "2018-01-01"
+    A2_start_date = "2020-02-02"
+    parameter_dict = {
+        "model_setting_split_dim": "a",
+        "segment_settings": {
+            "A1": {
+                "start_date": A1_start_date,
+                "end_date": None,
+                "holidays": [],
+                "regressors": [],
+                "grid_parameters": {},
+                "cv_settings": {},
+            },
+            "A2": {
+                "start_date": A2_start_date,
+                "end_date": None,
+                "holidays": [],
+                "regressors": [],
+                "grid_parameters": {},
+                "cv_settings": {},
+            },
+        },
+    }
+
+    parameter_dotmap = DotMap(parameter_dict)
+    predict_start_date = "2124-01-01"
+    predict_end_date = "2124-03-01"
+
+    forecast = FunnelForecast(
+        model_type="test",
+        parameters=parameter_dotmap,
+        use_holidays=None,
+        start_date=predict_start_date,
+        end_date=predict_end_date,
+        metric_hub=None,
+    )
+
+    observed_data = pd.DataFrame(
+        {"a": ["A1", "A1", "A2", "A2", "A2"], "b": ["B1", "B2", "B1", "B2", "B2"]}
+    )
+
+    segment_list = ["a", "b"]
+
+    forecast._set_segment_models(
+        observed_df=observed_data, segment_column_list=segment_list
+    )
+
+    # put the segments and the start date in the same dictionary to make
+    # comparison easier
+    # the important things to check is that all possible combinations
+    # of segments are present and that each has the parameters set properly
+    # start_date is a stand-in for these parameters and
+    # is determined by the value of a as specified in parameter_dict
+    check_segment_models = [
+        dict(**el.segment, **{"start_date": el.start_date})
+        for el in forecast.segment_models
+    ]
+    expected = [
+        {"a": "A1", "b": "B1", "start_date": A1_start_date},
+        {"a": "A1", "b": "B2", "start_date": A1_start_date},
+        {"a": "A2", "b": "B1", "start_date": A2_start_date},
+        {"a": "A2", "b": "B2", "start_date": A2_start_date},
+    ]
+
+    # can't make a set of dicts for comparison
+    # so sort the lists and compare each element
+    compare_sorted = zip(
+        sorted(check_segment_models, key=lambda x: (x["a"], x["b"])),
+        sorted(expected, key=lambda x: (x["a"], x["b"])),
+    )
+
+    for checkval, expectedval in compare_sorted:
+        assert checkval == expectedval
+
+
+def test_set_segment_models_exception():
+    """test the exception for segment_models where
+    and exception is raised if a model_setting_split_dim
+    is specified that isn't in the data"""
+    A1_start_date = "2018-01-01"
+    A2_start_date = "2020-02-02"
+    parameter_dict = {
+        "model_setting_split_dim": "c",  # not in data
+        "segment_settings": {
+            "A1": {
+                "start_date": A1_start_date,
+                "end_date": None,
+                "holidays": [],
+                "regressors": [],
+                "grid_parameters": {},
+                "cv_settings": {},
+            },
+            "A2": {
+                "start_date": A2_start_date,
+                "end_date": None,
+                "holidays": [],
+                "regressors": [],
+                "grid_parameters": {},
+                "cv_settings": {},
+            },
+        },
+    }
+
+    parameter_dotmap = DotMap(parameter_dict)
+    predict_start_date = "2124-01-01"
+    predict_end_date = "2124-03-01"
+
+    forecast = FunnelForecast(
+        model_type="test",
+        parameters=parameter_dotmap,
+        use_holidays=None,
+        start_date=predict_start_date,
+        end_date=predict_end_date,
+        metric_hub=None,
+    )
+
+    observed_data = pd.DataFrame(
+        {"a": ["A1", "A1", "A2", "A2", "A2"], "b": ["B1", "B2", "B1", "B2", "B2"]}
+    )
+
+    segment_list = ["a", "b"]
+
+    with pytest.raises(
+        ValueError,
+        match="model_setting_split_dim set to c which is not among segment columns: a,b",
+    ):
+        forecast._set_segment_models(
+            observed_df=observed_data, segment_column_list=segment_list
+        )
+
+
 def test_fill_regressor_dates(forecast):
+    """test _fill_regressor_dates
+    the name in the regressor info indicates which case is being tested
+    Dates are chosen arbitrarily"""
     regressor_info = {
         "name": "only_start",
         "description": "only has a start",
@@ -81,6 +1037,8 @@ def test_fill_regressor_dates(forecast):
 
 
 def test_add_regressors(forecast):
+    """test add regressors
+    test case for each element of regressor_list_raw is indicated in name"""
     regressor_list_raw = [
         {
             "name": "all_in",
@@ -90,7 +1048,7 @@ def test_add_regressors(forecast):
         },
         {
             "name": "all_out",
-            "description": "it's all in",
+            "description": "it's all out",
             "start_date": "2124-02-01",
             "end_date": "2124-02-06",
         },
@@ -131,10 +1089,10 @@ def test_add_regressors(forecast):
                 pd.to_datetime("2124-01-03").date(),
                 pd.to_datetime("2124-01-04").date(),
             ],
-            "all_in": [0, 0, 0, 0],
-            "all_out": [1, 1, 1, 1],
-            "just_end": [1, 1, 0, 0],
-            "just_middle": [1, 0, 0, 1],
+            "all_in": [1, 1, 1, 1],
+            "all_out": [0, 0, 0, 0],
+            "just_end": [0, 0, 1, 1],
+            "just_middle": [0, 1, 1, 0],
         }
     )
 
@@ -142,7 +1100,8 @@ def test_add_regressors(forecast):
     pd.testing.assert_frame_equal(output_df, expected_df[output_df.columns])
 
 
-def test_build_model_dataframe_exception(forecast):
+def test_build_train_dataframe_no_regressors(forecast):
+    """test _build_train_dataframe with no regressors"""
     regressor_list = []
 
     grid_parameters = {
@@ -185,61 +1144,8 @@ def test_build_model_dataframe_exception(forecast):
         }
     )
 
-    forecast.observed_df = observed_df
-
-    with pytest.raises(ValueError, match="task set to test, must be train or predict"):
-        _ = forecast._build_model_dataframe(
-            segment_settings=segment_settings, task="test"
-        )
-
-
-def test_build_model_dataframe_no_regressors_train(forecast):
-    regressor_list = []
-
-    grid_parameters = {
-        "changepoint_prior_scale": [0.01, 0.1, 0.15, 0.2],
-        "changepoint_range": [0.8, 0.9, 1],
-        "n_changepoints": [30],
-        "weekly_seasonality": True,
-        "yearly_seasonality": True,
-        "growth": "logistic",
-    }
-    cv_settings = {
-        "initial": "366 days",
-        "period": "30 days",
-        "horizon": "30 days",
-        "parallel": "processes",
-    }
-    segment_settings = SegmentModelSettings(
-        segment={"a": 1, "b": 2},
-        start_date="2124-01-01",
-        end_date="2124-02-01",
-        holidays=[],
-        regressors=[ProphetRegressor(**r) for r in regressor_list],
-        grid_parameters=grid_parameters,
-        cv_settings=cv_settings,
-    )
-
-    observed_df = pd.DataFrame(
-        {
-            "a": [1, 1, 1, 1, 3, 3],
-            "b": [1, 1, 2, 2, 2, 2],
-            "y": [1, 2, 3, 4, 5, 6],
-            "submission_date": [
-                pd.to_datetime("2124-12-01").date(),
-                pd.to_datetime("2124-12-02").date(),
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
-                pd.to_datetime("2123-01-01").date(),
-                pd.to_datetime("2123-01-02").date(),
-            ],
-        }
-    )
-
-    forecast.observed_df = observed_df
-
-    output_train_df = forecast._build_model_dataframe(
-        segment_settings=segment_settings, task="train"
+    output_train_df = forecast._build_train_dataframe(
+        observed_df, segment_settings=segment_settings
     )
     expected_train_df = pd.DataFrame(
         {
@@ -256,8 +1162,9 @@ def test_build_model_dataframe_no_regressors_train(forecast):
         output_train_df.reset_index(drop=True), expected_train_df
     )
 
-    output_train_wlog_df = forecast._build_model_dataframe(
-        segment_settings=segment_settings, task="train", add_logistic_growth_cols=True
+    # test again but with add_logistic_growth_cols set to true
+    output_train_wlog_df = forecast._build_train_dataframe(
+        observed_df, segment_settings=segment_settings, add_logistic_growth_cols=True
     )
     expected_train_wlog_df = pd.DataFrame(
         {
@@ -280,7 +1187,8 @@ def test_build_model_dataframe_no_regressors_train(forecast):
     )
 
 
-def test_build_model_dataframe_train(forecast):
+def test_build_train_dataframe(forecast):
+    """test _build_train_dataframe and include regressors"""
     regressor_list = [
         {
             "name": "all_in",
@@ -341,11 +1249,8 @@ def test_build_model_dataframe_train(forecast):
             ],
         }
     )
-
-    forecast.observed_df = observed_df
-
-    output_train_df = forecast._build_model_dataframe(
-        segment_settings=segment_settings, task="train"
+    output_train_df = forecast._build_train_dataframe(
+        observed_df, segment_settings=segment_settings
     )
     expected_train_df = pd.DataFrame(
         {
@@ -356,17 +1261,17 @@ def test_build_model_dataframe_train(forecast):
                 pd.to_datetime("2124-01-01").date(),
                 pd.to_datetime("2124-01-02").date(),
             ],
-            "all_in": [0, 0],
-            "all_out": [1, 1],
-            "just_end": [1, 0],
+            "all_in": [1, 1],
+            "all_out": [0, 0],
+            "just_end": [0, 1],
         }
     )
     pd.testing.assert_frame_equal(
         output_train_df.reset_index(drop=True), expected_train_df
     )
 
-    output_train_wlog_df = forecast._build_model_dataframe(
-        segment_settings=segment_settings, task="train", add_logistic_growth_cols=True
+    output_train_wlog_df = forecast._build_train_dataframe(
+        observed_df, segment_settings=segment_settings, add_logistic_growth_cols=True
     )
     expected_train_wlog_df = pd.DataFrame(
         {
@@ -377,9 +1282,9 @@ def test_build_model_dataframe_train(forecast):
                 pd.to_datetime("2124-01-01").date(),
                 pd.to_datetime("2124-01-02").date(),
             ],
-            "all_in": [0, 0],
-            "all_out": [1, 1],
-            "just_end": [1, 0],
+            "all_in": [1, 1],
+            "all_out": [0, 0],
+            "just_end": [0, 1],
             "floor": [1.5, 1.5],
             "cap": [6.0, 6.0],
         }
@@ -392,7 +1297,8 @@ def test_build_model_dataframe_train(forecast):
     )
 
 
-def test_build_model_dataframe_no_regressors_predict(forecast):
+def test_build_predict_dataframe_no_regressors(forecast):
+    """test _build_predict with no regressors"""
     regressor_list = []
 
     grid_parameters = {
@@ -419,6 +1325,7 @@ def test_build_model_dataframe_no_regressors_predict(forecast):
         cv_settings=cv_settings,
     )
 
+    # manually set trained_parameters, normally this would happen during training
     segment_settings.trained_parameters = {"floor": -1.0, "cap": 10.0}
 
     dates_to_predict = pd.DataFrame(
@@ -434,10 +1341,8 @@ def test_build_model_dataframe_no_regressors_predict(forecast):
         }
     )
 
-    forecast.dates_to_predict = dates_to_predict
-
-    output_predict_df = forecast._build_model_dataframe(
-        segment_settings=segment_settings, task="predict"
+    output_predict_df = forecast._build_predict_dataframe(
+        dates_to_predict, segment_settings=segment_settings
     )
     expected_predict_df = pd.DataFrame(
         {
@@ -455,8 +1360,11 @@ def test_build_model_dataframe_no_regressors_predict(forecast):
         output_predict_df.reset_index(drop=True), expected_predict_df
     )
 
-    output_predict_wlog_df = forecast._build_model_dataframe(
-        segment_settings=segment_settings, task="predict", add_logistic_growth_cols=True
+    # test against but with add_logistic_growth_cols set to true
+    output_predict_wlog_df = forecast._build_predict_dataframe(
+        dates_to_predict,
+        segment_settings=segment_settings,
+        add_logistic_growth_cols=True,
     )
     expected_predict_wlog_df = pd.DataFrame(
         {
@@ -480,7 +1388,8 @@ def test_build_model_dataframe_no_regressors_predict(forecast):
     )
 
 
-def test_build_model_dataframe_predict(forecast):
+def test_build_predict_dataframe(forecast):
+    """test _build_predict_dataframe including regressors"""
     regressor_list = [
         {
             "name": "all_in",
@@ -526,62 +1435,54 @@ def test_build_model_dataframe_predict(forecast):
         cv_settings=cv_settings,
     )
 
-    observed_df = pd.DataFrame(
+    # set training_parameters, which is usually done in the fit method
+    segment_settings.trained_parameters = {"floor": -1.0, "cap": 10.0}
+
+    dates_to_predict = pd.DataFrame(
         {
-            "a": [1, 1, 1, 1, 3, 3],
-            "b": [1, 1, 2, 2, 2, 2],
-            "y": [1, 2, 3, 4, 5, 6],
             "submission_date": [
-                pd.to_datetime("2124-12-01").date(),
-                pd.to_datetime("2124-12-02").date(),
                 pd.to_datetime("2124-01-01").date(),
                 pd.to_datetime("2124-01-02").date(),
-                pd.to_datetime("2123-01-01").date(),
-                pd.to_datetime("2123-01-02").date(),
             ],
         }
     )
 
-    forecast.observed_df = observed_df
-
-    output_train_df = forecast._build_model_dataframe(
-        segment_settings=segment_settings, task="train"
+    output_train_df = forecast._build_predict_dataframe(
+        dates_to_predict,
+        segment_settings=segment_settings,
     )
     expected_train_df = pd.DataFrame(
         {
-            "a": [1, 1],
-            "b": [2, 2],
-            "y": [3, 4],
             "ds": [
                 pd.to_datetime("2124-01-01").date(),
                 pd.to_datetime("2124-01-02").date(),
             ],
-            "all_in": [0, 0],
-            "all_out": [1, 1],
-            "just_end": [1, 0],
+            "all_in": [1, 1],
+            "all_out": [0, 0],
+            "just_end": [0, 1],
         }
     )
     pd.testing.assert_frame_equal(
         output_train_df.reset_index(drop=True), expected_train_df
     )
 
-    output_train_wlog_df = forecast._build_model_dataframe(
-        segment_settings=segment_settings, task="train", add_logistic_growth_cols=True
+    # test again but with add_logistic_growth_cols set to true
+    output_train_wlog_df = forecast._build_predict_dataframe(
+        dates_to_predict,
+        segment_settings=segment_settings,
+        add_logistic_growth_cols=True,
     )
     expected_train_wlog_df = pd.DataFrame(
         {
-            "a": [1, 1],
-            "b": [2, 2],
-            "y": [3, 4],
             "ds": [
                 pd.to_datetime("2124-01-01").date(),
                 pd.to_datetime("2124-01-02").date(),
             ],
-            "all_in": [0, 0],
-            "all_out": [1, 1],
-            "just_end": [1, 0],
-            "floor": [1.5, 1.5],
-            "cap": [6.0, 6.0],
+            "all_in": [1, 1],
+            "all_out": [0, 0],
+            "just_end": [0, 1],
+            "floor": [-1.0, -1.0],
+            "cap": [10.0, 10.0],
         }
     )
 
@@ -593,6 +1494,8 @@ def test_build_model_dataframe_predict(forecast):
 
 
 def test_build_model(forecast):
+    """test build_model
+    just runs the function and ensures no error is raised"""
     regressor_list = [
         {
             "name": "all_in",
diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_performance_analysis.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_performance_analysis.py
index e6dc10b4..edbc2cbb 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_performance_analysis.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_performance_analysis.py
@@ -1,6 +1,5 @@
 import pytest
 import yaml
-import cmath
 
 import pandas as pd
 
diff --git a/jobs/kpi-forecasting/requirements.txt b/jobs/kpi-forecasting/requirements.txt
index 498823ef..218d688a 100644
--- a/jobs/kpi-forecasting/requirements.txt
+++ b/jobs/kpi-forecasting/requirements.txt
@@ -56,6 +56,7 @@ pyasn1-modules==0.3.0
 PyMeeus==0.5.12
 pyparsing==3.0.9
 pytest==7.3.2
+pytest-mock==3.14.0
 pytest-ruff==0.3.2
 python-dateutil==2.8.2
 pytz==2023.3

From 6ab0527667a59d26111c9bc17f97f7d012c423df Mon Sep 17 00:00:00 2001
From: JCMOSCON1976 <167822375+JCMOSCON1976@users.noreply.github.com>
Date: Mon, 29 Jul 2024 16:35:12 -0400
Subject: [PATCH 13/33] feat(workday):remove unwanted fields (#249)

Co-authored-by: Julio Cezar Moscon <jcmoscon@gmail.com>
---
 .../scripts/api/Workday/Workday.py               | 16 +++++++++++++---
 .../scripts/api/XMatters/XMatters.py             |  2 +-
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/jobs/eam-integrations/scripts/api/Workday/Workday.py b/jobs/eam-integrations/scripts/api/Workday/Workday.py
index 2d7287c8..ccb4a8a6 100644
--- a/jobs/eam-integrations/scripts/api/Workday/Workday.py
+++ b/jobs/eam-integrations/scripts/api/Workday/Workday.py
@@ -115,11 +115,21 @@ def get_users():
             proxies=_config.proxies,
         )
         results = json.loads(r.text)
-        return [user for user in results["Report_Entry"]
+        users = [user for user in results["Report_Entry"]
                 if not (user.get("User_Home_Country", "") == "" and
                         user.get("User_Home_Postal_Code", "") == "")]
-
-        # return results["Report_Entry"]
+        for user in users:
+            user['User_Cost_Center'] = ''
+            user['User_Manager_Email_Address'] = ''
+            user['User_Functional_Group'] = ''
+            user['User_Work_Location'] = ''
+            user['User_Manager_Preferred_First_Name'] = ''
+            user['User_Manager_Preferred_Last_Name'] = ''
+            user["Worker_s_Manager"][0]["User_Manager_Preferred_First_Name"] = ''
+            user["Worker_s_Manager"][0]["User_Manager_Preferred_Last_Name"] = ''
+
+        return users
+        
     except Exception:
         logger.critical(sys.exc_info()[0])
         raise
diff --git a/jobs/eam-integrations/scripts/api/XMatters/XMatters.py b/jobs/eam-integrations/scripts/api/XMatters/XMatters.py
index 48ddb89a..e0b962a5 100644
--- a/jobs/eam-integrations/scripts/api/XMatters/XMatters.py
+++ b/jobs/eam-integrations/scripts/api/XMatters/XMatters.py
@@ -496,7 +496,7 @@ def delete_sites(xm_sites, xm_sites_in_wd):
     logger.info("\n")
     logger.info("Deleting empty sites from XMatters")
     for site in xm_sites:
-        if site not in xm_sites_in_wd and site != "Mountain View Office":
+        if site not in xm_sites_in_wd and site not in ["Default Site", "Mountain View Office"]:
             logger.info(
                 "Site %s not in WorkDay. INACTIVATING %s from XMatters"
                 % (site, xm_sites[site])

From 07e538891ef024812c3b0b17799b38d213f77f3d Mon Sep 17 00:00:00 2001
From: JCMOSCON1976 <167822375+JCMOSCON1976@users.noreply.github.com>
Date: Tue, 30 Jul 2024 11:25:36 -0400
Subject: [PATCH 14/33] fix(exit):Added sys.exit() call (#250)

Co-authored-by: Julio Cezar Moscon <jcmoscon@gmail.com>
---
 .../scripts/workday_everfi_integration.py            | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/jobs/eam-integrations/scripts/workday_everfi_integration.py b/jobs/eam-integrations/scripts/workday_everfi_integration.py
index 4aae9332..e7eb18fe 100644
--- a/jobs/eam-integrations/scripts/workday_everfi_integration.py
+++ b/jobs/eam-integrations/scripts/workday_everfi_integration.py
@@ -341,7 +341,7 @@ def run(self, force):
         except (APIAdaptorException, Exception) as e:
             self.logger.error(str(e))
             self.logger.critical("Failed while Getting Workday users...")           
-            
+            sys.exit(1)
 
         # ========================================================
         # Getting Everfi users...
@@ -353,7 +353,8 @@ def run(self, force):
         except (APIAdaptorException, Exception) as e:
             self.logger.error(str(e))
             self.logger.critical("Failed while Getting Everfi users...")
-
+            sys.exit(1)
+            
         # ========================================================
         # Comparing users...
         # ========================================================
@@ -369,7 +370,8 @@ def run(self, force):
         except (Exception) as e:
             self.logger.error(str(e))
             self.logger.critical("Failed while Comparing users...")
-
+            sys.exit(1)
+            
         # ========================================================
         # Deleting Everfi users ...
         # ========================================================
@@ -380,6 +382,7 @@ def run(self, force):
         except (APIAdaptorException, Exception) as e:
             self.logger.error(str(e))
             self.logger.critical("Faile while Deleting Everfi users ...")
+            sys.exit(1)
             
         # ========================================================
         # Adding Everfi users ...
@@ -393,7 +396,7 @@ def run(self, force):
         except (APIAdaptorException, Exception) as e:
             self.logger.error(str(e))
             self.logger.critical("Failed while Adding Everfi users ...")
-        
+            sys.exit(1)
         # ========================================================
         # Updating Everfi users ...
         # ========================================================
@@ -413,6 +416,7 @@ def run(self, force):
         except (APIAdaptorException, Exception) as e:
             self.logger.error(str(e))
             self.logger.critical("Failed while Updating Everfi users ...")
+            sys.exit(1)
         
         self.logger.info("End of integration")
 

From b102a7a29e02ae1a7cf76be209d4e7ec6653f10b Mon Sep 17 00:00:00 2001
From: Jared Snyder <jsnyder@mozilla.com>
Date: Tue, 30 Jul 2024 15:37:38 -0500
Subject: [PATCH 15/33] fix issue with call to _get_crossvalidation_metric

---
 jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py
index c4683f16..f557a0d7 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py
@@ -399,7 +399,7 @@ def _auto_tuning(
             m.fit(test_dat)
 
             crossval_metric = self._get_crossvalidation_metric(
-                m, **segment_settings.cv_settings
+                m, segment_settings.cv_settings
             )
             bias.append(crossval_metric)
 

From 0726287e4bf03efb62f6822c9bfe331bbfe5a226 Mon Sep 17 00:00:00 2001
From: Jared Snyder <jsnyder@mozilla.com>
Date: Mon, 5 Aug 2024 10:22:14 -0500
Subject: [PATCH 16/33] fixed type check

---
 .../kpi_forecasting/models/funnel_forecast.py             | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py
index f557a0d7..9652ce02 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py
@@ -769,12 +769,8 @@ def write_results(
         job.result()
 
         if components_table:
-            numeric_cols = self.components_df.dtypes[
-                self.components_df.dtypes is float
-            ].index.tolist()
-            string_cols = self.components_df.dtypes[
-                self.components_df.dtypes is object
-            ].index.tolist()
+            numeric_cols = list(self.components_df.select_dtypes(include=float).columns)
+            string_cols = list(self.components_df.select_dtypes(include=object).columns)
             self.components_df["metric_slug"] = self.metric_hub.slug
             self.components_df["forecast_trained_at"] = self.trained_at
 

From d8db825704de0542d9312591599bd83e917f0509 Mon Sep 17 00:00:00 2001
From: Jared Snyder <jsnyder@mozilla.com>
Date: Tue, 6 Aug 2024 13:18:08 -0500
Subject: [PATCH 17/33] added string case to aggregate_to_period and added
 tests

---
 .../kpi_forecasting/pandas_extras.py          |  30 ++-
 .../tests/test_pandas_extras.py               | 219 ++++++++++++++++++
 2 files changed, 248 insertions(+), 1 deletion(-)
 create mode 100644 jobs/kpi-forecasting/kpi_forecasting/tests/test_pandas_extras.py

diff --git a/jobs/kpi-forecasting/kpi_forecasting/pandas_extras.py b/jobs/kpi-forecasting/kpi_forecasting/pandas_extras.py
index e54fa60a..8ae622bf 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/pandas_extras.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/pandas_extras.py
@@ -26,4 +26,32 @@ def aggregate_to_period(
 
     x = df.copy(deep=True)
     x[date_col] = pd.to_datetime(x[date_col]).dt.to_period(period[0]).dt.to_timestamp()
-    return x.groupby(date_col).agg(aggregation).reset_index()
+
+    # treat numeric and string types separately
+    x_string = x.select_dtypes(include=["datetime64", object])
+    x_numeric = x.select_dtypes(include=["float", "int", "datetime64"])
+
+    if set(x_string.columns) | set(x_numeric.columns) != set(x.columns):
+        missing_columns = set(x.columns) - (
+            set(x_string.columns) | set(x_numeric.columns)
+        )
+        missing_columns_str = ",".join(missing_columns)
+        raise ValueError(
+            f"Columns do not have string or numeric type: {missing_columns_str}"
+        )
+
+    x_numeric_agg = x_numeric.groupby(date_col).agg(aggregation).reset_index()
+
+    # all values of x_string should be the same because it is just the dimensions
+    x_string_agg = x_string.drop_duplicates().reset_index(drop=True)
+
+    if len(x_string_agg) != len(x_numeric_agg):
+        raise ValueError(
+            "String and Numeric dataframes have different length, likely due to strings not being unique up to aggregation"
+        )
+
+    # unique preseves order so we should be fine to concat
+    output_df = pd.concat(
+        [x_numeric_agg, x_string_agg.drop(columns=[date_col])], axis=1
+    )
+    return output_df
diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_pandas_extras.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_pandas_extras.py
new file mode 100644
index 00000000..c512e0c9
--- /dev/null
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_pandas_extras.py
@@ -0,0 +1,219 @@
+import pandas as pd
+import pytest
+
+from kpi_forecasting.pandas_extras import aggregate_to_period
+
+
+def test_only_numeric():
+    df = pd.DataFrame(
+        {
+            "submission_date": [
+                "2020-01-01",
+                "2020-01-01",
+                "2020-01-02",
+                "2020-02-01",
+                "2020-02-02",
+            ],
+            "ints": [1, 2, 3, 4, 5],
+            "floats": [10.0, 20.0, 30.0, 40.0, 50.0],
+        }
+    )
+
+    day_output = aggregate_to_period(df, "day")
+
+    expected_day = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2020-01-01"),
+                pd.to_datetime("2020-01-02"),
+                pd.to_datetime("2020-02-01"),
+                pd.to_datetime("2020-02-02"),
+            ],
+            "ints": [3, 3, 4, 5],
+            "floats": [30.0, 30.0, 40.0, 50.0],
+        }
+    )
+
+    pd.testing.assert_frame_equal(day_output, expected_day)
+
+    month_output = aggregate_to_period(df, "month")
+
+    expected_month = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2020-01-01"),
+                pd.to_datetime("2020-02-01"),
+            ],
+            "ints": [6, 9],
+            "floats": [60.0, 90.0],
+        }
+    )
+
+    pd.testing.assert_frame_equal(month_output, expected_month)
+
+
+def test_with_string_and_numeric():
+    df = pd.DataFrame(
+        {
+            "submission_date": [
+                "2020-01-01",
+                "2020-01-01",
+                "2020-01-02",
+                "2020-02-01",
+                "2020-02-02",
+            ],
+            "ints": [1, 2, 3, 4, 5],
+            "floats": [10.0, 20.0, 30.0, 40.0, 50.0],
+            "string": ["jan", "jan", "jan", "feb", "feb"],
+        }
+    )
+
+    day_output = aggregate_to_period(df, "day")
+
+    expected_day = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2020-01-01"),
+                pd.to_datetime("2020-01-02"),
+                pd.to_datetime("2020-02-01"),
+                pd.to_datetime("2020-02-02"),
+            ],
+            "ints": [3, 3, 4, 5],
+            "floats": [30.0, 30.0, 40.0, 50.0],
+            "string": ["jan", "jan", "feb", "feb"],
+        }
+    )
+
+    pd.testing.assert_frame_equal(day_output, expected_day)
+
+    month_output = aggregate_to_period(df, "month")
+
+    expected_month = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2020-01-01"),
+                pd.to_datetime("2020-02-01"),
+            ],
+            "ints": [6, 9],
+            "floats": [60.0, 90.0],
+            "string": ["jan", "feb"],
+        }
+    )
+
+    pd.testing.assert_frame_equal(month_output, expected_month)
+
+
+def test_only_string():
+    df = pd.DataFrame(
+        {
+            "submission_date": [
+                "2020-01-01",
+                "2020-01-01",
+                "2020-01-02",
+                "2020-02-01",
+                "2020-02-02",
+            ],
+            "string": ["jan", "jan", "jan", "feb", "feb"],
+        }
+    )
+
+    day_output = aggregate_to_period(df, "day")
+
+    expected_day = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2020-01-01"),
+                pd.to_datetime("2020-01-02"),
+                pd.to_datetime("2020-02-01"),
+                pd.to_datetime("2020-02-02"),
+            ],
+            "string": ["jan", "jan", "feb", "feb"],
+        }
+    )
+
+    pd.testing.assert_frame_equal(day_output, expected_day)
+
+    month_output = aggregate_to_period(df, "month")
+
+    expected_month = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2020-01-01"),
+                pd.to_datetime("2020-02-01"),
+            ],
+            "string": ["jan", "feb"],
+        }
+    )
+
+    pd.testing.assert_frame_equal(month_output, expected_month)
+
+
+def test_non_unique_string_exception():
+    df = pd.DataFrame(
+        {
+            "submission_date": [
+                "2020-01-01",
+                "2020-01-01",
+                "2020-01-02",
+                "2020-02-01",
+                "2020-02-02",
+            ],
+            "ints": [1, 2, 3, 4, 5],
+            "floats": [10.0, 20.0, 30.0, 40.0, 50.0],
+            "string": ["jan", "jane", "yan", "fev", "feb"],
+        }
+    )
+
+    with pytest.raises(
+        ValueError,
+        match="String and Numeric dataframes have different length, likely due to strings not being unique up to aggregation",
+    ):
+        _ = aggregate_to_period(df, "day")
+
+
+def test_column_type_exception():
+    df = pd.DataFrame(
+        {
+            "submission_date": [
+                "2020-01-01",
+                "2020-01-01",
+                "2020-01-02",
+                "2020-02-01",
+                "2020-02-02",
+            ],
+            "ints": [1, 2, 3, 4, 5],
+            "floats": [10.0, 20.0, 30.0, 40.0, 50.0],
+            "string": ["jan", "jane", "yan", "fev", "feb"],
+            "bool": [True, True, True, False, False],
+        }
+    )
+
+    with pytest.raises(
+        ValueError,
+        match="Columns do not have string or numeric type: bool",
+    ):
+        _ = aggregate_to_period(df, "day")
+
+
+def test_agg_exception():
+    df = pd.DataFrame(
+        {
+            "submission_date": [
+                "2020-01-01",
+                "2020-01-01",
+                "2020-01-02",
+                "2020-02-01",
+                "2020-02-02",
+            ],
+            "ints": [1, 2, 3, 4, 5],
+            "floats": [10.0, 20.0, 30.0, 40.0, 50.0],
+            "string": ["jan", "jane", "yan", "fev", "feb"],
+            "bool": [True, True, True, False, False],
+        }
+    )
+
+    with pytest.raises(
+        ValueError,
+        match="Don't know how to floor dates by hamburger. Please use 'day', 'month', or 'year'.",
+    ):
+        _ = aggregate_to_period(df, "hamburger")

From 83aa2298051c2e439aca37c61974155efbb1aba9 Mon Sep 17 00:00:00 2001
From: Jared Snyder <jsnyder@mozilla.com>
Date: Wed, 7 Aug 2024 08:42:33 -0500
Subject: [PATCH 18/33] revert file

---
 .../scripts/workday_everfi_integration.py     | 199 ++++++++----------
 1 file changed, 86 insertions(+), 113 deletions(-)

diff --git a/jobs/eam-integrations/scripts/workday_everfi_integration.py b/jobs/eam-integrations/scripts/workday_everfi_integration.py
index 59e51114..39cf41d0 100644
--- a/jobs/eam-integrations/scripts/workday_everfi_integration.py
+++ b/jobs/eam-integrations/scripts/workday_everfi_integration.py
@@ -1,3 +1,4 @@
+
 from workday_everfi.api.Workday import WorkdayAPI
 from workday_everfi.api.Everfi import EverfiAPI
 from api.util import Util, APIAdaptorException
@@ -5,7 +6,6 @@
 import logging
 import sys
 
-
 def cal_user_location(wd_user, locs, loc_map_table):
     loc = ""
     location_country = wd_user.get("location_country", "")
@@ -37,23 +37,19 @@ class Everfi:
     def __init__(self) -> None:
         self.everfi_api = EverfiAPI()
         self.logger = logging.getLogger(self.__class__.__name__)
-
+    
     def get_everfi_users(self, locs, loc_map_table, hire_dates):
-        filter = {"filter[active]": "true"}
-        fields = {
-            "fields[users]": "email,first_name,last_name,sso_id,employee_id,student_id,location_id,active,user_rule_set_roles,category_labels"
-        }
-        return self.everfi_api.get_users(
-            fields, filter, locs, loc_map_table, hire_dates
-        )
+        filter = {'filter[active]': 'true'}
+        fields = {'fields[users]': 'email,first_name,last_name,sso_id,employee_id,student_id,location_id,active,user_rule_set_roles,category_labels'}
+        return self.everfi_api.get_users(fields, filter, locs, loc_map_table, hire_dates)
 
     def deactivate_users(self, del_list, everfi_users):
         count = 0
-
+        
         for email in del_list:
-            id = everfi_users[email].get("id")
+            id = everfi_users[email].get('id')
             self.everfi_api.deactivate_user(id)
-            if "@" in email:
+            if '@' in email:
                 n = email.split("@")[0]
             else:
                 n = email
@@ -62,10 +58,10 @@ def deactivate_users(self, del_list, everfi_users):
             if count % 20 == 0:
                 self.logger.info(f"[{count} of {len(del_list)}] users deactivated.")
         return count
-
+        
     def activate_user(self, id):
-        self.everfi_api.set_active(id, True)
-
+        self.everfi_api.set_active(id,True)
+        
     def get_locations_mapping_table(self):
         return self.everfi_api.get_locations_mapping_table()
 
@@ -81,21 +77,17 @@ def upd_everfi_users(
     ):
         errors_list = []
         count_upd = 0
-        loc_id_dict = {
-            x.get("id"): x.get("attributes").get("name") for x in locs.values()
-        }
-
+        loc_id_dict = {x.get('id'):x.get('attributes').get('name') for x in locs.values()}
+        
         for email in upd_list_keys:
             wd_user = wd_users[email][1]
-            loc_id = cal_user_location(wd_user, locs, loc_map_table)
-            if int(loc_id) != everfi_users[email].get("attributes").get("location_id"):
-                if "@" in email:
+            loc_id = cal_user_location(wd_user, locs, loc_map_table)    
+            if int(loc_id) != everfi_users[email].get('attributes').get('location_id'):
+                if '@' in email:
                     n = email.split("@")[0]
                 else:
                     n = email
-                self.logger.info(
-                    f"User {n[:4]} .. {n[-1]} changed location from {loc_id_dict[str(everfi_users[email].get('attributes').get('location_id'))]} to {loc_id_dict[loc_id]}"
-                )
+                self.logger.info(f"User {n[:4]} .. {n[-1]} changed location from {loc_id_dict[str(everfi_users[email].get('attributes').get('location_id'))]} to {loc_id_dict[loc_id]}")
             json_data = {
                 "data": {
                     "type": "registration_sets",
@@ -125,14 +117,12 @@ def upd_everfi_users(
             except Exception as e:
                 self.logger.exception(e)
                 errors_list.append(e)
-
-            cat_label_user_id = self.get_category_label_user_id(
-                everfi_users[email]["id"]
-            )
+            
+            cat_label_user_id = self.get_category_label_user_id(everfi_users[email]["id"])
             if cat_label_user_id:
                 self.delete_category_label_user(cat_label_user_id)
 
-            # wd_users[email][1]["hire_date"] = '2024-07-10'
+            #wd_users[email][1]["hire_date"] = '2024-07-10'
             hire_date_id = self.get_hire_date_id(
                 wd_users[email][1]["hire_date"], hire_date_category_id, hire_dates
             )
@@ -144,29 +134,27 @@ def upd_everfi_users(
             except Exception as e:
                 self.logger.exception(e)
                 errors_list.append(e)
-
+            
             if count_upd % 20 == 0:
-                self.logger.info(
-                    f"[{count_upd} of {len(upd_list_keys)}] users updated."
-                )
-
+                self.logger.info(f"[{count_upd} of {len(upd_list_keys)}] users updated.")
+                
             count_upd += 1
-
+    
         return count_upd
-
+    
     def get_category_label_user_id(self, id):
-        ret = self.everfi_api.get_category_label_user_id(id)
-        if len(ret.data.get("data", "")) > 0:
-            return ret.data.get("data", "")[0].get("id", "")
+        ret = self.everfi_api.get_category_label_user_id(id)   
+        if len(ret.data.get('data',''))>0:
+            return ret.data.get('data','')[0].get('id','')
         else:
             return None
-
+    
     def delete_category_label_user(self, id):
-        ret = self.everfi_api.delete_category_label_user(id)
+        ret = self.everfi_api.delete_category_label_user(id)               
         return ret
-
-    def bulk_clear_category_id(self, ids, category_id, category_label):
-        return self.everfi_api.bulk_clear_category_id(ids, category_id, category_label)
+    
+    def bulk_clear_category_id(self, ids, category_id,category_label):
+        return self.everfi_api.bulk_clear_category_id(ids, category_id,category_label)
 
     def get_hire_date_id(self, wd_hire_date, hire_date_category_id, hire_dates):
         wd_hire_date = wd_hire_date.split("-")
@@ -225,34 +213,33 @@ def add_everfi_users(
             except Exception as e:
                 self.logger.exception(e)
                 self.logger.info("Trying to activate user and update ")
-                if e.args[0][0].get("id", "") == "user_rule_set":
+                if (e.args[0][0].get('id','')=='user_rule_set'):
                     # try to active user
                     # find user by email and then update the user with current data
-                    filter = {"filter[email]": wd_user.get("primary_work_email", "")}
-                    fields = {"fields[users]": "id,email"}
-                    # find user id
+                    filter = {'filter[email]': wd_user.get("primary_work_email", "")}
+                    fields = {'fields[users]': 'id,email'}                    
+                    #find user id
                     user = self.everfi_api.search_user(fields, filter)
-                    id = user.get(email, "").get("id", "")
+                    id = user.get(email,'').get('id', '')
                     if id:
-                        # self.activate_user(id)
-                        json_data["data"]["id"] = id
-                        json_data["data"]["attributes"]["registrations"][0][
-                            "active"
-                        ] = True
-                        # active user and update fields
-                        r = self.everfi_api.upd_user(id, json_data)
-                        # remove hire date custom field
-
-                        # hd = wd_users[email][1]["hire_date"].split('-')
+                        #self.activate_user(id)
+                        json_data['data']['id'] = id
+                        json_data['data']['attributes']['registrations'][0]['active'] = True 
+                        #active user and update fields
+                        r = self.everfi_api.upd_user(id, json_data) 
+                        #remove hire date custom field
+                       
+                        #hd = wd_users[email][1]["hire_date"].split('-') 
                         cat_label_user_id = self.get_category_label_user_id(id)
                         if cat_label_user_id:
                             self.delete_category_label_user(cat_label_user_id)
-                        # self.bulk_clear_category_id([id], hire_date_category_id, hd[1] + '-' + hd[0])
-                    else:
+                        #self.bulk_clear_category_id([id], hire_date_category_id, hd[1] + '-' + hd[0])
+                    else:                        
                         errors.append(e)
                         continue
 
-            # wd_users[email][1]["hire_date"] = '2024-07-10'
+            
+            #wd_users[email][1]["hire_date"] = '2024-07-10'
             hire_date_id = self.get_hire_date_id(
                 wd_users[email][1]["hire_date"], hire_date_category_id, hire_dates
             )
@@ -266,24 +253,25 @@ def add_everfi_users(
                 errors.append(e)
 
             count_add += 1
-
-            if "@" in email:
+            
+            if '@' in email:
                 n = email.split("@")[0]
             else:
                 n = email
             self.logger.info(f"{n[:4]} .. {n[-1]} added")
-
+            
             if count_add % 20 == 0:
                 self.logger.info(f"[{count_add} of {len(add_list_keys)}] users added.")
-
+            
+            
+        
         return count_add
 
-
 class Workday:
     def build_comparison_string(self, wd_row, locs, loc_map_table):
         loc_id = cal_user_location(wd_row, locs, loc_map_table)
-        hire_date = wd_row["hire_date"].split("-")
-
+        hire_date = wd_row['hire_date'].split('-')
+        
         is_manager = "supervisor" if wd_row.get("is_manager", "") else "non_supervisor"
         return (
             wd_row["primary_work_email"]
@@ -298,9 +286,7 @@ def build_comparison_string(self, wd_row, locs, loc_map_table):
             + "|"
             + is_manager
             + "|"
-            + hire_date[1]
-            + "-"
-            + hire_date[0]
+            + hire_date[1] + "-" + hire_date[0]
             + "|"
             + wd_row["primary_work_email"]
         )
@@ -319,7 +305,7 @@ def get_wd_users(self, locs, loc_map_table):
             (df["currently_active"] == True)
             & (df["moco_or_mofo"] == "MoCo")
             & (df["worker_type"] == "Employee")
-            | (df["primary_work_email"] == "jmoscon@mozilla.com")
+            | (df['primary_work_email'] == "jmoscon@mozilla.com")
         ]
 
         comp = {
@@ -353,15 +339,16 @@ def compare_users(self, wd_comp, everfi_comp, wd_users, everfi_users):
             if everfi_comp[upd_email] != wd_comp[upd_email]:
                 upd_list.append(upd_email)
 
+ 
         return add_list, del_list, upd_list
 
     def run(self, limit):
         # ========================================================
         # Getting Everfi hire dates, locations and locations mapping table ...
-        # ========================================================
+        # ========================================================        
         try:
             self.logger.info("Getting everfi hire dates")
-            hire_date_category_id, hire_dates = self.everfi.everfi_api.get_hire_dates()
+            hire_date_category_id, hire_dates = self.everfi.everfi_api.get_hire_dates()           
             self.logger.info(f"Number of hire dates: {len(hire_dates)}")
 
             self.logger.info("Getting everfi locations")
@@ -374,9 +361,7 @@ def run(self, limit):
 
         except (APIAdaptorException, Exception) as e:
             self.logger.error(str(e))
-            self.logger.critical(
-                "Failed while Getting Everfi hire dates,locations and locations mapping table ..."
-            )
+            self.logger.critical("Failed while Getting Everfi hire dates,locations and locations mapping table ...")            
             sys.exit(1)
 
         # ========================================================
@@ -388,7 +373,7 @@ def run(self, limit):
             self.logger.info(f"Number of wd users: {len(wd_users)}")
         except (APIAdaptorException, Exception) as e:
             self.logger.error(str(e))
-            self.logger.critical("Failed while Getting Workday users...")
+            self.logger.critical("Failed while Getting Workday users...")           
             sys.exit(1)
 
         # ========================================================
@@ -396,15 +381,13 @@ def run(self, limit):
         # ========================================================
         self.logger.info("Getting Everfi users...")
         try:
-            everfi_comp, everfi_users = self.everfi.get_everfi_users(
-                locs, loc_map_table, hire_dates
-            )
+            everfi_comp, everfi_users = self.everfi.get_everfi_users(locs, loc_map_table, hire_dates)
             self.logger.info(f"Number of Everfi users: {len(everfi_users)}")
         except (APIAdaptorException, Exception) as e:
             self.logger.error(str(e))
             self.logger.critical("Failed while Getting Everfi users...")
             sys.exit(1)
-
+            
         # ========================================================
         # Comparing users...
         # ========================================================
@@ -414,51 +397,42 @@ def run(self, limit):
                 wd_comp, everfi_comp, wd_users, everfi_users
             )
 
-            self.logger.info(
-                f"Number of users to delete w/o limit={len(del_list)} with limit={len(del_list[:limit])}"
-            )
-            self.logger.info(
-                f"Number of users to add w/o limit={len(add_list)} with limit={len(add_list[:limit])}"
-            )
-            self.logger.info(
-                f"Number of users to update w/o limit={len(upd_list)} with limit={len(upd_list[:limit])}"
-            )
+            self.logger.info(f"Number of users to delete w/o limit={len(del_list)} with limit={len(del_list[:limit])}")
+            self.logger.info(f"Number of users to add w/o limit={len(add_list)} with limit={len(add_list[:limit])}")
+            self.logger.info(f"Number of users to update w/o limit={len(upd_list)} with limit={len(upd_list[:limit])}")
 
             del_list = del_list[:limit]
             add_list = add_list[:limit]
             upd_list = upd_list[:limit]
-
-        except Exception as e:
+  
+        except (Exception) as e:
             self.logger.error(str(e))
             self.logger.critical("Failed while Comparing users...")
             sys.exit(1)
-
+        
+            
         # ========================================================
         # Deleting Everfi users ...
         # ========================================================
-        self.logger.info("Deleting Everfi users ...")
+        self.logger.info("Deleting Everfi users ...")        
         try:
+             
             count_dels = self.everfi.deactivate_users(del_list, everfi_users)
             self.logger.info(f"Number of users deleted {count_dels}")
         except (APIAdaptorException, Exception) as e:
             self.logger.error(str(e))
             self.logger.critical("Faile while Deleting Everfi users ...")
             sys.exit(1)
-
+            
         # ========================================================
         # Adding Everfi users ...
         # ========================================================
         self.logger.info("Adding Everfi users ...")
         try:
             count_add = self.everfi.add_everfi_users(
-                hire_date_category_id,
-                hire_dates,
-                locs,
-                add_list,
-                wd_users,
-                loc_map_table,
+                hire_date_category_id, hire_dates, locs, add_list, wd_users, loc_map_table
             )
-            self.logger.info(f"Number of users added {count_add}")
+            self.logger.info(f"Number of users added {count_add}")            
         except (APIAdaptorException, Exception) as e:
             self.logger.error(str(e))
             self.logger.critical("Failed while Adding Everfi users ...")
@@ -467,7 +441,7 @@ def run(self, limit):
         # Updating Everfi users ...
         # ========================================================
         self.logger.info("Updating Everfi users ...")
-
+        
         try:
             count_upd = self.everfi.upd_everfi_users(
                 hire_date_category_id,
@@ -483,10 +457,9 @@ def run(self, limit):
             self.logger.error(str(e))
             self.logger.critical("Failed while Updating Everfi users ...")
             sys.exit(1)
-
+        
         self.logger.info("End of integration")
 
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Sync up XMatters with Workday")
 
@@ -498,18 +471,18 @@ def run(self, limit):
         type=str,
         default="info",
     )
-
+   
     parser.add_argument(
         "-m",
-        "--max_limit",
+        "--max_limit", 
         action="store",
         type=int,
-        help="limit the number of changes in Everfi",
-        default=10,
+        help="limit the number of changes in Everfi",        
+        default=10
     )
     args = None
     args = parser.parse_args()
-
+    
     log_level = Util.set_up_logging(args.level)
 
     logger = logging.getLogger(__name__)

From d5a0e63437c4f5704a97f12094e12a0b12efc2d1 Mon Sep 17 00:00:00 2001
From: Jared Snyder <jsnyder@mozilla.com>
Date: Thu, 8 Aug 2024 11:59:24 -0500
Subject: [PATCH 19/33] added more tests to prophet_forecast

---
 .../kpi_forecasting/models/base_forecast.py   |  20 +-
 .../models/prophet_forecast.py                |  31 +-
 .../tests/test_base_forecast.py               |  39 +-
 .../tests/test_prophet_forecast.py            | 533 ++++++++++++++++++
 4 files changed, 598 insertions(+), 25 deletions(-)

diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
index f41f3b59..ed958518 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
@@ -189,7 +189,7 @@ def summarize(
         Returns:
             pd.DataFrame: metric dataframe for all metrics and aggregations
         """
-        self.summary_df = pd.concat(
+        summary_df = pd.concat(
             [
                 self._summarize(
                     self.forecast_df,
@@ -202,4 +202,22 @@ def summarize(
             ]
         )
 
+        # add Metric Hub metadata columns
+        summary_df["metric_alias"] = self.metric_hub.alias.lower()
+        summary_df["metric_hub_app_name"] = self.metric_hub.app_name.lower()
+        summary_df["metric_hub_slug"] = self.metric_hub.slug.lower()
+        summary_df["metric_start_date"] = pd.to_datetime(self.metric_hub.min_date)
+        summary_df["metric_end_date"] = pd.to_datetime(self.metric_hub.max_date)
+        summary_df["metric_collected_at"] = self.collected_at
+
+        # add forecast model metadata columns
+        summary_df["forecast_start_date"] = self.start_date
+        summary_df["forecast_end_date"] = self.end_date
+        summary_df["forecast_trained_at"] = self.trained_at
+        summary_df["forecast_predicted_at"] = self.predicted_at
+
+        summary_df["forecast_parameters"] = self.metadata_params
+
+        self.summary_df = summary_df
+
         return self.summary_df
diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py
index 60b8982a..19f57e1d 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py
@@ -20,15 +20,20 @@ class ProphetForecast(BaseForecast):
     def column_names_map(self) -> Dict[str, str]:
         return {"submission_date": "ds", "value": "y"}
 
-    def _fit(self, observed_df) -> None:
-        self.model = prophet.Prophet(
-            **self.parameters,
+    def _build_model(self, parameter_dict):
+        model = prophet.Prophet(
+            **parameter_dict,
             uncertainty_samples=self.number_of_simulations,
             mcmc_samples=0,
         )
 
         if self.use_holidays:
-            self.model.add_country_holidays(country_name="US")
+            model.add_country_holidays(country_name="US")
+
+        return model
+
+    def _fit(self, observed_df) -> None:
+        self.model = self._build_model(self.parameters)
 
         # Modify observed data to have column names that Prophet expects, and fit
         # the model
@@ -235,24 +240,6 @@ def _summarize(
         # add summary metadata columns
         df["aggregation_period"] = period.lower()
 
-        # reorder columns to make interpretation easier
-        df = df[["submission_date", "aggregation_period", "source", "measure", "value"]]
-
-        # add Metric Hub metadata columns
-        df["metric_alias"] = self.metric_hub.alias.lower()
-        df["metric_hub_app_name"] = self.metric_hub.app_name.lower()
-        df["metric_hub_slug"] = self.metric_hub.slug.lower()
-        df["metric_start_date"] = pd.to_datetime(self.metric_hub.min_date)
-        df["metric_end_date"] = pd.to_datetime(self.metric_hub.max_date)
-        df["metric_collected_at"] = self.collected_at
-
-        # add forecast model metadata columns
-        df["forecast_start_date"] = self.start_date
-        df["forecast_end_date"] = self.end_date
-        df["forecast_trained_at"] = self.trained_at
-        df["forecast_predicted_at"] = self.predicted_at
-        df["forecast_parameters"] = self.metadata_params
-
         return df
 
     def _summarize_legacy(self) -> pd.DataFrame:
diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
index 6a731560..17ce4d27 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
@@ -1,4 +1,5 @@
 from typing import List
+import collections
 
 import pytest
 import pandas as pd
@@ -160,6 +161,18 @@ def test_summarize(good_class):
     )
     good_class.forecast_df = np.array([1, 2])
     good_class.observed_df = np.array([3, 4])
+    MetricHub = collections.namedtuple(
+        "MetricHub",
+        ["alias", "app_name", "slug", "min_date", "max_date"],
+    )
+
+    dummy_metric_hub = MetricHub("", "", "", "2124-01-01", "2124-01-01")
+
+    # add it here rather than in __init__ so it doesn't try to load data
+    good_class.metric_hub = dummy_metric_hub
+    good_class.trained_at = ""
+    good_class.predicted_at = ""
+
     number_val = 10
     output = good_class.summarize(
         periods=["a", "b", "c"], numpy_aggregations=["sum"], percentiles=["percentiles"]
@@ -170,5 +183,27 @@ def test_summarize(good_class):
             for el in ["a", "b", "c"]
         ]
     )
-    assert output.reset_index(drop=True).equals(expected_output)
-    assert good_class.summary_df.reset_index(drop=True).equals(expected_output)
+    # not going to check all the metadata columns
+    # in assert_frame_equal.  Just make sure they're there
+    metadata_columns = {
+        "metric_alias",
+        "metric_hub_app_name",
+        "metric_hub_slug",
+        "metric_start_date",
+        "metric_end_date",
+        "metric_collected_at",
+        "forecast_start_date",
+        "forecast_end_date",
+        "forecast_trained_at",
+        "forecast_predicted_at",
+        "forecast_parameters",
+    }
+    assert set(expected_output.columns) | metadata_columns == set(output.columns)
+
+    pd.testing.assert_frame_equal(
+        output[expected_output.columns].reset_index(drop=True), expected_output
+    )
+    pd.testing.assert_frame_equal(
+        good_class.summary_df[expected_output.columns].reset_index(drop=True),
+        expected_output,
+    )
diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py
index 18d3df67..ce372cf6 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py
@@ -1,11 +1,544 @@
 import pandas as pd
 from dotmap import DotMap
 import numpy as np
+import pytest
+import collections
 
 
 from kpi_forecasting.models.prophet_forecast import ProphetForecast
 
 
+@pytest.fixture
+def forecast():
+    A1_start_date = "2124-01-01"
+    parameter_dict = {
+        "model_setting_split_dim": "a",
+        "segment_settings": {
+            "A1": {
+                "start_date": A1_start_date,
+                "end_date": None,
+                "holidays": [],
+                "regressors": [],
+                "grid_parameters": {"param1": [1, 2], "param2": [20, 10]},
+                "cv_settings": {},
+            },
+        },
+    }
+
+    parameter_dotmap = DotMap(parameter_dict)
+    predict_start_date = "2124-01-02"
+    predict_end_date = "2124-03-01"
+    return ProphetForecast(
+        model_type="test",
+        parameters=parameter_dotmap,
+        use_holidays=None,
+        start_date=predict_start_date,
+        end_date=predict_end_date,
+        metric_hub=None,
+    )
+
+
+class MockModel:
+    """Used in place of prophet.Prophet for testing purposes"""
+
+    def __init__(self, param1=0, param2=0, **kwargs):
+        self.value = param1 * param2
+        self.history = None
+
+    def fit(self, df, *args, **kwargs):
+        self.history = df
+        return None
+
+    def predict(self, dates_to_predict):
+        output = dates_to_predict.copy()
+
+        output[
+            [
+                "yhat",
+                "trend",
+                "trend_upper",
+                "trend_lower",
+                "weekly",
+                "weekly_upper",
+                "weekly_lower",
+                "yearly",
+                "yearly_upper",
+                "yearly_lower",
+            ]
+        ] = 0  # some dummy value so it has the right shape
+
+        return output
+
+    def predictive_samples(self, dates_to_predict):
+        # prophet function outputs dict of numpy arrays
+        # only element we care about is `yhat`
+        output = np.arange(len(dates_to_predict)) * self.value
+        return {"yhat": {0: output}}
+
+
+def mock_build_model(parameters):
+    """mocks the FunnelForecast build_model method"""
+    return MockModel(
+        **parameters,
+    )
+
+
+def mock_aggregate_forecast_observed(
+    forecast_df, observed_df, period, numpy_aggregations, percentiles
+):
+    """Mocks the aggregate_forecast_observed function defined in ProphetForecast
+    and inherited in FunnelForecast.
+    This function is tested extensively in test_prophet_forecast
+    so we can make dummy outputs for tests related to it"""
+
+    # add dummy columns where aggregated metrics woudl go
+    percentile_columns = [f"p{el}" for el in percentiles]
+    output_forecast_df = forecast_df.copy()
+    output_forecast_df[numpy_aggregations + percentile_columns] = 0
+    return output_forecast_df, observed_df.copy()
+
+
+def test_under_fit(forecast, mocker):
+    """test the _fit method"""
+
+    observed_data = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+        }
+    )
+    mocker.patch.object(forecast, "_build_model", mock_build_model)
+
+    forecast._fit(observed_data)
+
+    # checking that history is set in the mocked Model ensures fit was called on it
+    pd.testing.assert_frame_equal(
+        observed_data.rename(columns={"submission_date": "ds"}), forecast.model.history
+    )
+
+
+def test_fit(forecast, mocker):
+    """test the fit function.  It is inherited from BaseForecast
+    and calls _fit with the proper object attributes.  Test looks very
+    similar to that for _fit"""
+    observed_data = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+        }
+    )
+    mocker.patch.object(forecast, "_build_model", mock_build_model)
+
+    forecast.observed_df = observed_data
+    forecast.fit()
+
+    # checking that history is set in the mocked Model ensures fit was called on it
+    pd.testing.assert_frame_equal(
+        observed_data.rename(columns={"submission_date": "ds"}), forecast.model.history
+    )
+
+    assert forecast.trained_at is not None
+
+
+def test_combine_forecast_observed(mocker, forecast):
+    """tests the _combine_forecast_observed method"""
+    # 2024-01-01 is chosen as an arbitrary date to center the tests around
+
+    # forecast predictions are set with the
+    # mock_aggregate_forecast_observed function so they
+    # can be ommited here
+    forecast_df = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+        }
+    )
+
+    # rows with negative values are those expected to be removed
+    # by filters in summarize
+    observed_df = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+            "value": [10, 20],
+        }
+    )
+
+    mocker.patch.object(
+        forecast, "_aggregate_forecast_observed", mock_aggregate_forecast_observed
+    )
+
+    numpy_aggregations = ["mean"]
+    percentiles = [10, 50, 90]
+    output_df = forecast._combine_forecast_observed(
+        forecast_df,
+        observed_df,
+        period="period",
+        numpy_aggregations=numpy_aggregations,
+        percentiles=percentiles,
+    )
+    observed_expected_df = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+            "value": [10, 20],
+            "measure": ["observed", "observed"],
+            "source": ["historical", "historical"],
+        }
+    )
+
+    # 4x2 columns, 4 metrics (mean, p10, p50, p90)
+    forecast_expected_df = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+            "measure": ["mean", "mean", "p10", "p10", "p50", "p50", "p90", "p90"],
+            "value": [0] * 8,
+            "source": ["forecast"] * 8,
+        }
+    )
+
+    # concat in same order to make our lives easier
+    expected = pd.concat([observed_expected_df, forecast_expected_df]).sort_values(
+        ["submission_date", "measure"]
+    )
+    assert set(expected.columns) == set(output_df.columns)
+    # force value columns to be floats in both cases to make check easier
+    numeric_cols = ["value", "value_low", "value_mid", "value_high"]
+    # expected[numeric_cols] = expected[numeric_cols].astype(float)
+    # output_df[numeric_cols] = output_df[numeric_cols].astype(float)
+    pd.testing.assert_frame_equal(
+        output_df.sort_values(["submission_date", "measure"]).reset_index(drop=True),
+        expected[output_df.columns].reset_index(drop=True),
+    )
+
+    # should not be any nulls outside the metric column
+    non_metric_columns = [el for el in output_df.columns if el not in numeric_cols]
+    assert not pd.isna(output_df[non_metric_columns]).any(axis=None)
+
+
+def test_under_summarize(mocker, forecast):
+    """testing _summarize"""
+    # 2024-01-01 is chosen as an arbitrary date to center the tests around
+
+    # forecast predictions are set with the
+    # mock_aggregate_forecast_observed function so they
+    # can be ommited here
+    forecast_df = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+        }
+    )
+
+    # rows with negative values are those expected to be removed
+    # by filters in summarize
+    observed_df = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+            "value": [10, 20],
+        }
+    )
+
+    mocker.patch.object(
+        forecast, "_aggregate_forecast_observed", mock_aggregate_forecast_observed
+    )
+
+    numpy_aggregations = ["mean"]
+    percentiles = [10, 50, 90]
+    output_df = forecast._summarize(
+        forecast_df,
+        observed_df,
+        period="period",
+        numpy_aggregations=numpy_aggregations,
+        percentiles=percentiles,
+    )
+    observed_expected_df = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+            "value": [10, 20],
+            "measure": ["observed", "observed"],
+            "source": ["historical", "historical"],
+        }
+    )
+
+    # 4x2 columns, 4 metrics (mean, p10, p50, p90)
+    forecast_expected_df = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+            "measure": ["mean", "mean", "p10", "p10", "p50", "p50", "p90", "p90"],
+            "value": [0] * 8,
+            "source": ["forecast"] * 8,
+        }
+    )
+
+    # concat in same order to make our lives easier
+    expected = pd.concat([observed_expected_df, forecast_expected_df]).sort_values(
+        ["submission_date", "measure"]
+    )
+    expected["aggregation_period"] = "period"
+
+    assert set(expected.columns) == set(output_df.columns)
+    # force value columns to be floats in both cases to make check easier
+    numeric_cols = ["value", "value_low", "value_mid", "value_high"]
+    # expected[numeric_cols] = expected[numeric_cols].astype(float)
+    # output_df[numeric_cols] = output_df[numeric_cols].astype(float)
+    pd.testing.assert_frame_equal(
+        output_df.sort_values(["submission_date", "measure"]).reset_index(drop=True),
+        expected[output_df.columns].reset_index(drop=True),
+    )
+
+    # should not be any nulls outside the metric column
+    non_metric_columns = [el for el in output_df.columns if el not in numeric_cols]
+    assert not pd.isna(output_df[non_metric_columns]).any(axis=None)
+
+
+def test_summarize(mocker, forecast):
+    """testing summarize"""
+    # create dummy metric hub object to when meta data from
+    # it is added we don't get an error
+    MetricHub = collections.namedtuple(
+        "MetricHub",
+        ["alias", "app_name", "slug", "min_date", "max_date"],
+    )
+
+    dummy_metric_hub = MetricHub("", "", "", "2124-01-01", "2124-01-01")
+
+    # 2024-01-01 is chosen as an arbitrary date to center the tests around
+
+    # forecast predictions are set with the
+    # mock_aggregate_forecast_observed function so they
+    # can be ommited here
+    forecast_df = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+        }
+    )
+
+    # rows with negative values are those expected to be removed
+    # by filters in summarize
+    observed_df = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+            "value": [10, 20],
+        }
+    )
+
+    mocker.patch.object(
+        forecast, "_aggregate_forecast_observed", mock_aggregate_forecast_observed
+    )
+
+    numpy_aggregations = ["mean"]
+    percentiles = [10, 50, 90]
+
+    forecast.observed_df = observed_df
+    forecast.forecast_df = forecast_df
+    forecast.metric_hub = dummy_metric_hub
+
+    #  timestamp attributes created by fit and predict
+    # must be added manuall
+    forecast.collected_at = ""
+    forecast.trained_at = ""
+    forecast.predicted_at = ""
+    forecast.metadata_params = ""
+
+    numpy_aggregations = ["mean"]
+    percentiles = [10, 50, 90]
+    forecast.summarize(
+        periods=["period1", "period2"],
+        numpy_aggregations=numpy_aggregations,
+        percentiles=percentiles,
+    )
+
+    output_df = forecast.summary_df
+
+    observed_expected_df = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+            "value": [10, 20],
+            "measure": ["observed", "observed"],
+            "source": ["historical", "historical"],
+        }
+    )
+
+    # 4x2 columns, 4 metrics (mean, p10, p50, p90)
+    forecast_expected_df = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+            "measure": ["mean", "mean", "p10", "p10", "p50", "p50", "p90", "p90"],
+            "value": [0] * 8,
+            "source": ["forecast"] * 8,
+        }
+    )
+
+    # concat in same order to make our lives easier
+    expected = pd.concat([observed_expected_df, forecast_expected_df]).sort_values(
+        ["submission_date", "measure"]
+    )
+    expected1 = expected.copy()
+    expected2 = expected.copy()
+    expected1["aggregation_period"] = "period1"
+    expected2["aggregation_period"] = "period2"
+
+    expected = pd.concat([expected1, expected2])
+
+    # not going to check all the metadata columns
+    # in assert_frame_equal.  Just make sure they're there
+    metadata_columns = {
+        "metric_alias",
+        "metric_hub_app_name",
+        "metric_hub_slug",
+        "metric_start_date",
+        "metric_end_date",
+        "metric_collected_at",
+        "forecast_start_date",
+        "forecast_end_date",
+        "forecast_trained_at",
+        "forecast_predicted_at",
+        "forecast_parameters",
+    }
+    assert set(expected.columns) | metadata_columns == set(output_df.columns)
+    # force value columns to be floats in both cases to make check easier
+    numeric_cols = ["value", "value_low", "value_mid", "value_high"]
+    pd.testing.assert_frame_equal(
+        output_df.sort_values(["submission_date", "aggregation_period", "measure"])[
+            expected.columns
+        ].reset_index(drop=True),
+        expected.sort_values(
+            ["submission_date", "aggregation_period", "measure"]
+        ).reset_index(drop=True),
+    )
+
+    # should not be any nulls outside the metric column
+    non_metric_columns = [el for el in output_df.columns if el not in numeric_cols]
+    assert not pd.isna(output_df[non_metric_columns]).any(axis=None)
+
+
+def test_under_predict(mocker, forecast):
+    """testing _predict"""
+    # this ensures forecast is using MockModel
+    mocker.patch.object(forecast, "_build_model", mock_build_model)
+
+    observed_df = pd.DataFrame(
+        {
+            "y": [0, 1],
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+        }
+    )
+
+    dates_to_predict = pd.DataFrame(
+        {
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ]
+        }
+    )
+    forecast.observed_df = observed_df
+    forecast.parameters = {"param1": 1, "param2": 2}
+    forecast.fit()
+    out = forecast._predict(dates_to_predict).reset_index(drop=True)
+
+    # in MockModel, the predictive_samples method sets the output to
+    # np.arange(len(dates_to_predict)) * self.value for one column called 0
+    # this helps ensure the forecast_df in segment_models is set properly
+    expected = pd.DataFrame(
+        {
+            0: [0, 2],
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+        }
+    )
+
+    pd.testing.assert_frame_equal(out, expected)
+
+    # test predict while we're here
+
+    forecast.dates_to_predict = dates_to_predict
+    forecast.number_of_simulations = 1  # so that _validate doesn't break
+    forecast.predict()
+
+    out = forecast.forecast_df
+
+    # in MockModel, the predictive_samples method sets the output to
+    # np.arange(len(dates_to_predict)) * self.value for one column called 0
+    # this helps ensure the forecast_df in segment_models is set properly
+    expected = pd.DataFrame(
+        {
+            0: [0, 2],
+            "submission_date": [
+                pd.to_datetime("2124-01-01").date(),
+                pd.to_datetime("2124-01-02").date(),
+            ],
+        }
+    )
+
+    pd.testing.assert_frame_equal(out, expected)
+    assert forecast.predicted_at is not None
+
+
 def test_summarize_non_overlapping_day():
     observed_start_date = "2124-01-01"
     observed_end_date = "2124-02-01"

From b3edd109c0764bcf0d429cc0ea3549032505af89 Mon Sep 17 00:00:00 2001
From: Jared Snyder <jsnyder@mozilla.com>
Date: Fri, 9 Aug 2024 09:28:19 -0500
Subject: [PATCH 20/33] removed DotMap

---
 jobs/kpi-forecasting/kpi_forecasting.py       | 14 +++++++-------
 .../configs/model_inputs/__init__.py          |  6 +++---
 .../kpi-forecasting/kpi_forecasting/inputs.py | 19 +++++--------------
 .../kpi_forecasting/metric_hub.py             |  3 +--
 .../kpi_forecasting/models/base_forecast.py   |  2 +-
 .../kpi_forecasting/models/funnel_forecast.py | 12 +++---------
 .../kpi_forecasting/results_processing.py     | 11 ++++++-----
 .../tests/test_base_forecast.py               | 11 +++++------
 .../tests/test_funnel_forecast.py             | 15 +++++----------
 .../tests/test_performance_analysis.py        |  3 +++
 .../tests/test_prophet_forecast.py            | 12 +++++-------
 jobs/kpi-forecasting/requirements.txt         |  1 -
 12 files changed, 44 insertions(+), 65 deletions(-)

diff --git a/jobs/kpi-forecasting/kpi_forecasting.py b/jobs/kpi-forecasting/kpi_forecasting.py
index e7dcca7c..d8c3f04c 100644
--- a/jobs/kpi-forecasting/kpi_forecasting.py
+++ b/jobs/kpi-forecasting/kpi_forecasting.py
@@ -1,4 +1,4 @@
-from kpi_forecasting.inputs import CLI, YAML
+from kpi_forecasting.inputs import CLI, load_yaml
 from kpi_forecasting.models.prophet_forecast import ProphetForecast
 from kpi_forecasting.models.funnel_forecast import FunnelForecast
 from kpi_forecasting.metric_hub import MetricHub
@@ -13,17 +13,17 @@
 
 def main() -> None:
     # Load the config
-    config = YAML(filepath=CLI().args.config).data
-    model_type = config.forecast_model.model_type
+    config = load_yaml(filepath=CLI().args.config)
+    model_type = config["forecast_model"]["model_type"]
 
     if model_type in MODELS:
-        metric_hub = MetricHub(**config.metric_hub)
-        model = MODELS[model_type](metric_hub=metric_hub, **config.forecast_model)
+        metric_hub = MetricHub(**config["metric_hub"])
+        model = MODELS[model_type](metric_hub=metric_hub, **config["forecast_model"])
 
         model.fit()
         model.predict()
-        model.summarize(**config.summarize)
-        model.write_results(**config.write_results)
+        model.summarize(**config["summarize"])
+        model.write_results(**config["write_results"])
 
     else:
         raise ValueError(f"Don't know how to forecast using {model_type}.")
diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/model_inputs/__init__.py b/jobs/kpi-forecasting/kpi_forecasting/configs/model_inputs/__init__.py
index 1ebd482e..caacc611 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/configs/model_inputs/__init__.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/configs/model_inputs/__init__.py
@@ -3,15 +3,15 @@
 from pathlib import Path
 
 
-from kpi_forecasting.inputs import YAML
+from kpi_forecasting.inputs import load_yaml
 
 
 PARENT_PATH = Path(__file__).parent
 HOLIDAY_PATH = PARENT_PATH / "holidays.yaml"
 REGRESSOR_PATH = PARENT_PATH / "regressors.yaml"
 
-holiday_collection = YAML(HOLIDAY_PATH)
-regressor_collection = YAML(REGRESSOR_PATH)
+holiday_collection = load_yaml(HOLIDAY_PATH)
+regressor_collection = load_yaml(REGRESSOR_PATH)
 
 
 @attr.s(auto_attribs=True, frozen=False)
diff --git a/jobs/kpi-forecasting/kpi_forecasting/inputs.py b/jobs/kpi-forecasting/kpi_forecasting/inputs.py
index 034af27a..14da5545 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/inputs.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/inputs.py
@@ -2,7 +2,6 @@
 import yaml
 
 from dataclasses import dataclass
-from dotmap import DotMap
 
 
 @dataclass
@@ -20,18 +19,10 @@ def __post_init__(self) -> None:
         self.args = self.parser.parse_args()
 
 
-@dataclass
-class YAML:
+def load_yaml(filepath: str) -> dict:
     """
-    Create a data structure from a YAML config filepath. Instead of loading the
-    YAML as a dictionary, which requires verbose code to access nested dictionary
-    values, this class loads YAML as a dot map. Nested values can be accessed using
-    dot notation, like `YAML(<filepath>).data.section.subsection.value`.
+    Create a data structure from a YAML config filepath.
     """
-
-    filepath: str
-
-    def __post_init__(self) -> None:
-        with open(self.filepath, "r") as f:
-            data = yaml.safe_load(f)
-        self.data = DotMap(data)
+    with open(filepath, "r") as f:
+        data = yaml.safe_load(f)
+    return data
diff --git a/jobs/kpi-forecasting/kpi_forecasting/metric_hub.py b/jobs/kpi-forecasting/kpi_forecasting/metric_hub.py
index 64cf9d42..e0a86c83 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/metric_hub.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/metric_hub.py
@@ -1,7 +1,6 @@
 import pandas as pd
 
 from dataclasses import dataclass
-from dotmap import DotMap
 from google.cloud import bigquery
 from mozanalysis.config import ConfigLoader
 from textwrap import dedent
@@ -36,7 +35,7 @@ class MetricHub:
     app_name: str
     slug: str
     start_date: str
-    segments: DotMap = None
+    segments: dict = None
     where: str = None
     end_date: str = None
     alias: str = None
diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
index ed958518..45c567d2 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
@@ -71,7 +71,7 @@ def __post_init__(self) -> None:
         self.metadata_params = json.dumps(
             {
                 "model_type": self.model_type.lower(),
-                "model_params": self.parameters.toDict(),
+                "model_params": self.parameters,
                 "use_holidays": self.use_holidays,
             }
         )
diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py
index 9652ce02..52aa9cc8 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py
@@ -117,22 +117,16 @@ def _set_segment_models(
         ## file. Parse the holidays and regressors specified in the config file.
         segment_models = []
         for segment in segment_combinations:
-            model_params = getattr(
-                self.parameters["segment_settings"], segment[split_dim]
-            )
+            model_params = self.parameters["segment_settings"][segment[split_dim]]
 
             holiday_list = []
             regressor_list = []
 
             if model_params["holidays"]:
-                holiday_list = [
-                    getattr(holiday_collection.data, h)
-                    for h in model_params["holidays"]
-                ]
+                holiday_list = [holiday_collection[h] for h in model_params["holidays"]]
             if model_params["regressors"]:
                 regressor_list = [
-                    getattr(regressor_collection.data, r)
-                    for r in model_params["regressors"]
+                    regressor_collection[r] for r in model_params["regressors"]
                 ]
 
             # Create a SegmentModelSettings object for each segment combination
diff --git a/jobs/kpi-forecasting/kpi_forecasting/results_processing.py b/jobs/kpi-forecasting/kpi_forecasting/results_processing.py
index f7e8ab88..1cb8a9d1 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/results_processing.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/results_processing.py
@@ -4,7 +4,7 @@
 from google.cloud import bigquery
 from google.cloud.bigquery.enums import SqlTypeNames as bq_types
 
-from kpi_forecasting.inputs import YAML
+from kpi_forecasting.inputs import load_yaml
 import pandas as pd
 import numpy as np
 
@@ -74,12 +74,13 @@ def _set_intra_forecast_agg_functions(self):
 
     def _load_config_data(self):
         """Extracts data from the list of config files passed to the class and stores it in the
-        config_data attribute. The filename is the key, and the contents (represnted as a DotMap)
+        config_data attribute. The filename is the key, and the contents
         are the values"""
         self.config_data = {}
         for config_file in self.input_config_list:
             full_path = f"{self.input_config_path}/{config_file}"
-            config_data = YAML(full_path).data
+            config_data = load_yaml(full_path)
+            print(config_data)
             self.config_data[config_file] = config_data
 
     def _extract_config_data(self):
@@ -99,7 +100,7 @@ def _extract_config_data(self):
         config_file_list = list(self.config_data.keys())
         for config_data in self.config_data.values():
             # get segment data
-            metric_hub_data = config_data.metric_hub.toDict()
+            metric_hub_data = config_data["metric_hub"]
             if "segments" in metric_hub_data:
                 segment_data = metric_hub_data["segments"]
                 segment_data_list.append(segment_data)
@@ -107,7 +108,7 @@ def _extract_config_data(self):
                 segment_data_list.append(None)
 
             # get input table info
-            input_table_list.append(config_data.write_results.toDict())
+            input_table_list.append(config_data["write_results"])
 
         input_table_data = input_table_list.pop(0)
         input_table_matches_first = [input_table_data == el for el in input_table_list]
diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
index 17ce4d27..97c5b229 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
@@ -3,7 +3,6 @@
 
 import pytest
 import pandas as pd
-from dotmap import DotMap
 import numpy as np
 from datetime import datetime, timedelta, timezone
 
@@ -81,7 +80,7 @@ def test_post_init(good_class):
     end_date = "2124-02-02"
     good_class = good_class(
         model_type="test",
-        parameters=DotMap(),
+        parameters={},
         use_holidays=None,
         start_date=start_date,
         end_date=end_date,
@@ -101,7 +100,7 @@ def test_post_init_default_dates(good_class):
     # check default start and end time
     good_class = good_class(
         model_type="test",
-        parameters=DotMap(),
+        parameters={},
         use_holidays=None,
         start_date="",
         end_date="",
@@ -122,7 +121,7 @@ def test_post_init_default_dates(good_class):
 def test_fit(good_class):
     good_class = good_class(
         model_type="test",
-        parameters=DotMap(),
+        parameters={},
         use_holidays=None,
         start_date="2124-01-01",
         end_date="2124-02-02",
@@ -138,7 +137,7 @@ def test_fit(good_class):
 def test_predict_and_validate(good_class):
     good_class = good_class(
         model_type="test",
-        parameters=DotMap(),
+        parameters={},
         use_holidays=None,
         start_date="2124-01-01",
         end_date="2124-02-02",
@@ -153,7 +152,7 @@ def test_predict_and_validate(good_class):
 def test_summarize(good_class):
     good_class = good_class(
         model_type="test",
-        parameters=DotMap(),
+        parameters={},
         use_holidays=None,
         start_date="2124-01-01",
         end_date="2124-02-02",
diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py
index c792db67..f7f14184 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py
@@ -3,7 +3,6 @@
 import collections
 
 import pandas as pd
-from dotmap import DotMap
 import pytest
 import numpy as np
 
@@ -21,7 +20,7 @@ def forecast():
 
     forecast = FunnelForecast(
         model_type="test",
-        parameters=DotMap(),
+        parameters={},
         use_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
@@ -82,13 +81,12 @@ def funnel_forecast_for_fit_tests(segment_info_fit_tests, mocker):
         },
     }
 
-    parameter_dotmap = DotMap(parameter_dict)
     predict_start_date = "2124-01-01"
     predict_end_date = "2124-01-02"
 
     forecast = FunnelForecast(
         model_type="test",
-        parameters=parameter_dotmap,
+        parameters=parameter_dict,
         use_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
@@ -507,13 +505,12 @@ def test_under_predict(mocker):
         },
     }
 
-    parameter_dotmap = DotMap(parameter_dict)
     predict_start_date = "2124-01-02"
     predict_end_date = "2124-03-01"
 
     forecast = FunnelForecast(
         model_type="test",
-        parameters=parameter_dotmap,
+        parameters=parameter_dict,
         use_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
@@ -871,13 +868,12 @@ def test_set_segment_models():
         },
     }
 
-    parameter_dotmap = DotMap(parameter_dict)
     predict_start_date = "2124-01-01"
     predict_end_date = "2124-03-01"
 
     forecast = FunnelForecast(
         model_type="test",
-        parameters=parameter_dotmap,
+        parameters=parameter_dict,
         use_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
@@ -950,13 +946,12 @@ def test_set_segment_models_exception():
         },
     }
 
-    parameter_dotmap = DotMap(parameter_dict)
     predict_start_date = "2124-01-01"
     predict_end_date = "2124-03-01"
 
     forecast = FunnelForecast(
         model_type="test",
-        parameters=parameter_dotmap,
+        parameters=parameter_dict,
         use_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_performance_analysis.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_performance_analysis.py
index edbc2cbb..3e4f0120 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_performance_analysis.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_performance_analysis.py
@@ -58,6 +58,7 @@ def directory_of_configs(tmp_path_factory):
             "dataset": "y",
             "table": "z",
         },
+        "metric_hub": {},
     }
     f4 = tmpdir / "config_nosegments1_1.yaml"
     f5 = tmpdir / "config_nosegments1_2.yaml"
@@ -73,6 +74,7 @@ def directory_of_configs(tmp_path_factory):
             "dataset": "q",
             "table": "z",
         },
+        "metric_hub": {},
     }
     f6 = tmpdir / "config_nosegments2_1.yaml"
 
@@ -91,6 +93,7 @@ def get_forecast_performance_config(tmp_path_factory):
             "dataset": "",
             "table": "",
         },
+        "metric_hub": {},
     }
     f1 = tmpdir / "config.yaml"
     with open(f1, "w") as outfile:
diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py
index ce372cf6..f86b032c 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py
@@ -1,5 +1,4 @@
 import pandas as pd
-from dotmap import DotMap
 import numpy as np
 import pytest
 import collections
@@ -25,12 +24,11 @@ def forecast():
         },
     }
 
-    parameter_dotmap = DotMap(parameter_dict)
     predict_start_date = "2124-01-02"
     predict_end_date = "2124-03-01"
     return ProphetForecast(
         model_type="test",
-        parameters=parameter_dotmap,
+        parameters=parameter_dict,
         use_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
@@ -548,7 +546,7 @@ def test_summarize_non_overlapping_day():
 
     forecast = ProphetForecast(
         model_type="test",
-        parameters=DotMap(),
+        parameters={},
         use_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
@@ -651,7 +649,7 @@ def test_summarize_non_overlapping_month():
 
     forecast = ProphetForecast(
         model_type="test",
-        parameters=DotMap(),
+        parameters={},
         use_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
@@ -770,7 +768,7 @@ def test_summarize_overlapping_day():
 
     forecast = ProphetForecast(
         model_type="test",
-        parameters=DotMap(),
+        parameters={},
         use_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
@@ -875,7 +873,7 @@ def test_summarize_overlapping_month():
 
     forecast = ProphetForecast(
         model_type="test",
-        parameters=DotMap(),
+        parameters={},
         use_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
diff --git a/jobs/kpi-forecasting/requirements.txt b/jobs/kpi-forecasting/requirements.txt
index 218d688a..cae076e6 100644
--- a/jobs/kpi-forecasting/requirements.txt
+++ b/jobs/kpi-forecasting/requirements.txt
@@ -10,7 +10,6 @@ contourpy==1.1.0
 convertdate==2.4.0
 cycler==0.11.0
 db-dtypes==1.1.1
-dotmap==1.3.30
 ephem==4.1.4
 exceptiongroup==1.1.1
 fonttools==4.40.0

From fd1435b74b2dee6edbdb55faafb5fc670b34c138 Mon Sep 17 00:00:00 2001
From: Jared Snyder <jsnyder@mozilla.com>
Date: Fri, 9 Aug 2024 13:10:42 -0500
Subject: [PATCH 21/33] modified README to make it match better between
 FunnelForecast and ProphetForecast

---
 jobs/kpi-forecasting/README.md                |  83 +++++++++-
 .../kpi_forecasting/configs/dau_desktop.yaml  |   2 +
 .../kpi_forecasting/configs/dau_mobile.yaml   |   2 +
 .../configs/search_forecasting_ad_clicks.yaml |  72 +++++----
 ...search_forecasting_daily_active_users.yaml |  66 ++++----
 .../search_forecasting_search_count.yaml      |  66 ++++----
 .../kpi_forecasting/models/base_forecast.py   |  20 ++-
 .../kpi_forecasting/models/funnel_forecast.py |  70 ++++----
 .../models/prophet_forecast.py                |  10 ++
 .../tests/test_funnel_forecast.py             | 151 +++++++++---------
 10 files changed, 326 insertions(+), 216 deletions(-)

diff --git a/jobs/kpi-forecasting/README.md b/jobs/kpi-forecasting/README.md
index 31231cf8..ff1a6ed8 100644
--- a/jobs/kpi-forecasting/README.md
+++ b/jobs/kpi-forecasting/README.md
@@ -85,8 +85,87 @@ The tests can be run locally with `python -m pytest` in the root directory of th
 
 # YAML Configs
 
-Each of the sections in the YAML files contains a list of arguments that are passed to their relevant objects or methods.
-Definitions should be documented in the code.
+Configuration for each forecast is found in the `configs` folder.  Below is an example config file with sample values and a description of what the field means as a comment whe it is not self-evident
+
+```
+metric_hub:  # this configures the observed data fed to the model which is obtained via metrichub
+  app_name: "multi_product"  # metric-hub app name
+  slug: "search_forecasting_ad_clicks"  # metric-hub slug
+  alias: "search_forecasting_ad_clicks"  # metric-hub alias
+  start_date: "2018-01-01"  # date at which the observed data should start
+  end_date: "last complete month"
+    # date at which the observed data will end, can be a date or "last complete month" 
+    # which uses `utils.parse_end_date` to determine the last complete month  
+  segments:  
+        # this section is optional and currently only used in funnel forecast, 
+        # specifies which segments are used to partition the data, 
+        # enabling separate models to be fit for each partition.  
+        # Values underneath are a map of column names to be output by the 
+        # metric-hub call and the SQL queries to populate those columns 
+    device: "device"
+    channel: "'all'"
+    country: "CASE WHEN country = 'US' THEN 'US' ELSE 'ROW' END"
+    partner: "partner"
+  where: "partner = 'Google'"  # filter to apply to the metric hub pull
+
+forecast_model:  # this section configures the model 
+  model_type: "funnel"  
+    # type of model object to use, current options are "funnel" for FunnelForecast and "prophet" for ProphetForecast
+  start_date: NULL  
+    # starting date for the predicted data (unless predict_historical_dates is set), 
+    # if unset, value depends on predict_historical_dates.
+  end_date: NULL
+    # final date for the predicted data
+  use_holidays: False
+    For prophet-based models, when true, call `model.add_country_holidays(country_name="US")` on model
+  predict_historical_dates: True
+    # if predict_historical_dates is True, set to first date of the observed data
+    # if predict_historical_dates is False, defaults to the day after the last day in the observed data
+  number_of_simulations: 1000
+    # for prophet-based models,number of simulations to run
+  parameters:
+    # this section can be a map or a list.  
+    # If it's a map, these parameters are used for all models
+    # (recall multiple models are train if there is a metric_hub.segments)
+    # If it's a list, it will set different parameters
+    # for different subsets of the parition specified in `metric_hub.segments`. 
+    - segment: 
+        # specifies which subset of the partitions this applies to
+        # key is a column specified in metric_hub.segments
+        # value is a value that column can take to which the configuration is applied
+        device: desktop
+      start_date: "2018-01-01"  # only applies to FunnelForecast, allows one to set start date for each sub-model
+      end_date: NULL # only applies to FunnelForecast, allows one to set end date for each sub-model
+      holidays: ["easter", "covid_sip11"]  # holidays specified in `configs.model_inputs.holidays` to use.
+      regressors: ["post_esr_migration", "in_covid", "ad_click_bug"] # regressors specified in `configs.model_inputs.regressors`
+      grid_parameters:
+        # sets grid for hyperparameter tuning
+        changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5]
+        changepoint_range: [0.8, 0.9]
+        n_changepoints: [25, 50]
+        weekly_seasonality: True
+        yearly_seasonality: True
+      cv_settings:
+        # sets parameters for prophet cross-validation used in FunnelForecast
+        initial: "1296 days"
+        period: "30 days"
+        horizon: "30 days"
+        parallel: "processes"
+    ...
+
+summarize:
+    # parameters used to summarize and aggregate the predictions
+  periods: ["day", "month"]  # periods to aggregate up to
+  numpy_aggregations: ["mean"] # numpy aggregation functions to use when aggregating predictions
+  percentiles: [10, 50, 90] # precentiles to calculate on aggregation
+
+write_results:
+    # set the project, dataset and table for output data
+  project: "moz-fx-data-shared-prod"
+  dataset: "search_derived"
+  table: "search_funnel_forecasts_v1"
+  components_table: "search_forecast_model_components_v1"
+```
 
 # Development
 
diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml
index 5ba432ea..83e80ab9 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml
+++ b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml
@@ -11,6 +11,8 @@ forecast_model:
   start_date: NULL
   end_date: NULL
   use_holidays: False
+  predict_historical_dates: False
+  number_of_simulations: 1000
   parameters:
     seasonality_prior_scale: 0.00825
     changepoint_prior_scale: 0.15983
diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml
index 74889971..a3a9f3eb 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml
+++ b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml
@@ -11,6 +11,8 @@ forecast_model:
   start_date: NULL
   end_date: NULL
   use_holidays: True
+  predict_historical_dates: False
+  number_of_simulations: 1000
   parameters:
     seasonality_prior_scale: 0.01
     changepoint_prior_scale: 0.01
diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml
index a756b518..ea8a2a64 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml
+++ b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml
@@ -17,42 +17,44 @@ forecast_model:
   start_date: NULL
   end_date: NULL
   use_holidays: False
+  predict_historical_dates: True
+  number_of_simulations: 1000
   parameters:
-    model_setting_split_dim: "device"
-    segment_settings:
-      desktop:
-        start_date: "2018-01-01"
-        end_date: NULL
-        holidays: ["easter", "covid_sip11"]
-        regressors: ["post_esr_migration", "in_covid", "ad_click_bug"]
-        grid_parameters:
-          changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5]
-          changepoint_range: [0.8, 0.9]
-          n_changepoints: [25, 50]
-          weekly_seasonality: True
-          yearly_seasonality: True
-        cv_settings:
-          initial: "1296 days"
-          period: "30 days"
-          horizon: "30 days"
-          parallel: "processes"
-      mobile:
-        start_date: "2022-01-01"
-        end_date: NULL
-        holidays: ["easter"]
-        regressors: ["after_fenix", "in_covid"]
-        grid_parameters:
-          changepoint_prior_scale: [.01, .1, .15, .2]
-          changepoint_range: [0.8, 0.9, 1]
-          n_changepoints: [30]
-          weekly_seasonality: True
-          yearly_seasonality: True
-          growth: "logistic"
-        cv_settings:
-          initial: "366 days"
-          period: "30 days"
-          horizon: "30 days"
-          parallel: "processes"
+    - segment: 
+        device: desktop
+      start_date: "2018-01-01"
+      end_date: NULL
+      holidays: ["easter", "covid_sip11"]
+      regressors: ["post_esr_migration", "in_covid", "ad_click_bug"]
+      grid_parameters:
+        changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5]
+        changepoint_range: [0.8, 0.9]
+        n_changepoints: [25, 50]
+        weekly_seasonality: True
+        yearly_seasonality: True
+      cv_settings:
+        initial: "1296 days"
+        period: "30 days"
+        horizon: "30 days"
+        parallel: "processes"
+    - segment:
+        device: mobile
+      start_date: "2022-01-01"
+      end_date: NULL
+      holidays: ["easter"]
+      regressors: ["after_fenix", "in_covid"]
+      grid_parameters:
+        changepoint_prior_scale: [.01, .1, .15, .2]
+        changepoint_range: [0.8, 0.9, 1]
+        n_changepoints: [30]
+        weekly_seasonality: True
+        yearly_seasonality: True
+        growth: "logistic"
+      cv_settings:
+        initial: "366 days"
+        period: "30 days"
+        horizon: "30 days"
+        parallel: "processes"
 
 summarize:
   periods: ["day", "month"]
diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml
index b6643c4a..3ce3568e 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml
+++ b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml
@@ -17,39 +17,41 @@ forecast_model:
   start_date: NULL
   end_date: NULL
   use_holidays: False
+  predict_historical_dates: True
+  number_of_simulations: 1000
   parameters:
-    model_setting_split_dim: "device"
-    segment_settings:
-      desktop:
-        start_date: "2018-01-01"
-        end_date: NULL
-        holidays: ["easter", "covid_sip11"]
-        regressors: ["post_esr_migration", "in_covid"]
-        grid_parameters:
-          changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5]
-          changepoint_range: [0.8, 0.9]
-          weekly_seasonality: True
-          yearly_seasonality: True
-        cv_settings:
-          initial: "1296 days"
-          period: "30 days"
-          horizon: "30 days"
-          parallel: "processes"
-      mobile:
-        start_date: "2021-01-01"
-        end_date: NULL
-        holidays: ["easter"]
-        regressors: ["after_fenix", "in_covid"]
-        grid_parameters:
-          changepoint_prior_scale: [0.001, 0.01, 0.1]
-          weekly_seasonality: True
-          yearly_seasonality: True
-          growth: "logistic"
-        cv_settings:
-          initial: "366 days"
-          period: "30 days"
-          horizon: "30 days"
-          parallel: "processes"
+    - segment:
+        device: desktop
+      start_date: "2018-01-01"
+      end_date: NULL
+      holidays: ["easter", "covid_sip11"]
+      regressors: ["post_esr_migration", "in_covid"]
+      grid_parameters:
+        changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5]
+        changepoint_range: [0.8, 0.9]
+        weekly_seasonality: True
+        yearly_seasonality: True
+      cv_settings:
+        initial: "1296 days"
+        period: "30 days"
+        horizon: "30 days"
+        parallel: "processes"
+    - segment: 
+        device: mobile
+      start_date: "2021-01-01"
+      end_date: NULL
+      holidays: ["easter"]
+      regressors: ["after_fenix", "in_covid"]
+      grid_parameters:
+        changepoint_prior_scale: [0.001, 0.01, 0.1]
+        weekly_seasonality: True
+        yearly_seasonality: True
+        growth: "logistic"
+      cv_settings:
+        initial: "366 days"
+        period: "30 days"
+        horizon: "30 days"
+        parallel: "processes"
 
 summarize:
   periods: ["day", "month"]
diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml
index 8dd8f811..75f73ba2 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml
+++ b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml
@@ -17,39 +17,41 @@ forecast_model:
   start_date: NULL
   end_date: NULL
   use_holidays: False
+  predict_historical_dates: True
+  number_of_simulations: 1000
   parameters:
-    model_setting_split_dim: "device"
-    segment_settings:
-      desktop:
-        start_date: "2018-01-01"
-        end_date: NULL
-        holidays: ["easter", "covid_sip11"]
-        regressors: ["post_esr_migration", "in_covid"]
-        grid_parameters:
-          changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5]
-          changepoint_range: [0.8, 0.9]
-          weekly_seasonality: True
-          yearly_seasonality: True
-        cv_settings:
-          initial: "1296 days"
-          period: "30 days"
-          horizon: "30 days"
-          parallel: "processes"
-      mobile:
-        start_date: "2020-01-01"
-        end_date: NULL
-        holidays: ["easter"]
-        regressors: ["after_fenix", "in_covid"]
-        grid_parameters:
-          changepoint_prior_scale: [0.001, 0.01, 0.1]
-          weekly_seasonality: True
-          yearly_seasonality: True
-          growth: "logistic"
-        cv_settings:
-          initial: "366 days"
-          period: "30 days"
-          horizon: "30 days"
-          parallel: "processes"
+    - segment:
+        device: desktop
+      start_date: "2018-01-01"
+      end_date: NULL
+      holidays: ["easter", "covid_sip11"]
+      regressors: ["post_esr_migration", "in_covid"]
+      grid_parameters:
+        changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5]
+        changepoint_range: [0.8, 0.9]
+        weekly_seasonality: True
+        yearly_seasonality: True
+      cv_settings:
+        initial: "1296 days"
+        period: "30 days"
+        horizon: "30 days"
+        parallel: "processes"
+    - segment:
+        device: mobile  
+      start_date: "2020-01-01"
+      end_date: NULL
+      holidays: ["easter"]
+      regressors: ["after_fenix", "in_covid"]
+      grid_parameters:
+        changepoint_prior_scale: [0.001, 0.01, 0.1]
+        weekly_seasonality: True
+        yearly_seasonality: True
+        growth: "logistic"
+      cv_settings:
+        initial: "366 days"
+        period: "30 days"
+        horizon: "30 days"
+        parallel: "processes"
 
 summarize:
   periods: ["day", "month"]
diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
index 45c567d2..916c3f07 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
@@ -29,9 +29,6 @@ class BaseForecast(abc.ABC):
             date the metric should be queried.
         metric_hub (MetricHub): A MetricHub object that provides details about the
             metric to be forecasted.
-        number_of_simulations (int): The number of simulated timeseries that the forecast
-            should generate. Since many forecast models are probablistic, this enables the
-            measurement of variation across a range of possible outcomes.
     """
 
     model_type: str
@@ -40,7 +37,7 @@ class BaseForecast(abc.ABC):
     start_date: str
     end_date: str
     metric_hub: MetricHub
-    number_of_simulations: int = 1000
+    predict_historical_dates: bool = False
 
     def _get_observed_data(self):
         if self.metric_hub:
@@ -58,9 +55,18 @@ def __post_init__(self) -> None:
         # use default start/end dates if the user doesn't specify them
         self.start_date = pd.to_datetime(self.start_date or self._default_start_date)
         self.end_date = pd.to_datetime(self.end_date or self._default_end_date)
-        self.dates_to_predict = pd.DataFrame(
-            {"submission_date": pd.date_range(self.start_date, self.end_date).date}
-        )
+        if self.predict_historical_dates:
+            self.dates_to_predict = pd.DataFrame(
+                {
+                    "submission_date": pd.date_range(
+                        self.metric_hub.start_date, self.end_date
+                    ).date
+                }
+            )
+        else:
+            self.dates_to_predict = pd.DataFrame(
+                {"submission_date": pd.date_range(self.start_date, self.end_date).date}
+            )
 
         # initialize unset attributes
         self.model = None
diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py
index 52aa9cc8..aa4a4fb8 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py
@@ -67,15 +67,6 @@ def __post_init__(self) -> None:
             # this is used to avoid the code below for testing purposes
             return
 
-        # Overwrite dates_to_predict to provide historical date forecasts
-        self.dates_to_predict = pd.DataFrame(
-            {
-                "submission_date": pd.date_range(
-                    self.metric_hub.start_date, self.end_date
-                ).date
-            }
-        )
-
         self._set_segment_models(self.observed_df, self.metric_hub.segments.keys())
 
         # initialize unset attributes
@@ -85,10 +76,10 @@ def _set_segment_models(
         self, observed_df: pd.DataFrame, segment_column_list: list
     ) -> None:
         """Creates a SegmentSettings object for each segment specified in the
-            metric_hub.segments section of the config.  These objects are stored in a list
-            in the segment_models attribute
-            Parameters can be specified independently for at most one dimension column
-            set using model_setting_split_dim in self.parameters
+            metric_hub.segments section of the config.  It is populated from the list of
+            parameters in the forecast_model.parameters section of the configuration file.
+            The segements section of each element of the list specifies which values within which
+            segments the parameters are associated with.
 
         Args:
             observed_df (pd.DataFrame): dataframe containing observed data used to model
@@ -100,45 +91,64 @@ def _set_segment_models(
         combination_df = observed_df[segment_column_list].drop_duplicates()
 
         # Construct dictionaries from those combinations
+        # this will be used to check that the config actually partitions the data
         segment_combinations = combination_df.to_dict("records")
 
-        # initialize a list to hold models for each segment
-        ## populate the list with segments and parameters for the segment
-        split_dim = self.parameters["model_setting_split_dim"]
-
-        # check to make sure split_dim is one of the columns set in segment_column_list
-        if split_dim not in segment_column_list:
-            columns_str = ",".join(segment_column_list)
+        # get subset of segment that is used in partitioning
+        split_dims = None
+        for partition in self.parameters:
+            partition_dim = set(partition["segment"].keys())
+            if split_dims and partition_dim != split_dims:
+                raise ValueError(
+                    "Segment keys are not the same across different elements of parameters in the config file"
+                )
+            elif split_dims is None:
+                split_dims = partition_dim
+            else:
+                # this is case where split_dim is set and matches paritition_dim
+                continue
+        if not split_dims <= set(combination_df.keys()):
+            missing_dims = split_dims - set(combination_df.keys())
+            missing_dims_str = ",".join(missing_dims)
             raise ValueError(
-                f"model_setting_split_dim set to {split_dim} which is not among segment columns: {columns_str}"
+                f"Segment keys missing from metric hub segments: {missing_dims_str}"
             )
 
         # For each segment combinination, get the model parameters from the config
         ## file. Parse the holidays and regressors specified in the config file.
         segment_models = []
         for segment in segment_combinations:
-            model_params = self.parameters["segment_settings"][segment[split_dim]]
-
+            # find the correct configuration
+            for partition in self.parameters:
+                partition_segment = partition["segment"]
+                # get subset of segment that is used to partition
+                subset_segment = {
+                    key: val for key, val in segment.items() if key in split_dims
+                }
+                if partition_segment == subset_segment:
+                    # parition is set to the desired value
+                    # break out of loop
+                    break
             holiday_list = []
             regressor_list = []
 
-            if model_params["holidays"]:
-                holiday_list = [holiday_collection[h] for h in model_params["holidays"]]
-            if model_params["regressors"]:
+            if "holidays" in partition:
+                holiday_list = [holiday_collection[h] for h in partition["holidays"]]
+            if "regressors" in partition:
                 regressor_list = [
-                    regressor_collection[r] for r in model_params["regressors"]
+                    regressor_collection[r] for r in partition["regressors"]
                 ]
 
             # Create a SegmentModelSettings object for each segment combination
             segment_models.append(
                 SegmentModelSettings(
                     segment=segment,
-                    start_date=model_params["start_date"],
+                    start_date=partition["start_date"],
                     end_date=self.end_date,
                     holidays=[ProphetHoliday(**h) for h in holiday_list],
                     regressors=[ProphetRegressor(**r) for r in regressor_list],
-                    grid_parameters=dict(model_params["grid_parameters"]),
-                    cv_settings=dict(model_params["cv_settings"]),
+                    grid_parameters=dict(partition["grid_parameters"]),
+                    cv_settings=dict(partition["cv_settings"]),
                 )
             )
         self.segment_models = segment_models
diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py
index 19f57e1d..30d152b3 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py
@@ -16,6 +16,16 @@
 
 @dataclass
 class ProphetForecast(BaseForecast):
+    """Forecast object specifically for prophet forecast models
+
+    Additional attributes:
+    number_of_simulations (int): The number of simulated timeseries that the forecast
+            should generate. Since many forecast models are probablistic, this enables the
+            measurement of variation across a range of possible outcomes.
+    """
+
+    number_of_simulations: int = 1000
+
     @property
     def column_names_map(self) -> Dict[str, str]:
         return {"submission_date": "ds", "value": "y"}
diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py
index f7f14184..885e2b52 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py
@@ -59,34 +59,33 @@ def funnel_forecast_for_fit_tests(segment_info_fit_tests, mocker):
     """This method creates a forecast object from the segment dict
     created in the segment_info_fit_tests fixture.  It also
     mocks some of the object methods to enable easier testing"""
-    parameter_dict = {
-        "model_setting_split_dim": "a",
-        "segment_settings": {
-            "A1": {
-                "start_date": segment_info_fit_tests["A1"]["start_date"],
-                "end_date": None,
-                "holidays": [],
-                "regressors": [],
-                "grid_parameters": segment_info_fit_tests["A1"]["grid_parameters"],
-                "cv_settings": {},
-            },
-            "A2": {
-                "start_date": segment_info_fit_tests["A2"]["start_date"],
-                "end_date": None,
-                "holidays": [],
-                "regressors": [],
-                "grid_parameters": segment_info_fit_tests["A2"]["grid_parameters"],
-                "cv_settings": {},
-            },
+    parameter_list = [
+        {
+            "segment": {"a": "A1"},
+            "start_date": segment_info_fit_tests["A1"]["start_date"],
+            "end_date": None,
+            "holidays": [],
+            "regressors": [],
+            "grid_parameters": segment_info_fit_tests["A1"]["grid_parameters"],
+            "cv_settings": {},
         },
-    }
+        {
+            "segment": {"a": "A2"},
+            "start_date": segment_info_fit_tests["A2"]["start_date"],
+            "end_date": None,
+            "holidays": [],
+            "regressors": [],
+            "grid_parameters": segment_info_fit_tests["A2"]["grid_parameters"],
+            "cv_settings": {},
+        },
+    ]
 
     predict_start_date = "2124-01-01"
     predict_end_date = "2124-01-02"
 
     forecast = FunnelForecast(
         model_type="test",
-        parameters=parameter_dict,
+        parameters=parameter_list,
         use_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
@@ -491,26 +490,24 @@ def test_under_predict(mocker):
     # set segment models
     # 2124-01-01 chosen as a artibrary date to center tests on
     A1_start_date = "2124-01-01"
-    parameter_dict = {
-        "model_setting_split_dim": "a",
-        "segment_settings": {
-            "A1": {
-                "start_date": A1_start_date,
-                "end_date": None,
-                "holidays": [],
-                "regressors": [],
-                "grid_parameters": {"param1": [1, 2], "param2": [20, 10]},
-                "cv_settings": {},
-            },
-        },
-    }
+    parameter_list = [
+        {
+            "segment": {"a": "A1"},
+            "start_date": A1_start_date,
+            "end_date": None,
+            "holidays": [],
+            "regressors": [],
+            "grid_parameters": {"param1": [1, 2], "param2": [20, 10]},
+            "cv_settings": {},
+        }
+    ]
 
     predict_start_date = "2124-01-02"
     predict_end_date = "2124-03-01"
 
     forecast = FunnelForecast(
         model_type="test",
-        parameters=parameter_dict,
+        parameters=parameter_list,
         use_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
@@ -846,34 +843,33 @@ def test_set_segment_models():
     """test the set_segment_models method"""
     A1_start_date = "2018-01-01"
     A2_start_date = "2020-02-02"
-    parameter_dict = {
-        "model_setting_split_dim": "a",
-        "segment_settings": {
-            "A1": {
-                "start_date": A1_start_date,
-                "end_date": None,
-                "holidays": [],
-                "regressors": [],
-                "grid_parameters": {},
-                "cv_settings": {},
-            },
-            "A2": {
-                "start_date": A2_start_date,
-                "end_date": None,
-                "holidays": [],
-                "regressors": [],
-                "grid_parameters": {},
-                "cv_settings": {},
-            },
+    parameter_list = [
+        {
+            "segment": {"a": "A1"},
+            "start_date": A1_start_date,
+            "end_date": None,
+            "holidays": [],
+            "regressors": [],
+            "grid_parameters": {},
+            "cv_settings": {},
         },
-    }
+        {
+            "segment": {"a": "A2"},
+            "start_date": A2_start_date,
+            "end_date": None,
+            "holidays": [],
+            "regressors": [],
+            "grid_parameters": {},
+            "cv_settings": {},
+        },
+    ]
 
     predict_start_date = "2124-01-01"
     predict_end_date = "2124-03-01"
 
     forecast = FunnelForecast(
         model_type="test",
-        parameters=parameter_dict,
+        parameters=parameter_list,
         use_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
@@ -924,34 +920,33 @@ def test_set_segment_models_exception():
     is specified that isn't in the data"""
     A1_start_date = "2018-01-01"
     A2_start_date = "2020-02-02"
-    parameter_dict = {
-        "model_setting_split_dim": "c",  # not in data
-        "segment_settings": {
-            "A1": {
-                "start_date": A1_start_date,
-                "end_date": None,
-                "holidays": [],
-                "regressors": [],
-                "grid_parameters": {},
-                "cv_settings": {},
-            },
-            "A2": {
-                "start_date": A2_start_date,
-                "end_date": None,
-                "holidays": [],
-                "regressors": [],
-                "grid_parameters": {},
-                "cv_settings": {},
-            },
+    parameter_list = [
+        {
+            "segment": {"c": "A1"},
+            "start_date": A1_start_date,
+            "end_date": None,
+            "holidays": [],
+            "regressors": [],
+            "grid_parameters": {},
+            "cv_settings": {},
         },
-    }
+        {
+            "segment": {"c": "A2"},
+            "start_date": A2_start_date,
+            "end_date": None,
+            "holidays": [],
+            "regressors": [],
+            "grid_parameters": {},
+            "cv_settings": {},
+        },
+    ]
 
     predict_start_date = "2124-01-01"
     predict_end_date = "2124-03-01"
 
     forecast = FunnelForecast(
         model_type="test",
-        parameters=parameter_dict,
+        parameters=parameter_list,
         use_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
@@ -966,7 +961,7 @@ def test_set_segment_models_exception():
 
     with pytest.raises(
         ValueError,
-        match="model_setting_split_dim set to c which is not among segment columns: a,b",
+        match="Segment keys missing from metric hub segments: c",
     ):
         forecast._set_segment_models(
             observed_df=observed_data, segment_column_list=segment_list

From f551f4c7fd2f35b57456d169f3122cf1ff9c7d1e Mon Sep 17 00:00:00 2001
From: Jared Snyder <jaredssnyder@gmail.com>
Date: Fri, 9 Aug 2024 16:47:29 -0500
Subject: [PATCH 22/33] Update
 jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py

Co-authored-by: Brad Ochocki Szasz <bochocki@mozilla.com>
---
 jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
index ed958518..dcf64b91 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
@@ -215,7 +215,6 @@ def summarize(
         summary_df["forecast_end_date"] = self.end_date
         summary_df["forecast_trained_at"] = self.trained_at
         summary_df["forecast_predicted_at"] = self.predicted_at
-
         summary_df["forecast_parameters"] = self.metadata_params
 
         self.summary_df = summary_df

From 1a63912afd4ebd7b5304ec243a1b9a98a2ad7e98 Mon Sep 17 00:00:00 2001
From: Jared Snyder <jsnyder@mozilla.com>
Date: Fri, 9 Aug 2024 16:53:42 -0500
Subject: [PATCH 23/33] Brad easy fixes

---
 .../kpi_forecasting/tests/test_prophet_forecast.py          | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py
index ce372cf6..59420cc4 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py
@@ -227,8 +227,6 @@ def test_combine_forecast_observed(mocker, forecast):
     assert set(expected.columns) == set(output_df.columns)
     # force value columns to be floats in both cases to make check easier
     numeric_cols = ["value", "value_low", "value_mid", "value_high"]
-    # expected[numeric_cols] = expected[numeric_cols].astype(float)
-    # output_df[numeric_cols] = output_df[numeric_cols].astype(float)
     pd.testing.assert_frame_equal(
         output_df.sort_values(["submission_date", "measure"]).reset_index(drop=True),
         expected[output_df.columns].reset_index(drop=True),
@@ -320,8 +318,6 @@ def test_under_summarize(mocker, forecast):
     assert set(expected.columns) == set(output_df.columns)
     # force value columns to be floats in both cases to make check easier
     numeric_cols = ["value", "value_low", "value_mid", "value_high"]
-    # expected[numeric_cols] = expected[numeric_cols].astype(float)
-    # output_df[numeric_cols] = output_df[numeric_cols].astype(float)
     pd.testing.assert_frame_equal(
         output_df.sort_values(["submission_date", "measure"]).reset_index(drop=True),
         expected[output_df.columns].reset_index(drop=True),
@@ -343,7 +339,7 @@ def test_summarize(mocker, forecast):
 
     dummy_metric_hub = MetricHub("", "", "", "2124-01-01", "2124-01-01")
 
-    # 2024-01-01 is chosen as an arbitrary date to center the tests around
+    # 2124-01-01 is chosen as an arbitrary date to center the tests around
 
     # forecast predictions are set with the
     # mock_aggregate_forecast_observed function so they

From 6a8c90cd5d4dc2478386ae502fb9d5f33702941b Mon Sep 17 00:00:00 2001
From: Jared Snyder <jsnyder@mozilla.com>
Date: Mon, 12 Aug 2024 13:21:29 -0500
Subject: [PATCH 24/33] remove magic year

---
 .../tests/test_base_forecast.py               |  41 ++-
 .../tests/test_funnel_forecast.py             | 333 ++++++++++--------
 .../tests/test_prophet_forecast.py            | 187 ++++++----
 3 files changed, 320 insertions(+), 241 deletions(-)

diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
index 17ce4d27..19a2db9d 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
@@ -1,15 +1,25 @@
 from typing import List
 import collections
+from datetime import date, datetime
+from dateutil.relativedelta import relativedelta
 
 import pytest
 import pandas as pd
 from dotmap import DotMap
 import numpy as np
-from datetime import datetime, timedelta, timezone
+from datetime import timedelta, timezone
 
 
 from kpi_forecasting.models.base_forecast import BaseForecast
 
+# Arbitrarily choose some date to use for the tests
+TEST_DATE = date(2024, 1, 1)
+TEST_DATE_STR = TEST_DATE.strftime("%Y-%m-%d")
+TEST_DATE_NEXT_DAY = date(2024, 1, 2)
+TEST_DATE_NEXT_DAY_STR = TEST_DATE_NEXT_DAY.strftime("%Y-%m-%d")
+TEST_PREDICT_END = TEST_DATE + relativedelta(months=2)
+TEST_PREDICT_END_STR = TEST_PREDICT_END.strftime("%Y-%m-%d")
+
 
 class BadClass(BaseForecast):
     pass
@@ -30,8 +40,9 @@ def _get_observed_data(self):
             self.observed_df = pd.DataFrame(
                 {
                     "submission_date": [
-                        pd.to_datetime("2020-01-01"),
-                        pd.to_datetime("1990-01-01"),
+                        TEST_DATE,
+                        TEST_DATE
+                        - relativedelta(years=1),  # just an arbitrary date in the past
                     ]
                 }
             )
@@ -77,8 +88,8 @@ def test_not_implemented():
 
 
 def test_post_init(good_class):
-    start_date = "2124-01-01"
-    end_date = "2124-02-02"
+    start_date = TEST_DATE_STR
+    end_date = TEST_PREDICT_END_STR
     good_class = good_class(
         model_type="test",
         parameters=DotMap(),
@@ -109,7 +120,7 @@ def test_post_init_default_dates(good_class):
     )
     # this is the max date of the self.observed_data['submission_date'] plus one day
     # from the object definion
-    start_date = pd.to_datetime("2020-01-02")
+    start_date = TEST_DATE_NEXT_DAY
     end_date = (
         datetime.now(timezone.utc).replace(tzinfo=None) + timedelta(weeks=78)
     ).date()
@@ -124,15 +135,15 @@ def test_fit(good_class):
         model_type="test",
         parameters=DotMap(),
         use_holidays=None,
-        start_date="2124-01-01",
-        end_date="2124-02-02",
+        start_date=TEST_DATE_STR,
+        end_date=TEST_PREDICT_END_STR,
         metric_hub=None,
     )
     good_class.fit()
     assert good_class.model
 
-    #
-    assert good_class.model.is_fit == pd.to_datetime("2020-01-01")
+    # model sets is_fit to the largest day in the observed data
+    assert good_class.model.is_fit == TEST_DATE
 
 
 def test_predict_and_validate(good_class):
@@ -140,8 +151,8 @@ def test_predict_and_validate(good_class):
         model_type="test",
         parameters=DotMap(),
         use_holidays=None,
-        start_date="2124-01-01",
-        end_date="2124-02-02",
+        start_date=TEST_DATE_STR,
+        end_date=TEST_PREDICT_END_STR,
         metric_hub=None,
     )
     # overwrite date range set in __post_init__
@@ -155,8 +166,8 @@ def test_summarize(good_class):
         model_type="test",
         parameters=DotMap(),
         use_holidays=None,
-        start_date="2124-01-01",
-        end_date="2124-02-02",
+        start_date=TEST_DATE_STR,
+        end_date=TEST_PREDICT_END_STR,
         metric_hub=None,
     )
     good_class.forecast_df = np.array([1, 2])
@@ -166,7 +177,7 @@ def test_summarize(good_class):
         ["alias", "app_name", "slug", "min_date", "max_date"],
     )
 
-    dummy_metric_hub = MetricHub("", "", "", "2124-01-01", "2124-01-01")
+    dummy_metric_hub = MetricHub("", "", "", TEST_DATE_STR, TEST_DATE_STR)
 
     # add it here rather than in __init__ so it doesn't try to load data
     good_class.metric_hub = dummy_metric_hub
diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py
index c792db67..a8f865b5 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py
@@ -1,6 +1,8 @@
 """tests for the funnel forecast module"""
 
 import collections
+from datetime import date, datetime
+from dateutil.relativedelta import relativedelta
 
 import pandas as pd
 from dotmap import DotMap
@@ -11,13 +13,21 @@
 from kpi_forecasting.configs.model_inputs import ProphetRegressor, ProphetHoliday
 from kpi_forecasting.models.funnel_forecast import SegmentModelSettings, FunnelForecast
 
+# Arbitrarily choose some date to use for the tests
+TEST_DATE = date(2024, 1, 1)
+TEST_DATE_STR = TEST_DATE.strftime("%Y-%m-%d")
+TEST_DATE_NEXT_DAY = date(2024, 1, 2)
+TEST_DATE_NEXT_DAY_STR = TEST_DATE_NEXT_DAY.strftime("%Y-%m-%d")
+TEST_PREDICT_END = TEST_DATE + relativedelta(months=2)
+TEST_PREDICT_END_STR = TEST_PREDICT_END.strftime("%Y-%m-%d")
+
 
 @pytest.fixture()
 def forecast():
     """This mocks a generic forecast object"""
     # 2024-01-01 is arbitarily chosen as a future date
-    predict_start_date = "2124-01-01"
-    predict_end_date = "2124-03-01"
+    predict_start_date = TEST_DATE_STR
+    predict_end_date = TEST_PREDICT_END_STR
 
     forecast = FunnelForecast(
         model_type="test",
@@ -37,8 +47,8 @@ def segment_info_fit_tests():
     in the functions that test fit methods"""
 
     # 2024-01-01 is arbitarily chosen as a future date
-    A1_start_date = "2124-01-01"
-    A2_start_date = "2124-01-02"
+    A1_start_date = TEST_DATE_STR
+    A2_start_date = TEST_DATE_NEXT_DAY_STR
 
     segment_info_dict = {
         "A1": {
@@ -83,9 +93,8 @@ def funnel_forecast_for_fit_tests(segment_info_fit_tests, mocker):
     }
 
     parameter_dotmap = DotMap(parameter_dict)
-    predict_start_date = "2124-01-01"
-    predict_end_date = "2124-01-02"
-
+    predict_start_date = TEST_DATE_STR
+    predict_end_date = TEST_DATE_NEXT_DAY_STR
     forecast = FunnelForecast(
         model_type="test",
         parameters=parameter_dotmap,
@@ -178,8 +187,8 @@ def test_combine_forecast_observed(mocker, forecast):
     forecast_df = pd.DataFrame(
         {
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
         }
     )
@@ -187,8 +196,8 @@ def test_combine_forecast_observed(mocker, forecast):
     observed_df = pd.DataFrame(
         {
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
             "a": ["A1", "A1"],
             "value": [5, 6],
@@ -238,8 +247,8 @@ def test_under_summarize(mocker, forecast):
     forecast_df = pd.DataFrame(
         {
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
         }
     )
@@ -249,11 +258,11 @@ def test_under_summarize(mocker, forecast):
     observed_df = pd.DataFrame(
         {
             "submission_date": [
-                pd.to_datetime("2123-01-01").date(),
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE - relativedelta(months=1),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
             "a": ["A1", "A1", "A1", "A2", "A2"],
             "value": [10, 20, 30, 40, 50],
@@ -265,7 +274,7 @@ def test_under_summarize(mocker, forecast):
         ["start_date", "forecast_df", "segment", "trained_parameters"],
     )
     dummy_segment_settings = SegmentSettings(
-        start_date="2124-01-01",
+        start_date=TEST_DATE_STR,
         forecast_df=forecast_df.copy(),
         segment={"a": "A1"},
         trained_parameters={"trained_parameters": "yes"},
@@ -288,8 +297,8 @@ def test_under_summarize(mocker, forecast):
     observed_expected_df = pd.DataFrame(
         {
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
             "a": ["A1", "A1"],
             "value": [20, 30],
@@ -334,7 +343,7 @@ def test_summarize(mocker, forecast):
         ["alias", "app_name", "slug", "min_date", "max_date"],
     )
 
-    dummy_metric_hub = MetricHub("", "", "", "2124-01-01", "2124-01-01")
+    dummy_metric_hub = MetricHub("", "", "", TEST_DATE_STR, TEST_DATE_STR)
 
     # forecast predictions are set with the
     # mock_aggregate_forecast_observed function so they
@@ -342,8 +351,8 @@ def test_summarize(mocker, forecast):
     forecast_df = pd.DataFrame(
         {
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
         }
     )
@@ -353,11 +362,11 @@ def test_summarize(mocker, forecast):
     observed_df = pd.DataFrame(
         {
             "submission_date": [
-                pd.to_datetime("2123-01-01").date(),
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE - relativedelta(months=1),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
             "a": ["A1", "A1", "A1", "A2", "A2"],
             "value": [10, 20, 30, 40, 50],
@@ -373,7 +382,7 @@ def test_summarize(mocker, forecast):
     # we're only testing that it is concatenated properly
     # with the segment data added
     dummy_segment_settings_A1 = SegmentSettings(
-        start_date="2124-01-01",
+        start_date=TEST_DATE_STR,
         forecast_df=forecast_df.copy(),
         segment={"a": "A1"},
         trained_parameters={"trained_parameters": "yes"},
@@ -381,7 +390,7 @@ def test_summarize(mocker, forecast):
     )
 
     dummy_segment_settings_A2 = SegmentSettings(
-        start_date="2124-01-01",
+        start_date=TEST_DATE_STR,
         forecast_df=forecast_df.copy(),
         segment={"a": "A2"},
         trained_parameters={"trained_parameters": "yes"},
@@ -418,10 +427,10 @@ def test_summarize(mocker, forecast):
     observed_expected_df = pd.DataFrame(
         {
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
             "a": ["A1", "A1", "A2", "A2"],
             "value": [20, 30, 40, 50],
@@ -491,8 +500,8 @@ def test_summarize(mocker, forecast):
 def test_under_predict(mocker):
     """testing _predict"""
     # set segment models
-    # 2124-01-01 chosen as a artibrary date to center tests on
-    A1_start_date = "2124-01-01"
+
+    A1_start_date = TEST_DATE_STR
     parameter_dict = {
         "model_setting_split_dim": "a",
         "segment_settings": {
@@ -508,8 +517,8 @@ def test_under_predict(mocker):
     }
 
     parameter_dotmap = DotMap(parameter_dict)
-    predict_start_date = "2124-01-02"
-    predict_end_date = "2124-03-01"
+    predict_start_date = TEST_DATE_NEXT_DAY_STR
+    predict_end_date = TEST_PREDICT_END_STR
 
     forecast = FunnelForecast(
         model_type="test",
@@ -535,8 +544,8 @@ def test_under_predict(mocker):
             "b": ["B1", "B2"],
             "y": [0, 1],
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
         }
     )
@@ -557,8 +566,8 @@ def test_under_predict(mocker):
     dates_to_predict = pd.DataFrame(
         {
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ]
         }
     )
@@ -574,8 +583,8 @@ def test_under_predict(mocker):
         {
             0: [0, model_value],
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
         }
     )
@@ -623,10 +632,10 @@ def test_predict(funnel_forecast_for_fit_tests, segment_info_fit_tests):
             "b": ["B1", "B2", "B1", "B2"],
             "y": [-1, 1, -1, 1],
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
         }
     )
@@ -652,8 +661,8 @@ def test_predict(funnel_forecast_for_fit_tests, segment_info_fit_tests):
             {
                 0: [0, model_value],
                 "submission_date": [
-                    pd.to_datetime("2124-01-01").date(),
-                    pd.to_datetime("2124-01-02").date(),
+                    TEST_DATE,
+                    TEST_DATE_NEXT_DAY,
                 ],
             }
         )
@@ -664,7 +673,7 @@ def test_predict(funnel_forecast_for_fit_tests, segment_info_fit_tests):
             expected_raw["submission_date"]
             >= pd.to_datetime(funnel_forecast_for_fit_tests.start_date).date()
         )
-        expected = expected_raw[expected_time_filter]
+        expected = expected_raw[expected_time_filter].reset_index(drop=True)
 
         forecast_df = segment.forecast_df
         pd.testing.assert_frame_equal(forecast_df, expected)
@@ -717,8 +726,8 @@ def test_auto_tuning(forecast, mocker):
     # set one segment with two sets of grid parameters
     segment_settings = SegmentModelSettings(
         segment={"a": "A1"},
-        start_date="2124-01-01",
-        end_date="2124-03-01",
+        start_date=TEST_DATE_STR,
+        end_date=TEST_PREDICT_END_STR,
         holidays=[],
         regressors=[],
         grid_parameters={"param1": [1, 2], "param2": [20, 10]},
@@ -738,8 +747,8 @@ def test_auto_tuning(forecast, mocker):
             "a": ["A1", "A1"],
             "b": ["B1", "B2"],
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-01").date(),
+                TEST_DATE,
+                TEST_DATE,
             ],
         }
     )
@@ -760,10 +769,10 @@ def test_under_fit(funnel_forecast_for_fit_tests, segment_info_fit_tests):
             "a": ["A1", "A1", "A2", "A2"],
             "b": ["B1", "B2", "B1", "B2"],
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
         }
     )
@@ -807,10 +816,10 @@ def test_fit(funnel_forecast_for_fit_tests, segment_info_fit_tests):
             "a": ["A1", "A1", "A2", "A2"],
             "b": ["B1", "B2", "B1", "B2"],
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
         }
     )
@@ -872,8 +881,8 @@ def test_set_segment_models():
     }
 
     parameter_dotmap = DotMap(parameter_dict)
-    predict_start_date = "2124-01-01"
-    predict_end_date = "2124-03-01"
+    predict_start_date = TEST_DATE_STR
+    predict_end_date = TEST_PREDICT_END_STR
 
     forecast = FunnelForecast(
         model_type="test",
@@ -951,8 +960,8 @@ def test_set_segment_models_exception():
     }
 
     parameter_dotmap = DotMap(parameter_dict)
-    predict_start_date = "2124-01-01"
-    predict_end_date = "2124-03-01"
+    predict_start_date = TEST_DATE_STR
+    predict_end_date = TEST_PREDICT_END_STR
 
     forecast = FunnelForecast(
         model_type="test",
@@ -982,6 +991,14 @@ def test_fill_regressor_dates(forecast):
     """test _fill_regressor_dates
     the name in the regressor info indicates which case is being tested
     Dates are chosen arbitrarily"""
+    # get the set start and end dates for the forecast fixture
+    # as datetime objects
+    default_start_datetime = datetime(TEST_DATE.year, TEST_DATE.month, TEST_DATE.day)
+    default_end_datetime = datetime(
+        TEST_PREDICT_END.year, TEST_PREDICT_END.month, TEST_PREDICT_END.day
+    )
+
+    # set the start date with an arbitrary date
     regressor_info = {
         "name": "only_start",
         "description": "only has a start",
@@ -990,8 +1007,11 @@ def test_fill_regressor_dates(forecast):
     regressor = ProphetRegressor(**regressor_info)
     forecast._fill_regressor_dates(regressor)
     assert regressor.start_date == pd.to_datetime("2020-08-15")
-    assert regressor.end_date == pd.to_datetime("2124-03-01")
 
+    # this is the end dat for the forecast fixture
+    assert regressor.end_date == default_end_datetime
+
+    # set the end date with an arbitrary date
     regressor_info = {
         "name": "only_end",
         "description": "only has a end",
@@ -999,9 +1019,11 @@ def test_fill_regressor_dates(forecast):
     }
     regressor = ProphetRegressor(**regressor_info)
     forecast._fill_regressor_dates(regressor)
-    assert regressor.start_date == pd.to_datetime("2124-01-01")
+    # the start date for the forecast fixture is TEST_DATE
+    assert regressor.start_date == default_start_datetime
     assert regressor.end_date == pd.to_datetime("2125-08-15")
 
+    # set both the start and end dates to arbitrary dates
     regressor_info = {
         "name": "both",
         "description": "only has a start",
@@ -1013,15 +1035,17 @@ def test_fill_regressor_dates(forecast):
     assert regressor.start_date == pd.to_datetime("2020-08-15")
     assert regressor.end_date == pd.to_datetime("2020-09-15")
 
+    # use the defaults for both
     regressor_info = {
         "name": "neither",
         "description": "nothin to see here",
     }
     regressor = ProphetRegressor(**regressor_info)
     forecast._fill_regressor_dates(regressor)
-    assert regressor.start_date == pd.to_datetime("2124-01-01")
-    assert regressor.end_date == pd.to_datetime("2124-03-01")
+    assert regressor.start_date == default_start_datetime
+    assert regressor.end_date == default_end_datetime
 
+    # use arbitrary out of order dates to set
     regressor_info = {
         "name": "out_of_order",
         "description": "best better break",
@@ -1039,6 +1063,11 @@ def test_fill_regressor_dates(forecast):
 def test_add_regressors(forecast):
     """test add regressors
     test case for each element of regressor_list_raw is indicated in name"""
+
+    # choose arbitrary dates for dates
+    # name indicates the relationship of the window
+    # to the timeframe of the data as defined in the ds
+    # column of df below
     regressor_list_raw = [
         {
             "name": "all_in",
@@ -1120,8 +1149,8 @@ def test_build_train_dataframe_no_regressors(forecast):
     }
     segment_settings = SegmentModelSettings(
         segment={"a": 1, "b": 2},
-        start_date="2124-01-01",
-        end_date="2124-02-01",
+        start_date=TEST_DATE_STR,
+        end_date=TEST_PREDICT_END_STR,
         holidays=[],
         regressors=[ProphetRegressor(**r) for r in regressor_list],
         grid_parameters=grid_parameters,
@@ -1134,12 +1163,12 @@ def test_build_train_dataframe_no_regressors(forecast):
             "b": [1, 1, 2, 2, 2, 2],
             "y": [1, 2, 3, 4, 5, 6],
             "submission_date": [
-                pd.to_datetime("2124-12-01").date(),
-                pd.to_datetime("2124-12-02").date(),
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
-                pd.to_datetime("2123-01-01").date(),
-                pd.to_datetime("2123-01-02").date(),
+                TEST_DATE - relativedelta(months=1),
+                TEST_DATE_NEXT_DAY - relativedelta(months=1),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
+                TEST_DATE + relativedelta(months=1),
+                TEST_DATE_NEXT_DAY + relativedelta(months=1),
             ],
         }
     )
@@ -1153,8 +1182,8 @@ def test_build_train_dataframe_no_regressors(forecast):
             "b": [2, 2],
             "y": [3, 4],
             "ds": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
         }
     )
@@ -1172,8 +1201,8 @@ def test_build_train_dataframe_no_regressors(forecast):
             "b": [2, 2],
             "y": [3, 4],
             "ds": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
             "floor": [1.5, 1.5],
             "cap": [6.0, 6.0],
@@ -1193,20 +1222,24 @@ def test_build_train_dataframe(forecast):
         {
             "name": "all_in",
             "description": "it's all in",
-            "start_date": "2124-01-01",
-            "end_date": "2124-01-06",
+            "start_date": TEST_DATE_STR,
+            "end_date": (TEST_DATE + relativedelta(days=6)).strftime("%Y-%m-%d"),
         },
         {
             "name": "all_out",
             "description": "it's all in",
-            "start_date": "2124-02-01",
-            "end_date": "2124-02-06",
+            "start_date": (TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d"),
+            "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime(
+                "%Y-%m-%d"
+            ),
         },
         {
             "name": "just_end",
             "description": "just the second one",
-            "start_date": "2124-01-02",
-            "end_date": "2124-02-06",
+            "start_date": (TEST_DATE + relativedelta(days=1)).strftime("%Y-%m-%d"),
+            "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime(
+                "%Y-%m-%d"
+            ),
         },
     ]
 
@@ -1226,8 +1259,8 @@ def test_build_train_dataframe(forecast):
     }
     segment_settings = SegmentModelSettings(
         segment={"a": 1, "b": 2},
-        start_date="2124-01-01",
-        end_date="2124-02-01",
+        start_date=TEST_DATE_STR,
+        end_date=(TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d"),
         holidays=[],
         regressors=[ProphetRegressor(**r) for r in regressor_list],
         grid_parameters=grid_parameters,
@@ -1240,12 +1273,12 @@ def test_build_train_dataframe(forecast):
             "b": [1, 1, 2, 2, 2, 2],
             "y": [1, 2, 3, 4, 5, 6],
             "submission_date": [
-                pd.to_datetime("2124-12-01").date(),
-                pd.to_datetime("2124-12-02").date(),
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
-                pd.to_datetime("2123-01-01").date(),
-                pd.to_datetime("2123-01-02").date(),
+                TEST_DATE - relativedelta(months=1),
+                TEST_DATE_NEXT_DAY - relativedelta(months=1),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
+                TEST_DATE + relativedelta(months=1),
+                TEST_DATE_NEXT_DAY + relativedelta(months=1),
             ],
         }
     )
@@ -1258,8 +1291,8 @@ def test_build_train_dataframe(forecast):
             "b": [2, 2],
             "y": [3, 4],
             "ds": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
             "all_in": [1, 1],
             "all_out": [0, 0],
@@ -1279,8 +1312,8 @@ def test_build_train_dataframe(forecast):
             "b": [2, 2],
             "y": [3, 4],
             "ds": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
             "all_in": [1, 1],
             "all_out": [0, 0],
@@ -1317,8 +1350,8 @@ def test_build_predict_dataframe_no_regressors(forecast):
     }
     segment_settings = SegmentModelSettings(
         segment={"a": 1, "b": 2},
-        start_date="2124-01-01",
-        end_date="2124-02-01",
+        start_date=TEST_DATE_STR,
+        end_date=TEST_PREDICT_END_STR,
         holidays=[],
         regressors=[ProphetRegressor(**r) for r in regressor_list],
         grid_parameters=grid_parameters,
@@ -1331,12 +1364,12 @@ def test_build_predict_dataframe_no_regressors(forecast):
     dates_to_predict = pd.DataFrame(
         {
             "submission_date": [
-                pd.to_datetime("2124-12-01").date(),
-                pd.to_datetime("2124-12-02").date(),
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
-                pd.to_datetime("2123-01-01").date(),
-                pd.to_datetime("2123-01-02").date(),
+                TEST_DATE - relativedelta(months=1),
+                TEST_DATE_NEXT_DAY - relativedelta(months=1),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
         }
     )
@@ -1347,12 +1380,12 @@ def test_build_predict_dataframe_no_regressors(forecast):
     expected_predict_df = pd.DataFrame(
         {
             "ds": [
-                pd.to_datetime("2124-12-01").date(),
-                pd.to_datetime("2124-12-02").date(),
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
-                pd.to_datetime("2123-01-01").date(),
-                pd.to_datetime("2123-01-02").date(),
+                TEST_DATE - relativedelta(months=1),
+                TEST_DATE_NEXT_DAY - relativedelta(months=1),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
         }
     )
@@ -1369,12 +1402,12 @@ def test_build_predict_dataframe_no_regressors(forecast):
     expected_predict_wlog_df = pd.DataFrame(
         {
             "ds": [
-                pd.to_datetime("2124-12-01").date(),
-                pd.to_datetime("2124-12-02").date(),
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
-                pd.to_datetime("2123-01-01").date(),
-                pd.to_datetime("2123-01-02").date(),
+                TEST_DATE - relativedelta(months=1),
+                TEST_DATE_NEXT_DAY - relativedelta(months=1),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
             "floor": [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0],
             "cap": [10.0, 10.0, 10.0, 10.0, 10.0, 10.0],
@@ -1394,20 +1427,24 @@ def test_build_predict_dataframe(forecast):
         {
             "name": "all_in",
             "description": "it's all in",
-            "start_date": "2124-01-01",
-            "end_date": "2124-01-06",
+            "start_date": TEST_DATE_STR,
+            "end_date": (TEST_DATE + relativedelta(days=6)).strftime("%Y-%m-%d"),
         },
         {
             "name": "all_out",
             "description": "it's all in",
-            "start_date": "2124-02-01",
-            "end_date": "2124-02-06",
+            "start_date": (TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d"),
+            "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime(
+                "%Y-%m-%d"
+            ),
         },
         {
             "name": "just_end",
             "description": "just the second one",
-            "start_date": "2124-01-02",
-            "end_date": "2124-02-06",
+            "start_date": (TEST_DATE + relativedelta(days=1)).strftime("%Y-%m-%d"),
+            "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime(
+                "%Y-%m-%d"
+            ),
         },
     ]
 
@@ -1427,8 +1464,8 @@ def test_build_predict_dataframe(forecast):
     }
     segment_settings = SegmentModelSettings(
         segment={"a": 1, "b": 2},
-        start_date="2124-01-01",
-        end_date="2124-02-01",
+        start_date=TEST_DATE_STR,
+        end_date=TEST_PREDICT_END_STR,
         holidays=[],
         regressors=[ProphetRegressor(**r) for r in regressor_list],
         grid_parameters=grid_parameters,
@@ -1440,10 +1477,7 @@ def test_build_predict_dataframe(forecast):
 
     dates_to_predict = pd.DataFrame(
         {
-            "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
-            ],
+            "submission_date": [TEST_DATE, TEST_DATE_NEXT_DAY],
         }
     )
 
@@ -1453,10 +1487,7 @@ def test_build_predict_dataframe(forecast):
     )
     expected_train_df = pd.DataFrame(
         {
-            "ds": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
-            ],
+            "ds": [TEST_DATE, TEST_DATE_NEXT_DAY],
             "all_in": [1, 1],
             "all_out": [0, 0],
             "just_end": [0, 1],
@@ -1474,10 +1505,7 @@ def test_build_predict_dataframe(forecast):
     )
     expected_train_wlog_df = pd.DataFrame(
         {
-            "ds": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
-            ],
+            "ds": [TEST_DATE, TEST_DATE_NEXT_DAY],
             "all_in": [1, 1],
             "all_out": [0, 0],
             "just_end": [0, 1],
@@ -1500,23 +1528,28 @@ def test_build_model(forecast):
         {
             "name": "all_in",
             "description": "it's all in",
-            "start_date": "2124-01-01",
-            "end_date": "2124-01-06",
+            "start_date": TEST_DATE_STR,
+            "end_date": (TEST_DATE + relativedelta(days=6)).strftime("%Y-%m-%d"),
         },
         {
             "name": "all_out",
             "description": "it's all in",
-            "start_date": "2124-02-01",
-            "end_date": "2124-02-06",
+            "start_date": (TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d"),
+            "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime(
+                "%Y-%m-%d"
+            ),
         },
         {
             "name": "just_end",
             "description": "just the second one",
-            "start_date": "2124-01-02",
-            "end_date": "2124-02-06",
+            "start_date": (TEST_DATE + relativedelta(days=1)).strftime("%Y-%m-%d"),
+            "end_date": (TEST_DATE + relativedelta(months=1, days=6)).strftime(
+                "%Y-%m-%d"
+            ),
         },
     ]
 
+    # use holidays from holiday config file
     holiday_list = {
         "easter": {
             "name": "easter",
@@ -1565,8 +1598,8 @@ def test_build_model(forecast):
     }
     segment_settings = SegmentModelSettings(
         segment={"a": 1, "b": 2},
-        start_date="2124-01-01",
-        end_date="2124-02-01",
+        start_date=TEST_DATE_STR,
+        end_date=TEST_PREDICT_END_STR,
         holidays=[ProphetHoliday(**h) for h in holiday_list.values()],
         regressors=[ProphetRegressor(**r) for r in regressor_list],
         grid_parameters=grid_parameters,
diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py
index 59420cc4..1e211375 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py
@@ -1,3 +1,6 @@
+from datetime import date
+from dateutil.relativedelta import relativedelta
+
 import pandas as pd
 from dotmap import DotMap
 import numpy as np
@@ -7,10 +10,16 @@
 
 from kpi_forecasting.models.prophet_forecast import ProphetForecast
 
+# Arbitrarily choose some date to use for the tests
+TEST_DATE = date(2024, 1, 1)
+TEST_DATE_STR = TEST_DATE.strftime("%Y-%m-%d")
+TEST_DATE_NEXT_DAY = date(2024, 1, 1)
+TEST_DATE_NEXT_DAY_STR = TEST_DATE_NEXT_DAY.strftime("%Y-%m-%d")
+
 
 @pytest.fixture
 def forecast():
-    A1_start_date = "2124-01-01"
+    A1_start_date = TEST_DATE_STR
     parameter_dict = {
         "model_setting_split_dim": "a",
         "segment_settings": {
@@ -26,8 +35,9 @@ def forecast():
     }
 
     parameter_dotmap = DotMap(parameter_dict)
-    predict_start_date = "2124-01-02"
-    predict_end_date = "2124-03-01"
+    predict_start_date = TEST_DATE_NEXT_DAY_STR
+    # arbitarily set it a couple months in the future
+    predict_end_date = (TEST_DATE + relativedelta(months=2)).strftime("%Y-%m-%d")
     return ProphetForecast(
         model_type="test",
         parameters=parameter_dotmap,
@@ -104,10 +114,10 @@ def test_under_fit(forecast, mocker):
     observed_data = pd.DataFrame(
         {
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
         }
     )
@@ -128,10 +138,10 @@ def test_fit(forecast, mocker):
     observed_data = pd.DataFrame(
         {
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
         }
     )
@@ -150,16 +160,14 @@ def test_fit(forecast, mocker):
 
 def test_combine_forecast_observed(mocker, forecast):
     """tests the _combine_forecast_observed method"""
-    # 2024-01-01 is chosen as an arbitrary date to center the tests around
-
     # forecast predictions are set with the
     # mock_aggregate_forecast_observed function so they
     # can be ommited here
     forecast_df = pd.DataFrame(
         {
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
         }
     )
@@ -169,8 +177,8 @@ def test_combine_forecast_observed(mocker, forecast):
     observed_df = pd.DataFrame(
         {
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
             "value": [10, 20],
         }
@@ -192,8 +200,8 @@ def test_combine_forecast_observed(mocker, forecast):
     observed_expected_df = pd.DataFrame(
         {
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
             "value": [10, 20],
             "measure": ["observed", "observed"],
@@ -205,14 +213,14 @@ def test_combine_forecast_observed(mocker, forecast):
     forecast_expected_df = pd.DataFrame(
         {
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
             "measure": ["mean", "mean", "p10", "p10", "p50", "p50", "p90", "p90"],
             "value": [0] * 8,
@@ -239,16 +247,14 @@ def test_combine_forecast_observed(mocker, forecast):
 
 def test_under_summarize(mocker, forecast):
     """testing _summarize"""
-    # 2024-01-01 is chosen as an arbitrary date to center the tests around
-
     # forecast predictions are set with the
     # mock_aggregate_forecast_observed function so they
     # can be ommited here
     forecast_df = pd.DataFrame(
         {
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
         }
     )
@@ -258,8 +264,8 @@ def test_under_summarize(mocker, forecast):
     observed_df = pd.DataFrame(
         {
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
             "value": [10, 20],
         }
@@ -281,8 +287,8 @@ def test_under_summarize(mocker, forecast):
     observed_expected_df = pd.DataFrame(
         {
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
             "value": [10, 20],
             "measure": ["observed", "observed"],
@@ -294,14 +300,14 @@ def test_under_summarize(mocker, forecast):
     forecast_expected_df = pd.DataFrame(
         {
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
             "measure": ["mean", "mean", "p10", "p10", "p50", "p50", "p90", "p90"],
             "value": [0] * 8,
@@ -337,9 +343,7 @@ def test_summarize(mocker, forecast):
         ["alias", "app_name", "slug", "min_date", "max_date"],
     )
 
-    dummy_metric_hub = MetricHub("", "", "", "2124-01-01", "2124-01-01")
-
-    # 2124-01-01 is chosen as an arbitrary date to center the tests around
+    dummy_metric_hub = MetricHub("", "", "", TEST_DATE_STR, TEST_DATE_STR)
 
     # forecast predictions are set with the
     # mock_aggregate_forecast_observed function so they
@@ -347,8 +351,8 @@ def test_summarize(mocker, forecast):
     forecast_df = pd.DataFrame(
         {
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
         }
     )
@@ -358,8 +362,8 @@ def test_summarize(mocker, forecast):
     observed_df = pd.DataFrame(
         {
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
             "value": [10, 20],
         }
@@ -396,8 +400,8 @@ def test_summarize(mocker, forecast):
     observed_expected_df = pd.DataFrame(
         {
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
             "value": [10, 20],
             "measure": ["observed", "observed"],
@@ -409,14 +413,14 @@ def test_summarize(mocker, forecast):
     forecast_expected_df = pd.DataFrame(
         {
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
             "measure": ["mean", "mean", "p10", "p10", "p50", "p50", "p90", "p90"],
             "value": [0] * 8,
@@ -476,8 +480,8 @@ def test_under_predict(mocker, forecast):
         {
             "y": [0, 1],
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
         }
     )
@@ -485,8 +489,8 @@ def test_under_predict(mocker, forecast):
     dates_to_predict = pd.DataFrame(
         {
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ]
         }
     )
@@ -502,8 +506,8 @@ def test_under_predict(mocker, forecast):
         {
             0: [0, 2],
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
         }
     )
@@ -525,8 +529,8 @@ def test_under_predict(mocker, forecast):
         {
             0: [0, 2],
             "submission_date": [
-                pd.to_datetime("2124-01-01").date(),
-                pd.to_datetime("2124-01-02").date(),
+                TEST_DATE,
+                TEST_DATE_NEXT_DAY,
             ],
         }
     )
@@ -536,11 +540,13 @@ def test_under_predict(mocker, forecast):
 
 
 def test_summarize_non_overlapping_day():
-    observed_start_date = "2124-01-01"
-    observed_end_date = "2124-02-01"
+    observed_start_date = TEST_DATE_STR
+    observed_end_date = (TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d")
 
-    predict_start_date = "2124-02-02"
-    predict_end_date = "2124-03-01"
+    predict_start_date = (TEST_DATE + relativedelta(months=1, days=1)).strftime(
+        "%Y-%m-%d"
+    )
+    predict_end_date = (TEST_DATE + relativedelta(months=2)).strftime("%Y-%m-%d")
 
     forecast = ProphetForecast(
         model_type="test",
@@ -562,10 +568,15 @@ def test_summarize_non_overlapping_day():
         }
     )
 
+    # there are the samples generated
+    # the mean and median are the aggregates used
     test_samples = np.array([1, 1, 2, 3, 5, 8, 13])
     test_mean = np.mean(test_samples)
     test_median = np.median(test_samples)
 
+    # mean and median scale with a factor
+    # so a factor is multiplied on to make sure the aggregation is working
+    # across rows properly
     forecast_array = np.stack(
         [test_samples * i for i in range(1, 1 + len(predict_submission_dates))],
         axis=0,
@@ -639,12 +650,22 @@ def test_summarize_non_overlapping_day():
 
 
 def test_summarize_non_overlapping_month():
+    # choose arbitrary year for the start and end dates
+    # two full months (Jan and Feb )
+    # are in the observed data, the number of days (31 and 28 days respectively)
+    # in each month is used in the checks
     observed_start_date = "2124-01-01"
     observed_end_date = "2124-02-28"
 
+    # two full months (April and May )
+    # are in the observed data, the number of days (28 and 31 days respectively)
+    # in each month is used in the checks
     predict_start_date = "2124-04-01"
     predict_end_date = "2124-05-31"
 
+    print(observed_start_date, observed_end_date)
+    print(predict_start_date, predict_end_date)
+
     forecast = ProphetForecast(
         model_type="test",
         parameters=DotMap(),
@@ -758,11 +779,11 @@ def test_summarize_non_overlapping_month():
 
 
 def test_summarize_overlapping_day():
-    observed_start_date = "2124-01-01"
-    observed_end_date = "2124-02-01"
+    observed_start_date = TEST_DATE_STR
+    observed_end_date = (TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d")
 
-    predict_start_date = "2124-01-01"
-    predict_end_date = "2124-02-01"
+    predict_start_date = TEST_DATE_STR
+    predict_end_date = (TEST_DATE + relativedelta(months=1)).strftime("%Y-%m-%d")
 
     forecast = ProphetForecast(
         model_type="test",
@@ -784,10 +805,15 @@ def test_summarize_overlapping_day():
         }
     )
 
+    # there are the samples generated
+    # the mean and median are the aggregates used
     test_samples = np.array([1, 1, 2, 3, 5, 8, 13])
     test_mean = np.mean(test_samples)
     test_median = np.median(test_samples)
 
+    # mean and median scale with a factor
+    # so a factor is multiplied on to make sure the aggregation is working
+    # across rows properly
     forecast_array = np.stack(
         [test_samples * i for i in range(1, 1 + len(predict_submission_dates))],
         axis=0,
@@ -863,6 +889,10 @@ def test_summarize_overlapping_day():
 
 
 def test_summarize_overlapping_month():
+    # choose arbitrary year for the start and end dates
+    # two full months (Jan and Feb )
+    # are in the observed data, the number of days (31 and 28 days respectively)
+    # in each month is used in the checks
     observed_start_date = "2124-01-01"
     observed_end_date = "2124-02-28"
 
@@ -889,10 +919,15 @@ def test_summarize_overlapping_month():
         }
     )
 
+    # there are the samples generated
+    # the mean and median are the aggregates used
     test_samples = np.array([1, 1, 2, 3, 5, 8, 13])
     test_mean = np.mean(test_samples)
     test_median = np.median(test_samples)
 
+    # mean and median scale with a factor
+    # so a factor is multiplied on to make sure the aggregation is working
+    # across rows properly
     forecast_array = np.stack(
         [test_samples] * len(predict_submission_dates),
         axis=0,

From 963a116f5a89b117fce06163b1df4515ca35aa7b Mon Sep 17 00:00:00 2001
From: Jared Snyder <jsnyder@mozilla.com>
Date: Fri, 9 Aug 2024 09:28:19 -0500
Subject: [PATCH 25/33] removed DotMap

---
 jobs/kpi-forecasting/kpi_forecasting.py       | 14 +++++++-------
 .../configs/model_inputs/__init__.py          |  6 +++---
 .../kpi-forecasting/kpi_forecasting/inputs.py | 19 +++++--------------
 .../kpi_forecasting/metric_hub.py             |  3 +--
 .../kpi_forecasting/models/base_forecast.py   |  2 +-
 .../kpi_forecasting/models/funnel_forecast.py | 12 +++---------
 .../kpi_forecasting/results_processing.py     | 11 ++++++-----
 .../tests/test_base_forecast.py               | 11 +++++------
 .../tests/test_funnel_forecast.py             | 16 ++++++----------
 .../tests/test_performance_analysis.py        |  3 +++
 .../tests/test_prophet_forecast.py            | 12 +++++-------
 jobs/kpi-forecasting/requirements.txt         |  1 -
 12 files changed, 45 insertions(+), 65 deletions(-)

diff --git a/jobs/kpi-forecasting/kpi_forecasting.py b/jobs/kpi-forecasting/kpi_forecasting.py
index e7dcca7c..d8c3f04c 100644
--- a/jobs/kpi-forecasting/kpi_forecasting.py
+++ b/jobs/kpi-forecasting/kpi_forecasting.py
@@ -1,4 +1,4 @@
-from kpi_forecasting.inputs import CLI, YAML
+from kpi_forecasting.inputs import CLI, load_yaml
 from kpi_forecasting.models.prophet_forecast import ProphetForecast
 from kpi_forecasting.models.funnel_forecast import FunnelForecast
 from kpi_forecasting.metric_hub import MetricHub
@@ -13,17 +13,17 @@
 
 def main() -> None:
     # Load the config
-    config = YAML(filepath=CLI().args.config).data
-    model_type = config.forecast_model.model_type
+    config = load_yaml(filepath=CLI().args.config)
+    model_type = config["forecast_model"]["model_type"]
 
     if model_type in MODELS:
-        metric_hub = MetricHub(**config.metric_hub)
-        model = MODELS[model_type](metric_hub=metric_hub, **config.forecast_model)
+        metric_hub = MetricHub(**config["metric_hub"])
+        model = MODELS[model_type](metric_hub=metric_hub, **config["forecast_model"])
 
         model.fit()
         model.predict()
-        model.summarize(**config.summarize)
-        model.write_results(**config.write_results)
+        model.summarize(**config["summarize"])
+        model.write_results(**config["write_results"])
 
     else:
         raise ValueError(f"Don't know how to forecast using {model_type}.")
diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/model_inputs/__init__.py b/jobs/kpi-forecasting/kpi_forecasting/configs/model_inputs/__init__.py
index 1ebd482e..caacc611 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/configs/model_inputs/__init__.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/configs/model_inputs/__init__.py
@@ -3,15 +3,15 @@
 from pathlib import Path
 
 
-from kpi_forecasting.inputs import YAML
+from kpi_forecasting.inputs import load_yaml
 
 
 PARENT_PATH = Path(__file__).parent
 HOLIDAY_PATH = PARENT_PATH / "holidays.yaml"
 REGRESSOR_PATH = PARENT_PATH / "regressors.yaml"
 
-holiday_collection = YAML(HOLIDAY_PATH)
-regressor_collection = YAML(REGRESSOR_PATH)
+holiday_collection = load_yaml(HOLIDAY_PATH)
+regressor_collection = load_yaml(REGRESSOR_PATH)
 
 
 @attr.s(auto_attribs=True, frozen=False)
diff --git a/jobs/kpi-forecasting/kpi_forecasting/inputs.py b/jobs/kpi-forecasting/kpi_forecasting/inputs.py
index 034af27a..14da5545 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/inputs.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/inputs.py
@@ -2,7 +2,6 @@
 import yaml
 
 from dataclasses import dataclass
-from dotmap import DotMap
 
 
 @dataclass
@@ -20,18 +19,10 @@ def __post_init__(self) -> None:
         self.args = self.parser.parse_args()
 
 
-@dataclass
-class YAML:
+def load_yaml(filepath: str) -> dict:
     """
-    Create a data structure from a YAML config filepath. Instead of loading the
-    YAML as a dictionary, which requires verbose code to access nested dictionary
-    values, this class loads YAML as a dot map. Nested values can be accessed using
-    dot notation, like `YAML(<filepath>).data.section.subsection.value`.
+    Create a data structure from a YAML config filepath.
     """
-
-    filepath: str
-
-    def __post_init__(self) -> None:
-        with open(self.filepath, "r") as f:
-            data = yaml.safe_load(f)
-        self.data = DotMap(data)
+    with open(filepath, "r") as f:
+        data = yaml.safe_load(f)
+    return data
diff --git a/jobs/kpi-forecasting/kpi_forecasting/metric_hub.py b/jobs/kpi-forecasting/kpi_forecasting/metric_hub.py
index 64cf9d42..e0a86c83 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/metric_hub.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/metric_hub.py
@@ -1,7 +1,6 @@
 import pandas as pd
 
 from dataclasses import dataclass
-from dotmap import DotMap
 from google.cloud import bigquery
 from mozanalysis.config import ConfigLoader
 from textwrap import dedent
@@ -36,7 +35,7 @@ class MetricHub:
     app_name: str
     slug: str
     start_date: str
-    segments: DotMap = None
+    segments: dict = None
     where: str = None
     end_date: str = None
     alias: str = None
diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
index dcf64b91..08a0f750 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
@@ -71,7 +71,7 @@ def __post_init__(self) -> None:
         self.metadata_params = json.dumps(
             {
                 "model_type": self.model_type.lower(),
-                "model_params": self.parameters.toDict(),
+                "model_params": self.parameters,
                 "use_holidays": self.use_holidays,
             }
         )
diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py
index 9652ce02..52aa9cc8 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py
@@ -117,22 +117,16 @@ def _set_segment_models(
         ## file. Parse the holidays and regressors specified in the config file.
         segment_models = []
         for segment in segment_combinations:
-            model_params = getattr(
-                self.parameters["segment_settings"], segment[split_dim]
-            )
+            model_params = self.parameters["segment_settings"][segment[split_dim]]
 
             holiday_list = []
             regressor_list = []
 
             if model_params["holidays"]:
-                holiday_list = [
-                    getattr(holiday_collection.data, h)
-                    for h in model_params["holidays"]
-                ]
+                holiday_list = [holiday_collection[h] for h in model_params["holidays"]]
             if model_params["regressors"]:
                 regressor_list = [
-                    getattr(regressor_collection.data, r)
-                    for r in model_params["regressors"]
+                    regressor_collection[r] for r in model_params["regressors"]
                 ]
 
             # Create a SegmentModelSettings object for each segment combination
diff --git a/jobs/kpi-forecasting/kpi_forecasting/results_processing.py b/jobs/kpi-forecasting/kpi_forecasting/results_processing.py
index f7e8ab88..1cb8a9d1 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/results_processing.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/results_processing.py
@@ -4,7 +4,7 @@
 from google.cloud import bigquery
 from google.cloud.bigquery.enums import SqlTypeNames as bq_types
 
-from kpi_forecasting.inputs import YAML
+from kpi_forecasting.inputs import load_yaml
 import pandas as pd
 import numpy as np
 
@@ -74,12 +74,13 @@ def _set_intra_forecast_agg_functions(self):
 
     def _load_config_data(self):
         """Extracts data from the list of config files passed to the class and stores it in the
-        config_data attribute. The filename is the key, and the contents (represnted as a DotMap)
+        config_data attribute. The filename is the key, and the contents
         are the values"""
         self.config_data = {}
         for config_file in self.input_config_list:
             full_path = f"{self.input_config_path}/{config_file}"
-            config_data = YAML(full_path).data
+            config_data = load_yaml(full_path)
+            print(config_data)
             self.config_data[config_file] = config_data
 
     def _extract_config_data(self):
@@ -99,7 +100,7 @@ def _extract_config_data(self):
         config_file_list = list(self.config_data.keys())
         for config_data in self.config_data.values():
             # get segment data
-            metric_hub_data = config_data.metric_hub.toDict()
+            metric_hub_data = config_data["metric_hub"]
             if "segments" in metric_hub_data:
                 segment_data = metric_hub_data["segments"]
                 segment_data_list.append(segment_data)
@@ -107,7 +108,7 @@ def _extract_config_data(self):
                 segment_data_list.append(None)
 
             # get input table info
-            input_table_list.append(config_data.write_results.toDict())
+            input_table_list.append(config_data["write_results"])
 
         input_table_data = input_table_list.pop(0)
         input_table_matches_first = [input_table_data == el for el in input_table_list]
diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
index 19a2db9d..de8aa885 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
@@ -5,7 +5,6 @@
 
 import pytest
 import pandas as pd
-from dotmap import DotMap
 import numpy as np
 from datetime import timedelta, timezone
 
@@ -92,7 +91,7 @@ def test_post_init(good_class):
     end_date = TEST_PREDICT_END_STR
     good_class = good_class(
         model_type="test",
-        parameters=DotMap(),
+        parameters={},
         use_holidays=None,
         start_date=start_date,
         end_date=end_date,
@@ -112,7 +111,7 @@ def test_post_init_default_dates(good_class):
     # check default start and end time
     good_class = good_class(
         model_type="test",
-        parameters=DotMap(),
+        parameters={},
         use_holidays=None,
         start_date="",
         end_date="",
@@ -133,7 +132,7 @@ def test_post_init_default_dates(good_class):
 def test_fit(good_class):
     good_class = good_class(
         model_type="test",
-        parameters=DotMap(),
+        parameters={},
         use_holidays=None,
         start_date=TEST_DATE_STR,
         end_date=TEST_PREDICT_END_STR,
@@ -149,7 +148,7 @@ def test_fit(good_class):
 def test_predict_and_validate(good_class):
     good_class = good_class(
         model_type="test",
-        parameters=DotMap(),
+        parameters={},
         use_holidays=None,
         start_date=TEST_DATE_STR,
         end_date=TEST_PREDICT_END_STR,
@@ -164,7 +163,7 @@ def test_predict_and_validate(good_class):
 def test_summarize(good_class):
     good_class = good_class(
         model_type="test",
-        parameters=DotMap(),
+        parameters={},
         use_holidays=None,
         start_date=TEST_DATE_STR,
         end_date=TEST_PREDICT_END_STR,
diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py
index a8f865b5..bb7355fd 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py
@@ -5,7 +5,6 @@
 from dateutil.relativedelta import relativedelta
 
 import pandas as pd
-from dotmap import DotMap
 import pytest
 import numpy as np
 
@@ -31,7 +30,7 @@ def forecast():
 
     forecast = FunnelForecast(
         model_type="test",
-        parameters=DotMap(),
+        parameters={},
         use_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
@@ -92,12 +91,12 @@ def funnel_forecast_for_fit_tests(segment_info_fit_tests, mocker):
         },
     }
 
-    parameter_dotmap = DotMap(parameter_dict)
     predict_start_date = TEST_DATE_STR
     predict_end_date = TEST_DATE_NEXT_DAY_STR
+
     forecast = FunnelForecast(
         model_type="test",
-        parameters=parameter_dotmap,
+        parameters=parameter_dict,
         use_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
@@ -516,13 +515,12 @@ def test_under_predict(mocker):
         },
     }
 
-    parameter_dotmap = DotMap(parameter_dict)
     predict_start_date = TEST_DATE_NEXT_DAY_STR
     predict_end_date = TEST_PREDICT_END_STR
 
     forecast = FunnelForecast(
         model_type="test",
-        parameters=parameter_dotmap,
+        parameters=parameter_dict,
         use_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
@@ -880,13 +878,12 @@ def test_set_segment_models():
         },
     }
 
-    parameter_dotmap = DotMap(parameter_dict)
     predict_start_date = TEST_DATE_STR
     predict_end_date = TEST_PREDICT_END_STR
 
     forecast = FunnelForecast(
         model_type="test",
-        parameters=parameter_dotmap,
+        parameters=parameter_dict,
         use_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
@@ -959,13 +956,12 @@ def test_set_segment_models_exception():
         },
     }
 
-    parameter_dotmap = DotMap(parameter_dict)
     predict_start_date = TEST_DATE_STR
     predict_end_date = TEST_PREDICT_END_STR
 
     forecast = FunnelForecast(
         model_type="test",
-        parameters=parameter_dotmap,
+        parameters=parameter_dict,
         use_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_performance_analysis.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_performance_analysis.py
index edbc2cbb..3e4f0120 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_performance_analysis.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_performance_analysis.py
@@ -58,6 +58,7 @@ def directory_of_configs(tmp_path_factory):
             "dataset": "y",
             "table": "z",
         },
+        "metric_hub": {},
     }
     f4 = tmpdir / "config_nosegments1_1.yaml"
     f5 = tmpdir / "config_nosegments1_2.yaml"
@@ -73,6 +74,7 @@ def directory_of_configs(tmp_path_factory):
             "dataset": "q",
             "table": "z",
         },
+        "metric_hub": {},
     }
     f6 = tmpdir / "config_nosegments2_1.yaml"
 
@@ -91,6 +93,7 @@ def get_forecast_performance_config(tmp_path_factory):
             "dataset": "",
             "table": "",
         },
+        "metric_hub": {},
     }
     f1 = tmpdir / "config.yaml"
     with open(f1, "w") as outfile:
diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py
index 1e211375..150ec4da 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py
@@ -2,7 +2,6 @@
 from dateutil.relativedelta import relativedelta
 
 import pandas as pd
-from dotmap import DotMap
 import numpy as np
 import pytest
 import collections
@@ -34,13 +33,12 @@ def forecast():
         },
     }
 
-    parameter_dotmap = DotMap(parameter_dict)
     predict_start_date = TEST_DATE_NEXT_DAY_STR
     # arbitarily set it a couple months in the future
     predict_end_date = (TEST_DATE + relativedelta(months=2)).strftime("%Y-%m-%d")
     return ProphetForecast(
         model_type="test",
-        parameters=parameter_dotmap,
+        parameters=parameter_dict,
         use_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
@@ -550,7 +548,7 @@ def test_summarize_non_overlapping_day():
 
     forecast = ProphetForecast(
         model_type="test",
-        parameters=DotMap(),
+        parameters={},
         use_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
@@ -668,7 +666,7 @@ def test_summarize_non_overlapping_month():
 
     forecast = ProphetForecast(
         model_type="test",
-        parameters=DotMap(),
+        parameters={},
         use_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
@@ -787,7 +785,7 @@ def test_summarize_overlapping_day():
 
     forecast = ProphetForecast(
         model_type="test",
-        parameters=DotMap(),
+        parameters={},
         use_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
@@ -901,7 +899,7 @@ def test_summarize_overlapping_month():
 
     forecast = ProphetForecast(
         model_type="test",
-        parameters=DotMap(),
+        parameters={},
         use_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
diff --git a/jobs/kpi-forecasting/requirements.txt b/jobs/kpi-forecasting/requirements.txt
index 218d688a..cae076e6 100644
--- a/jobs/kpi-forecasting/requirements.txt
+++ b/jobs/kpi-forecasting/requirements.txt
@@ -10,7 +10,6 @@ contourpy==1.1.0
 convertdate==2.4.0
 cycler==0.11.0
 db-dtypes==1.1.1
-dotmap==1.3.30
 ephem==4.1.4
 exceptiongroup==1.1.1
 fonttools==4.40.0

From 0f2f5096cf3e10556d0d0188fcea385fcaf8bb34 Mon Sep 17 00:00:00 2001
From: Jared Snyder <jsnyder@mozilla.com>
Date: Fri, 9 Aug 2024 13:10:42 -0500
Subject: [PATCH 26/33] modified README to make it match better between
 FunnelForecast and ProphetForecast

---
 jobs/kpi-forecasting/README.md                |  83 +++++++++-
 .../kpi_forecasting/configs/dau_desktop.yaml  |   2 +
 .../kpi_forecasting/configs/dau_mobile.yaml   |   2 +
 .../configs/search_forecasting_ad_clicks.yaml |  72 +++++----
 ...search_forecasting_daily_active_users.yaml |  66 ++++----
 .../search_forecasting_search_count.yaml      |  66 ++++----
 .../kpi_forecasting/models/base_forecast.py   |  20 ++-
 .../kpi_forecasting/models/funnel_forecast.py |  70 ++++----
 .../models/prophet_forecast.py                |  10 ++
 .../tests/test_funnel_forecast.py             | 151 +++++++++---------
 10 files changed, 326 insertions(+), 216 deletions(-)

diff --git a/jobs/kpi-forecasting/README.md b/jobs/kpi-forecasting/README.md
index 31231cf8..ff1a6ed8 100644
--- a/jobs/kpi-forecasting/README.md
+++ b/jobs/kpi-forecasting/README.md
@@ -85,8 +85,87 @@ The tests can be run locally with `python -m pytest` in the root directory of th
 
 # YAML Configs
 
-Each of the sections in the YAML files contains a list of arguments that are passed to their relevant objects or methods.
-Definitions should be documented in the code.
+Configuration for each forecast is found in the `configs` folder.  Below is an example config file with sample values and a description of what the field means as a comment whe it is not self-evident
+
+```
+metric_hub:  # this configures the observed data fed to the model which is obtained via metrichub
+  app_name: "multi_product"  # metric-hub app name
+  slug: "search_forecasting_ad_clicks"  # metric-hub slug
+  alias: "search_forecasting_ad_clicks"  # metric-hub alias
+  start_date: "2018-01-01"  # date at which the observed data should start
+  end_date: "last complete month"
+    # date at which the observed data will end, can be a date or "last complete month" 
+    # which uses `utils.parse_end_date` to determine the last complete month  
+  segments:  
+        # this section is optional and currently only used in funnel forecast, 
+        # specifies which segments are used to partition the data, 
+        # enabling separate models to be fit for each partition.  
+        # Values underneath are a map of column names to be output by the 
+        # metric-hub call and the SQL queries to populate those columns 
+    device: "device"
+    channel: "'all'"
+    country: "CASE WHEN country = 'US' THEN 'US' ELSE 'ROW' END"
+    partner: "partner"
+  where: "partner = 'Google'"  # filter to apply to the metric hub pull
+
+forecast_model:  # this section configures the model 
+  model_type: "funnel"  
+    # type of model object to use, current options are "funnel" for FunnelForecast and "prophet" for ProphetForecast
+  start_date: NULL  
+    # starting date for the predicted data (unless predict_historical_dates is set), 
+    # if unset, value depends on predict_historical_dates.
+  end_date: NULL
+    # final date for the predicted data
+  use_holidays: False
+    For prophet-based models, when true, call `model.add_country_holidays(country_name="US")` on model
+  predict_historical_dates: True
+    # if predict_historical_dates is True, set to first date of the observed data
+    # if predict_historical_dates is False, defaults to the day after the last day in the observed data
+  number_of_simulations: 1000
+    # for prophet-based models,number of simulations to run
+  parameters:
+    # this section can be a map or a list.  
+    # If it's a map, these parameters are used for all models
+    # (recall multiple models are train if there is a metric_hub.segments)
+    # If it's a list, it will set different parameters
+    # for different subsets of the parition specified in `metric_hub.segments`. 
+    - segment: 
+        # specifies which subset of the partitions this applies to
+        # key is a column specified in metric_hub.segments
+        # value is a value that column can take to which the configuration is applied
+        device: desktop
+      start_date: "2018-01-01"  # only applies to FunnelForecast, allows one to set start date for each sub-model
+      end_date: NULL # only applies to FunnelForecast, allows one to set end date for each sub-model
+      holidays: ["easter", "covid_sip11"]  # holidays specified in `configs.model_inputs.holidays` to use.
+      regressors: ["post_esr_migration", "in_covid", "ad_click_bug"] # regressors specified in `configs.model_inputs.regressors`
+      grid_parameters:
+        # sets grid for hyperparameter tuning
+        changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5]
+        changepoint_range: [0.8, 0.9]
+        n_changepoints: [25, 50]
+        weekly_seasonality: True
+        yearly_seasonality: True
+      cv_settings:
+        # sets parameters for prophet cross-validation used in FunnelForecast
+        initial: "1296 days"
+        period: "30 days"
+        horizon: "30 days"
+        parallel: "processes"
+    ...
+
+summarize:
+    # parameters used to summarize and aggregate the predictions
+  periods: ["day", "month"]  # periods to aggregate up to
+  numpy_aggregations: ["mean"] # numpy aggregation functions to use when aggregating predictions
+  percentiles: [10, 50, 90] # precentiles to calculate on aggregation
+
+write_results:
+    # set the project, dataset and table for output data
+  project: "moz-fx-data-shared-prod"
+  dataset: "search_derived"
+  table: "search_funnel_forecasts_v1"
+  components_table: "search_forecast_model_components_v1"
+```
 
 # Development
 
diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml
index 5ba432ea..83e80ab9 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml
+++ b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml
@@ -11,6 +11,8 @@ forecast_model:
   start_date: NULL
   end_date: NULL
   use_holidays: False
+  predict_historical_dates: False
+  number_of_simulations: 1000
   parameters:
     seasonality_prior_scale: 0.00825
     changepoint_prior_scale: 0.15983
diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml
index 74889971..a3a9f3eb 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml
+++ b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml
@@ -11,6 +11,8 @@ forecast_model:
   start_date: NULL
   end_date: NULL
   use_holidays: True
+  predict_historical_dates: False
+  number_of_simulations: 1000
   parameters:
     seasonality_prior_scale: 0.01
     changepoint_prior_scale: 0.01
diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml
index a756b518..ea8a2a64 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml
+++ b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml
@@ -17,42 +17,44 @@ forecast_model:
   start_date: NULL
   end_date: NULL
   use_holidays: False
+  predict_historical_dates: True
+  number_of_simulations: 1000
   parameters:
-    model_setting_split_dim: "device"
-    segment_settings:
-      desktop:
-        start_date: "2018-01-01"
-        end_date: NULL
-        holidays: ["easter", "covid_sip11"]
-        regressors: ["post_esr_migration", "in_covid", "ad_click_bug"]
-        grid_parameters:
-          changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5]
-          changepoint_range: [0.8, 0.9]
-          n_changepoints: [25, 50]
-          weekly_seasonality: True
-          yearly_seasonality: True
-        cv_settings:
-          initial: "1296 days"
-          period: "30 days"
-          horizon: "30 days"
-          parallel: "processes"
-      mobile:
-        start_date: "2022-01-01"
-        end_date: NULL
-        holidays: ["easter"]
-        regressors: ["after_fenix", "in_covid"]
-        grid_parameters:
-          changepoint_prior_scale: [.01, .1, .15, .2]
-          changepoint_range: [0.8, 0.9, 1]
-          n_changepoints: [30]
-          weekly_seasonality: True
-          yearly_seasonality: True
-          growth: "logistic"
-        cv_settings:
-          initial: "366 days"
-          period: "30 days"
-          horizon: "30 days"
-          parallel: "processes"
+    - segment: 
+        device: desktop
+      start_date: "2018-01-01"
+      end_date: NULL
+      holidays: ["easter", "covid_sip11"]
+      regressors: ["post_esr_migration", "in_covid", "ad_click_bug"]
+      grid_parameters:
+        changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5]
+        changepoint_range: [0.8, 0.9]
+        n_changepoints: [25, 50]
+        weekly_seasonality: True
+        yearly_seasonality: True
+      cv_settings:
+        initial: "1296 days"
+        period: "30 days"
+        horizon: "30 days"
+        parallel: "processes"
+    - segment:
+        device: mobile
+      start_date: "2022-01-01"
+      end_date: NULL
+      holidays: ["easter"]
+      regressors: ["after_fenix", "in_covid"]
+      grid_parameters:
+        changepoint_prior_scale: [.01, .1, .15, .2]
+        changepoint_range: [0.8, 0.9, 1]
+        n_changepoints: [30]
+        weekly_seasonality: True
+        yearly_seasonality: True
+        growth: "logistic"
+      cv_settings:
+        initial: "366 days"
+        period: "30 days"
+        horizon: "30 days"
+        parallel: "processes"
 
 summarize:
   periods: ["day", "month"]
diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml
index b6643c4a..3ce3568e 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml
+++ b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml
@@ -17,39 +17,41 @@ forecast_model:
   start_date: NULL
   end_date: NULL
   use_holidays: False
+  predict_historical_dates: True
+  number_of_simulations: 1000
   parameters:
-    model_setting_split_dim: "device"
-    segment_settings:
-      desktop:
-        start_date: "2018-01-01"
-        end_date: NULL
-        holidays: ["easter", "covid_sip11"]
-        regressors: ["post_esr_migration", "in_covid"]
-        grid_parameters:
-          changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5]
-          changepoint_range: [0.8, 0.9]
-          weekly_seasonality: True
-          yearly_seasonality: True
-        cv_settings:
-          initial: "1296 days"
-          period: "30 days"
-          horizon: "30 days"
-          parallel: "processes"
-      mobile:
-        start_date: "2021-01-01"
-        end_date: NULL
-        holidays: ["easter"]
-        regressors: ["after_fenix", "in_covid"]
-        grid_parameters:
-          changepoint_prior_scale: [0.001, 0.01, 0.1]
-          weekly_seasonality: True
-          yearly_seasonality: True
-          growth: "logistic"
-        cv_settings:
-          initial: "366 days"
-          period: "30 days"
-          horizon: "30 days"
-          parallel: "processes"
+    - segment:
+        device: desktop
+      start_date: "2018-01-01"
+      end_date: NULL
+      holidays: ["easter", "covid_sip11"]
+      regressors: ["post_esr_migration", "in_covid"]
+      grid_parameters:
+        changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5]
+        changepoint_range: [0.8, 0.9]
+        weekly_seasonality: True
+        yearly_seasonality: True
+      cv_settings:
+        initial: "1296 days"
+        period: "30 days"
+        horizon: "30 days"
+        parallel: "processes"
+    - segment: 
+        device: mobile
+      start_date: "2021-01-01"
+      end_date: NULL
+      holidays: ["easter"]
+      regressors: ["after_fenix", "in_covid"]
+      grid_parameters:
+        changepoint_prior_scale: [0.001, 0.01, 0.1]
+        weekly_seasonality: True
+        yearly_seasonality: True
+        growth: "logistic"
+      cv_settings:
+        initial: "366 days"
+        period: "30 days"
+        horizon: "30 days"
+        parallel: "processes"
 
 summarize:
   periods: ["day", "month"]
diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml
index 8dd8f811..75f73ba2 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml
+++ b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml
@@ -17,39 +17,41 @@ forecast_model:
   start_date: NULL
   end_date: NULL
   use_holidays: False
+  predict_historical_dates: True
+  number_of_simulations: 1000
   parameters:
-    model_setting_split_dim: "device"
-    segment_settings:
-      desktop:
-        start_date: "2018-01-01"
-        end_date: NULL
-        holidays: ["easter", "covid_sip11"]
-        regressors: ["post_esr_migration", "in_covid"]
-        grid_parameters:
-          changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5]
-          changepoint_range: [0.8, 0.9]
-          weekly_seasonality: True
-          yearly_seasonality: True
-        cv_settings:
-          initial: "1296 days"
-          period: "30 days"
-          horizon: "30 days"
-          parallel: "processes"
-      mobile:
-        start_date: "2020-01-01"
-        end_date: NULL
-        holidays: ["easter"]
-        regressors: ["after_fenix", "in_covid"]
-        grid_parameters:
-          changepoint_prior_scale: [0.001, 0.01, 0.1]
-          weekly_seasonality: True
-          yearly_seasonality: True
-          growth: "logistic"
-        cv_settings:
-          initial: "366 days"
-          period: "30 days"
-          horizon: "30 days"
-          parallel: "processes"
+    - segment:
+        device: desktop
+      start_date: "2018-01-01"
+      end_date: NULL
+      holidays: ["easter", "covid_sip11"]
+      regressors: ["post_esr_migration", "in_covid"]
+      grid_parameters:
+        changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5]
+        changepoint_range: [0.8, 0.9]
+        weekly_seasonality: True
+        yearly_seasonality: True
+      cv_settings:
+        initial: "1296 days"
+        period: "30 days"
+        horizon: "30 days"
+        parallel: "processes"
+    - segment:
+        device: mobile  
+      start_date: "2020-01-01"
+      end_date: NULL
+      holidays: ["easter"]
+      regressors: ["after_fenix", "in_covid"]
+      grid_parameters:
+        changepoint_prior_scale: [0.001, 0.01, 0.1]
+        weekly_seasonality: True
+        yearly_seasonality: True
+        growth: "logistic"
+      cv_settings:
+        initial: "366 days"
+        period: "30 days"
+        horizon: "30 days"
+        parallel: "processes"
 
 summarize:
   periods: ["day", "month"]
diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
index 08a0f750..6d7bbae9 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
@@ -29,9 +29,6 @@ class BaseForecast(abc.ABC):
             date the metric should be queried.
         metric_hub (MetricHub): A MetricHub object that provides details about the
             metric to be forecasted.
-        number_of_simulations (int): The number of simulated timeseries that the forecast
-            should generate. Since many forecast models are probablistic, this enables the
-            measurement of variation across a range of possible outcomes.
     """
 
     model_type: str
@@ -40,7 +37,7 @@ class BaseForecast(abc.ABC):
     start_date: str
     end_date: str
     metric_hub: MetricHub
-    number_of_simulations: int = 1000
+    predict_historical_dates: bool = False
 
     def _get_observed_data(self):
         if self.metric_hub:
@@ -58,9 +55,18 @@ def __post_init__(self) -> None:
         # use default start/end dates if the user doesn't specify them
         self.start_date = pd.to_datetime(self.start_date or self._default_start_date)
         self.end_date = pd.to_datetime(self.end_date or self._default_end_date)
-        self.dates_to_predict = pd.DataFrame(
-            {"submission_date": pd.date_range(self.start_date, self.end_date).date}
-        )
+        if self.predict_historical_dates:
+            self.dates_to_predict = pd.DataFrame(
+                {
+                    "submission_date": pd.date_range(
+                        self.metric_hub.start_date, self.end_date
+                    ).date
+                }
+            )
+        else:
+            self.dates_to_predict = pd.DataFrame(
+                {"submission_date": pd.date_range(self.start_date, self.end_date).date}
+            )
 
         # initialize unset attributes
         self.model = None
diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py
index 52aa9cc8..aa4a4fb8 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py
@@ -67,15 +67,6 @@ def __post_init__(self) -> None:
             # this is used to avoid the code below for testing purposes
             return
 
-        # Overwrite dates_to_predict to provide historical date forecasts
-        self.dates_to_predict = pd.DataFrame(
-            {
-                "submission_date": pd.date_range(
-                    self.metric_hub.start_date, self.end_date
-                ).date
-            }
-        )
-
         self._set_segment_models(self.observed_df, self.metric_hub.segments.keys())
 
         # initialize unset attributes
@@ -85,10 +76,10 @@ def _set_segment_models(
         self, observed_df: pd.DataFrame, segment_column_list: list
     ) -> None:
         """Creates a SegmentSettings object for each segment specified in the
-            metric_hub.segments section of the config.  These objects are stored in a list
-            in the segment_models attribute
-            Parameters can be specified independently for at most one dimension column
-            set using model_setting_split_dim in self.parameters
+            metric_hub.segments section of the config.  It is populated from the list of
+            parameters in the forecast_model.parameters section of the configuration file.
+            The segements section of each element of the list specifies which values within which
+            segments the parameters are associated with.
 
         Args:
             observed_df (pd.DataFrame): dataframe containing observed data used to model
@@ -100,45 +91,64 @@ def _set_segment_models(
         combination_df = observed_df[segment_column_list].drop_duplicates()
 
         # Construct dictionaries from those combinations
+        # this will be used to check that the config actually partitions the data
         segment_combinations = combination_df.to_dict("records")
 
-        # initialize a list to hold models for each segment
-        ## populate the list with segments and parameters for the segment
-        split_dim = self.parameters["model_setting_split_dim"]
-
-        # check to make sure split_dim is one of the columns set in segment_column_list
-        if split_dim not in segment_column_list:
-            columns_str = ",".join(segment_column_list)
+        # get subset of segment that is used in partitioning
+        split_dims = None
+        for partition in self.parameters:
+            partition_dim = set(partition["segment"].keys())
+            if split_dims and partition_dim != split_dims:
+                raise ValueError(
+                    "Segment keys are not the same across different elements of parameters in the config file"
+                )
+            elif split_dims is None:
+                split_dims = partition_dim
+            else:
+                # this is case where split_dim is set and matches paritition_dim
+                continue
+        if not split_dims <= set(combination_df.keys()):
+            missing_dims = split_dims - set(combination_df.keys())
+            missing_dims_str = ",".join(missing_dims)
             raise ValueError(
-                f"model_setting_split_dim set to {split_dim} which is not among segment columns: {columns_str}"
+                f"Segment keys missing from metric hub segments: {missing_dims_str}"
             )
 
         # For each segment combinination, get the model parameters from the config
         ## file. Parse the holidays and regressors specified in the config file.
         segment_models = []
         for segment in segment_combinations:
-            model_params = self.parameters["segment_settings"][segment[split_dim]]
-
+            # find the correct configuration
+            for partition in self.parameters:
+                partition_segment = partition["segment"]
+                # get subset of segment that is used to partition
+                subset_segment = {
+                    key: val for key, val in segment.items() if key in split_dims
+                }
+                if partition_segment == subset_segment:
+                    # parition is set to the desired value
+                    # break out of loop
+                    break
             holiday_list = []
             regressor_list = []
 
-            if model_params["holidays"]:
-                holiday_list = [holiday_collection[h] for h in model_params["holidays"]]
-            if model_params["regressors"]:
+            if "holidays" in partition:
+                holiday_list = [holiday_collection[h] for h in partition["holidays"]]
+            if "regressors" in partition:
                 regressor_list = [
-                    regressor_collection[r] for r in model_params["regressors"]
+                    regressor_collection[r] for r in partition["regressors"]
                 ]
 
             # Create a SegmentModelSettings object for each segment combination
             segment_models.append(
                 SegmentModelSettings(
                     segment=segment,
-                    start_date=model_params["start_date"],
+                    start_date=partition["start_date"],
                     end_date=self.end_date,
                     holidays=[ProphetHoliday(**h) for h in holiday_list],
                     regressors=[ProphetRegressor(**r) for r in regressor_list],
-                    grid_parameters=dict(model_params["grid_parameters"]),
-                    cv_settings=dict(model_params["cv_settings"]),
+                    grid_parameters=dict(partition["grid_parameters"]),
+                    cv_settings=dict(partition["cv_settings"]),
                 )
             )
         self.segment_models = segment_models
diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py
index 19f57e1d..30d152b3 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py
@@ -16,6 +16,16 @@
 
 @dataclass
 class ProphetForecast(BaseForecast):
+    """Forecast object specifically for prophet forecast models
+
+    Additional attributes:
+    number_of_simulations (int): The number of simulated timeseries that the forecast
+            should generate. Since many forecast models are probablistic, this enables the
+            measurement of variation across a range of possible outcomes.
+    """
+
+    number_of_simulations: int = 1000
+
     @property
     def column_names_map(self) -> Dict[str, str]:
         return {"submission_date": "ds", "value": "y"}
diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py
index bb7355fd..99ccab84 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py
@@ -69,34 +69,33 @@ def funnel_forecast_for_fit_tests(segment_info_fit_tests, mocker):
     """This method creates a forecast object from the segment dict
     created in the segment_info_fit_tests fixture.  It also
     mocks some of the object methods to enable easier testing"""
-    parameter_dict = {
-        "model_setting_split_dim": "a",
-        "segment_settings": {
-            "A1": {
-                "start_date": segment_info_fit_tests["A1"]["start_date"],
-                "end_date": None,
-                "holidays": [],
-                "regressors": [],
-                "grid_parameters": segment_info_fit_tests["A1"]["grid_parameters"],
-                "cv_settings": {},
-            },
-            "A2": {
-                "start_date": segment_info_fit_tests["A2"]["start_date"],
-                "end_date": None,
-                "holidays": [],
-                "regressors": [],
-                "grid_parameters": segment_info_fit_tests["A2"]["grid_parameters"],
-                "cv_settings": {},
-            },
+    parameter_list = [
+        {
+            "segment": {"a": "A1"},
+            "start_date": segment_info_fit_tests["A1"]["start_date"],
+            "end_date": None,
+            "holidays": [],
+            "regressors": [],
+            "grid_parameters": segment_info_fit_tests["A1"]["grid_parameters"],
+            "cv_settings": {},
         },
-    }
+        {
+            "segment": {"a": "A2"},
+            "start_date": segment_info_fit_tests["A2"]["start_date"],
+            "end_date": None,
+            "holidays": [],
+            "regressors": [],
+            "grid_parameters": segment_info_fit_tests["A2"]["grid_parameters"],
+            "cv_settings": {},
+        },
+    ]
 
     predict_start_date = TEST_DATE_STR
     predict_end_date = TEST_DATE_NEXT_DAY_STR
 
     forecast = FunnelForecast(
         model_type="test",
-        parameters=parameter_dict,
+        parameters=parameter_list,
         use_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
@@ -501,26 +500,24 @@ def test_under_predict(mocker):
     # set segment models
 
     A1_start_date = TEST_DATE_STR
-    parameter_dict = {
-        "model_setting_split_dim": "a",
-        "segment_settings": {
-            "A1": {
-                "start_date": A1_start_date,
-                "end_date": None,
-                "holidays": [],
-                "regressors": [],
-                "grid_parameters": {"param1": [1, 2], "param2": [20, 10]},
-                "cv_settings": {},
-            },
-        },
-    }
+    parameter_list = [
+        {
+            "segment": {"a": "A1"},
+            "start_date": A1_start_date,
+            "end_date": None,
+            "holidays": [],
+            "regressors": [],
+            "grid_parameters": {"param1": [1, 2], "param2": [20, 10]},
+            "cv_settings": {},
+        }
+    ]
 
     predict_start_date = TEST_DATE_NEXT_DAY_STR
     predict_end_date = TEST_PREDICT_END_STR
 
     forecast = FunnelForecast(
         model_type="test",
-        parameters=parameter_dict,
+        parameters=parameter_list,
         use_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
@@ -856,34 +853,33 @@ def test_set_segment_models():
     """test the set_segment_models method"""
     A1_start_date = "2018-01-01"
     A2_start_date = "2020-02-02"
-    parameter_dict = {
-        "model_setting_split_dim": "a",
-        "segment_settings": {
-            "A1": {
-                "start_date": A1_start_date,
-                "end_date": None,
-                "holidays": [],
-                "regressors": [],
-                "grid_parameters": {},
-                "cv_settings": {},
-            },
-            "A2": {
-                "start_date": A2_start_date,
-                "end_date": None,
-                "holidays": [],
-                "regressors": [],
-                "grid_parameters": {},
-                "cv_settings": {},
-            },
+    parameter_list = [
+        {
+            "segment": {"a": "A1"},
+            "start_date": A1_start_date,
+            "end_date": None,
+            "holidays": [],
+            "regressors": [],
+            "grid_parameters": {},
+            "cv_settings": {},
         },
-    }
+        {
+            "segment": {"a": "A2"},
+            "start_date": A2_start_date,
+            "end_date": None,
+            "holidays": [],
+            "regressors": [],
+            "grid_parameters": {},
+            "cv_settings": {},
+        },
+    ]
 
     predict_start_date = TEST_DATE_STR
     predict_end_date = TEST_PREDICT_END_STR
 
     forecast = FunnelForecast(
         model_type="test",
-        parameters=parameter_dict,
+        parameters=parameter_list,
         use_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
@@ -934,34 +930,33 @@ def test_set_segment_models_exception():
     is specified that isn't in the data"""
     A1_start_date = "2018-01-01"
     A2_start_date = "2020-02-02"
-    parameter_dict = {
-        "model_setting_split_dim": "c",  # not in data
-        "segment_settings": {
-            "A1": {
-                "start_date": A1_start_date,
-                "end_date": None,
-                "holidays": [],
-                "regressors": [],
-                "grid_parameters": {},
-                "cv_settings": {},
-            },
-            "A2": {
-                "start_date": A2_start_date,
-                "end_date": None,
-                "holidays": [],
-                "regressors": [],
-                "grid_parameters": {},
-                "cv_settings": {},
-            },
+    parameter_list = [
+        {
+            "segment": {"c": "A1"},
+            "start_date": A1_start_date,
+            "end_date": None,
+            "holidays": [],
+            "regressors": [],
+            "grid_parameters": {},
+            "cv_settings": {},
         },
-    }
+        {
+            "segment": {"c": "A2"},
+            "start_date": A2_start_date,
+            "end_date": None,
+            "holidays": [],
+            "regressors": [],
+            "grid_parameters": {},
+            "cv_settings": {},
+        },
+    ]
 
     predict_start_date = TEST_DATE_STR
     predict_end_date = TEST_PREDICT_END_STR
 
     forecast = FunnelForecast(
         model_type="test",
-        parameters=parameter_dict,
+        parameters=parameter_list,
         use_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
@@ -976,7 +971,7 @@ def test_set_segment_models_exception():
 
     with pytest.raises(
         ValueError,
-        match="model_setting_split_dim set to c which is not among segment columns: a,b",
+        match="Segment keys missing from metric hub segments: c",
     ):
         forecast._set_segment_models(
             observed_df=observed_data, segment_column_list=segment_list

From e93162c0588ef031044fc2f873de7280beb27b8d Mon Sep 17 00:00:00 2001
From: Jared Snyder <jsnyder@mozilla.com>
Date: Mon, 12 Aug 2024 14:20:12 -0500
Subject: [PATCH 27/33] added test for more complex segments

---
 .../tests/test_funnel_forecast.py             | 98 +++++++++++++++++++
 1 file changed, 98 insertions(+)

diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py
index 99ccab84..34cef8cc 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py
@@ -924,6 +924,104 @@ def test_set_segment_models():
         assert checkval == expectedval
 
 
+def test_set_segment_models_multiple():
+    """test the set_segment_models method
+    with segments on multiple columns"""
+    # set arbitrary dates
+    # they're only used to make sure segments are set correctly
+    A1B1_start_date = "2018-01-01"
+    A1B2_start_date = "2019-01-01"
+    A2B1_start_date = "2020-02-02"
+    A2B2_start_date = "2021-02-02"
+    parameter_list = [
+        {
+            "segment": {"a": "A1", "b": "B1"},
+            "start_date": A1B1_start_date,
+            "end_date": None,
+            "holidays": [],
+            "regressors": [],
+            "grid_parameters": {},
+            "cv_settings": {},
+        },
+        {
+            "segment": {"a": "A1", "b": "B2"},
+            "start_date": A1B2_start_date,
+            "end_date": None,
+            "holidays": [],
+            "regressors": [],
+            "grid_parameters": {},
+            "cv_settings": {},
+        },
+        {
+            "segment": {"a": "A2", "b": "B1"},
+            "start_date": A2B1_start_date,
+            "end_date": None,
+            "holidays": [],
+            "regressors": [],
+            "grid_parameters": {},
+            "cv_settings": {},
+        },
+        {
+            "segment": {"a": "A2", "b": "B2"},
+            "start_date": A2B2_start_date,
+            "end_date": None,
+            "holidays": [],
+            "regressors": [],
+            "grid_parameters": {},
+            "cv_settings": {},
+        },
+    ]
+
+    predict_start_date = TEST_DATE_STR
+    predict_end_date = TEST_PREDICT_END_STR
+
+    forecast = FunnelForecast(
+        model_type="test",
+        parameters=parameter_list,
+        use_holidays=None,
+        start_date=predict_start_date,
+        end_date=predict_end_date,
+        metric_hub=None,
+    )
+
+    observed_data = pd.DataFrame(
+        {"a": ["A1", "A1", "A2", "A2", "A2"], "b": ["B1", "B2", "B1", "B2", "B2"]}
+    )
+
+    segment_list = ["a", "b"]
+
+    forecast._set_segment_models(
+        observed_df=observed_data, segment_column_list=segment_list
+    )
+
+    # put the segments and the start date in the same dictionary to make
+    # comparison easier
+    # the important things to check is that all possible combinations
+    # of segments are present and that each has the parameters set properly
+    # start_date is a stand-in for these parameters and
+    # is determined by the value of a as specified in parameter_dict
+    check_segment_models = [
+        dict(**el.segment, **{"start_date": el.start_date})
+        for el in forecast.segment_models
+    ]
+    expected = [
+        {"a": "A1", "b": "B1", "start_date": A1B1_start_date},
+        {"a": "A1", "b": "B2", "start_date": A1B2_start_date},
+        {"a": "A2", "b": "B1", "start_date": A2B1_start_date},
+        {"a": "A2", "b": "B2", "start_date": A2B2_start_date},
+    ]
+
+    # can't make a set of dicts for comparison
+    # so sort the lists and compare each element
+    compare_sorted = zip(
+        sorted(check_segment_models, key=lambda x: (x["a"], x["b"])),
+        sorted(expected, key=lambda x: (x["a"], x["b"])),
+    )
+
+    for checkval, expectedval in compare_sorted:
+        assert checkval == expectedval
+
+
 def test_set_segment_models_exception():
     """test the exception for segment_models where
     and exception is raised if a model_setting_split_dim

From 5f0536d5578b09c444a53f5d7b6d778c905423fa Mon Sep 17 00:00:00 2001
From: Jared Snyder <jsnyder@mozilla.com>
Date: Tue, 13 Aug 2024 13:28:00 -0500
Subject: [PATCH 28/33] renamed use_holidays to use_all_us_holidays

---
 jobs/kpi-forecasting/README.md                       |  2 +-
 .../kpi_forecasting/configs/dau_desktop.yaml         |  2 +-
 .../kpi_forecasting/configs/dau_mobile.yaml          |  2 +-
 .../configs/search_forecasting_ad_clicks.yaml        |  2 +-
 .../search_forecasting_daily_active_users.yaml       |  2 +-
 .../configs/search_forecasting_search_count.yaml     |  2 +-
 .../kpi_forecasting/models/base_forecast.py          |  6 +++---
 .../kpi_forecasting/models/prophet_forecast.py       |  4 ++--
 .../kpi_forecasting/tests/test_base_forecast.py      | 10 +++++-----
 .../tests/test_data/test_funnel_config.yaml          |  2 +-
 .../kpi_forecasting/tests/test_funnel_forecast.py    | 12 ++++++------
 .../kpi_forecasting/tests/test_prophet_forecast.py   | 10 +++++-----
 12 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/jobs/kpi-forecasting/README.md b/jobs/kpi-forecasting/README.md
index ff1a6ed8..efe75df6 100644
--- a/jobs/kpi-forecasting/README.md
+++ b/jobs/kpi-forecasting/README.md
@@ -116,7 +116,7 @@ forecast_model:  # this section configures the model
     # if unset, value depends on predict_historical_dates.
   end_date: NULL
     # final date for the predicted data
-  use_holidays: False
+  use_all_us_holidays: False
     For prophet-based models, when true, call `model.add_country_holidays(country_name="US")` on model
   predict_historical_dates: True
     # if predict_historical_dates is True, set to first date of the observed data
diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml
index 83e80ab9..0b8966f2 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml
+++ b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_desktop.yaml
@@ -10,7 +10,7 @@ forecast_model:
   model_type: "prophet"
   start_date: NULL
   end_date: NULL
-  use_holidays: False
+  use_all_us_holidays: False
   predict_historical_dates: False
   number_of_simulations: 1000
   parameters:
diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml
index a3a9f3eb..c9288408 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml
+++ b/jobs/kpi-forecasting/kpi_forecasting/configs/dau_mobile.yaml
@@ -10,7 +10,7 @@ forecast_model:
   model_type: "prophet"
   start_date: NULL
   end_date: NULL
-  use_holidays: True
+  use_all_us_holidays: True
   predict_historical_dates: False
   number_of_simulations: 1000
   parameters:
diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml
index ea8a2a64..7a01aa15 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml
+++ b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_ad_clicks.yaml
@@ -16,7 +16,7 @@ forecast_model:
   model_type: "funnel"
   start_date: NULL
   end_date: NULL
-  use_holidays: False
+  use_all_us_holidays: False
   predict_historical_dates: True
   number_of_simulations: 1000
   parameters:
diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml
index 3ce3568e..dfb7bb49 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml
+++ b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_daily_active_users.yaml
@@ -16,7 +16,7 @@ forecast_model:
   model_type: "funnel"
   start_date: NULL
   end_date: NULL
-  use_holidays: False
+  use_all_us_holidays: False
   predict_historical_dates: True
   number_of_simulations: 1000
   parameters:
diff --git a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml
index 75f73ba2..17431247 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml
+++ b/jobs/kpi-forecasting/kpi_forecasting/configs/search_forecasting_search_count.yaml
@@ -16,7 +16,7 @@ forecast_model:
   model_type: "funnel"
   start_date: NULL
   end_date: NULL
-  use_holidays: False
+  use_all_us_holidays: False
   predict_historical_dates: True
   number_of_simulations: 1000
   parameters:
diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
index 6d7bbae9..99504fdb 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
@@ -20,7 +20,7 @@ class BaseForecast(abc.ABC):
     Args:
         model_type (str): The name of the forecasting model that's being used.
         parameters (Dict): Parameters that should be passed to the forecasting model.
-        use_holidays (bool): Whether or not the forecasting model should use holidays.
+        use_all_us_holidays (bool): Whether or not the forecasting model should use holidays.
             The base model does not apply holiday logic; that logic needs to be built
             in the child class.
         start_date (str): A 'YYYY-MM-DD' formatted-string that specifies the first
@@ -33,7 +33,7 @@ class BaseForecast(abc.ABC):
 
     model_type: str
     parameters: Dict
-    use_holidays: bool
+    use_all_us_holidays: bool
     start_date: str
     end_date: str
     metric_hub: MetricHub
@@ -78,7 +78,7 @@ def __post_init__(self) -> None:
             {
                 "model_type": self.model_type.lower(),
                 "model_params": self.parameters,
-                "use_holidays": self.use_holidays,
+                "use_all_us_holidays": self.use_all_us_holidays,
             }
         )
 
diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py
index 30d152b3..82a07fc4 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py
@@ -37,7 +37,7 @@ def _build_model(self, parameter_dict):
             mcmc_samples=0,
         )
 
-        if self.use_holidays:
+        if self.use_all_us_holidays:
             model.add_country_holidays(country_name="US")
 
         return model
@@ -106,7 +106,7 @@ def _predict_legacy(self) -> pd.DataFrame:
             datetime.now(timezone.utc).replace(tzinfo=None).date()
         )
         df["forecast_parameters"] = str(
-            json.dumps({**self.parameters, "holidays": self.use_holidays})
+            json.dumps({**self.parameters, "holidays": self.use_all_us_holidays})
         )
 
         alias = self.metric_hub.alias.lower()
diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
index de8aa885..c4d823d3 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
@@ -92,7 +92,7 @@ def test_post_init(good_class):
     good_class = good_class(
         model_type="test",
         parameters={},
-        use_holidays=None,
+        use_all_us_holidays=None,
         start_date=start_date,
         end_date=end_date,
         metric_hub=None,
@@ -112,7 +112,7 @@ def test_post_init_default_dates(good_class):
     good_class = good_class(
         model_type="test",
         parameters={},
-        use_holidays=None,
+        use_all_us_holidays=None,
         start_date="",
         end_date="",
         metric_hub=None,
@@ -133,7 +133,7 @@ def test_fit(good_class):
     good_class = good_class(
         model_type="test",
         parameters={},
-        use_holidays=None,
+        use_all_us_holidays=None,
         start_date=TEST_DATE_STR,
         end_date=TEST_PREDICT_END_STR,
         metric_hub=None,
@@ -149,7 +149,7 @@ def test_predict_and_validate(good_class):
     good_class = good_class(
         model_type="test",
         parameters={},
-        use_holidays=None,
+        use_all_us_holidays=None,
         start_date=TEST_DATE_STR,
         end_date=TEST_PREDICT_END_STR,
         metric_hub=None,
@@ -164,7 +164,7 @@ def test_summarize(good_class):
     good_class = good_class(
         model_type="test",
         parameters={},
-        use_holidays=None,
+        use_all_us_holidays=None,
         start_date=TEST_DATE_STR,
         end_date=TEST_PREDICT_END_STR,
         metric_hub=None,
diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_data/test_funnel_config.yaml b/jobs/kpi-forecasting/kpi_forecasting/tests/test_data/test_funnel_config.yaml
index 2aebbeff..17943134 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_data/test_funnel_config.yaml
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_data/test_funnel_config.yaml
@@ -15,7 +15,7 @@ forecast_model:
   model_type: "funnel"
   start_date: NULL
   end_date: NULL
-  use_holidays: False
+  use_all_us_holidays: False
   parameters:
     model_setting_split_dim: "device"
     segment_settings:
diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py
index 34cef8cc..535f84c2 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py
@@ -31,7 +31,7 @@ def forecast():
     forecast = FunnelForecast(
         model_type="test",
         parameters={},
-        use_holidays=None,
+        use_all_us_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
         metric_hub=None,
@@ -96,7 +96,7 @@ def funnel_forecast_for_fit_tests(segment_info_fit_tests, mocker):
     forecast = FunnelForecast(
         model_type="test",
         parameters=parameter_list,
-        use_holidays=None,
+        use_all_us_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
         metric_hub=None,
@@ -518,7 +518,7 @@ def test_under_predict(mocker):
     forecast = FunnelForecast(
         model_type="test",
         parameters=parameter_list,
-        use_holidays=None,
+        use_all_us_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
         metric_hub=None,
@@ -880,7 +880,7 @@ def test_set_segment_models():
     forecast = FunnelForecast(
         model_type="test",
         parameters=parameter_list,
-        use_holidays=None,
+        use_all_us_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
         metric_hub=None,
@@ -978,7 +978,7 @@ def test_set_segment_models_multiple():
     forecast = FunnelForecast(
         model_type="test",
         parameters=parameter_list,
-        use_holidays=None,
+        use_all_us_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
         metric_hub=None,
@@ -1055,7 +1055,7 @@ def test_set_segment_models_exception():
     forecast = FunnelForecast(
         model_type="test",
         parameters=parameter_list,
-        use_holidays=None,
+        use_all_us_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
         metric_hub=None,
diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py
index 150ec4da..adc9c4ba 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_prophet_forecast.py
@@ -39,7 +39,7 @@ def forecast():
     return ProphetForecast(
         model_type="test",
         parameters=parameter_dict,
-        use_holidays=None,
+        use_all_us_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
         metric_hub=None,
@@ -549,7 +549,7 @@ def test_summarize_non_overlapping_day():
     forecast = ProphetForecast(
         model_type="test",
         parameters={},
-        use_holidays=None,
+        use_all_us_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
         metric_hub=None,
@@ -667,7 +667,7 @@ def test_summarize_non_overlapping_month():
     forecast = ProphetForecast(
         model_type="test",
         parameters={},
-        use_holidays=None,
+        use_all_us_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
         metric_hub=None,
@@ -786,7 +786,7 @@ def test_summarize_overlapping_day():
     forecast = ProphetForecast(
         model_type="test",
         parameters={},
-        use_holidays=None,
+        use_all_us_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
         metric_hub=None,
@@ -900,7 +900,7 @@ def test_summarize_overlapping_month():
     forecast = ProphetForecast(
         model_type="test",
         parameters={},
-        use_holidays=None,
+        use_all_us_holidays=None,
         start_date=predict_start_date,
         end_date=predict_end_date,
         metric_hub=None,

From e0903b3f708b703d1082f30cb3fc5b9ef23088f5 Mon Sep 17 00:00:00 2001
From: m-d-bowerman <mbowerman@mozilla.com>
Date: Tue, 13 Aug 2024 13:21:47 -0700
Subject: [PATCH 29/33] typo

---
 jobs/kpi-forecasting/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jobs/kpi-forecasting/README.md b/jobs/kpi-forecasting/README.md
index ff1a6ed8..2372ac0d 100644
--- a/jobs/kpi-forecasting/README.md
+++ b/jobs/kpi-forecasting/README.md
@@ -85,7 +85,7 @@ The tests can be run locally with `python -m pytest` in the root directory of th
 
 # YAML Configs
 
-Configuration for each forecast is found in the `configs` folder.  Below is an example config file with sample values and a description of what the field means as a comment whe it is not self-evident
+Configuration for each forecast is found in the `configs` folder.  Below is an example config file with sample values and a description of what the field means as a comment when it is not self-evident
 
 ```
 metric_hub:  # this configures the observed data fed to the model which is obtained via metrichub

From 109dff7feb6ac97811ce39f93b5e4c47f7af656d Mon Sep 17 00:00:00 2001
From: m-d-bowerman <mbowerman@mozilla.com>
Date: Tue, 13 Aug 2024 13:35:02 -0700
Subject: [PATCH 30/33] added detail to prophet parameter descriptions

---
 jobs/kpi-forecasting/README.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/jobs/kpi-forecasting/README.md b/jobs/kpi-forecasting/README.md
index 2372ac0d..ebafbf78 100644
--- a/jobs/kpi-forecasting/README.md
+++ b/jobs/kpi-forecasting/README.md
@@ -140,17 +140,17 @@ forecast_model:  # this section configures the model
       regressors: ["post_esr_migration", "in_covid", "ad_click_bug"] # regressors specified in `configs.model_inputs.regressors`
       grid_parameters:
         # sets grid for hyperparameter tuning
-        changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5]
-        changepoint_range: [0.8, 0.9]
-        n_changepoints: [25, 50]
-        weekly_seasonality: True
-        yearly_seasonality: True
+        changepoint_prior_scale: [0.001, 0.01, 0.1, 0.2, 0.5] # parameter of prior distribution controlling how much the trend fluctuates at changepoints
+        changepoint_range: [0.8, 0.9] # the proportion of the time series over which the changepoints are distributed
+        n_changepoints: [25, 50] # number of trend changepoints, equally spaced over the time series
+        weekly_seasonality: True # if weekly seasonality is included in the model
+        yearly_seasonality: True # if yearly seasonality is included in the model
       cv_settings:
         # sets parameters for prophet cross-validation used in FunnelForecast
-        initial: "1296 days"
-        period: "30 days"
-        horizon: "30 days"
-        parallel: "processes"
+        initial: "1296 days" # the initial training period, used to train the first iteration of the model for CV
+        period: "30 days" # spacing between cutoff dates, the sliding window over which each round of cross validation is performed
+        horizon: "30 days" # forecast horizon used to make predictions and calculate model fit metrics for optimization
+        parallel: "processes" # how parallelization is performed by Prophet, or None if no paralellization is used
     ...
 
 summarize:

From c6ed03c9e5f3deff7b611c938349e0f838387e0d Mon Sep 17 00:00:00 2001
From: Jared Snyder <jsnyder@mozilla.com>
Date: Wed, 14 Aug 2024 10:04:40 -0500
Subject: [PATCH 31/33] updated setting of default start date and added tests

---
 .../kpi_forecasting/models/base_forecast.py   | 28 +++++++------
 .../tests/test_base_forecast.py               | 40 +++++++++++++++++++
 2 files changed, 55 insertions(+), 13 deletions(-)

diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
index 99504fdb..896051f8 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/base_forecast.py
@@ -29,6 +29,9 @@ class BaseForecast(abc.ABC):
             date the metric should be queried.
         metric_hub (MetricHub): A MetricHub object that provides details about the
             metric to be forecasted.
+        predict_historical_dates (bool):  If True, forecast starts at the first
+            date in the observed data.  If False, it uses the value of start_date it set
+            and the first day after the observed data ends otherwise
     """
 
     model_type: str
@@ -52,21 +55,17 @@ def __post_init__(self) -> None:
         self.collected_at = datetime.now(timezone.utc).replace(tzinfo=None)
         self._get_observed_data()
 
+        # raise an error is predict_historical_dates is True and start_date is set
+        if self.start_date and self.predict_historical_dates:
+            raise ValueError(
+                "forecast start_date set while predict_historical_dates is True"
+            )
         # use default start/end dates if the user doesn't specify them
         self.start_date = pd.to_datetime(self.start_date or self._default_start_date)
         self.end_date = pd.to_datetime(self.end_date or self._default_end_date)
-        if self.predict_historical_dates:
-            self.dates_to_predict = pd.DataFrame(
-                {
-                    "submission_date": pd.date_range(
-                        self.metric_hub.start_date, self.end_date
-                    ).date
-                }
-            )
-        else:
-            self.dates_to_predict = pd.DataFrame(
-                {"submission_date": pd.date_range(self.start_date, self.end_date).date}
-            )
+        self.dates_to_predict = pd.DataFrame(
+            {"submission_date": pd.date_range(self.start_date, self.end_date).date}
+        )
 
         # initialize unset attributes
         self.model = None
@@ -144,7 +143,10 @@ def _summarize(
     @property
     def _default_start_date(self) -> str:
         """The first day after the last date in the observed dataset."""
-        return self.observed_df["submission_date"].max() + timedelta(days=1)
+        if self.predict_historical_dates:
+            return self.observed_df["submission_date"].min()
+        else:
+            return self.observed_df["submission_date"].max() + timedelta(days=1)
 
     @property
     def _default_end_date(self) -> str:
diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
index c4d823d3..bfea0e5a 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_base_forecast.py
@@ -107,6 +107,24 @@ def test_post_init(good_class):
     assert good_class.dates_to_predict.equals(dates_to_predict_expected)
 
 
+def test_post_init_exception(good_class):
+    start_date = TEST_DATE_STR
+    end_date = TEST_PREDICT_END_STR
+    with pytest.raises(
+        ValueError,
+        match="forecast start_date set while predict_historical_dates is True",
+    ):
+        _ = good_class(
+            model_type="test",
+            parameters={},
+            use_all_us_holidays=None,
+            start_date=start_date,
+            end_date=end_date,
+            metric_hub=None,
+            predict_historical_dates=True,
+        )
+
+
 def test_post_init_default_dates(good_class):
     # check default start and end time
     good_class = good_class(
@@ -129,6 +147,28 @@ def test_post_init_default_dates(good_class):
     assert good_class.dates_to_predict.equals(dates_to_predict_expected)
 
 
+def test_post_init_default_dates_historical(good_class):
+    # check default start and end time
+    good_class = good_class(
+        model_type="test",
+        parameters={},
+        use_all_us_holidays=None,
+        start_date="",
+        end_date="",
+        metric_hub=None,
+        predict_historical_dates=True,
+    )
+    # this is the min date of the observed data
+    start_date = TEST_DATE - relativedelta(years=1)
+    end_date = (
+        datetime.now(timezone.utc).replace(tzinfo=None) + timedelta(weeks=78)
+    ).date()
+    dates_to_predict_expected = pd.DataFrame(
+        {"submission_date": pd.date_range(start_date, end_date).date}
+    )
+    assert good_class.dates_to_predict.equals(dates_to_predict_expected)
+
+
 def test_fit(good_class):
     good_class = good_class(
         model_type="test",

From 2f8b3d09b2514e4a45a734798d6919b236cc82d9 Mon Sep 17 00:00:00 2001
From: Jared Snyder <jsnyder@mozilla.com>
Date: Wed, 14 Aug 2024 14:07:21 -0500
Subject: [PATCH 32/33] remove print

---
 jobs/kpi-forecasting/kpi_forecasting/results_processing.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/jobs/kpi-forecasting/kpi_forecasting/results_processing.py b/jobs/kpi-forecasting/kpi_forecasting/results_processing.py
index 1cb8a9d1..e2f199e5 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/results_processing.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/results_processing.py
@@ -80,7 +80,6 @@ def _load_config_data(self):
         for config_file in self.input_config_list:
             full_path = f"{self.input_config_path}/{config_file}"
             config_data = load_yaml(full_path)
-            print(config_data)
             self.config_data[config_file] = config_data
 
     def _extract_config_data(self):

From 877d07cbb3a26b4627fab28faebe57a20573ae74 Mon Sep 17 00:00:00 2001
From: Jared Snyder <jsnyder@mozilla.com>
Date: Wed, 14 Aug 2024 16:14:16 -0500
Subject: [PATCH 33/33] moved filter and updated tests to relfect this

---
 .../kpi_forecasting/models/funnel_forecast.py | 10 +++--
 .../models/prophet_forecast.py                |  4 +-
 .../tests/test_funnel_forecast.py             | 45 ++++++++-----------
 3 files changed, 28 insertions(+), 31 deletions(-)

diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py
index b38e4df4..3c06863c 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/funnel_forecast.py
@@ -493,9 +493,7 @@ def _predict(
 
         segment_settings.components_df = components_df.copy()
 
-        return df.loc[
-            pd.to_datetime(df["submission_date"]) >= pd.to_datetime(self.start_date)
-        ]
+        return df
 
     def _validate_forecast_df(self, df: pd.DataFrame) -> None:
         """
@@ -565,6 +563,12 @@ def _combine_forecast_observed(
         Returns:
             pd.DataFrame: combined dataframe containing aggregated values from observed and forecast
         """
+        # filter the forecast data to just the data in the future
+        last_historic_date = observed_df["submission_date"].max()
+        forecast_df = forecast_df.loc[
+            forecast_df["submission_date"] > last_historic_date
+        ]
+
         forecast_summarized, observed_summarized = self._aggregate_forecast_observed(
             forecast_df, observed_df, period, numpy_aggregations, percentiles
         )
diff --git a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py
index 82a07fc4..3dc2b920 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/models/prophet_forecast.py
@@ -362,8 +362,8 @@ def write_results(
         project_legacy: str,
         dataset_legacy: str,
         write_disposition: str = "WRITE_APPEND",
-        forecast_table_legacy: str = "kpi_automated_forecast_v1",
-        confidences_table_legacy: str = "kpi_automated_forecast_confidences_v1",
+        forecast_table_legacy: str = "kpi_automated_forecast_v1_branch",
+        confidences_table_legacy: str = "kpi_automated_forecast_confidences_v1_branch",
     ) -> None:
         """
         Write `self.summary_df` to Big Query.
diff --git a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py
index aaeed8b7..6e43e409 100644
--- a/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py
+++ b/jobs/kpi-forecasting/kpi_forecasting/tests/test_funnel_forecast.py
@@ -194,8 +194,8 @@ def test_combine_forecast_observed(mocker, forecast):
     observed_df = pd.DataFrame(
         {
             "submission_date": [
-                TEST_DATE,
-                TEST_DATE_NEXT_DAY,
+                TEST_DATE - relativedelta(days=2),
+                TEST_DATE - relativedelta(days=1),
             ],
             "a": ["A1", "A1"],
             "value": [5, 6],
@@ -257,10 +257,10 @@ def test_under_summarize(mocker, forecast):
         {
             "submission_date": [
                 TEST_DATE - relativedelta(months=1),
-                TEST_DATE,
-                TEST_DATE_NEXT_DAY,
-                TEST_DATE,
-                TEST_DATE_NEXT_DAY,
+                TEST_DATE - relativedelta(days=2),
+                TEST_DATE - relativedelta(days=1),
+                TEST_DATE - relativedelta(days=2),
+                TEST_DATE - relativedelta(days=1),
             ],
             "a": ["A1", "A1", "A1", "A2", "A2"],
             "value": [10, 20, 30, 40, 50],
@@ -272,7 +272,7 @@ def test_under_summarize(mocker, forecast):
         ["start_date", "forecast_df", "segment", "trained_parameters"],
     )
     dummy_segment_settings = SegmentSettings(
-        start_date=TEST_DATE_STR,
+        start_date=(TEST_DATE - relativedelta(days=2)).strftime("%Y-%m-%d"),
         forecast_df=forecast_df.copy(),
         segment={"a": "A1"},
         trained_parameters={"trained_parameters": "yes"},
@@ -295,8 +295,8 @@ def test_under_summarize(mocker, forecast):
     observed_expected_df = pd.DataFrame(
         {
             "submission_date": [
-                TEST_DATE,
-                TEST_DATE_NEXT_DAY,
+                TEST_DATE - relativedelta(days=2),
+                TEST_DATE - relativedelta(days=1),
             ],
             "a": ["A1", "A1"],
             "value": [20, 30],
@@ -361,10 +361,10 @@ def test_summarize(mocker, forecast):
         {
             "submission_date": [
                 TEST_DATE - relativedelta(months=1),
-                TEST_DATE,
-                TEST_DATE_NEXT_DAY,
-                TEST_DATE,
-                TEST_DATE_NEXT_DAY,
+                TEST_DATE - relativedelta(days=2),
+                TEST_DATE - relativedelta(days=1),
+                TEST_DATE - relativedelta(days=2),
+                TEST_DATE - relativedelta(days=1),
             ],
             "a": ["A1", "A1", "A1", "A2", "A2"],
             "value": [10, 20, 30, 40, 50],
@@ -380,7 +380,7 @@ def test_summarize(mocker, forecast):
     # we're only testing that it is concatenated properly
     # with the segment data added
     dummy_segment_settings_A1 = SegmentSettings(
-        start_date=TEST_DATE_STR,
+        start_date=(TEST_DATE - relativedelta(days=2)).strftime("%Y-%m-%d"),
         forecast_df=forecast_df.copy(),
         segment={"a": "A1"},
         trained_parameters={"trained_parameters": "yes"},
@@ -388,7 +388,7 @@ def test_summarize(mocker, forecast):
     )
 
     dummy_segment_settings_A2 = SegmentSettings(
-        start_date=TEST_DATE_STR,
+        start_date=(TEST_DATE - relativedelta(days=2)).strftime("%Y-%m-%d"),
         forecast_df=forecast_df.copy(),
         segment={"a": "A2"},
         trained_parameters={"trained_parameters": "yes"},
@@ -425,10 +425,10 @@ def test_summarize(mocker, forecast):
     observed_expected_df = pd.DataFrame(
         {
             "submission_date": [
-                TEST_DATE,
-                TEST_DATE_NEXT_DAY,
-                TEST_DATE,
-                TEST_DATE_NEXT_DAY,
+                TEST_DATE - relativedelta(days=2),
+                TEST_DATE - relativedelta(days=1),
+                TEST_DATE - relativedelta(days=2),
+                TEST_DATE - relativedelta(days=1),
             ],
             "a": ["A1", "A1", "A2", "A2"],
             "value": [20, 30, 40, 50],
@@ -584,13 +584,6 @@ def test_under_predict(mocker):
         }
     )
 
-    # time filter corresponds to the start time of the object
-    # as opposed to the segment
-    expected_time_filter = (
-        expected["submission_date"] >= pd.to_datetime(forecast.start_date).date()
-    )
-    expected = expected[expected_time_filter].reset_index(drop=True)
-
     pd.testing.assert_frame_equal(out, expected)
 
     # check the components