From 1b9dd9bb845305ba8a1197d5e8e545c621a946e0 Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Wed, 31 Jul 2024 14:32:16 -0700 Subject: [PATCH 01/40] first draft --- src/mozanalysis/experiment.py | 241 +++++++++++++++++++++++++--------- src/mozanalysis/metrics.py | 55 +++++--- src/mozanalysis/types.py | 5 + 3 files changed, 222 insertions(+), 79 deletions(-) diff --git a/src/mozanalysis/experiment.py b/src/mozanalysis/experiment.py index dd05f94a..a4bf956e 100644 --- a/src/mozanalysis/experiment.py +++ b/src/mozanalysis/experiment.py @@ -6,6 +6,7 @@ import logging from enum import Enum from typing import TYPE_CHECKING +from typing_extensions import assert_never import attr @@ -14,6 +15,7 @@ from mozanalysis.config import ConfigLoader from mozanalysis.metrics import AnalysisBasis, DataSource, Metric from mozanalysis.utils import add_days, date_sub, hash_ish +from mozanalysis.types import AnalysisUnit if TYPE_CHECKING: from pandas import DataFrame @@ -134,6 +136,7 @@ class Experiment: num_dates_enrollment = attr.ib(default=None) app_id = attr.ib(default=None) app_name = attr.ib(default=None) + analysis_unit = attr.ib(type=AnalysisUnit, default=AnalysisUnit.CLIENT_ID) def get_app_name(self): """ @@ -189,6 +192,10 @@ def get_single_window_data( Specifies the query type to use to get the experiment's enrollments, unless overridden by ``custom_enrollments_query``. + analysis_unit (AnalysisUnit): the "unit" of analysis, namely the + id that defines an experimental unit. For example: `client_id` + for mobile experiments or `group_id` for desktop experiments. Can be + overridden through a ``custom_enrollments_query``, but shouldn't. custom_enrollments_query (str): A full SQL query that will generate the `enrollments` common table expression used in the main query. The query must produce the columns @@ -317,6 +324,10 @@ def get_time_series_data( Specifies the query type to use to get the experiment's enrollments, unless overridden by ``custom_enrollments_query``. + analysis_unit (AnalysisUnit): the "unit" of analysis, namely the + id that defines an experimental unit. For example: `client_id` + for mobile experiments or `group_id` for desktop experiments. Can be + overridden through a ``custom_enrollments_query``, but shouldn't. custom_enrollments_query (str): A full SQL query that will generate the `enrollments` common table expression used in the main query. The query must produce the columns @@ -437,6 +448,10 @@ def build_enrollments_query( Specifies the query type to use to get the experiment's enrollments, unless overridden by ``custom_enrollments_query``. + analysis_unit (AnalysisUnit): the "unit" of analysis, namely the + id that defines an experimental unit. For example: `client_id` + for mobile experiments or `group_id` for desktop experiments. Can be + overridden through a ``custom_enrollments_query``, but shouldn't. custom_enrollments_query (str): A full SQL query that will generate the `enrollments` common table expression used in the main query. The query must produce the columns @@ -465,7 +480,9 @@ def build_enrollments_query( sample_size = sample_size or 100 enrollments_query = custom_enrollments_query or self._build_enrollments_query( - time_limits, enrollments_query_type, sample_size + time_limits, + enrollments_query_type, + sample_size, ) if exposure_signal: @@ -474,10 +491,14 @@ def build_enrollments_query( ) else: exposure_query = custom_exposure_query or self._build_exposure_query( - time_limits, enrollments_query_type + time_limits, + enrollments_query_type, ) - segments_query = self._build_segments_query(segment_list, time_limits) + segments_query = self._build_segments_query( + segment_list, + time_limits, + ) return f""" WITH raw_enrollments AS ({enrollments_query}), @@ -486,10 +507,10 @@ def build_enrollments_query( SELECT se.*, - e.* EXCEPT (client_id, branch) + e.* EXCEPT ({self.analysis_unit}, branch) FROM segmented_enrollments se LEFT JOIN exposures e - USING (client_id, branch) + USING ({self.analysis_unit.value}, branch) """ def build_metrics_query( @@ -573,7 +594,7 @@ def build_metrics_query( x.num_exposure_events FROM exposures x RIGHT JOIN raw_enrollments e - USING (client_id, branch) + USING ({id_column}, branch) ) SELECT enrollments.*, @@ -586,6 +607,7 @@ def build_metrics_query( metrics_columns=",\n ".join(metrics_columns), metrics_joins="\n".join(metrics_joins), enrollments_table=enrollments_table, + id_column=self.analysis_unit.value, ) @staticmethod @@ -611,30 +633,47 @@ def _build_enrollments_query( ) -> str: """Return SQL to query a list of enrollments and their branches""" if enrollments_query_type == EnrollmentsQueryType.NORMANDY: - return self._build_enrollments_query_normandy(time_limits, sample_size) + return self._build_enrollments_query_normandy( + time_limits, + sample_size, + ) elif enrollments_query_type == EnrollmentsQueryType.GLEAN_EVENT: if not self.app_id: raise ValueError( "App ID must be defined for building Glean enrollments query" ) + if not self.analysis_unit == AnalysisUnit.CLIENT_ID: + raise ValueError( + "Glean enrollments currently only support client_id analysis units" + ) return self._build_enrollments_query_glean_event( time_limits, self.app_id, sample_size ) elif enrollments_query_type == EnrollmentsQueryType.FENIX_FALLBACK: + if not self.analysis_unit == AnalysisUnit.CLIENT_ID: + raise ValueError( + "Fenix fallback enrollments currently only support client_id analysis units" # noqa: E501 + ) return self._build_enrollments_query_fenix_baseline( time_limits, sample_size ) elif enrollments_query_type == EnrollmentsQueryType.CIRRUS: + if not self.analysis_unit == AnalysisUnit.CLIENT_ID: + raise ValueError( + "Cirrus enrollments currently only support client_id analysis units" + ) if not self.app_id: raise ValueError( "App ID must be defined for building Cirrus enrollments query" ) return self._build_enrollments_query_cirrus(time_limits, self.app_id) else: - raise ValueError + assert_never(enrollments_query_type) def _build_exposure_query( - self, time_limits: TimeLimits, exposure_query_type: EnrollmentsQueryType + self, + time_limits: TimeLimits, + exposure_query_type: EnrollmentsQueryType, ) -> str: """Return SQL to query a list of exposures and their branches""" if exposure_query_type == EnrollmentsQueryType.NORMANDY: @@ -644,12 +683,24 @@ def _build_exposure_query( raise ValueError( "App ID must be defined for building Glean exposures query" ) + if not self.analysis_unit == AnalysisUnit.CLIENT_ID: + raise ValueError( + "Glean exposures currently only support client_id analysis units" + ) return self._build_exposure_query_glean_event(time_limits, self.app_id) elif exposure_query_type == EnrollmentsQueryType.FENIX_FALLBACK: + if not self.analysis_unit == AnalysisUnit.CLIENT_ID: + raise ValueError( + "Fenix fallback exposures currently only support client_id analysis units" # noqa: E501 + ) return self._build_exposure_query_glean_event( time_limits, "org_mozilla_firefox" ) elif exposure_query_type == EnrollmentsQueryType.CIRRUS: + if not self.analysis_unit == AnalysisUnit.CLIENT_ID: + raise ValueError( + "Cirrus exposures currently only support client_id analysis units" + ) if not self.app_id: raise ValueError( "App ID must be defined for building Cirrus exposures query" @@ -661,30 +712,56 @@ def _build_exposure_query( event_category="cirrus_events", ) else: - raise ValueError + assert_never(exposure_query_type) def _build_enrollments_query_normandy( - self, time_limits: TimeLimits, sample_size: int = 100 + self, + time_limits: TimeLimits, + sample_size: int = 100, ) -> str: """Return SQL to query enrollments for a normandy experiment""" - return f""" - SELECT - e.client_id, - `mozfun.map.get_key`(e.event_map_values, 'branch') - AS branch, - MIN(e.submission_date) AS enrollment_date, - COUNT(e.submission_date) AS num_enrollment_events - FROM - `moz-fx-data-shared-prod.telemetry.events` e - WHERE - e.event_category = 'normandy' - AND e.event_method = 'enroll' - AND e.submission_date - BETWEEN '{time_limits.first_enrollment_date}' AND '{time_limits.last_enrollment_date}' - AND e.event_string_value = '{self.experiment_slug}' - AND e.sample_id < {sample_size} - GROUP BY e.client_id, branch - """ # noqa:E501 + if self.analysis_unit == AnalysisUnit.CLIENT_ID: + return f""" + SELECT + e.client_id, + `mozfun.map.get_key`(e.event_map_values, 'branch') + AS branch, + MIN(e.submission_date) AS enrollment_date, + COUNT(e.submission_date) AS num_enrollment_events + FROM + `moz-fx-data-shared-prod.telemetry.events` e + WHERE + e.event_category = 'normandy' + AND e.event_method = 'enroll' + AND e.submission_date + BETWEEN '{time_limits.first_enrollment_date}' AND '{time_limits.last_enrollment_date}' + AND e.event_string_value = '{self.experiment_slug}' + AND e.sample_id < {sample_size} + GROUP BY e.client_id, branch + """ # noqa:E501 + elif self.analysis_unit == AnalysisUnit.GROUP_ID: + # TODO: update this based on the final structure of the group_id + # within the events ping + return f""" + SELECT + e.profile_group_id, + `mozfun.map.get_key`(e.event_map_values, 'branch') + AS branch, + MIN(e.submission_date) AS enrollment_date, + COUNT(e.submission_date) AS num_enrollment_events + FROM + `moz-fx-data-shared-prod.telemetry.events` e + WHERE + e.event_category = 'normandy' + AND e.event_method = 'enroll' + AND e.submission_date + BETWEEN '{time_limits.first_enrollment_date}' AND '{time_limits.last_enrollment_date}' + AND e.event_string_value = '{self.experiment_slug}' + AND e.sample_id < {sample_size} + GROUP BY e.profile_group_id, branch + """ # noqa:E501 + else: + assert_never(self.analysis_unit) def _build_enrollments_query_fenix_baseline( self, time_limits: TimeLimits, sample_size: int = 100 @@ -699,6 +776,7 @@ def _build_enrollments_query_fenix_baseline( """ # Try to ignore users who enrolled early - but only consider a # 7 day window + return """ SELECT b.client_info.client_id AS client_id, @@ -741,6 +819,7 @@ def _build_enrollments_query_glean_event( ``ping_info.experiments`` to get a list of who is in what branch and when they enrolled. """ + return f""" SELECT events.client_info.client_id AS client_id, mozfun.map.get_key( @@ -774,6 +853,7 @@ def _build_enrollments_query_cirrus( ``ping_info.experiments`` to get a list of who is in what branch and when they enrolled. """ + return f""" SELECT mozfun.map.get_key(e.extra, "user_id") AS client_id, @@ -798,32 +878,62 @@ def _build_enrollments_query_cirrus( def _build_exposure_query_normandy(self, time_limits: TimeLimits) -> str: """Return SQL to query exposures for a normandy experiment""" - return f""" - SELECT - e.client_id, - e.branch, - min(e.submission_date) AS exposure_date, - COUNT(e.submission_date) AS num_exposure_events - FROM raw_enrollments re - LEFT JOIN ( + if self.analysis_unit == AnalysisUnit.CLIENT_ID: + return f""" SELECT - client_id, - `mozfun.map.get_key`(event_map_values, 'branchSlug') AS branch, - submission_date - FROM - `moz-fx-data-shared-prod.telemetry.events` - WHERE - event_category = 'normandy' - AND (event_method = 'exposure' OR event_method = 'expose') - AND submission_date - BETWEEN '{time_limits.first_enrollment_date}' AND '{time_limits.last_enrollment_date}' - AND event_string_value = '{self.experiment_slug}' - ) e - ON re.client_id = e.client_id AND - re.branch = e.branch AND - e.submission_date >= re.enrollment_date - GROUP BY e.client_id, e.branch - """ # noqa: E501 + e.client_id, + e.branch, + min(e.submission_date) AS exposure_date, + COUNT(e.submission_date) AS num_exposure_events + FROM raw_enrollments re + LEFT JOIN ( + SELECT + client_id, + `mozfun.map.get_key`(event_map_values, 'branchSlug') AS branch, + submission_date + FROM + `moz-fx-data-shared-prod.telemetry.events` + WHERE + event_category = 'normandy' + AND (event_method = 'exposure' OR event_method = 'expose') + AND submission_date + BETWEEN '{time_limits.first_enrollment_date}' AND '{time_limits.last_enrollment_date}' + AND event_string_value = '{self.experiment_slug}' + ) e + ON re.client_id = e.client_id AND + re.branch = e.branch AND + e.submission_date >= re.enrollment_date + GROUP BY e.client_id, e.branch + """ # noqa: E501 + elif self.analysis_unit == AnalysisUnit.GROUP_ID: + return f""" + SELECT + e.profile_group_id, + e.branch, + min(e.submission_date) AS exposure_date, + COUNT(e.submission_date) AS num_exposure_events + FROM raw_enrollments re + LEFT JOIN ( + SELECT + profile_group_id, + `mozfun.map.get_key`(event_map_values, 'branchSlug') AS branch, + submission_date + FROM + `moz-fx-data-shared-prod.telemetry.events` + WHERE + event_category = 'normandy' + AND (event_method = 'exposure' OR event_method = 'expose') + AND submission_date + BETWEEN '{time_limits.first_enrollment_date}' AND '{time_limits.last_enrollment_date}' + AND event_string_value = '{self.experiment_slug}' + ) e + ON re.profile_group_id = e.profile_group_id AND + re.branch = e.branch AND + e.submission_date >= re.enrollment_date + GROUP BY e.profile_group_id, e.branch + """ # noqa: E501 + else: + assert_never(self.analysis_unit) def _build_exposure_query_glean_event( self, @@ -894,13 +1004,15 @@ def _build_metrics_query_bits( self.experiment_slug, self.app_id, analysis_basis, + self.analysis_unit, exposure_signal, ) + metrics_joins.append( f""" LEFT JOIN ( - {query_for_metrics} - ) ds_{i} USING (client_id, branch, analysis_window_start, analysis_window_end) - """ + {query_for_metrics} + ) ds_{i} USING ({self.analysis_unit.value}, branch, analysis_window_start, analysis_window_end) + """ ) for m in ds_metrics[ds]: @@ -920,7 +1032,9 @@ def _partition_by_data_source( } def _build_segments_query( - self, segment_list: list[Segment], time_limits: TimeLimits + self, + segment_list: list[Segment], + time_limits: TimeLimits, ) -> str: """Build a query adding segment columns to the enrollments view. @@ -950,12 +1064,14 @@ def _build_segments_query( ) def _build_segments_query_bits( - self, segment_list: list[Segment], time_limits: TimeLimits + self, + segment_list: list[Segment | str], + time_limits: TimeLimits, ) -> tuple[list[str], list[str]]: """Return lists of SQL fragments corresponding to segments.""" # resolve segment slugs - segments = [] + segments: list[Segment] = [] for segment in segment_list: if isinstance(segment, str): segments.append(ConfigLoader.get_segment(segment, self.get_app_name())) @@ -974,7 +1090,7 @@ def _build_segments_query_bits( segments_joins.append( f""" LEFT JOIN ( {query_for_segments} - ) ds_{i} USING (client_id, branch) + ) ds_{i} USING ({self.analysis_unit.value}, branch) """ ) @@ -1245,6 +1361,7 @@ class TimeSeriesResult: fully_qualified_table_name = attr.ib(type=str) analysis_windows = attr.ib(type=list) + analysis_unit = attr.ib(type=AnalysisUnit, default=AnalysisUnit.CLIENT_ID) def get(self, bq_context: BigQueryContext, analysis_window) -> DataFrame: """Get the DataFrame for a specific analysis window. @@ -1354,7 +1471,7 @@ def _build_analysis_window_subset_query( in "the standard format" for a single analysis window. """ return f""" - SELECT * EXCEPT (client_id, analysis_window_start, analysis_window_end) + SELECT * EXCEPT ({self.analysis_unit.value}, analysis_window_start, analysis_window_end) FROM {self.fully_qualified_table_name} WHERE analysis_window_start = {analysis_window.start} AND analysis_window_end = {analysis_window.end} diff --git a/src/mozanalysis/metrics.py b/src/mozanalysis/metrics.py index ad5bf189..f7539fa8 100644 --- a/src/mozanalysis/metrics.py +++ b/src/mozanalysis/metrics.py @@ -5,6 +5,8 @@ from enum import Enum from typing import TYPE_CHECKING +from typing_extensions import assert_never +from mozanalysis.types import AnalysisUnit if TYPE_CHECKING: from mozanalysis.experiment import TimeLimits @@ -69,8 +71,9 @@ class DataSource: submission_date_column = attr.ib(default="submission_date", type=str) default_dataset = attr.ib(default=None, type=str | None) app_name = attr.ib(default=None, type=str | None) + group_id_column = attr.ib(default="profile_group_id", type=str) - EXPERIMENT_COLUMN_TYPES = (None, "simple", "native", "glean") + EXPERIMENT_COLUMN_TYPES = (None, "simple", "native", "glean", "main_v5") @experiments_column_type.validator def _check_experiments_column_type(self, attribute, value): @@ -141,6 +144,7 @@ def build_query( experiment_slug: str, from_expr_dataset: str | None = None, analysis_basis: str = AnalysisBasis.ENROLLMENTS, + analysis_unit: AnalysisUnit = AnalysisUnit.CLIENT_ID, exposure_signal=None, ) -> str: """Return a nearly-self contained SQL query. @@ -148,9 +152,15 @@ def build_query( This query does not define ``enrollments`` but otherwise could be executed to query all metrics from this data source. """ + if analysis_unit == AnalysisUnit.CLIENT_ID: + ds_id = self.client_id_column or "client_id" + elif analysis_unit == AnalysisUnit.GROUP_ID: + ds_id = self.group_id_column or "profile_group_id" + else: + assert_never(analysis_unit) return """ SELECT - e.client_id, + e.{id_column}, e.branch, e.analysis_window_start, e.analysis_window_end, @@ -159,20 +169,20 @@ def build_query( {metrics} FROM enrollments e LEFT JOIN {from_expr} ds - ON ds.{client_id} = e.client_id + ON ds.{ds_id} = e.{id_column} AND ds.{submission_date} BETWEEN '{fddr}' AND '{lddr}' AND ds.{submission_date} BETWEEN DATE_ADD(e.{date}, interval e.analysis_window_start day) AND DATE_ADD(e.{date}, interval e.analysis_window_end day) {ignore_pre_enroll_first_day} GROUP BY - e.client_id, + e.{id_column}, e.branch, e.num_exposure_events, e.exposure_date, e.analysis_window_start, e.analysis_window_end""".format( - client_id=self.client_id_column or "client_id", + ds_id=ds_id, submission_date=self.submission_date_column or "submission_date", from_expr=self.from_expr_for(from_expr_dataset), fddr=time_limits.first_date_data_required, @@ -181,13 +191,16 @@ def build_query( f"{m.select_expr.format(experiment_slug=experiment_slug)} AS {m.name}" for m in metric_list ), - date="exposure_date" - if analysis_basis == AnalysisBasis.EXPOSURES and exposure_signal is None - else "enrollment_date", + date=( + "exposure_date" + if analysis_basis == AnalysisBasis.EXPOSURES and exposure_signal is None + else "enrollment_date" + ), ignore_pre_enroll_first_day=self.experiments_column_expr.format( submission_date=self.submission_date_column or "submission_date", experiment_slug=experiment_slug, ), + id_column=analysis_unit.value, ) def build_query_targets( @@ -198,6 +211,7 @@ def build_query_targets( analysis_length: int, from_expr_dataset: str | None = None, continuous_enrollment: bool = False, + analysis_unit: AnalysisUnit = AnalysisUnit.CLIENT_ID, ) -> str: """Return a nearly-self contained SQL query that constructs the metrics query for targeting historical data without @@ -206,6 +220,11 @@ def build_query_targets( This query does not define ``targets`` but otherwise could be executed to query all metrics from this data source. """ + if analysis_unit != AnalysisUnit.CLIENT_ID: + raise ValueError( + "`build_query_targets` currently supports client_id analysis" + ) + return """ SELECT t.client_id, @@ -228,22 +247,24 @@ def build_query_targets( f"{m.select_expr.format(experiment_name=experiment_name)} AS {m.name}" for m in metric_list ), - date_clause=""" + date_clause=( + """ AND ds.{submission_date} BETWEEN '{fddr}' AND '{lddr}' AND ds.{submission_date} BETWEEN DATE_ADD(t.enrollment_date, interval t.analysis_window_start day) AND DATE_ADD(t.enrollment_date, interval t.analysis_window_end day)""".format( - submission_date=self.submission_date_column or "submission_date", - fddr=time_limits.first_date_data_required, - lddr=time_limits.last_date_data_required, - ) - if not continuous_enrollment - else """AND ds.{submission_date} BETWEEN + submission_date=self.submission_date_column or "submission_date", + fddr=time_limits.first_date_data_required, + lddr=time_limits.last_date_data_required, + ) + if not continuous_enrollment + else """AND ds.{submission_date} BETWEEN t.enrollment_date AND DATE_ADD(t.enrollment_date, interval {analysis_length} day) """.format( - submission_date=self.submission_date_column or "submission_date", - analysis_length=analysis_length, + submission_date=self.submission_date_column or "submission_date", + analysis_length=analysis_length, + ) ), ) diff --git a/src/mozanalysis/types.py b/src/mozanalysis/types.py index 2b7f425e..a41baba9 100644 --- a/src/mozanalysis/types.py +++ b/src/mozanalysis/types.py @@ -19,3 +19,8 @@ class Uplift(str, Enum): EstimatesByBranch = dict[BranchLabel, Estimates] CompareBranchesOutput = dict[ComparativeOption, EstimatesByBranch] + + +class AnalysisUnit(str, Enum): + CLIENT_ID = "client_id" + GROUP_ID = "profile_group_id" From 995741339ee42ebbd03023e8e120b58ae9279938 Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Wed, 31 Jul 2024 15:20:53 -0700 Subject: [PATCH 02/40] clean up typing for experiment.py --- src/mozanalysis/config.py | 6 +++-- src/mozanalysis/experiment.py | 41 +++++++++++++++++++++++++---------- 2 files changed, 34 insertions(+), 13 deletions(-) diff --git a/src/mozanalysis/config.py b/src/mozanalysis/config.py index af0ef1e6..447934be 100644 --- a/src/mozanalysis/config.py +++ b/src/mozanalysis/config.py @@ -6,6 +6,9 @@ from metric_config_parser.config import ConfigCollection +from mozanalysis.metrics import Metric + + METRIC_HUB_JETSTREAM_REPO = "https://github.com/mozilla/metric-hub/tree/main/jetstream" @@ -42,12 +45,11 @@ def with_configs_from( self.configs.merge(config_collection) return self - def get_metric(self, metric_slug: str, app_name: str): + def get_metric(self, metric_slug: str, app_name: str) -> Metric: """Load a metric definition for the given app. Returns a :class:`mozanalysis.metrics.Metric` instance. """ - from mozanalysis.metrics import Metric metric_definition = self.configs.get_metric_definition(metric_slug, app_name) if metric_definition is None: diff --git a/src/mozanalysis/experiment.py b/src/mozanalysis/experiment.py index a4bf956e..68e3dc5b 100644 --- a/src/mozanalysis/experiment.py +++ b/src/mozanalysis/experiment.py @@ -5,7 +5,7 @@ import logging from enum import Enum -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast from typing_extensions import assert_never import attr @@ -16,6 +16,7 @@ from mozanalysis.metrics import AnalysisBasis, DataSource, Metric from mozanalysis.utils import add_days, date_sub, hash_ish from mozanalysis.types import AnalysisUnit +from mozanalysis.segments import Segment, SegmentDataSource if TYPE_CHECKING: from pandas import DataFrame @@ -32,6 +33,10 @@ class EnrollmentsQueryType(str, Enum): GLEAN_EVENT = "glean-event" +AggregatorType = Metric | Segment +DataSourceType = DataSource | SegmentDataSource + + @attr.s(frozen=True, slots=True) class Experiment: """Query experiment data; store experiment metadata. @@ -131,7 +136,7 @@ class Experiment: before UTC midnight. """ - experiment_slug = attr.ib() + experiment_slug = attr.ib(type=str) start_date = attr.ib() num_dates_enrollment = attr.ib(default=None) app_id = attr.ib(default=None) @@ -425,7 +430,7 @@ def get_time_series_data( fully_qualified_table_name=bq_context.fully_qualify_table_name( full_res_table_name ), - analysis_windows=time_limits.analysis_windows, + analysis_windows=list(time_limits.analysis_windows), ) def build_enrollments_query( @@ -975,20 +980,21 @@ def _build_exposure_query_glean_event( def _build_metrics_query_bits( self, - metric_list: list[Metric], + metric_list: list[Metric | str], time_limits: TimeLimits, analysis_basis=AnalysisBasis.ENROLLMENTS, exposure_signal=None, ) -> tuple[list[str], list[str]]: """Return lists of SQL fragments corresponding to metrics.""" - metrics = [] + metrics: list[Metric] = [] for metric in metric_list: if isinstance(metric, str): metrics.append(ConfigLoader.get_metric(metric, self.get_app_name())) else: metrics.append(metric) - ds_metrics = self._partition_by_data_source(metrics) + ds_metrics = self._partition_metrics_by_data_source(metrics) + ds_metrics = cast(dict[DataSource, list[Metric]], ds_metrics) ds_metrics = { ds: metrics + ds.get_sanity_metrics(self.experiment_slug) for ds, metrics in ds_metrics.items() @@ -1020,9 +1026,20 @@ def _build_metrics_query_bits( return metrics_columns, metrics_joins - def _partition_by_data_source( - self, metric_or_segment_list: list[Metric] | list[Segment] - ) -> dict[DataSource | SegmentDataSource, list[Metric | Segment]]: + def _partition_segments_by_data_source( + self, metric_or_segment_list: list[Segment] + ) -> dict[SegmentDataSource, list[Segment]]: + """Return a dict mapping data sources to metric/segment lists.""" + data_sources = {m.data_source for m in metric_or_segment_list} + + return { + ds: [m for m in metric_or_segment_list if m.data_source == ds] + for ds in data_sources + } + + def _partition_metrics_by_data_source( + self, metric_or_segment_list: list[Metric] + ) -> dict[DataSource, list[Metric]]: """Return a dict mapping data sources to metric/segment lists.""" data_sources = {m.data_source for m in metric_or_segment_list} @@ -1049,7 +1066,7 @@ def _build_segments_query( # arrive with "how segments work" as their first question. segments_columns, segments_joins = self._build_segments_query_bits( - segment_list or [], time_limits + cast(list[Segment | str], segment_list) or [], time_limits ) return """ @@ -1078,7 +1095,7 @@ def _build_segments_query_bits( else: segments.append(segment) - ds_segments = self._partition_by_data_source(segments) + ds_segments = self._partition_segments_by_data_source(segments) segments_columns = [] segments_joins = [] @@ -1257,6 +1274,8 @@ def for_ts( ] ) + analysis_windows = cast(tuple[AnalysisWindow], analysis_windows) + last_date_data_required = add_days( last_enrollment_date, analysis_windows[-1].end ) From 5bd1bdf3d91079f715c310941bf1fa6a55faf09a Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Wed, 31 Jul 2024 15:24:07 -0700 Subject: [PATCH 03/40] ruff --- src/mozanalysis/config.py | 1 - src/mozanalysis/experiment.py | 12 +++++------- src/mozanalysis/metrics.py | 2 ++ 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/mozanalysis/config.py b/src/mozanalysis/config.py index 447934be..2c423417 100644 --- a/src/mozanalysis/config.py +++ b/src/mozanalysis/config.py @@ -8,7 +8,6 @@ from mozanalysis.metrics import Metric - METRIC_HUB_JETSTREAM_REPO = "https://github.com/mozilla/metric-hub/tree/main/jetstream" diff --git a/src/mozanalysis/experiment.py b/src/mozanalysis/experiment.py index 68e3dc5b..b838baba 100644 --- a/src/mozanalysis/experiment.py +++ b/src/mozanalysis/experiment.py @@ -6,23 +6,21 @@ import logging from enum import Enum from typing import TYPE_CHECKING, cast -from typing_extensions import assert_never import attr +from typing_extensions import assert_never from mozanalysis import APPS from mozanalysis.bq import BigQueryContext, sanitize_table_name_for_bq from mozanalysis.config import ConfigLoader from mozanalysis.metrics import AnalysisBasis, DataSource, Metric -from mozanalysis.utils import add_days, date_sub, hash_ish -from mozanalysis.types import AnalysisUnit from mozanalysis.segments import Segment, SegmentDataSource +from mozanalysis.types import AnalysisUnit +from mozanalysis.utils import add_days, date_sub, hash_ish if TYPE_CHECKING: from pandas import DataFrame - from mozanalysis.segments import Segment, SegmentDataSource - logger = logging.getLogger(__name__) @@ -1018,7 +1016,7 @@ def _build_metrics_query_bits( f""" LEFT JOIN ( {query_for_metrics} ) ds_{i} USING ({self.analysis_unit.value}, branch, analysis_window_start, analysis_window_end) - """ + """ # noqa: E501 ) for m in ds_metrics[ds]: @@ -1494,7 +1492,7 @@ def _build_analysis_window_subset_query( FROM {self.fully_qualified_table_name} WHERE analysis_window_start = {analysis_window.start} AND analysis_window_end = {analysis_window.end} - """ + """ # noqa: E501 def _build_aggregated_data_query( self, metric_list: list[Metric], aggregate_function: str diff --git a/src/mozanalysis/metrics.py b/src/mozanalysis/metrics.py index f7539fa8..a0107546 100644 --- a/src/mozanalysis/metrics.py +++ b/src/mozanalysis/metrics.py @@ -5,7 +5,9 @@ from enum import Enum from typing import TYPE_CHECKING + from typing_extensions import assert_never + from mozanalysis.types import AnalysisUnit if TYPE_CHECKING: From c2c7cbb98a040e55193a4cfa1d7774bbcd072110 Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Wed, 31 Jul 2024 15:58:48 -0700 Subject: [PATCH 04/40] added some tests --- src/mozanalysis/experiment.py | 4 +- src/mozanalysis/metrics.py | 3 +- tests/test_experiment.py | 309 +++++++++++++++++++++++++++++++++- 3 files changed, 305 insertions(+), 11 deletions(-) diff --git a/src/mozanalysis/experiment.py b/src/mozanalysis/experiment.py index b838baba..df10b73d 100644 --- a/src/mozanalysis/experiment.py +++ b/src/mozanalysis/experiment.py @@ -587,9 +587,7 @@ def build_metrics_query( FROM `{enrollments_table}` e CROSS JOIN analysis_windows aw ), - exposures AS ( - {exposure_query} - ), + exposures AS ({exposure_query}), enrollments AS ( SELECT e.* EXCEPT (exposure_date, num_exposure_events), diff --git a/src/mozanalysis/metrics.py b/src/mozanalysis/metrics.py index a0107546..659b06c2 100644 --- a/src/mozanalysis/metrics.py +++ b/src/mozanalysis/metrics.py @@ -160,8 +160,7 @@ def build_query( ds_id = self.group_id_column or "profile_group_id" else: assert_never(analysis_unit) - return """ - SELECT + return """SELECT e.{id_column}, e.branch, e.analysis_window_start, diff --git a/tests/test_experiment.py b/tests/test_experiment.py index 670ee4de..b4774b71 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -18,6 +18,7 @@ from mozanalysis.exposure import ExposureSignal from mozanalysis.metrics import AnalysisBasis, DataSource, Metric from mozanalysis.segments import Segment, SegmentDataSource +from mozanalysis.types import AnalysisUnit def test_time_limits_validates(): @@ -277,8 +278,11 @@ def test_analysis_window_validates_end(): AnalysisWindow(5, 4) -def test_query_not_detectably_malformed(): - exp = Experiment("slug", "2019-01-01", 8) +@pytest.mark.parametrize( + "analysis_unit", [AnalysisUnit.CLIENT_ID, AnalysisUnit.GROUP_ID] +) +def test_query_not_detectably_malformed(analysis_unit: AnalysisUnit): + exp = Experiment("slug", "2019-01-01", 8, analysis_unit=analysis_unit) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -296,6 +300,11 @@ def test_query_not_detectably_malformed(): sql_lint(enrollments_sql) assert "sample_id < None" not in enrollments_sql + if analysis_unit == AnalysisUnit.CLIENT_ID: + assert "client_id" in enrollments_sql + elif analysis_unit == AnalysisUnit.GROUP_ID: + assert "profile_group_id" in enrollments_sql + metrics_sql = exp.build_metrics_query( metric_list=[], time_limits=tl, @@ -304,9 +313,17 @@ def test_query_not_detectably_malformed(): sql_lint(metrics_sql) + if analysis_unit == AnalysisUnit.CLIENT_ID: + assert "client_id" in metrics_sql + elif analysis_unit == AnalysisUnit.GROUP_ID: + assert "profile_group_id" in metrics_sql -def test_megaquery_not_detectably_malformed(): - exp = Experiment("slug", "2019-01-01", 8) + +@pytest.mark.parametrize( + "analysis_unit", [AnalysisUnit.CLIENT_ID, AnalysisUnit.GROUP_ID] +) +def test_megaquery_not_detectably_malformed(analysis_unit: AnalysisUnit): + exp = Experiment("slug", "2019-01-01", 8, analysis_unit=analysis_unit) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -321,6 +338,11 @@ def test_megaquery_not_detectably_malformed(): sql_lint(enrollments_sql) + if analysis_unit == AnalysisUnit.CLIENT_ID: + assert "client_id" in enrollments_sql + elif analysis_unit == AnalysisUnit.GROUP_ID: + assert "profile_group_id" in enrollments_sql + metrics_sql = exp.build_metrics_query( metric_list=desktop_metrics, time_limits=tl, @@ -329,9 +351,17 @@ def test_megaquery_not_detectably_malformed(): sql_lint(metrics_sql) + if analysis_unit == AnalysisUnit.CLIENT_ID: + assert "client_id" in metrics_sql + elif analysis_unit == AnalysisUnit.GROUP_ID: + assert "profile_group_id" in metrics_sql -def test_segments_megaquery_not_detectably_malformed(): - exp = Experiment("slug", "2019-01-01", 8) + +@pytest.mark.parametrize( + "analysis_unit", [AnalysisUnit.CLIENT_ID, AnalysisUnit.GROUP_ID] +) +def test_segments_megaquery_not_detectably_malformed(analysis_unit: AnalysisUnit): + exp = Experiment("slug", "2019-01-01", 8, analysis_unit=analysis_unit) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -850,3 +880,270 @@ def test_resolve_missing_column_names(): ) assert "None" not in metric_sql + + +def test_enrollments_query_explicit_client_id(): + exp = Experiment("slug", "2019-01-01", 8) + + tl = TimeLimits.for_ts( + first_enrollment_date="2019-01-01", + last_date_full_data="2019-03-01", + time_series_period="weekly", + num_dates_enrollment=8, + ) + + enrollments_sql = exp.build_enrollments_query( + time_limits=tl, enrollments_query_type=EnrollmentsQueryType.NORMANDY + ) + + sql_lint(enrollments_sql) + + expected = """ + WITH raw_enrollments AS ( + SELECT + e.client_id, + `mozfun.map.get_key`(e.event_map_values, 'branch') + AS branch, + MIN(e.submission_date) AS enrollment_date, + COUNT(e.submission_date) AS num_enrollment_events + FROM + `moz-fx-data-shared-prod.telemetry.events` e + WHERE + e.event_category = 'normandy' + AND e.event_method = 'enroll' + AND e.submission_date + BETWEEN '2019-01-01' AND '2019-01-08' + AND e.event_string_value = 'slug' + AND e.sample_id < 100 + GROUP BY e.client_id, branch + ), + segmented_enrollments AS ( + SELECT + raw_enrollments.*, + + FROM raw_enrollments + + ), + exposures AS ( + SELECT + e.client_id, + e.branch, + min(e.submission_date) AS exposure_date, + COUNT(e.submission_date) AS num_exposure_events + FROM raw_enrollments re + LEFT JOIN ( + SELECT + client_id, + `mozfun.map.get_key`(event_map_values, 'branchSlug') AS branch, + submission_date + FROM + `moz-fx-data-shared-prod.telemetry.events` + WHERE + event_category = 'normandy' + AND (event_method = 'exposure' OR event_method = 'expose') + AND submission_date + BETWEEN '2019-01-01' AND '2019-01-08' + AND event_string_value = 'slug' + ) e + ON re.client_id = e.client_id AND + re.branch = e.branch AND + e.submission_date >= re.enrollment_date + GROUP BY e.client_id, e.branch + ) + + SELECT + se.*, + e.* EXCEPT (client_id, branch) + FROM segmented_enrollments se + LEFT JOIN exposures e + USING (client_id, branch) + """ + + assert enrollments_sql == expected + + metrics_sql = exp.build_metrics_query( + metric_list=[ + metric for metric in desktop_metrics if metric.name == "active_hours" + ], + time_limits=tl, + enrollments_table="enrollments", + ) + + sql_lint(metrics_sql) + + +def test_metrics_query_explicit_client_id(): + exp = Experiment("slug", "2019-01-01", 8) + + tl = TimeLimits.for_ts( + first_enrollment_date="2019-01-01", + last_date_full_data="2019-03-01", + time_series_period="weekly", + num_dates_enrollment=8, + ) + + enrollments_sql = exp.build_enrollments_query( + time_limits=tl, enrollments_query_type=EnrollmentsQueryType.NORMANDY + ) + + sql_lint(enrollments_sql) + + metrics_sql = exp.build_metrics_query( + metric_list=[ + metric for metric in desktop_metrics if metric.name == "active_hours" + ], + time_limits=tl, + enrollments_table="enrollments", + ) + + sql_lint(metrics_sql) + + expected = """ + WITH analysis_windows AS ( + (SELECT 0 AS analysis_window_start, 6 AS analysis_window_end) + UNION ALL + (SELECT 7 AS analysis_window_start, 13 AS analysis_window_end) + UNION ALL + (SELECT 14 AS analysis_window_start, 20 AS analysis_window_end) + UNION ALL + (SELECT 21 AS analysis_window_start, 27 AS analysis_window_end) + UNION ALL + (SELECT 28 AS analysis_window_start, 34 AS analysis_window_end) + UNION ALL + (SELECT 35 AS analysis_window_start, 41 AS analysis_window_end) + UNION ALL + (SELECT 42 AS analysis_window_start, 48 AS analysis_window_end) + ), + raw_enrollments AS ( + -- needed by "exposures" sub query + SELECT + e.*, + aw.* + FROM `enrollments` e + CROSS JOIN analysis_windows aw + ), + exposures AS ( + SELECT + * + FROM raw_enrollments e + ), + enrollments AS ( + SELECT + e.* EXCEPT (exposure_date, num_exposure_events), + x.exposure_date, + x.num_exposure_events + FROM exposures x + RIGHT JOIN raw_enrollments e + USING (client_id, branch) + ) + SELECT + enrollments.*, + ds_0.active_hours + FROM enrollments + LEFT JOIN ( + SELECT + e.client_id, + e.branch, + e.analysis_window_start, + e.analysis_window_end, + e.num_exposure_events, + e.exposure_date, + COALESCE(SUM(active_hours_sum), 0) AS active_hours + FROM enrollments e + LEFT JOIN mozdata.telemetry.clients_daily ds + ON ds.client_id = e.client_id + AND ds.submission_date BETWEEN '2019-01-01' AND '2019-02-25' + AND ds.submission_date BETWEEN + DATE_ADD(e.enrollment_date, interval e.analysis_window_start day) + AND DATE_ADD(e.enrollment_date, interval e.analysis_window_end day) + + GROUP BY + e.client_id, + e.branch, + e.num_exposure_events, + e.exposure_date, + e.analysis_window_start, + e.analysis_window_end + ) ds_0 USING (client_id, branch, analysis_window_start, analysis_window_end)""" + + assert expected == metrics_sql.rstrip() + + +def test_enrollments_query_explicit_group_id(): + exp = Experiment("slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.GROUP_ID) + + tl = TimeLimits.for_ts( + first_enrollment_date="2019-01-01", + last_date_full_data="2019-03-01", + time_series_period="weekly", + num_dates_enrollment=8, + ) + + enrollments_sql = exp.build_enrollments_query( + time_limits=tl, enrollments_query_type=EnrollmentsQueryType.NORMANDY + ) + + sql_lint(enrollments_sql) + + expected = """ + WITH raw_enrollments AS ( + SELECT + e.profile_group_id, + `mozfun.map.get_key`(e.event_map_values, 'branch') + AS branch, + MIN(e.submission_date) AS enrollment_date, + COUNT(e.submission_date) AS num_enrollment_events + FROM + `moz-fx-data-shared-prod.telemetry.events` e + WHERE + e.event_category = 'normandy' + AND e.event_method = 'enroll' + AND e.submission_date + BETWEEN '2019-01-01' AND '2019-01-08' + AND e.event_string_value = 'slug' + AND e.sample_id < 100 + GROUP BY e.profile_group_id, branch + ), + segmented_enrollments AS ( + SELECT + raw_enrollments.*, + + FROM raw_enrollments + + ), + exposures AS ( + SELECT + e.profile_group_id, + e.branch, + min(e.submission_date) AS exposure_date, + COUNT(e.submission_date) AS num_exposure_events + FROM raw_enrollments re + LEFT JOIN ( + SELECT + profile_group_id, + `mozfun.map.get_key`(event_map_values, 'branchSlug') AS branch, + submission_date + FROM + `moz-fx-data-shared-prod.telemetry.events` + WHERE + event_category = 'normandy' + AND (event_method = 'exposure' OR event_method = 'expose') + AND submission_date + BETWEEN '2019-01-01' AND '2019-01-08' + AND event_string_value = 'slug' + ) e + ON re.profile_group_id = e.profile_group_id AND + re.branch = e.branch AND + e.submission_date >= re.enrollment_date + GROUP BY e.profile_group_id, e.branch + ) + + SELECT + se.*, + e.* EXCEPT (profile_group_id, branch) + FROM segmented_enrollments se + LEFT JOIN exposures e + USING (profile_group_id, branch) + """ + + assert enrollments_sql == expected From 52c6087f37ecb042a5697534a41164b06d0e498e Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Thu, 1 Aug 2024 07:30:30 -0700 Subject: [PATCH 05/40] dedent to lint --- tests/test_experiment.py | 376 ++++++++++++++++++++------------------- 1 file changed, 189 insertions(+), 187 deletions(-) diff --git a/tests/test_experiment.py b/tests/test_experiment.py index b4774b71..788a83f4 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -1,3 +1,5 @@ +from textwrap import dedent + import pytest from helpers.cheap_lint import sql_lint # local helper file from helpers.config_loader_lists import ( @@ -899,67 +901,67 @@ def test_enrollments_query_explicit_client_id(): sql_lint(enrollments_sql) expected = """ - WITH raw_enrollments AS ( - SELECT - e.client_id, - `mozfun.map.get_key`(e.event_map_values, 'branch') - AS branch, - MIN(e.submission_date) AS enrollment_date, - COUNT(e.submission_date) AS num_enrollment_events - FROM - `moz-fx-data-shared-prod.telemetry.events` e - WHERE - e.event_category = 'normandy' - AND e.event_method = 'enroll' - AND e.submission_date - BETWEEN '2019-01-01' AND '2019-01-08' - AND e.event_string_value = 'slug' - AND e.sample_id < 100 - GROUP BY e.client_id, branch - ), - segmented_enrollments AS ( - SELECT - raw_enrollments.*, - - FROM raw_enrollments - + WITH raw_enrollments AS ( + SELECT + e.client_id, + `mozfun.map.get_key`(e.event_map_values, 'branch') + AS branch, + MIN(e.submission_date) AS enrollment_date, + COUNT(e.submission_date) AS num_enrollment_events + FROM + `moz-fx-data-shared-prod.telemetry.events` e + WHERE + e.event_category = 'normandy' + AND e.event_method = 'enroll' + AND e.submission_date + BETWEEN '2019-01-01' AND '2019-01-08' + AND e.event_string_value = 'slug' + AND e.sample_id < 100 + GROUP BY e.client_id, branch ), - exposures AS ( - SELECT - e.client_id, - e.branch, - min(e.submission_date) AS exposure_date, - COUNT(e.submission_date) AS num_exposure_events - FROM raw_enrollments re - LEFT JOIN ( - SELECT - client_id, - `mozfun.map.get_key`(event_map_values, 'branchSlug') AS branch, - submission_date - FROM - `moz-fx-data-shared-prod.telemetry.events` - WHERE - event_category = 'normandy' - AND (event_method = 'exposure' OR event_method = 'expose') - AND submission_date - BETWEEN '2019-01-01' AND '2019-01-08' - AND event_string_value = 'slug' - ) e - ON re.client_id = e.client_id AND - re.branch = e.branch AND - e.submission_date >= re.enrollment_date - GROUP BY e.client_id, e.branch - ) - - SELECT - se.*, - e.* EXCEPT (client_id, branch) - FROM segmented_enrollments se - LEFT JOIN exposures e - USING (client_id, branch) - """ - - assert enrollments_sql == expected + segmented_enrollments AS ( +SELECT + raw_enrollments.*, + +FROM raw_enrollments + +), + exposures AS ( + SELECT + e.client_id, + e.branch, + min(e.submission_date) AS exposure_date, + COUNT(e.submission_date) AS num_exposure_events + FROM raw_enrollments re + LEFT JOIN ( + SELECT + client_id, + `mozfun.map.get_key`(event_map_values, 'branchSlug') AS branch, + submission_date + FROM + `moz-fx-data-shared-prod.telemetry.events` + WHERE + event_category = 'normandy' + AND (event_method = 'exposure' OR event_method = 'expose') + AND submission_date + BETWEEN '2019-01-01' AND '2019-01-08' + AND event_string_value = 'slug' + ) e + ON re.client_id = e.client_id AND + re.branch = e.branch AND + e.submission_date >= re.enrollment_date + GROUP BY e.client_id, e.branch + ) + + SELECT + se.*, + e.* EXCEPT (client_id, branch) + FROM segmented_enrollments se + LEFT JOIN exposures e + USING (client_id, branch) +""" + + assert dedent(enrollments_sql) == expected metrics_sql = exp.build_metrics_query( metric_list=[ @@ -999,74 +1001,74 @@ def test_metrics_query_explicit_client_id(): sql_lint(metrics_sql) expected = """ - WITH analysis_windows AS ( - (SELECT 0 AS analysis_window_start, 6 AS analysis_window_end) - UNION ALL - (SELECT 7 AS analysis_window_start, 13 AS analysis_window_end) - UNION ALL - (SELECT 14 AS analysis_window_start, 20 AS analysis_window_end) - UNION ALL - (SELECT 21 AS analysis_window_start, 27 AS analysis_window_end) - UNION ALL - (SELECT 28 AS analysis_window_start, 34 AS analysis_window_end) - UNION ALL - (SELECT 35 AS analysis_window_start, 41 AS analysis_window_end) - UNION ALL - (SELECT 42 AS analysis_window_start, 48 AS analysis_window_end) - ), - raw_enrollments AS ( - -- needed by "exposures" sub query - SELECT - e.*, - aw.* - FROM `enrollments` e - CROSS JOIN analysis_windows aw - ), - exposures AS ( - SELECT - * - FROM raw_enrollments e - ), - enrollments AS ( - SELECT - e.* EXCEPT (exposure_date, num_exposure_events), - x.exposure_date, - x.num_exposure_events - FROM exposures x - RIGHT JOIN raw_enrollments e - USING (client_id, branch) - ) +WITH analysis_windows AS ( + (SELECT 0 AS analysis_window_start, 6 AS analysis_window_end) +UNION ALL +(SELECT 7 AS analysis_window_start, 13 AS analysis_window_end) +UNION ALL +(SELECT 14 AS analysis_window_start, 20 AS analysis_window_end) +UNION ALL +(SELECT 21 AS analysis_window_start, 27 AS analysis_window_end) +UNION ALL +(SELECT 28 AS analysis_window_start, 34 AS analysis_window_end) +UNION ALL +(SELECT 35 AS analysis_window_start, 41 AS analysis_window_end) +UNION ALL +(SELECT 42 AS analysis_window_start, 48 AS analysis_window_end) +), +raw_enrollments AS ( + -- needed by "exposures" sub query + SELECT + e.*, + aw.* + FROM `enrollments` e + CROSS JOIN analysis_windows aw +), +exposures AS ( SELECT - enrollments.*, - ds_0.active_hours - FROM enrollments - LEFT JOIN ( - SELECT - e.client_id, - e.branch, - e.analysis_window_start, - e.analysis_window_end, - e.num_exposure_events, - e.exposure_date, - COALESCE(SUM(active_hours_sum), 0) AS active_hours - FROM enrollments e - LEFT JOIN mozdata.telemetry.clients_daily ds - ON ds.client_id = e.client_id - AND ds.submission_date BETWEEN '2019-01-01' AND '2019-02-25' - AND ds.submission_date BETWEEN - DATE_ADD(e.enrollment_date, interval e.analysis_window_start day) - AND DATE_ADD(e.enrollment_date, interval e.analysis_window_end day) - - GROUP BY - e.client_id, - e.branch, - e.num_exposure_events, - e.exposure_date, - e.analysis_window_start, - e.analysis_window_end - ) ds_0 USING (client_id, branch, analysis_window_start, analysis_window_end)""" - - assert expected == metrics_sql.rstrip() + * + FROM raw_enrollments e + ), +enrollments AS ( + SELECT + e.* EXCEPT (exposure_date, num_exposure_events), + x.exposure_date, + x.num_exposure_events + FROM exposures x + RIGHT JOIN raw_enrollments e + USING (client_id, branch) +) +SELECT + enrollments.*, + ds_0.active_hours +FROM enrollments + LEFT JOIN ( + SELECT + e.client_id, + e.branch, + e.analysis_window_start, + e.analysis_window_end, + e.num_exposure_events, + e.exposure_date, + COALESCE(SUM(active_hours_sum), 0) AS active_hours +FROM enrollments e + LEFT JOIN mozdata.telemetry.clients_daily ds + ON ds.client_id = e.client_id + AND ds.submission_date BETWEEN '2019-01-01' AND '2019-02-25' + AND ds.submission_date BETWEEN + DATE_ADD(e.enrollment_date, interval e.analysis_window_start day) + AND DATE_ADD(e.enrollment_date, interval e.analysis_window_end day) + +GROUP BY + e.client_id, + e.branch, + e.num_exposure_events, + e.exposure_date, + e.analysis_window_start, + e.analysis_window_end + ) ds_0 USING (client_id, branch, analysis_window_start, analysis_window_end)""" + + assert expected == dedent(metrics_sql.rstrip()) def test_enrollments_query_explicit_group_id(): @@ -1086,64 +1088,64 @@ def test_enrollments_query_explicit_group_id(): sql_lint(enrollments_sql) expected = """ - WITH raw_enrollments AS ( - SELECT - e.profile_group_id, - `mozfun.map.get_key`(e.event_map_values, 'branch') - AS branch, - MIN(e.submission_date) AS enrollment_date, - COUNT(e.submission_date) AS num_enrollment_events - FROM - `moz-fx-data-shared-prod.telemetry.events` e - WHERE - e.event_category = 'normandy' - AND e.event_method = 'enroll' - AND e.submission_date - BETWEEN '2019-01-01' AND '2019-01-08' - AND e.event_string_value = 'slug' - AND e.sample_id < 100 - GROUP BY e.profile_group_id, branch - ), - segmented_enrollments AS ( - SELECT - raw_enrollments.*, - - FROM raw_enrollments - + WITH raw_enrollments AS ( + SELECT + e.profile_group_id, + `mozfun.map.get_key`(e.event_map_values, 'branch') + AS branch, + MIN(e.submission_date) AS enrollment_date, + COUNT(e.submission_date) AS num_enrollment_events + FROM + `moz-fx-data-shared-prod.telemetry.events` e + WHERE + e.event_category = 'normandy' + AND e.event_method = 'enroll' + AND e.submission_date + BETWEEN '2019-01-01' AND '2019-01-08' + AND e.event_string_value = 'slug' + AND e.sample_id < 100 + GROUP BY e.profile_group_id, branch ), - exposures AS ( - SELECT - e.profile_group_id, - e.branch, - min(e.submission_date) AS exposure_date, - COUNT(e.submission_date) AS num_exposure_events - FROM raw_enrollments re - LEFT JOIN ( - SELECT - profile_group_id, - `mozfun.map.get_key`(event_map_values, 'branchSlug') AS branch, - submission_date - FROM - `moz-fx-data-shared-prod.telemetry.events` - WHERE - event_category = 'normandy' - AND (event_method = 'exposure' OR event_method = 'expose') - AND submission_date - BETWEEN '2019-01-01' AND '2019-01-08' - AND event_string_value = 'slug' - ) e - ON re.profile_group_id = e.profile_group_id AND - re.branch = e.branch AND - e.submission_date >= re.enrollment_date - GROUP BY e.profile_group_id, e.branch - ) - - SELECT - se.*, - e.* EXCEPT (profile_group_id, branch) - FROM segmented_enrollments se - LEFT JOIN exposures e - USING (profile_group_id, branch) - """ - - assert enrollments_sql == expected + segmented_enrollments AS ( +SELECT + raw_enrollments.*, + +FROM raw_enrollments + +), + exposures AS ( + SELECT + e.profile_group_id, + e.branch, + min(e.submission_date) AS exposure_date, + COUNT(e.submission_date) AS num_exposure_events + FROM raw_enrollments re + LEFT JOIN ( + SELECT + profile_group_id, + `mozfun.map.get_key`(event_map_values, 'branchSlug') AS branch, + submission_date + FROM + `moz-fx-data-shared-prod.telemetry.events` + WHERE + event_category = 'normandy' + AND (event_method = 'exposure' OR event_method = 'expose') + AND submission_date + BETWEEN '2019-01-01' AND '2019-01-08' + AND event_string_value = 'slug' + ) e + ON re.profile_group_id = e.profile_group_id AND + re.branch = e.branch AND + e.submission_date >= re.enrollment_date + GROUP BY e.profile_group_id, e.branch + ) + + SELECT + se.*, + e.* EXCEPT (profile_group_id, branch) + FROM segmented_enrollments se + LEFT JOIN exposures e + USING (profile_group_id, branch) +""" + + assert dedent(enrollments_sql) == expected From 404be93bb26ba534a499393216370667bb2b1b09 Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Thu, 1 Aug 2024 07:39:00 -0700 Subject: [PATCH 06/40] added unit tests --- src/mozanalysis/experiment.py | 28 ++++++----- tests/test_experiment.py | 89 +++++++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+), 12 deletions(-) diff --git a/src/mozanalysis/experiment.py b/src/mozanalysis/experiment.py index df10b73d..59945fd7 100644 --- a/src/mozanalysis/experiment.py +++ b/src/mozanalysis/experiment.py @@ -35,6 +35,10 @@ class EnrollmentsQueryType(str, Enum): DataSourceType = DataSource | SegmentDataSource +class IncompatibleAnalysisUnit(ValueError): + pass + + @attr.s(frozen=True, slots=True) class Experiment: """Query experiment data; store experiment metadata. @@ -644,7 +648,7 @@ def _build_enrollments_query( "App ID must be defined for building Glean enrollments query" ) if not self.analysis_unit == AnalysisUnit.CLIENT_ID: - raise ValueError( + raise IncompatibleAnalysisUnit( "Glean enrollments currently only support client_id analysis units" ) return self._build_enrollments_query_glean_event( @@ -652,21 +656,21 @@ def _build_enrollments_query( ) elif enrollments_query_type == EnrollmentsQueryType.FENIX_FALLBACK: if not self.analysis_unit == AnalysisUnit.CLIENT_ID: - raise ValueError( + raise IncompatibleAnalysisUnit( "Fenix fallback enrollments currently only support client_id analysis units" # noqa: E501 ) return self._build_enrollments_query_fenix_baseline( time_limits, sample_size ) elif enrollments_query_type == EnrollmentsQueryType.CIRRUS: - if not self.analysis_unit == AnalysisUnit.CLIENT_ID: - raise ValueError( - "Cirrus enrollments currently only support client_id analysis units" - ) if not self.app_id: raise ValueError( "App ID must be defined for building Cirrus enrollments query" ) + if not self.analysis_unit == AnalysisUnit.CLIENT_ID: + raise IncompatibleAnalysisUnit( + "Cirrus enrollments currently only support client_id analysis units" + ) return self._build_enrollments_query_cirrus(time_limits, self.app_id) else: assert_never(enrollments_query_type) @@ -685,27 +689,27 @@ def _build_exposure_query( "App ID must be defined for building Glean exposures query" ) if not self.analysis_unit == AnalysisUnit.CLIENT_ID: - raise ValueError( + raise IncompatibleAnalysisUnit( "Glean exposures currently only support client_id analysis units" ) return self._build_exposure_query_glean_event(time_limits, self.app_id) elif exposure_query_type == EnrollmentsQueryType.FENIX_FALLBACK: if not self.analysis_unit == AnalysisUnit.CLIENT_ID: - raise ValueError( + raise IncompatibleAnalysisUnit( "Fenix fallback exposures currently only support client_id analysis units" # noqa: E501 ) return self._build_exposure_query_glean_event( time_limits, "org_mozilla_firefox" ) elif exposure_query_type == EnrollmentsQueryType.CIRRUS: - if not self.analysis_unit == AnalysisUnit.CLIENT_ID: - raise ValueError( - "Cirrus exposures currently only support client_id analysis units" - ) if not self.app_id: raise ValueError( "App ID must be defined for building Cirrus exposures query" ) + if not self.analysis_unit == AnalysisUnit.CLIENT_ID: + raise IncompatibleAnalysisUnit( + "Cirrus exposures currently only support client_id analysis units" + ) return self._build_exposure_query_glean_event( time_limits, self.app_id, diff --git a/tests/test_experiment.py b/tests/test_experiment.py index 788a83f4..3f18de10 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -15,6 +15,7 @@ AnalysisWindow, EnrollmentsQueryType, Experiment, + IncompatibleAnalysisUnit, TimeLimits, ) from mozanalysis.exposure import ExposureSignal @@ -1149,3 +1150,91 @@ def test_enrollments_query_explicit_group_id(): """ assert dedent(enrollments_sql) == expected + + +def test_glean_group_id_incompatible(): + exp = Experiment( + "slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.GROUP_ID, app_id="test_app" + ) + + tl = TimeLimits.for_ts( + first_enrollment_date="2019-01-01", + last_date_full_data="2019-03-01", + time_series_period="weekly", + num_dates_enrollment=8, + ) + + with pytest.raises(IncompatibleAnalysisUnit): + exp.build_enrollments_query( + time_limits=tl, enrollments_query_type=EnrollmentsQueryType.GLEAN_EVENT + ) + + +def test_glean_missing_app_id(): + exp = Experiment("slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.GROUP_ID) + + tl = TimeLimits.for_ts( + first_enrollment_date="2019-01-01", + last_date_full_data="2019-03-01", + time_series_period="weekly", + num_dates_enrollment=8, + ) + + with pytest.raises( + ValueError, match="App ID must be defined for building Glean enrollments query" + ): + exp.build_enrollments_query( + time_limits=tl, enrollments_query_type=EnrollmentsQueryType.GLEAN_EVENT + ) + + +def test_cirrus_group_id_incompatible(): + exp = Experiment( + "slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.GROUP_ID, app_id="test_app" + ) + + tl = TimeLimits.for_ts( + first_enrollment_date="2019-01-01", + last_date_full_data="2019-03-01", + time_series_period="weekly", + num_dates_enrollment=8, + ) + + with pytest.raises(IncompatibleAnalysisUnit): + exp.build_enrollments_query( + time_limits=tl, enrollments_query_type=EnrollmentsQueryType.CIRRUS + ) + + +def test_cirrus_missing_app_id(): + exp = Experiment("slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.GROUP_ID) + + tl = TimeLimits.for_ts( + first_enrollment_date="2019-01-01", + last_date_full_data="2019-03-01", + time_series_period="weekly", + num_dates_enrollment=8, + ) + + with pytest.raises( + ValueError, match="App ID must be defined for building Cirrus enrollments query" + ): + exp.build_enrollments_query( + time_limits=tl, enrollments_query_type=EnrollmentsQueryType.CIRRUS + ) + + +def test_fenix_group_id_incompatible(): + exp = Experiment("slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.GROUP_ID) + + tl = TimeLimits.for_ts( + first_enrollment_date="2019-01-01", + last_date_full_data="2019-03-01", + time_series_period="weekly", + num_dates_enrollment=8, + ) + + with pytest.raises(IncompatibleAnalysisUnit): + exp.build_enrollments_query( + time_limits=tl, enrollments_query_type=EnrollmentsQueryType.FENIX_FALLBACK + ) From bc40cf1c83803d05b66d2afcc2221ac220a75956 Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Thu, 1 Aug 2024 07:42:06 -0700 Subject: [PATCH 07/40] more tests --- tests/test_experiment.py | 52 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/tests/test_experiment.py b/tests/test_experiment.py index 3f18de10..582231fe 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -1170,6 +1170,24 @@ def test_glean_group_id_incompatible(): ) +def test_glean_group_id_incompatible_exposures(): + exp = Experiment( + "slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.GROUP_ID, app_id="test_app" + ) + + tl = TimeLimits.for_ts( + first_enrollment_date="2019-01-01", + last_date_full_data="2019-03-01", + time_series_period="weekly", + num_dates_enrollment=8, + ) + + with pytest.raises(IncompatibleAnalysisUnit): + exp._build_exposure_query( + time_limits=tl, exposure_query_type=EnrollmentsQueryType.GLEAN_EVENT + ) + + def test_glean_missing_app_id(): exp = Experiment("slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.GROUP_ID) @@ -1206,6 +1224,24 @@ def test_cirrus_group_id_incompatible(): ) +def test_cirrus_group_id_incompatible_exposures(): + exp = Experiment( + "slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.GROUP_ID, app_id="test_app" + ) + + tl = TimeLimits.for_ts( + first_enrollment_date="2019-01-01", + last_date_full_data="2019-03-01", + time_series_period="weekly", + num_dates_enrollment=8, + ) + + with pytest.raises(IncompatibleAnalysisUnit): + exp._build_exposure_query( + time_limits=tl, exposure_query_type=EnrollmentsQueryType.CIRRUS + ) + + def test_cirrus_missing_app_id(): exp = Experiment("slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.GROUP_ID) @@ -1238,3 +1274,19 @@ def test_fenix_group_id_incompatible(): exp.build_enrollments_query( time_limits=tl, enrollments_query_type=EnrollmentsQueryType.FENIX_FALLBACK ) + + +def test_fenix_group_id_incompatible_exposures(): + exp = Experiment("slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.GROUP_ID) + + tl = TimeLimits.for_ts( + first_enrollment_date="2019-01-01", + last_date_full_data="2019-03-01", + time_series_period="weekly", + num_dates_enrollment=8, + ) + + with pytest.raises(IncompatibleAnalysisUnit): + exp._build_exposure_query( + time_limits=tl, exposure_query_type=EnrollmentsQueryType.FENIX_FALLBACK + ) From d22c90151baff42d20d5fdfd1386adf554de1e30 Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Thu, 1 Aug 2024 07:49:18 -0700 Subject: [PATCH 08/40] more unit tests --- tests/test_experiment.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tests/test_experiment.py b/tests/test_experiment.py index 582231fe..346be894 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -1206,6 +1206,24 @@ def test_glean_missing_app_id(): ) +def test_glean_exposures_missing_app_id(): + exp = Experiment("slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.GROUP_ID) + + tl = TimeLimits.for_ts( + first_enrollment_date="2019-01-01", + last_date_full_data="2019-03-01", + time_series_period="weekly", + num_dates_enrollment=8, + ) + + with pytest.raises( + ValueError, match="App ID must be defined for building Glean exposures query" + ): + exp._build_exposure_query( + time_limits=tl, exposure_query_type=EnrollmentsQueryType.GLEAN_EVENT + ) + + def test_cirrus_group_id_incompatible(): exp = Experiment( "slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.GROUP_ID, app_id="test_app" @@ -1260,6 +1278,24 @@ def test_cirrus_missing_app_id(): ) +def test_cirrus_missing_app_id_exposures(): + exp = Experiment("slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.GROUP_ID) + + tl = TimeLimits.for_ts( + first_enrollment_date="2019-01-01", + last_date_full_data="2019-03-01", + time_series_period="weekly", + num_dates_enrollment=8, + ) + + with pytest.raises( + ValueError, match="App ID must be defined for building Cirrus exposures query" + ): + exp._build_exposure_query( + time_limits=tl, exposure_query_type=EnrollmentsQueryType.CIRRUS + ) + + def test_fenix_group_id_incompatible(): exp = Experiment("slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.GROUP_ID) From 91a94d6587e1b8064df466b9d9b5856ff6e60054 Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Thu, 1 Aug 2024 07:59:18 -0700 Subject: [PATCH 09/40] Update pyproject.toml --- pyproject.toml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index af38a54a..f92f7b43 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,3 +75,10 @@ select = [ [tool.pytest.ini_options] norecursedirs = "tests/helpers" + +[tool.coverage.report] +exclude_also = [ + '^ +assert_never\(.*?\)$', + '^ +case _ as unreachable:$', +] + From 9e653e55e1654ca8150589b0d47521427c346c77 Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Thu, 1 Aug 2024 08:11:05 -0700 Subject: [PATCH 10/40] Update pyproject.toml --- pyproject.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f92f7b43..4cb7036b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,7 +78,6 @@ norecursedirs = "tests/helpers" [tool.coverage.report] exclude_also = [ - '^ +assert_never\(.*?\)$', - '^ +case _ as unreachable:$', + 'assert_never\(.*?\)', ] From 884cab002a6cbb2a439020e6ac70d764230144b5 Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Thu, 1 Aug 2024 08:27:52 -0700 Subject: [PATCH 11/40] Update pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4cb7036b..3ede4f06 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,6 +78,6 @@ norecursedirs = "tests/helpers" [tool.coverage.report] exclude_also = [ - 'assert_never\(.*?\)', + 'assert_never', ] From b6a5f17e155d2f0bbefcaad14adf887bde7f6c62 Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Thu, 1 Aug 2024 08:36:02 -0700 Subject: [PATCH 12/40] Update pyproject.toml --- pyproject.toml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3ede4f06..af38a54a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,9 +75,3 @@ select = [ [tool.pytest.ini_options] norecursedirs = "tests/helpers" - -[tool.coverage.report] -exclude_also = [ - 'assert_never', -] - From aa9f025650ae8032d8ce5da6a656a287879897f7 Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Thu, 1 Aug 2024 08:46:34 -0700 Subject: [PATCH 13/40] cleanup --- src/mozanalysis/experiment.py | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/src/mozanalysis/experiment.py b/src/mozanalysis/experiment.py index 59945fd7..0e4b97bc 100644 --- a/src/mozanalysis/experiment.py +++ b/src/mozanalysis/experiment.py @@ -31,10 +31,6 @@ class EnrollmentsQueryType(str, Enum): GLEAN_EVENT = "glean-event" -AggregatorType = Metric | Segment -DataSourceType = DataSource | SegmentDataSource - - class IncompatibleAnalysisUnit(ValueError): pass @@ -123,6 +119,12 @@ class Experiment: app_id (str, optional): For a Glean app, the name of the BigQuery dataset derived from its app ID, like `org_mozilla_firefox`. app_name (str, optional): The Glean app name, like `fenix`. + analysis_unit (AnalysisUnit, optional): the "unit" of analysis, namely the + id that defines an experimental unit. For example: `client_id` + for mobile experiments or `group_id` for desktop experiments. Is used + as the join key when building queries and sub-unit level data is + aggregated up to that level. Defaults to client_id unless specified + (AnalysisUnit.CLIENT_ID) Attributes: experiment_slug (str): Name of the study, used to identify @@ -199,10 +201,6 @@ def get_single_window_data( Specifies the query type to use to get the experiment's enrollments, unless overridden by ``custom_enrollments_query``. - analysis_unit (AnalysisUnit): the "unit" of analysis, namely the - id that defines an experimental unit. For example: `client_id` - for mobile experiments or `group_id` for desktop experiments. Can be - overridden through a ``custom_enrollments_query``, but shouldn't. custom_enrollments_query (str): A full SQL query that will generate the `enrollments` common table expression used in the main query. The query must produce the columns @@ -331,10 +329,6 @@ def get_time_series_data( Specifies the query type to use to get the experiment's enrollments, unless overridden by ``custom_enrollments_query``. - analysis_unit (AnalysisUnit): the "unit" of analysis, namely the - id that defines an experimental unit. For example: `client_id` - for mobile experiments or `group_id` for desktop experiments. Can be - overridden through a ``custom_enrollments_query``, but shouldn't. custom_enrollments_query (str): A full SQL query that will generate the `enrollments` common table expression used in the main query. The query must produce the columns @@ -455,10 +449,6 @@ def build_enrollments_query( Specifies the query type to use to get the experiment's enrollments, unless overridden by ``custom_enrollments_query``. - analysis_unit (AnalysisUnit): the "unit" of analysis, namely the - id that defines an experimental unit. For example: `client_id` - for mobile experiments or `group_id` for desktop experiments. Can be - overridden through a ``custom_enrollments_query``, but shouldn't. custom_enrollments_query (str): A full SQL query that will generate the `enrollments` common table expression used in the main query. The query must produce the columns From c799236a9532226e430324087719e0f989143e8a Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Thu, 1 Aug 2024 08:55:06 -0700 Subject: [PATCH 14/40] more cleanup, additional test --- src/mozanalysis/metrics.py | 2 +- tests/test_experiment.py | 97 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+), 1 deletion(-) diff --git a/src/mozanalysis/metrics.py b/src/mozanalysis/metrics.py index 659b06c2..a0e2271e 100644 --- a/src/mozanalysis/metrics.py +++ b/src/mozanalysis/metrics.py @@ -75,7 +75,7 @@ class DataSource: app_name = attr.ib(default=None, type=str | None) group_id_column = attr.ib(default="profile_group_id", type=str) - EXPERIMENT_COLUMN_TYPES = (None, "simple", "native", "glean", "main_v5") + EXPERIMENT_COLUMN_TYPES = (None, "simple", "native", "glean") @experiments_column_type.validator def _check_experiments_column_type(self, attribute, value): diff --git a/tests/test_experiment.py b/tests/test_experiment.py index 346be894..1ad7bc93 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -1152,6 +1152,103 @@ def test_enrollments_query_explicit_group_id(): assert dedent(enrollments_sql) == expected +def test_metrics_query_explicit_group_id(): + exp = Experiment("slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.GROUP_ID) + + tl = TimeLimits.for_ts( + first_enrollment_date="2019-01-01", + last_date_full_data="2019-03-01", + time_series_period="weekly", + num_dates_enrollment=8, + ) + + enrollments_sql = exp.build_enrollments_query( + time_limits=tl, enrollments_query_type=EnrollmentsQueryType.NORMANDY + ) + + sql_lint(enrollments_sql) + + metrics_sql = exp.build_metrics_query( + metric_list=[ + metric for metric in desktop_metrics if metric.name == "active_hours" + ], + time_limits=tl, + enrollments_table="enrollments", + ) + + sql_lint(metrics_sql) + + expected = """ +WITH analysis_windows AS ( + (SELECT 0 AS analysis_window_start, 6 AS analysis_window_end) +UNION ALL +(SELECT 7 AS analysis_window_start, 13 AS analysis_window_end) +UNION ALL +(SELECT 14 AS analysis_window_start, 20 AS analysis_window_end) +UNION ALL +(SELECT 21 AS analysis_window_start, 27 AS analysis_window_end) +UNION ALL +(SELECT 28 AS analysis_window_start, 34 AS analysis_window_end) +UNION ALL +(SELECT 35 AS analysis_window_start, 41 AS analysis_window_end) +UNION ALL +(SELECT 42 AS analysis_window_start, 48 AS analysis_window_end) +), +raw_enrollments AS ( + -- needed by "exposures" sub query + SELECT + e.*, + aw.* + FROM `enrollments` e + CROSS JOIN analysis_windows aw +), +exposures AS ( + SELECT + * + FROM raw_enrollments e + ), +enrollments AS ( + SELECT + e.* EXCEPT (exposure_date, num_exposure_events), + x.exposure_date, + x.num_exposure_events + FROM exposures x + RIGHT JOIN raw_enrollments e + USING (profile_group_id, branch) +) +SELECT + enrollments.*, + ds_0.active_hours +FROM enrollments + LEFT JOIN ( + SELECT + e.profile_group_id, + e.branch, + e.analysis_window_start, + e.analysis_window_end, + e.num_exposure_events, + e.exposure_date, + COALESCE(SUM(active_hours_sum), 0) AS active_hours +FROM enrollments e + LEFT JOIN mozdata.telemetry.clients_daily ds + ON ds.profile_group_id = e.profile_group_id + AND ds.submission_date BETWEEN '2019-01-01' AND '2019-02-25' + AND ds.submission_date BETWEEN + DATE_ADD(e.enrollment_date, interval e.analysis_window_start day) + AND DATE_ADD(e.enrollment_date, interval e.analysis_window_end day) + +GROUP BY + e.profile_group_id, + e.branch, + e.num_exposure_events, + e.exposure_date, + e.analysis_window_start, + e.analysis_window_end + ) ds_0 USING (profile_group_id, branch, analysis_window_start, analysis_window_end)""" # noqa: E501 + + assert expected == dedent(metrics_sql.rstrip()) + + def test_glean_group_id_incompatible(): exp = Experiment( "slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.GROUP_ID, app_id="test_app" From 380b00d6de0a2028f3421a05a8dfac59542005f1 Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Thu, 1 Aug 2024 09:08:44 -0700 Subject: [PATCH 15/40] refactor, AnalysisUnit -> ExperimentalUnit --- src/mozanalysis/experiment.py | 68 +++++++++++---------- src/mozanalysis/metrics.py | 16 ++--- src/mozanalysis/types.py | 2 +- tests/test_experiment.py | 108 ++++++++++++++++++++++------------ 4 files changed, 116 insertions(+), 78 deletions(-) diff --git a/src/mozanalysis/experiment.py b/src/mozanalysis/experiment.py index 0e4b97bc..0d83b035 100644 --- a/src/mozanalysis/experiment.py +++ b/src/mozanalysis/experiment.py @@ -15,7 +15,7 @@ from mozanalysis.config import ConfigLoader from mozanalysis.metrics import AnalysisBasis, DataSource, Metric from mozanalysis.segments import Segment, SegmentDataSource -from mozanalysis.types import AnalysisUnit +from mozanalysis.types import ExperimentalUnit from mozanalysis.utils import add_days, date_sub, hash_ish if TYPE_CHECKING: @@ -31,7 +31,7 @@ class EnrollmentsQueryType(str, Enum): GLEAN_EVENT = "glean-event" -class IncompatibleAnalysisUnit(ValueError): +class IncompatibleExperimentalUnit(ValueError): pass @@ -119,12 +119,12 @@ class Experiment: app_id (str, optional): For a Glean app, the name of the BigQuery dataset derived from its app ID, like `org_mozilla_firefox`. app_name (str, optional): The Glean app name, like `fenix`. - analysis_unit (AnalysisUnit, optional): the "unit" of analysis, namely the - id that defines an experimental unit. For example: `client_id` + experimental_unit (ExperimentalUnit, optional): the "unit" of analysis, + which defines an experimental unit. For example: `client_id` for mobile experiments or `group_id` for desktop experiments. Is used as the join key when building queries and sub-unit level data is aggregated up to that level. Defaults to client_id unless specified - (AnalysisUnit.CLIENT_ID) + (ExperimentalUnit.CLIENT_ID) Attributes: experiment_slug (str): Name of the study, used to identify @@ -145,7 +145,9 @@ class Experiment: num_dates_enrollment = attr.ib(default=None) app_id = attr.ib(default=None) app_name = attr.ib(default=None) - analysis_unit = attr.ib(type=AnalysisUnit, default=AnalysisUnit.CLIENT_ID) + experimental_unit = attr.ib( + type=ExperimentalUnit, default=ExperimentalUnit.CLIENT_ID + ) def get_app_name(self): """ @@ -504,10 +506,10 @@ def build_enrollments_query( SELECT se.*, - e.* EXCEPT ({self.analysis_unit}, branch) + e.* EXCEPT ({self.experimental_unit}, branch) FROM segmented_enrollments se LEFT JOIN exposures e - USING ({self.analysis_unit.value}, branch) + USING ({self.experimental_unit.value}, branch) """ def build_metrics_query( @@ -602,7 +604,7 @@ def build_metrics_query( metrics_columns=",\n ".join(metrics_columns), metrics_joins="\n".join(metrics_joins), enrollments_table=enrollments_table, - id_column=self.analysis_unit.value, + id_column=self.experimental_unit.value, ) @staticmethod @@ -637,16 +639,16 @@ def _build_enrollments_query( raise ValueError( "App ID must be defined for building Glean enrollments query" ) - if not self.analysis_unit == AnalysisUnit.CLIENT_ID: - raise IncompatibleAnalysisUnit( + if not self.experimental_unit == ExperimentalUnit.CLIENT_ID: + raise IncompatibleExperimentalUnit( "Glean enrollments currently only support client_id analysis units" ) return self._build_enrollments_query_glean_event( time_limits, self.app_id, sample_size ) elif enrollments_query_type == EnrollmentsQueryType.FENIX_FALLBACK: - if not self.analysis_unit == AnalysisUnit.CLIENT_ID: - raise IncompatibleAnalysisUnit( + if not self.experimental_unit == ExperimentalUnit.CLIENT_ID: + raise IncompatibleExperimentalUnit( "Fenix fallback enrollments currently only support client_id analysis units" # noqa: E501 ) return self._build_enrollments_query_fenix_baseline( @@ -657,8 +659,8 @@ def _build_enrollments_query( raise ValueError( "App ID must be defined for building Cirrus enrollments query" ) - if not self.analysis_unit == AnalysisUnit.CLIENT_ID: - raise IncompatibleAnalysisUnit( + if not self.experimental_unit == ExperimentalUnit.CLIENT_ID: + raise IncompatibleExperimentalUnit( "Cirrus enrollments currently only support client_id analysis units" ) return self._build_enrollments_query_cirrus(time_limits, self.app_id) @@ -678,14 +680,14 @@ def _build_exposure_query( raise ValueError( "App ID must be defined for building Glean exposures query" ) - if not self.analysis_unit == AnalysisUnit.CLIENT_ID: - raise IncompatibleAnalysisUnit( + if not self.experimental_unit == ExperimentalUnit.CLIENT_ID: + raise IncompatibleExperimentalUnit( "Glean exposures currently only support client_id analysis units" ) return self._build_exposure_query_glean_event(time_limits, self.app_id) elif exposure_query_type == EnrollmentsQueryType.FENIX_FALLBACK: - if not self.analysis_unit == AnalysisUnit.CLIENT_ID: - raise IncompatibleAnalysisUnit( + if not self.experimental_unit == ExperimentalUnit.CLIENT_ID: + raise IncompatibleExperimentalUnit( "Fenix fallback exposures currently only support client_id analysis units" # noqa: E501 ) return self._build_exposure_query_glean_event( @@ -696,8 +698,8 @@ def _build_exposure_query( raise ValueError( "App ID must be defined for building Cirrus exposures query" ) - if not self.analysis_unit == AnalysisUnit.CLIENT_ID: - raise IncompatibleAnalysisUnit( + if not self.experimental_unit == ExperimentalUnit.CLIENT_ID: + raise IncompatibleExperimentalUnit( "Cirrus exposures currently only support client_id analysis units" ) return self._build_exposure_query_glean_event( @@ -715,7 +717,7 @@ def _build_enrollments_query_normandy( sample_size: int = 100, ) -> str: """Return SQL to query enrollments for a normandy experiment""" - if self.analysis_unit == AnalysisUnit.CLIENT_ID: + if self.experimental_unit == ExperimentalUnit.CLIENT_ID: return f""" SELECT e.client_id, @@ -734,7 +736,7 @@ def _build_enrollments_query_normandy( AND e.sample_id < {sample_size} GROUP BY e.client_id, branch """ # noqa:E501 - elif self.analysis_unit == AnalysisUnit.GROUP_ID: + elif self.experimental_unit == ExperimentalUnit.GROUP_ID: # TODO: update this based on the final structure of the group_id # within the events ping return f""" @@ -756,7 +758,7 @@ def _build_enrollments_query_normandy( GROUP BY e.profile_group_id, branch """ # noqa:E501 else: - assert_never(self.analysis_unit) + assert_never(self.experimental_unit) def _build_enrollments_query_fenix_baseline( self, time_limits: TimeLimits, sample_size: int = 100 @@ -873,7 +875,7 @@ def _build_enrollments_query_cirrus( def _build_exposure_query_normandy(self, time_limits: TimeLimits) -> str: """Return SQL to query exposures for a normandy experiment""" - if self.analysis_unit == AnalysisUnit.CLIENT_ID: + if self.experimental_unit == ExperimentalUnit.CLIENT_ID: return f""" SELECT e.client_id, @@ -900,7 +902,7 @@ def _build_exposure_query_normandy(self, time_limits: TimeLimits) -> str: e.submission_date >= re.enrollment_date GROUP BY e.client_id, e.branch """ # noqa: E501 - elif self.analysis_unit == AnalysisUnit.GROUP_ID: + elif self.experimental_unit == ExperimentalUnit.GROUP_ID: return f""" SELECT e.profile_group_id, @@ -928,7 +930,7 @@ def _build_exposure_query_normandy(self, time_limits: TimeLimits) -> str: GROUP BY e.profile_group_id, e.branch """ # noqa: E501 else: - assert_never(self.analysis_unit) + assert_never(self.experimental_unit) def _build_exposure_query_glean_event( self, @@ -1000,14 +1002,14 @@ def _build_metrics_query_bits( self.experiment_slug, self.app_id, analysis_basis, - self.analysis_unit, + self.experimental_unit, exposure_signal, ) metrics_joins.append( f""" LEFT JOIN ( {query_for_metrics} - ) ds_{i} USING ({self.analysis_unit.value}, branch, analysis_window_start, analysis_window_end) + ) ds_{i} USING ({self.experimental_unit.value}, branch, analysis_window_start, analysis_window_end) """ # noqa: E501 ) @@ -1097,7 +1099,7 @@ def _build_segments_query_bits( segments_joins.append( f""" LEFT JOIN ( {query_for_segments} - ) ds_{i} USING ({self.analysis_unit.value}, branch) + ) ds_{i} USING ({self.experimental_unit.value}, branch) """ ) @@ -1370,7 +1372,9 @@ class TimeSeriesResult: fully_qualified_table_name = attr.ib(type=str) analysis_windows = attr.ib(type=list) - analysis_unit = attr.ib(type=AnalysisUnit, default=AnalysisUnit.CLIENT_ID) + experimental_unit = attr.ib( + type=ExperimentalUnit, default=ExperimentalUnit.CLIENT_ID + ) def get(self, bq_context: BigQueryContext, analysis_window) -> DataFrame: """Get the DataFrame for a specific analysis window. @@ -1480,7 +1484,7 @@ def _build_analysis_window_subset_query( in "the standard format" for a single analysis window. """ return f""" - SELECT * EXCEPT ({self.analysis_unit.value}, analysis_window_start, analysis_window_end) + SELECT * EXCEPT ({self.experimental_unit.value}, analysis_window_start, analysis_window_end) FROM {self.fully_qualified_table_name} WHERE analysis_window_start = {analysis_window.start} AND analysis_window_end = {analysis_window.end} diff --git a/src/mozanalysis/metrics.py b/src/mozanalysis/metrics.py index a0e2271e..8e156123 100644 --- a/src/mozanalysis/metrics.py +++ b/src/mozanalysis/metrics.py @@ -8,7 +8,7 @@ from typing_extensions import assert_never -from mozanalysis.types import AnalysisUnit +from mozanalysis.types import ExperimentalUnit if TYPE_CHECKING: from mozanalysis.experiment import TimeLimits @@ -146,7 +146,7 @@ def build_query( experiment_slug: str, from_expr_dataset: str | None = None, analysis_basis: str = AnalysisBasis.ENROLLMENTS, - analysis_unit: AnalysisUnit = AnalysisUnit.CLIENT_ID, + experimental_unit: ExperimentalUnit = ExperimentalUnit.CLIENT_ID, exposure_signal=None, ) -> str: """Return a nearly-self contained SQL query. @@ -154,12 +154,12 @@ def build_query( This query does not define ``enrollments`` but otherwise could be executed to query all metrics from this data source. """ - if analysis_unit == AnalysisUnit.CLIENT_ID: + if experimental_unit == ExperimentalUnit.CLIENT_ID: ds_id = self.client_id_column or "client_id" - elif analysis_unit == AnalysisUnit.GROUP_ID: + elif experimental_unit == ExperimentalUnit.GROUP_ID: ds_id = self.group_id_column or "profile_group_id" else: - assert_never(analysis_unit) + assert_never(experimental_unit) return """SELECT e.{id_column}, e.branch, @@ -201,7 +201,7 @@ def build_query( submission_date=self.submission_date_column or "submission_date", experiment_slug=experiment_slug, ), - id_column=analysis_unit.value, + id_column=experimental_unit.value, ) def build_query_targets( @@ -212,7 +212,7 @@ def build_query_targets( analysis_length: int, from_expr_dataset: str | None = None, continuous_enrollment: bool = False, - analysis_unit: AnalysisUnit = AnalysisUnit.CLIENT_ID, + experimental_unit: ExperimentalUnit = ExperimentalUnit.CLIENT_ID, ) -> str: """Return a nearly-self contained SQL query that constructs the metrics query for targeting historical data without @@ -221,7 +221,7 @@ def build_query_targets( This query does not define ``targets`` but otherwise could be executed to query all metrics from this data source. """ - if analysis_unit != AnalysisUnit.CLIENT_ID: + if experimental_unit != ExperimentalUnit.CLIENT_ID: raise ValueError( "`build_query_targets` currently supports client_id analysis" ) diff --git a/src/mozanalysis/types.py b/src/mozanalysis/types.py index a41baba9..b4225667 100644 --- a/src/mozanalysis/types.py +++ b/src/mozanalysis/types.py @@ -21,6 +21,6 @@ class Uplift(str, Enum): CompareBranchesOutput = dict[ComparativeOption, EstimatesByBranch] -class AnalysisUnit(str, Enum): +class ExperimentalUnit(str, Enum): CLIENT_ID = "client_id" GROUP_ID = "profile_group_id" diff --git a/tests/test_experiment.py b/tests/test_experiment.py index 1ad7bc93..07bc25f9 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -15,13 +15,13 @@ AnalysisWindow, EnrollmentsQueryType, Experiment, - IncompatibleAnalysisUnit, + IncompatibleExperimentalUnit, TimeLimits, ) from mozanalysis.exposure import ExposureSignal from mozanalysis.metrics import AnalysisBasis, DataSource, Metric from mozanalysis.segments import Segment, SegmentDataSource -from mozanalysis.types import AnalysisUnit +from mozanalysis.types import ExperimentalUnit def test_time_limits_validates(): @@ -282,10 +282,10 @@ def test_analysis_window_validates_end(): @pytest.mark.parametrize( - "analysis_unit", [AnalysisUnit.CLIENT_ID, AnalysisUnit.GROUP_ID] + "experimental_unit", [ExperimentalUnit.CLIENT_ID, ExperimentalUnit.GROUP_ID] ) -def test_query_not_detectably_malformed(analysis_unit: AnalysisUnit): - exp = Experiment("slug", "2019-01-01", 8, analysis_unit=analysis_unit) +def test_query_not_detectably_malformed(experimental_unit: ExperimentalUnit): + exp = Experiment("slug", "2019-01-01", 8, experimental_unit=experimental_unit) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -303,9 +303,9 @@ def test_query_not_detectably_malformed(analysis_unit: AnalysisUnit): sql_lint(enrollments_sql) assert "sample_id < None" not in enrollments_sql - if analysis_unit == AnalysisUnit.CLIENT_ID: + if experimental_unit == ExperimentalUnit.CLIENT_ID: assert "client_id" in enrollments_sql - elif analysis_unit == AnalysisUnit.GROUP_ID: + elif experimental_unit == ExperimentalUnit.GROUP_ID: assert "profile_group_id" in enrollments_sql metrics_sql = exp.build_metrics_query( @@ -316,17 +316,17 @@ def test_query_not_detectably_malformed(analysis_unit: AnalysisUnit): sql_lint(metrics_sql) - if analysis_unit == AnalysisUnit.CLIENT_ID: + if experimental_unit == ExperimentalUnit.CLIENT_ID: assert "client_id" in metrics_sql - elif analysis_unit == AnalysisUnit.GROUP_ID: + elif experimental_unit == ExperimentalUnit.GROUP_ID: assert "profile_group_id" in metrics_sql @pytest.mark.parametrize( - "analysis_unit", [AnalysisUnit.CLIENT_ID, AnalysisUnit.GROUP_ID] + "experimental_unit", [ExperimentalUnit.CLIENT_ID, ExperimentalUnit.GROUP_ID] ) -def test_megaquery_not_detectably_malformed(analysis_unit: AnalysisUnit): - exp = Experiment("slug", "2019-01-01", 8, analysis_unit=analysis_unit) +def test_megaquery_not_detectably_malformed(experimental_unit: ExperimentalUnit): + exp = Experiment("slug", "2019-01-01", 8, experimental_unit=experimental_unit) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -341,9 +341,9 @@ def test_megaquery_not_detectably_malformed(analysis_unit: AnalysisUnit): sql_lint(enrollments_sql) - if analysis_unit == AnalysisUnit.CLIENT_ID: + if experimental_unit == ExperimentalUnit.CLIENT_ID: assert "client_id" in enrollments_sql - elif analysis_unit == AnalysisUnit.GROUP_ID: + elif experimental_unit == ExperimentalUnit.GROUP_ID: assert "profile_group_id" in enrollments_sql metrics_sql = exp.build_metrics_query( @@ -354,17 +354,19 @@ def test_megaquery_not_detectably_malformed(analysis_unit: AnalysisUnit): sql_lint(metrics_sql) - if analysis_unit == AnalysisUnit.CLIENT_ID: + if experimental_unit == ExperimentalUnit.CLIENT_ID: assert "client_id" in metrics_sql - elif analysis_unit == AnalysisUnit.GROUP_ID: + elif experimental_unit == ExperimentalUnit.GROUP_ID: assert "profile_group_id" in metrics_sql @pytest.mark.parametrize( - "analysis_unit", [AnalysisUnit.CLIENT_ID, AnalysisUnit.GROUP_ID] + "experimental_unit", [ExperimentalUnit.CLIENT_ID, ExperimentalUnit.GROUP_ID] ) -def test_segments_megaquery_not_detectably_malformed(analysis_unit: AnalysisUnit): - exp = Experiment("slug", "2019-01-01", 8, analysis_unit=analysis_unit) +def test_segments_megaquery_not_detectably_malformed( + experimental_unit: ExperimentalUnit, +): + exp = Experiment("slug", "2019-01-01", 8, experimental_unit=experimental_unit) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1073,7 +1075,9 @@ def test_metrics_query_explicit_client_id(): def test_enrollments_query_explicit_group_id(): - exp = Experiment("slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.GROUP_ID) + exp = Experiment( + "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP_ID + ) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1153,7 +1157,9 @@ def test_enrollments_query_explicit_group_id(): def test_metrics_query_explicit_group_id(): - exp = Experiment("slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.GROUP_ID) + exp = Experiment( + "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP_ID + ) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1251,7 +1257,11 @@ def test_metrics_query_explicit_group_id(): def test_glean_group_id_incompatible(): exp = Experiment( - "slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.GROUP_ID, app_id="test_app" + "slug", + "2019-01-01", + 8, + experimental_unit=ExperimentalUnit.GROUP_ID, + app_id="test_app", ) tl = TimeLimits.for_ts( @@ -1261,7 +1271,7 @@ def test_glean_group_id_incompatible(): num_dates_enrollment=8, ) - with pytest.raises(IncompatibleAnalysisUnit): + with pytest.raises(IncompatibleExperimentalUnit): exp.build_enrollments_query( time_limits=tl, enrollments_query_type=EnrollmentsQueryType.GLEAN_EVENT ) @@ -1269,7 +1279,11 @@ def test_glean_group_id_incompatible(): def test_glean_group_id_incompatible_exposures(): exp = Experiment( - "slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.GROUP_ID, app_id="test_app" + "slug", + "2019-01-01", + 8, + experimental_unit=ExperimentalUnit.GROUP_ID, + app_id="test_app", ) tl = TimeLimits.for_ts( @@ -1279,14 +1293,16 @@ def test_glean_group_id_incompatible_exposures(): num_dates_enrollment=8, ) - with pytest.raises(IncompatibleAnalysisUnit): + with pytest.raises(IncompatibleExperimentalUnit): exp._build_exposure_query( time_limits=tl, exposure_query_type=EnrollmentsQueryType.GLEAN_EVENT ) def test_glean_missing_app_id(): - exp = Experiment("slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.GROUP_ID) + exp = Experiment( + "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP_ID + ) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1304,7 +1320,9 @@ def test_glean_missing_app_id(): def test_glean_exposures_missing_app_id(): - exp = Experiment("slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.GROUP_ID) + exp = Experiment( + "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP_ID + ) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1323,7 +1341,11 @@ def test_glean_exposures_missing_app_id(): def test_cirrus_group_id_incompatible(): exp = Experiment( - "slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.GROUP_ID, app_id="test_app" + "slug", + "2019-01-01", + 8, + experimental_unit=ExperimentalUnit.GROUP_ID, + app_id="test_app", ) tl = TimeLimits.for_ts( @@ -1333,7 +1355,7 @@ def test_cirrus_group_id_incompatible(): num_dates_enrollment=8, ) - with pytest.raises(IncompatibleAnalysisUnit): + with pytest.raises(IncompatibleExperimentalUnit): exp.build_enrollments_query( time_limits=tl, enrollments_query_type=EnrollmentsQueryType.CIRRUS ) @@ -1341,7 +1363,11 @@ def test_cirrus_group_id_incompatible(): def test_cirrus_group_id_incompatible_exposures(): exp = Experiment( - "slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.GROUP_ID, app_id="test_app" + "slug", + "2019-01-01", + 8, + experimental_unit=ExperimentalUnit.GROUP_ID, + app_id="test_app", ) tl = TimeLimits.for_ts( @@ -1351,14 +1377,16 @@ def test_cirrus_group_id_incompatible_exposures(): num_dates_enrollment=8, ) - with pytest.raises(IncompatibleAnalysisUnit): + with pytest.raises(IncompatibleExperimentalUnit): exp._build_exposure_query( time_limits=tl, exposure_query_type=EnrollmentsQueryType.CIRRUS ) def test_cirrus_missing_app_id(): - exp = Experiment("slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.GROUP_ID) + exp = Experiment( + "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP_ID + ) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1376,7 +1404,9 @@ def test_cirrus_missing_app_id(): def test_cirrus_missing_app_id_exposures(): - exp = Experiment("slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.GROUP_ID) + exp = Experiment( + "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP_ID + ) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1394,7 +1424,9 @@ def test_cirrus_missing_app_id_exposures(): def test_fenix_group_id_incompatible(): - exp = Experiment("slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.GROUP_ID) + exp = Experiment( + "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP_ID + ) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1403,14 +1435,16 @@ def test_fenix_group_id_incompatible(): num_dates_enrollment=8, ) - with pytest.raises(IncompatibleAnalysisUnit): + with pytest.raises(IncompatibleExperimentalUnit): exp.build_enrollments_query( time_limits=tl, enrollments_query_type=EnrollmentsQueryType.FENIX_FALLBACK ) def test_fenix_group_id_incompatible_exposures(): - exp = Experiment("slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.GROUP_ID) + exp = Experiment( + "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP_ID + ) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1419,7 +1453,7 @@ def test_fenix_group_id_incompatible_exposures(): num_dates_enrollment=8, ) - with pytest.raises(IncompatibleAnalysisUnit): + with pytest.raises(IncompatibleExperimentalUnit): exp._build_exposure_query( time_limits=tl, exposure_query_type=EnrollmentsQueryType.FENIX_FALLBACK ) From fee899478bcb766b505963a7372b469797b8af27 Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Thu, 1 Aug 2024 09:13:26 -0700 Subject: [PATCH 16/40] refactor enum levels --- src/mozanalysis/experiment.py | 30 ++++++++--------- src/mozanalysis/metrics.py | 10 +++--- src/mozanalysis/types.py | 4 +-- tests/test_experiment.py | 62 +++++++++++++---------------------- 4 files changed, 43 insertions(+), 63 deletions(-) diff --git a/src/mozanalysis/experiment.py b/src/mozanalysis/experiment.py index 0d83b035..63126fd9 100644 --- a/src/mozanalysis/experiment.py +++ b/src/mozanalysis/experiment.py @@ -124,7 +124,7 @@ class Experiment: for mobile experiments or `group_id` for desktop experiments. Is used as the join key when building queries and sub-unit level data is aggregated up to that level. Defaults to client_id unless specified - (ExperimentalUnit.CLIENT_ID) + (ExperimentalUnit.CLIENT) Attributes: experiment_slug (str): Name of the study, used to identify @@ -145,9 +145,7 @@ class Experiment: num_dates_enrollment = attr.ib(default=None) app_id = attr.ib(default=None) app_name = attr.ib(default=None) - experimental_unit = attr.ib( - type=ExperimentalUnit, default=ExperimentalUnit.CLIENT_ID - ) + experimental_unit = attr.ib(type=ExperimentalUnit, default=ExperimentalUnit.CLIENT) def get_app_name(self): """ @@ -639,7 +637,7 @@ def _build_enrollments_query( raise ValueError( "App ID must be defined for building Glean enrollments query" ) - if not self.experimental_unit == ExperimentalUnit.CLIENT_ID: + if not self.experimental_unit == ExperimentalUnit.CLIENT: raise IncompatibleExperimentalUnit( "Glean enrollments currently only support client_id analysis units" ) @@ -647,7 +645,7 @@ def _build_enrollments_query( time_limits, self.app_id, sample_size ) elif enrollments_query_type == EnrollmentsQueryType.FENIX_FALLBACK: - if not self.experimental_unit == ExperimentalUnit.CLIENT_ID: + if not self.experimental_unit == ExperimentalUnit.CLIENT: raise IncompatibleExperimentalUnit( "Fenix fallback enrollments currently only support client_id analysis units" # noqa: E501 ) @@ -659,7 +657,7 @@ def _build_enrollments_query( raise ValueError( "App ID must be defined for building Cirrus enrollments query" ) - if not self.experimental_unit == ExperimentalUnit.CLIENT_ID: + if not self.experimental_unit == ExperimentalUnit.CLIENT: raise IncompatibleExperimentalUnit( "Cirrus enrollments currently only support client_id analysis units" ) @@ -680,13 +678,13 @@ def _build_exposure_query( raise ValueError( "App ID must be defined for building Glean exposures query" ) - if not self.experimental_unit == ExperimentalUnit.CLIENT_ID: + if not self.experimental_unit == ExperimentalUnit.CLIENT: raise IncompatibleExperimentalUnit( "Glean exposures currently only support client_id analysis units" ) return self._build_exposure_query_glean_event(time_limits, self.app_id) elif exposure_query_type == EnrollmentsQueryType.FENIX_FALLBACK: - if not self.experimental_unit == ExperimentalUnit.CLIENT_ID: + if not self.experimental_unit == ExperimentalUnit.CLIENT: raise IncompatibleExperimentalUnit( "Fenix fallback exposures currently only support client_id analysis units" # noqa: E501 ) @@ -698,7 +696,7 @@ def _build_exposure_query( raise ValueError( "App ID must be defined for building Cirrus exposures query" ) - if not self.experimental_unit == ExperimentalUnit.CLIENT_ID: + if not self.experimental_unit == ExperimentalUnit.CLIENT: raise IncompatibleExperimentalUnit( "Cirrus exposures currently only support client_id analysis units" ) @@ -717,7 +715,7 @@ def _build_enrollments_query_normandy( sample_size: int = 100, ) -> str: """Return SQL to query enrollments for a normandy experiment""" - if self.experimental_unit == ExperimentalUnit.CLIENT_ID: + if self.experimental_unit == ExperimentalUnit.CLIENT: return f""" SELECT e.client_id, @@ -736,7 +734,7 @@ def _build_enrollments_query_normandy( AND e.sample_id < {sample_size} GROUP BY e.client_id, branch """ # noqa:E501 - elif self.experimental_unit == ExperimentalUnit.GROUP_ID: + elif self.experimental_unit == ExperimentalUnit.GROUP: # TODO: update this based on the final structure of the group_id # within the events ping return f""" @@ -875,7 +873,7 @@ def _build_enrollments_query_cirrus( def _build_exposure_query_normandy(self, time_limits: TimeLimits) -> str: """Return SQL to query exposures for a normandy experiment""" - if self.experimental_unit == ExperimentalUnit.CLIENT_ID: + if self.experimental_unit == ExperimentalUnit.CLIENT: return f""" SELECT e.client_id, @@ -902,7 +900,7 @@ def _build_exposure_query_normandy(self, time_limits: TimeLimits) -> str: e.submission_date >= re.enrollment_date GROUP BY e.client_id, e.branch """ # noqa: E501 - elif self.experimental_unit == ExperimentalUnit.GROUP_ID: + elif self.experimental_unit == ExperimentalUnit.GROUP: return f""" SELECT e.profile_group_id, @@ -1372,9 +1370,7 @@ class TimeSeriesResult: fully_qualified_table_name = attr.ib(type=str) analysis_windows = attr.ib(type=list) - experimental_unit = attr.ib( - type=ExperimentalUnit, default=ExperimentalUnit.CLIENT_ID - ) + experimental_unit = attr.ib(type=ExperimentalUnit, default=ExperimentalUnit.CLIENT) def get(self, bq_context: BigQueryContext, analysis_window) -> DataFrame: """Get the DataFrame for a specific analysis window. diff --git a/src/mozanalysis/metrics.py b/src/mozanalysis/metrics.py index 8e156123..73a6df2c 100644 --- a/src/mozanalysis/metrics.py +++ b/src/mozanalysis/metrics.py @@ -146,7 +146,7 @@ def build_query( experiment_slug: str, from_expr_dataset: str | None = None, analysis_basis: str = AnalysisBasis.ENROLLMENTS, - experimental_unit: ExperimentalUnit = ExperimentalUnit.CLIENT_ID, + experimental_unit: ExperimentalUnit = ExperimentalUnit.CLIENT, exposure_signal=None, ) -> str: """Return a nearly-self contained SQL query. @@ -154,9 +154,9 @@ def build_query( This query does not define ``enrollments`` but otherwise could be executed to query all metrics from this data source. """ - if experimental_unit == ExperimentalUnit.CLIENT_ID: + if experimental_unit == ExperimentalUnit.CLIENT: ds_id = self.client_id_column or "client_id" - elif experimental_unit == ExperimentalUnit.GROUP_ID: + elif experimental_unit == ExperimentalUnit.GROUP: ds_id = self.group_id_column or "profile_group_id" else: assert_never(experimental_unit) @@ -212,7 +212,7 @@ def build_query_targets( analysis_length: int, from_expr_dataset: str | None = None, continuous_enrollment: bool = False, - experimental_unit: ExperimentalUnit = ExperimentalUnit.CLIENT_ID, + experimental_unit: ExperimentalUnit = ExperimentalUnit.CLIENT, ) -> str: """Return a nearly-self contained SQL query that constructs the metrics query for targeting historical data without @@ -221,7 +221,7 @@ def build_query_targets( This query does not define ``targets`` but otherwise could be executed to query all metrics from this data source. """ - if experimental_unit != ExperimentalUnit.CLIENT_ID: + if experimental_unit != ExperimentalUnit.CLIENT: raise ValueError( "`build_query_targets` currently supports client_id analysis" ) diff --git a/src/mozanalysis/types.py b/src/mozanalysis/types.py index b4225667..3fdd8d86 100644 --- a/src/mozanalysis/types.py +++ b/src/mozanalysis/types.py @@ -22,5 +22,5 @@ class Uplift(str, Enum): class ExperimentalUnit(str, Enum): - CLIENT_ID = "client_id" - GROUP_ID = "profile_group_id" + CLIENT = "client_id" + GROUP = "profile_group_id" diff --git a/tests/test_experiment.py b/tests/test_experiment.py index 07bc25f9..e96a4ab1 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -282,7 +282,7 @@ def test_analysis_window_validates_end(): @pytest.mark.parametrize( - "experimental_unit", [ExperimentalUnit.CLIENT_ID, ExperimentalUnit.GROUP_ID] + "experimental_unit", [ExperimentalUnit.CLIENT, ExperimentalUnit.GROUP] ) def test_query_not_detectably_malformed(experimental_unit: ExperimentalUnit): exp = Experiment("slug", "2019-01-01", 8, experimental_unit=experimental_unit) @@ -303,9 +303,9 @@ def test_query_not_detectably_malformed(experimental_unit: ExperimentalUnit): sql_lint(enrollments_sql) assert "sample_id < None" not in enrollments_sql - if experimental_unit == ExperimentalUnit.CLIENT_ID: + if experimental_unit == ExperimentalUnit.CLIENT: assert "client_id" in enrollments_sql - elif experimental_unit == ExperimentalUnit.GROUP_ID: + elif experimental_unit == ExperimentalUnit.GROUP: assert "profile_group_id" in enrollments_sql metrics_sql = exp.build_metrics_query( @@ -316,14 +316,14 @@ def test_query_not_detectably_malformed(experimental_unit: ExperimentalUnit): sql_lint(metrics_sql) - if experimental_unit == ExperimentalUnit.CLIENT_ID: + if experimental_unit == ExperimentalUnit.CLIENT: assert "client_id" in metrics_sql - elif experimental_unit == ExperimentalUnit.GROUP_ID: + elif experimental_unit == ExperimentalUnit.GROUP: assert "profile_group_id" in metrics_sql @pytest.mark.parametrize( - "experimental_unit", [ExperimentalUnit.CLIENT_ID, ExperimentalUnit.GROUP_ID] + "experimental_unit", [ExperimentalUnit.CLIENT, ExperimentalUnit.GROUP] ) def test_megaquery_not_detectably_malformed(experimental_unit: ExperimentalUnit): exp = Experiment("slug", "2019-01-01", 8, experimental_unit=experimental_unit) @@ -341,9 +341,9 @@ def test_megaquery_not_detectably_malformed(experimental_unit: ExperimentalUnit) sql_lint(enrollments_sql) - if experimental_unit == ExperimentalUnit.CLIENT_ID: + if experimental_unit == ExperimentalUnit.CLIENT: assert "client_id" in enrollments_sql - elif experimental_unit == ExperimentalUnit.GROUP_ID: + elif experimental_unit == ExperimentalUnit.GROUP: assert "profile_group_id" in enrollments_sql metrics_sql = exp.build_metrics_query( @@ -354,14 +354,14 @@ def test_megaquery_not_detectably_malformed(experimental_unit: ExperimentalUnit) sql_lint(metrics_sql) - if experimental_unit == ExperimentalUnit.CLIENT_ID: + if experimental_unit == ExperimentalUnit.CLIENT: assert "client_id" in metrics_sql - elif experimental_unit == ExperimentalUnit.GROUP_ID: + elif experimental_unit == ExperimentalUnit.GROUP: assert "profile_group_id" in metrics_sql @pytest.mark.parametrize( - "experimental_unit", [ExperimentalUnit.CLIENT_ID, ExperimentalUnit.GROUP_ID] + "experimental_unit", [ExperimentalUnit.CLIENT, ExperimentalUnit.GROUP] ) def test_segments_megaquery_not_detectably_malformed( experimental_unit: ExperimentalUnit, @@ -1075,9 +1075,7 @@ def test_metrics_query_explicit_client_id(): def test_enrollments_query_explicit_group_id(): - exp = Experiment( - "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP_ID - ) + exp = Experiment("slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1157,9 +1155,7 @@ def test_enrollments_query_explicit_group_id(): def test_metrics_query_explicit_group_id(): - exp = Experiment( - "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP_ID - ) + exp = Experiment("slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1260,7 +1256,7 @@ def test_glean_group_id_incompatible(): "slug", "2019-01-01", 8, - experimental_unit=ExperimentalUnit.GROUP_ID, + experimental_unit=ExperimentalUnit.GROUP, app_id="test_app", ) @@ -1282,7 +1278,7 @@ def test_glean_group_id_incompatible_exposures(): "slug", "2019-01-01", 8, - experimental_unit=ExperimentalUnit.GROUP_ID, + experimental_unit=ExperimentalUnit.GROUP, app_id="test_app", ) @@ -1300,9 +1296,7 @@ def test_glean_group_id_incompatible_exposures(): def test_glean_missing_app_id(): - exp = Experiment( - "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP_ID - ) + exp = Experiment("slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1320,9 +1314,7 @@ def test_glean_missing_app_id(): def test_glean_exposures_missing_app_id(): - exp = Experiment( - "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP_ID - ) + exp = Experiment("slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1344,7 +1336,7 @@ def test_cirrus_group_id_incompatible(): "slug", "2019-01-01", 8, - experimental_unit=ExperimentalUnit.GROUP_ID, + experimental_unit=ExperimentalUnit.GROUP, app_id="test_app", ) @@ -1366,7 +1358,7 @@ def test_cirrus_group_id_incompatible_exposures(): "slug", "2019-01-01", 8, - experimental_unit=ExperimentalUnit.GROUP_ID, + experimental_unit=ExperimentalUnit.GROUP, app_id="test_app", ) @@ -1384,9 +1376,7 @@ def test_cirrus_group_id_incompatible_exposures(): def test_cirrus_missing_app_id(): - exp = Experiment( - "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP_ID - ) + exp = Experiment("slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1404,9 +1394,7 @@ def test_cirrus_missing_app_id(): def test_cirrus_missing_app_id_exposures(): - exp = Experiment( - "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP_ID - ) + exp = Experiment("slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1424,9 +1412,7 @@ def test_cirrus_missing_app_id_exposures(): def test_fenix_group_id_incompatible(): - exp = Experiment( - "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP_ID - ) + exp = Experiment("slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1442,9 +1428,7 @@ def test_fenix_group_id_incompatible(): def test_fenix_group_id_incompatible_exposures(): - exp = Experiment( - "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP_ID - ) + exp = Experiment("slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", From aaf5033a98126db657eb28610e1b9e56543855ae Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Thu, 1 Aug 2024 09:55:54 -0700 Subject: [PATCH 17/40] update docstring --- src/mozanalysis/experiment.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/mozanalysis/experiment.py b/src/mozanalysis/experiment.py index 63126fd9..c898669d 100644 --- a/src/mozanalysis/experiment.py +++ b/src/mozanalysis/experiment.py @@ -120,11 +120,11 @@ class Experiment: dataset derived from its app ID, like `org_mozilla_firefox`. app_name (str, optional): The Glean app name, like `fenix`. experimental_unit (ExperimentalUnit, optional): the "unit" of analysis, - which defines an experimental unit. For example: `client_id` - for mobile experiments or `group_id` for desktop experiments. Is used + which defines an experimental unit. For example: `CLIENT` + for mobile experiments or `GROUP` for desktop experiments. Is used as the join key when building queries and sub-unit level data is - aggregated up to that level. Defaults to client_id unless specified - (ExperimentalUnit.CLIENT) + aggregated up to that level. Defaults to `ExperimentalUnit.CLIENT` + unless specified Attributes: experiment_slug (str): Name of the study, used to identify From d4fdbe17c1b5fef353e14cf27c0d2c0887f9e0f4 Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Thu, 1 Aug 2024 13:49:38 -0700 Subject: [PATCH 18/40] combined client/group query strings into one, added check for downsampling --- src/mozanalysis/experiment.py | 144 +++++++++----------------- tests/test_experiment.py | 185 +++++++++++++++++++--------------- 2 files changed, 150 insertions(+), 179 deletions(-) diff --git a/src/mozanalysis/experiment.py b/src/mozanalysis/experiment.py index c898669d..2175c510 100644 --- a/src/mozanalysis/experiment.py +++ b/src/mozanalysis/experiment.py @@ -715,48 +715,28 @@ def _build_enrollments_query_normandy( sample_size: int = 100, ) -> str: """Return SQL to query enrollments for a normandy experiment""" - if self.experimental_unit == ExperimentalUnit.CLIENT: - return f""" - SELECT - e.client_id, - `mozfun.map.get_key`(e.event_map_values, 'branch') - AS branch, - MIN(e.submission_date) AS enrollment_date, - COUNT(e.submission_date) AS num_enrollment_events - FROM - `moz-fx-data-shared-prod.telemetry.events` e - WHERE - e.event_category = 'normandy' - AND e.event_method = 'enroll' - AND e.submission_date - BETWEEN '{time_limits.first_enrollment_date}' AND '{time_limits.last_enrollment_date}' - AND e.event_string_value = '{self.experiment_slug}' - AND e.sample_id < {sample_size} - GROUP BY e.client_id, branch - """ # noqa:E501 - elif self.experimental_unit == ExperimentalUnit.GROUP: - # TODO: update this based on the final structure of the group_id - # within the events ping - return f""" - SELECT - e.profile_group_id, - `mozfun.map.get_key`(e.event_map_values, 'branch') - AS branch, - MIN(e.submission_date) AS enrollment_date, - COUNT(e.submission_date) AS num_enrollment_events - FROM - `moz-fx-data-shared-prod.telemetry.events` e - WHERE - e.event_category = 'normandy' - AND e.event_method = 'enroll' - AND e.submission_date - BETWEEN '{time_limits.first_enrollment_date}' AND '{time_limits.last_enrollment_date}' - AND e.event_string_value = '{self.experiment_slug}' - AND e.sample_id < {sample_size} - GROUP BY e.profile_group_id, branch - """ # noqa:E501 - else: - assert_never(self.experimental_unit) + if (self.experimental_unit == ExperimentalUnit.GROUP) and (sample_size < 100): + raise ValueError( + "Downsampling is not yet supported for group-level experiments" + ) + return f""" + SELECT + e.{self.experimental_unit.value}, + `mozfun.map.get_key`(e.event_map_values, 'branch') + AS branch, + MIN(e.submission_date) AS enrollment_date, + COUNT(e.submission_date) AS num_enrollment_events + FROM + `moz-fx-data-shared-prod.telemetry.events` e + WHERE + e.event_category = 'normandy' + AND e.event_method = 'enroll' + AND e.submission_date + BETWEEN '{time_limits.first_enrollment_date}' AND '{time_limits.last_enrollment_date}' + AND e.event_string_value = '{self.experiment_slug}' + AND e.sample_id < {sample_size} + GROUP BY e.{self.experimental_unit.value}, branch + """ # noqa:E501 def _build_enrollments_query_fenix_baseline( self, time_limits: TimeLimits, sample_size: int = 100 @@ -873,62 +853,32 @@ def _build_enrollments_query_cirrus( def _build_exposure_query_normandy(self, time_limits: TimeLimits) -> str: """Return SQL to query exposures for a normandy experiment""" - if self.experimental_unit == ExperimentalUnit.CLIENT: - return f""" - SELECT - e.client_id, - e.branch, - min(e.submission_date) AS exposure_date, - COUNT(e.submission_date) AS num_exposure_events - FROM raw_enrollments re - LEFT JOIN ( - SELECT - client_id, - `mozfun.map.get_key`(event_map_values, 'branchSlug') AS branch, - submission_date - FROM - `moz-fx-data-shared-prod.telemetry.events` - WHERE - event_category = 'normandy' - AND (event_method = 'exposure' OR event_method = 'expose') - AND submission_date - BETWEEN '{time_limits.first_enrollment_date}' AND '{time_limits.last_enrollment_date}' - AND event_string_value = '{self.experiment_slug}' - ) e - ON re.client_id = e.client_id AND - re.branch = e.branch AND - e.submission_date >= re.enrollment_date - GROUP BY e.client_id, e.branch - """ # noqa: E501 - elif self.experimental_unit == ExperimentalUnit.GROUP: - return f""" + return f""" + SELECT + e.{self.experimental_unit.value}, + e.branch, + min(e.submission_date) AS exposure_date, + COUNT(e.submission_date) AS num_exposure_events + FROM raw_enrollments re + LEFT JOIN ( SELECT - e.profile_group_id, - e.branch, - min(e.submission_date) AS exposure_date, - COUNT(e.submission_date) AS num_exposure_events - FROM raw_enrollments re - LEFT JOIN ( - SELECT - profile_group_id, - `mozfun.map.get_key`(event_map_values, 'branchSlug') AS branch, - submission_date - FROM - `moz-fx-data-shared-prod.telemetry.events` - WHERE - event_category = 'normandy' - AND (event_method = 'exposure' OR event_method = 'expose') - AND submission_date - BETWEEN '{time_limits.first_enrollment_date}' AND '{time_limits.last_enrollment_date}' - AND event_string_value = '{self.experiment_slug}' - ) e - ON re.profile_group_id = e.profile_group_id AND - re.branch = e.branch AND - e.submission_date >= re.enrollment_date - GROUP BY e.profile_group_id, e.branch - """ # noqa: E501 - else: - assert_never(self.experimental_unit) + {self.experimental_unit.value}, + `mozfun.map.get_key`(event_map_values, 'branchSlug') AS branch, + submission_date + FROM + `moz-fx-data-shared-prod.telemetry.events` + WHERE + event_category = 'normandy' + AND (event_method = 'exposure' OR event_method = 'expose') + AND submission_date + BETWEEN '{time_limits.first_enrollment_date}' AND '{time_limits.last_enrollment_date}' + AND event_string_value = '{self.experiment_slug}' + ) e + ON re.{self.experimental_unit.value} = e.{self.experimental_unit.value} AND + re.branch = e.branch AND + e.submission_date >= re.enrollment_date + GROUP BY e.{self.experimental_unit.value}, e.branch + """ # noqa: E501 def _build_exposure_query_glean_event( self, diff --git a/tests/test_experiment.py b/tests/test_experiment.py index 76f56775..99676c3d 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -903,23 +903,23 @@ def test_enrollments_query_explicit_client_id(): expected = """ WITH raw_enrollments AS ( - SELECT - e.client_id, - `mozfun.map.get_key`(e.event_map_values, 'branch') - AS branch, - MIN(e.submission_date) AS enrollment_date, - COUNT(e.submission_date) AS num_enrollment_events - FROM - `moz-fx-data-shared-prod.telemetry.events` e - WHERE - e.event_category = 'normandy' - AND e.event_method = 'enroll' - AND e.submission_date - BETWEEN '2019-01-01' AND '2019-01-08' - AND e.event_string_value = 'slug' - AND e.sample_id < 100 - GROUP BY e.client_id, branch - ), +SELECT + e.client_id, + `mozfun.map.get_key`(e.event_map_values, 'branch') + AS branch, + MIN(e.submission_date) AS enrollment_date, + COUNT(e.submission_date) AS num_enrollment_events +FROM + `moz-fx-data-shared-prod.telemetry.events` e +WHERE + e.event_category = 'normandy' + AND e.event_method = 'enroll' + AND e.submission_date + BETWEEN '2019-01-01' AND '2019-01-08' + AND e.event_string_value = 'slug' + AND e.sample_id < 100 +GROUP BY e.client_id, branch + ), segmented_enrollments AS ( SELECT raw_enrollments.*, @@ -928,31 +928,31 @@ def test_enrollments_query_explicit_client_id(): ), exposures AS ( +SELECT + e.client_id, + e.branch, + min(e.submission_date) AS exposure_date, + COUNT(e.submission_date) AS num_exposure_events +FROM raw_enrollments re +LEFT JOIN ( SELECT - e.client_id, - e.branch, - min(e.submission_date) AS exposure_date, - COUNT(e.submission_date) AS num_exposure_events - FROM raw_enrollments re - LEFT JOIN ( - SELECT - client_id, - `mozfun.map.get_key`(event_map_values, 'branchSlug') AS branch, - submission_date - FROM - `moz-fx-data-shared-prod.telemetry.events` - WHERE - event_category = 'normandy' - AND (event_method = 'exposure' OR event_method = 'expose') - AND submission_date - BETWEEN '2019-01-01' AND '2019-01-08' - AND event_string_value = 'slug' - ) e - ON re.client_id = e.client_id AND - re.branch = e.branch AND - e.submission_date >= re.enrollment_date - GROUP BY e.client_id, e.branch - ) + client_id, + `mozfun.map.get_key`(event_map_values, 'branchSlug') AS branch, + submission_date + FROM + `moz-fx-data-shared-prod.telemetry.events` + WHERE + event_category = 'normandy' + AND (event_method = 'exposure' OR event_method = 'expose') + AND submission_date + BETWEEN '2019-01-01' AND '2019-01-08' + AND event_string_value = 'slug' +) e +ON re.client_id = e.client_id AND + re.branch = e.branch AND + e.submission_date >= re.enrollment_date +GROUP BY e.client_id, e.branch + ) SELECT se.*, @@ -1090,23 +1090,23 @@ def test_enrollments_query_explicit_group_id(): expected = """ WITH raw_enrollments AS ( - SELECT - e.profile_group_id, - `mozfun.map.get_key`(e.event_map_values, 'branch') - AS branch, - MIN(e.submission_date) AS enrollment_date, - COUNT(e.submission_date) AS num_enrollment_events - FROM - `moz-fx-data-shared-prod.telemetry.events` e - WHERE - e.event_category = 'normandy' - AND e.event_method = 'enroll' - AND e.submission_date - BETWEEN '2019-01-01' AND '2019-01-08' - AND e.event_string_value = 'slug' - AND e.sample_id < 100 - GROUP BY e.profile_group_id, branch - ), +SELECT + e.profile_group_id, + `mozfun.map.get_key`(e.event_map_values, 'branch') + AS branch, + MIN(e.submission_date) AS enrollment_date, + COUNT(e.submission_date) AS num_enrollment_events +FROM + `moz-fx-data-shared-prod.telemetry.events` e +WHERE + e.event_category = 'normandy' + AND e.event_method = 'enroll' + AND e.submission_date + BETWEEN '2019-01-01' AND '2019-01-08' + AND e.event_string_value = 'slug' + AND e.sample_id < 100 +GROUP BY e.profile_group_id, branch + ), segmented_enrollments AS ( SELECT raw_enrollments.*, @@ -1115,31 +1115,31 @@ def test_enrollments_query_explicit_group_id(): ), exposures AS ( +SELECT + e.profile_group_id, + e.branch, + min(e.submission_date) AS exposure_date, + COUNT(e.submission_date) AS num_exposure_events +FROM raw_enrollments re +LEFT JOIN ( SELECT - e.profile_group_id, - e.branch, - min(e.submission_date) AS exposure_date, - COUNT(e.submission_date) AS num_exposure_events - FROM raw_enrollments re - LEFT JOIN ( - SELECT - profile_group_id, - `mozfun.map.get_key`(event_map_values, 'branchSlug') AS branch, - submission_date - FROM - `moz-fx-data-shared-prod.telemetry.events` - WHERE - event_category = 'normandy' - AND (event_method = 'exposure' OR event_method = 'expose') - AND submission_date - BETWEEN '2019-01-01' AND '2019-01-08' - AND event_string_value = 'slug' - ) e - ON re.profile_group_id = e.profile_group_id AND - re.branch = e.branch AND - e.submission_date >= re.enrollment_date - GROUP BY e.profile_group_id, e.branch - ) + profile_group_id, + `mozfun.map.get_key`(event_map_values, 'branchSlug') AS branch, + submission_date + FROM + `moz-fx-data-shared-prod.telemetry.events` + WHERE + event_category = 'normandy' + AND (event_method = 'exposure' OR event_method = 'expose') + AND submission_date + BETWEEN '2019-01-01' AND '2019-01-08' + AND event_string_value = 'slug' +) e +ON re.profile_group_id = e.profile_group_id AND + re.branch = e.branch AND + e.submission_date >= re.enrollment_date +GROUP BY e.profile_group_id, e.branch + ) SELECT se.*, @@ -1439,3 +1439,24 @@ def test_fenix_group_id_incompatible_exposures(): exp._build_exposure_query( time_limits=tl, exposure_query_type=EnrollmentsQueryType.FENIX_FALLBACK ) + + +def test_group_id_no_downsampling(): + exp = Experiment("slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP) + + tl = TimeLimits.for_ts( + first_enrollment_date="2019-01-01", + last_date_full_data="2019-03-01", + time_series_period="weekly", + num_dates_enrollment=8, + ) + + with pytest.raises( + ValueError, + match="Downsampling is not yet supported for group-level experiments", + ): + exp.build_enrollments_query( + time_limits=tl, + enrollments_query_type=EnrollmentsQueryType.NORMANDY, + sample_size=99, + ) From 11aee4ef6c1f101e0ea4ada6352e6aa7c332edeb Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Fri, 2 Aug 2024 08:24:55 -0700 Subject: [PATCH 19/40] Update src/mozanalysis/metrics.py Co-authored-by: Mike Williams <102263964+mikewilli@users.noreply.github.com> --- src/mozanalysis/metrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mozanalysis/metrics.py b/src/mozanalysis/metrics.py index 73a6df2c..50d7833a 100644 --- a/src/mozanalysis/metrics.py +++ b/src/mozanalysis/metrics.py @@ -222,8 +222,8 @@ def build_query_targets( be executed to query all metrics from this data source. """ if experimental_unit != ExperimentalUnit.CLIENT: - raise ValueError( - "`build_query_targets` currently supports client_id analysis" + raise IncompatibleExperimentalUnit( + "`build_query_targets` currently only supports client_id analysis" ) return """ From 5a52dd59a86d16763e104d0911e1c4b1f2ca7776 Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Fri, 2 Aug 2024 08:25:05 -0700 Subject: [PATCH 20/40] Update src/mozanalysis/metrics.py Co-authored-by: Mike Williams <102263964+mikewilli@users.noreply.github.com> --- src/mozanalysis/metrics.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mozanalysis/metrics.py b/src/mozanalysis/metrics.py index 50d7833a..e9e78665 100644 --- a/src/mozanalysis/metrics.py +++ b/src/mozanalysis/metrics.py @@ -160,6 +160,7 @@ def build_query( ds_id = self.group_id_column or "profile_group_id" else: assert_never(experimental_unit) + return """SELECT e.{id_column}, e.branch, From f3d76dd429bd2d7763991c461aec5e498b7539fb Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Fri, 2 Aug 2024 08:25:42 -0700 Subject: [PATCH 21/40] Update src/mozanalysis/experiment.py Co-authored-by: Mike Williams <102263964+mikewilli@users.noreply.github.com> --- src/mozanalysis/experiment.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/mozanalysis/experiment.py b/src/mozanalysis/experiment.py index 2175c510..50657bf9 100644 --- a/src/mozanalysis/experiment.py +++ b/src/mozanalysis/experiment.py @@ -967,13 +967,13 @@ def _build_metrics_query_bits( return metrics_columns, metrics_joins def _partition_segments_by_data_source( - self, metric_or_segment_list: list[Segment] + self, segment_list: list[Segment] ) -> dict[SegmentDataSource, list[Segment]]: - """Return a dict mapping data sources to metric/segment lists.""" - data_sources = {m.data_source for m in metric_or_segment_list} + """Return a dict mapping segment data sources to segment lists.""" + data_sources = {s.data_source for s in segment_list} return { - ds: [m for m in metric_or_segment_list if m.data_source == ds] + ds: [s for s in segment_list if s.data_source == ds] for ds in data_sources } From 9e03d864b9c4e6363294ed0d0564771c3a23263f Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Fri, 2 Aug 2024 08:26:02 -0700 Subject: [PATCH 22/40] Update src/mozanalysis/experiment.py Co-authored-by: Mike Williams <102263964+mikewilli@users.noreply.github.com> --- src/mozanalysis/experiment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mozanalysis/experiment.py b/src/mozanalysis/experiment.py index 50657bf9..4d7b9cae 100644 --- a/src/mozanalysis/experiment.py +++ b/src/mozanalysis/experiment.py @@ -978,7 +978,7 @@ def _partition_segments_by_data_source( } def _partition_metrics_by_data_source( - self, metric_or_segment_list: list[Metric] + self, metric_list: list[Metric] ) -> dict[DataSource, list[Metric]]: """Return a dict mapping data sources to metric/segment lists.""" data_sources = {m.data_source for m in metric_or_segment_list} From 44414789ca5d752df6f7ba100357bfac1f6f70d8 Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Fri, 2 Aug 2024 08:33:54 -0700 Subject: [PATCH 23/40] cleanup --- src/mozanalysis/experiment.py | 14 ++++---------- src/mozanalysis/metrics.py | 2 +- src/mozanalysis/types.py | 4 ++++ 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/src/mozanalysis/experiment.py b/src/mozanalysis/experiment.py index 4d7b9cae..e3557ac3 100644 --- a/src/mozanalysis/experiment.py +++ b/src/mozanalysis/experiment.py @@ -15,7 +15,7 @@ from mozanalysis.config import ConfigLoader from mozanalysis.metrics import AnalysisBasis, DataSource, Metric from mozanalysis.segments import Segment, SegmentDataSource -from mozanalysis.types import ExperimentalUnit +from mozanalysis.types import ExperimentalUnit, IncompatibleExperimentalUnit from mozanalysis.utils import add_days, date_sub, hash_ish if TYPE_CHECKING: @@ -31,10 +31,6 @@ class EnrollmentsQueryType(str, Enum): GLEAN_EVENT = "glean-event" -class IncompatibleExperimentalUnit(ValueError): - pass - - @attr.s(frozen=True, slots=True) class Experiment: """Query experiment data; store experiment metadata. @@ -973,19 +969,17 @@ def _partition_segments_by_data_source( data_sources = {s.data_source for s in segment_list} return { - ds: [s for s in segment_list if s.data_source == ds] - for ds in data_sources + ds: [s for s in segment_list if s.data_source == ds] for ds in data_sources } def _partition_metrics_by_data_source( self, metric_list: list[Metric] ) -> dict[DataSource, list[Metric]]: """Return a dict mapping data sources to metric/segment lists.""" - data_sources = {m.data_source for m in metric_or_segment_list} + data_sources = {m.data_source for m in metric_list} return { - ds: [m for m in metric_or_segment_list if m.data_source == ds] - for ds in data_sources + ds: [m for m in metric_list if m.data_source == ds] for ds in data_sources } def _build_segments_query( diff --git a/src/mozanalysis/metrics.py b/src/mozanalysis/metrics.py index e9e78665..9ca7afb3 100644 --- a/src/mozanalysis/metrics.py +++ b/src/mozanalysis/metrics.py @@ -8,7 +8,7 @@ from typing_extensions import assert_never -from mozanalysis.types import ExperimentalUnit +from mozanalysis.types import ExperimentalUnit, IncompatibleExperimentalUnit if TYPE_CHECKING: from mozanalysis.experiment import TimeLimits diff --git a/src/mozanalysis/types.py b/src/mozanalysis/types.py index 3fdd8d86..f8281621 100644 --- a/src/mozanalysis/types.py +++ b/src/mozanalysis/types.py @@ -24,3 +24,7 @@ class Uplift(str, Enum): class ExperimentalUnit(str, Enum): CLIENT = "client_id" GROUP = "profile_group_id" + + +class IncompatibleExperimentalUnit(ValueError): + pass From a64b2501103ad13ecaf94add93dceea4e8e3eb65 Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Fri, 2 Aug 2024 08:39:03 -0700 Subject: [PATCH 24/40] rename unit from GROUP to PROFILE_GROUP --- src/mozanalysis/experiment.py | 4 ++- src/mozanalysis/metrics.py | 2 +- src/mozanalysis/types.py | 2 +- tests/test_experiment.py | 58 +++++++++++++++++++++++------------ 4 files changed, 43 insertions(+), 23 deletions(-) diff --git a/src/mozanalysis/experiment.py b/src/mozanalysis/experiment.py index e3557ac3..8f73ddc8 100644 --- a/src/mozanalysis/experiment.py +++ b/src/mozanalysis/experiment.py @@ -711,7 +711,9 @@ def _build_enrollments_query_normandy( sample_size: int = 100, ) -> str: """Return SQL to query enrollments for a normandy experiment""" - if (self.experimental_unit == ExperimentalUnit.GROUP) and (sample_size < 100): + if (self.experimental_unit == ExperimentalUnit.PROFILE_GROUP) and ( + sample_size < 100 + ): raise ValueError( "Downsampling is not yet supported for group-level experiments" ) diff --git a/src/mozanalysis/metrics.py b/src/mozanalysis/metrics.py index 9ca7afb3..8c4fd090 100644 --- a/src/mozanalysis/metrics.py +++ b/src/mozanalysis/metrics.py @@ -156,7 +156,7 @@ def build_query( """ if experimental_unit == ExperimentalUnit.CLIENT: ds_id = self.client_id_column or "client_id" - elif experimental_unit == ExperimentalUnit.GROUP: + elif experimental_unit == ExperimentalUnit.PROFILE_GROUP: ds_id = self.group_id_column or "profile_group_id" else: assert_never(experimental_unit) diff --git a/src/mozanalysis/types.py b/src/mozanalysis/types.py index f8281621..eb80caf2 100644 --- a/src/mozanalysis/types.py +++ b/src/mozanalysis/types.py @@ -23,7 +23,7 @@ class Uplift(str, Enum): class ExperimentalUnit(str, Enum): CLIENT = "client_id" - GROUP = "profile_group_id" + PROFILE_GROUP = "profile_group_id" class IncompatibleExperimentalUnit(ValueError): diff --git a/tests/test_experiment.py b/tests/test_experiment.py index 99676c3d..4a749db5 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -282,7 +282,7 @@ def test_analysis_window_validates_end(): @pytest.mark.parametrize( - "experimental_unit", [ExperimentalUnit.CLIENT, ExperimentalUnit.GROUP] + "experimental_unit", [ExperimentalUnit.CLIENT, ExperimentalUnit.PROFILE_GROUP] ) def test_query_not_detectably_malformed(experimental_unit: ExperimentalUnit): exp = Experiment("slug", "2019-01-01", 8, experimental_unit=experimental_unit) @@ -305,7 +305,7 @@ def test_query_not_detectably_malformed(experimental_unit: ExperimentalUnit): if experimental_unit == ExperimentalUnit.CLIENT: assert "client_id" in enrollments_sql - elif experimental_unit == ExperimentalUnit.GROUP: + elif experimental_unit == ExperimentalUnit.PROFILE_GROUP: assert "profile_group_id" in enrollments_sql metrics_sql = exp.build_metrics_query( @@ -318,12 +318,12 @@ def test_query_not_detectably_malformed(experimental_unit: ExperimentalUnit): if experimental_unit == ExperimentalUnit.CLIENT: assert "client_id" in metrics_sql - elif experimental_unit == ExperimentalUnit.GROUP: + elif experimental_unit == ExperimentalUnit.PROFILE_GROUP: assert "profile_group_id" in metrics_sql @pytest.mark.parametrize( - "experimental_unit", [ExperimentalUnit.CLIENT, ExperimentalUnit.GROUP] + "experimental_unit", [ExperimentalUnit.CLIENT, ExperimentalUnit.PROFILE_GROUP] ) def test_megaquery_not_detectably_malformed(experimental_unit: ExperimentalUnit): exp = Experiment("slug", "2019-01-01", 8, experimental_unit=experimental_unit) @@ -343,7 +343,7 @@ def test_megaquery_not_detectably_malformed(experimental_unit: ExperimentalUnit) if experimental_unit == ExperimentalUnit.CLIENT: assert "client_id" in enrollments_sql - elif experimental_unit == ExperimentalUnit.GROUP: + elif experimental_unit == ExperimentalUnit.PROFILE_GROUP: assert "profile_group_id" in enrollments_sql metrics_sql = exp.build_metrics_query( @@ -356,12 +356,12 @@ def test_megaquery_not_detectably_malformed(experimental_unit: ExperimentalUnit) if experimental_unit == ExperimentalUnit.CLIENT: assert "client_id" in metrics_sql - elif experimental_unit == ExperimentalUnit.GROUP: + elif experimental_unit == ExperimentalUnit.PROFILE_GROUP: assert "profile_group_id" in metrics_sql @pytest.mark.parametrize( - "experimental_unit", [ExperimentalUnit.CLIENT, ExperimentalUnit.GROUP] + "experimental_unit", [ExperimentalUnit.CLIENT, ExperimentalUnit.PROFILE_GROUP] ) def test_segments_megaquery_not_detectably_malformed( experimental_unit: ExperimentalUnit, @@ -1073,7 +1073,9 @@ def test_metrics_query_explicit_client_id(): def test_enrollments_query_explicit_group_id(): - exp = Experiment("slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP) + exp = Experiment( + "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.PROFILE_GROUP + ) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1153,7 +1155,9 @@ def test_enrollments_query_explicit_group_id(): def test_metrics_query_explicit_group_id(): - exp = Experiment("slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP) + exp = Experiment( + "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.PROFILE_GROUP + ) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1254,7 +1258,7 @@ def test_glean_group_id_incompatible(): "slug", "2019-01-01", 8, - experimental_unit=ExperimentalUnit.GROUP, + experimental_unit=ExperimentalUnit.PROFILE_GROUP, app_id="test_app", ) @@ -1276,7 +1280,7 @@ def test_glean_group_id_incompatible_exposures(): "slug", "2019-01-01", 8, - experimental_unit=ExperimentalUnit.GROUP, + experimental_unit=ExperimentalUnit.PROFILE_GROUP, app_id="test_app", ) @@ -1294,7 +1298,9 @@ def test_glean_group_id_incompatible_exposures(): def test_glean_missing_app_id(): - exp = Experiment("slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP) + exp = Experiment( + "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.PROFILE_GROUP + ) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1312,7 +1318,9 @@ def test_glean_missing_app_id(): def test_glean_exposures_missing_app_id(): - exp = Experiment("slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP) + exp = Experiment( + "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.PROFILE_GROUP + ) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1334,7 +1342,7 @@ def test_cirrus_group_id_incompatible(): "slug", "2019-01-01", 8, - experimental_unit=ExperimentalUnit.GROUP, + experimental_unit=ExperimentalUnit.PROFILE_GROUP, app_id="test_app", ) @@ -1356,7 +1364,7 @@ def test_cirrus_group_id_incompatible_exposures(): "slug", "2019-01-01", 8, - experimental_unit=ExperimentalUnit.GROUP, + experimental_unit=ExperimentalUnit.PROFILE_GROUP, app_id="test_app", ) @@ -1374,7 +1382,9 @@ def test_cirrus_group_id_incompatible_exposures(): def test_cirrus_missing_app_id(): - exp = Experiment("slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP) + exp = Experiment( + "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.PROFILE_GROUP + ) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1392,7 +1402,9 @@ def test_cirrus_missing_app_id(): def test_cirrus_missing_app_id_exposures(): - exp = Experiment("slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP) + exp = Experiment( + "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.PROFILE_GROUP + ) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1410,7 +1422,9 @@ def test_cirrus_missing_app_id_exposures(): def test_fenix_group_id_incompatible(): - exp = Experiment("slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP) + exp = Experiment( + "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.PROFILE_GROUP + ) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1426,7 +1440,9 @@ def test_fenix_group_id_incompatible(): def test_fenix_group_id_incompatible_exposures(): - exp = Experiment("slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP) + exp = Experiment( + "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.PROFILE_GROUP + ) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1442,7 +1458,9 @@ def test_fenix_group_id_incompatible_exposures(): def test_group_id_no_downsampling(): - exp = Experiment("slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.GROUP) + exp = Experiment( + "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.PROFILE_GROUP + ) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", From abb48554b4b07acbb28dfad351700a6e94b4fc90 Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Fri, 2 Aug 2024 10:05:19 -0700 Subject: [PATCH 25/40] stricter typing, eliminates unexpected runtype type differences --- src/mozanalysis/config.py | 37 +++++------------- src/mozanalysis/experiment.py | 8 +++- src/mozanalysis/metrics.py | 71 ++++++++++++++++++++++++++++------- 3 files changed, 72 insertions(+), 44 deletions(-) diff --git a/src/mozanalysis/config.py b/src/mozanalysis/config.py index 0b64ca77..66e9c147 100644 --- a/src/mozanalysis/config.py +++ b/src/mozanalysis/config.py @@ -6,7 +6,7 @@ from metric_config_parser.config import ConfigCollection -from mozanalysis.metrics import Metric +from mozanalysis.metrics import Metric, DataSource METRIC_HUB_JETSTREAM_REPO = "https://github.com/mozilla/metric-hub/tree/main/jetstream" @@ -101,12 +101,11 @@ def get_metric(self, metric_slug: str, app_name: str) -> Metric: app_name=app_name, ) - def get_data_source(self, data_source_slug: str, app_name: str): + def get_data_source(self, data_source_slug: str, app_name: str) -> DataSource: """Load a data source definition for the given app. Returns a :class:`mozanalysis.metrics.DataSource` instance. """ - from mozanalysis.metrics import DataSource data_source_definition = self.configs.get_data_source_definition( data_source_slug, app_name @@ -121,19 +120,7 @@ def get_data_source(self, data_source_slug: str, app_name: str): f"Could not find application {app_name}, so data source {data_source_slug} could not be resolved" # noqa:E501 ) - return DataSource( - name=data_source_definition.name, - from_expr=data_source_definition.from_expression, - client_id_column=data_source_definition.client_id_column, - submission_date_column=data_source_definition.submission_date_column, - experiments_column_type=( - None - if data_source_definition.experiments_column_type == "none" - else data_source_definition.experiments_column_type - ), - default_dataset=data_source_definition.default_dataset, - app_name=app_name, - ) + return DataSource.from_mcp_data_source(data_source_definition, app_name) def get_segment(self, segment_slug: str, app_name: str): """Load a segment definition for the given app. @@ -231,12 +218,15 @@ class MinimalConfiguration: summaries = metric_definition.resolve(outcome_spec, conf, self.configs) metric = summaries[0].metric + if metric.data_source is None: + raise ValueError(f"Unable to resolve DataSource for Metric {metric.name}") + return Metric( name=metric.name, select_expr=metric.select_expression, friendly_name=metric.friendly_name, description=metric.description, - data_source=metric.data_source, + data_source=DataSource.from_mcp_data_source(metric.data_source, app_name), bigger_is_better=metric.bigger_is_better, ) @@ -267,17 +257,8 @@ def get_outcome_data_source( + f" in outcome {outcome_slug}" ) - return DataSource( - name=data_source_definition.name, - from_expr=data_source_definition.from_expression, - client_id_column=data_source_definition.client_id_column, - submission_date_column=data_source_definition.submission_date_column, - experiments_column_type=( - None - if data_source_definition.experiments_column_type == "none" - else data_source_definition.experiments_column_type - ), - default_dataset=data_source_definition.default_dataset, + return DataSource.from_mcp_data_source( + data_source_definition, app_name=app_name ) diff --git a/src/mozanalysis/experiment.py b/src/mozanalysis/experiment.py index 8f73ddc8..fdbf750a 100644 --- a/src/mozanalysis/experiment.py +++ b/src/mozanalysis/experiment.py @@ -136,12 +136,16 @@ class Experiment: before UTC midnight. """ - experiment_slug = attr.ib(type=str) + experiment_slug = attr.ib(type=str, validator=attr.validators.instance_of(str)) start_date = attr.ib() num_dates_enrollment = attr.ib(default=None) app_id = attr.ib(default=None) app_name = attr.ib(default=None) - experimental_unit = attr.ib(type=ExperimentalUnit, default=ExperimentalUnit.CLIENT) + experimental_unit = attr.ib( + type=ExperimentalUnit, + default=ExperimentalUnit.CLIENT, + converter=lambda s: ExperimentalUnit(s), # allows callers to pass a string + ) def get_app_name(self): """ diff --git a/src/mozanalysis/metrics.py b/src/mozanalysis/metrics.py index 8c4fd090..a0e032b8 100644 --- a/src/mozanalysis/metrics.py +++ b/src/mozanalysis/metrics.py @@ -17,6 +17,8 @@ import attr +from metric_config_parser.data_source import DataSource as ParserDataSource + logger = logging.getLogger(__name__) @@ -68,12 +70,27 @@ class DataSource: name = attr.ib(validator=attr.validators.instance_of(str)) _from_expr = attr.ib(validator=attr.validators.instance_of(str)) - experiments_column_type = attr.ib(default="simple", type=str) - client_id_column = attr.ib(default="client_id", type=str) - submission_date_column = attr.ib(default="submission_date", type=str) + experiments_column_type = attr.ib(default="simple", type=str | None) + client_id_column = attr.ib( + default=ExperimentalUnit.CLIENT.value, + type=str, + validator=[attr.validators.instance_of(str), attr.validators.min_len(1)], + converter=attr.converters.default_if_none(ExperimentalUnit.CLIENT.value), + ) + submission_date_column = attr.ib( + default="submission_date", + type=str, + validator=[attr.validators.instance_of(str), attr.validators.min_len(1)], + converter=attr.converters.default_if_none("submission_date"), + ) default_dataset = attr.ib(default=None, type=str | None) app_name = attr.ib(default=None, type=str | None) - group_id_column = attr.ib(default="profile_group_id", type=str) + group_id_column = attr.ib( + default=ExperimentalUnit.PROFILE_GROUP.value, + type=str, + validator=[attr.validators.instance_of(str), attr.validators.min_len(1)], + converter=attr.converters.default_if_none(ExperimentalUnit.PROFILE_GROUP.value), + ) EXPERIMENT_COLUMN_TYPES = (None, "simple", "native", "glean") @@ -155,9 +172,9 @@ def build_query( be executed to query all metrics from this data source. """ if experimental_unit == ExperimentalUnit.CLIENT: - ds_id = self.client_id_column or "client_id" + ds_id = self.client_id_column elif experimental_unit == ExperimentalUnit.PROFILE_GROUP: - ds_id = self.group_id_column or "profile_group_id" + ds_id = self.group_id_column else: assert_never(experimental_unit) @@ -185,7 +202,7 @@ def build_query( e.analysis_window_start, e.analysis_window_end""".format( ds_id=ds_id, - submission_date=self.submission_date_column or "submission_date", + submission_date=self.submission_date_column, from_expr=self.from_expr_for(from_expr_dataset), fddr=time_limits.first_date_data_required, lddr=time_limits.last_date_data_required, @@ -199,7 +216,7 @@ def build_query( else "enrollment_date" ), ignore_pre_enroll_first_day=self.experiments_column_expr.format( - submission_date=self.submission_date_column or "submission_date", + submission_date=self.submission_date_column, experiment_slug=experiment_slug, ), id_column=experimental_unit.value, @@ -243,7 +260,7 @@ def build_query_targets( t.enrollment_date, t.analysis_window_start, t.analysis_window_end""".format( - client_id=self.client_id_column or "client_id", + client_id=self.client_id_column, from_expr=self.from_expr_for(from_expr_dataset), metrics=",\n ".join( f"{m.select_expr.format(experiment_name=experiment_name)} AS {m.name}" @@ -255,7 +272,7 @@ def build_query_targets( AND ds.{submission_date} BETWEEN DATE_ADD(t.enrollment_date, interval t.analysis_window_start day) AND DATE_ADD(t.enrollment_date, interval t.analysis_window_end day)""".format( - submission_date=self.submission_date_column or "submission_date", + submission_date=self.submission_date_column, fddr=time_limits.first_date_data_required, lddr=time_limits.last_date_data_required, ) @@ -264,7 +281,7 @@ def build_query_targets( t.enrollment_date AND DATE_ADD(t.enrollment_date, interval {analysis_length} day) """.format( - submission_date=self.submission_date_column or "submission_date", + submission_date=self.submission_date_column, analysis_length=analysis_length, ) ), @@ -343,6 +360,30 @@ def get_sanity_metrics(self, experiment_slug: str) -> list[Metric]: else: raise ValueError + @classmethod + def from_mcp_data_source( + cls, + parser_data_source: ParserDataSource, + app_name: str | None = None, + group_id_column: str | None = ExperimentalUnit.PROFILE_GROUP.value, + ) -> "DataSource": + """metric-config-parser DataSource objects do not have an `app_name` + and do not, yet, have a group_id_column""" + return cls( + name=parser_data_source.name, + from_expr=parser_data_source.from_expression, + client_id_column=parser_data_source.client_id_column, + submission_date_column=parser_data_source.submission_date_column, + experiments_column_type=( + None + if parser_data_source.experiments_column_type == "none" + else parser_data_source.experiments_column_type + ), + default_dataset=parser_data_source.default_dataset, + app_name=app_name, + group_id_column=group_id_column, + ) + @attr.s(frozen=True, slots=True) class Metric: @@ -364,9 +405,11 @@ class Metric: used for validation """ - name = attr.ib(type=str) - data_source = attr.ib(type=DataSource) - select_expr = attr.ib(type=str) + name = attr.ib(type=str, validator=attr.validators.instance_of(str)) + data_source = attr.ib( + type=DataSource, validator=attr.validators.instance_of(DataSource) + ) + select_expr = attr.ib(type=str, validator=attr.validators.instance_of(str)) friendly_name = attr.ib(type=str | None, default=None) description = attr.ib(type=str | None, default=None) bigger_is_better = attr.ib(type=bool, default=True) From 519f51c8ee443efcbf09dcd5108d56a99fe38aa3 Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Fri, 2 Aug 2024 10:10:10 -0700 Subject: [PATCH 26/40] linting --- src/mozanalysis/config.py | 2 +- src/mozanalysis/metrics.py | 25 +++++++++---------------- 2 files changed, 10 insertions(+), 17 deletions(-) diff --git a/src/mozanalysis/config.py b/src/mozanalysis/config.py index 66e9c147..6a921c3e 100644 --- a/src/mozanalysis/config.py +++ b/src/mozanalysis/config.py @@ -6,7 +6,7 @@ from metric_config_parser.config import ConfigCollection -from mozanalysis.metrics import Metric, DataSource +from mozanalysis.metrics import DataSource, Metric METRIC_HUB_JETSTREAM_REPO = "https://github.com/mozilla/metric-hub/tree/main/jetstream" diff --git a/src/mozanalysis/metrics.py b/src/mozanalysis/metrics.py index a0e032b8..178cf1c4 100644 --- a/src/mozanalysis/metrics.py +++ b/src/mozanalysis/metrics.py @@ -11,14 +11,14 @@ from mozanalysis.types import ExperimentalUnit, IncompatibleExperimentalUnit if TYPE_CHECKING: + from metric_config_parser.data_source import DataSource as ParserDataSource + from mozanalysis.experiment import TimeLimits import logging import attr -from metric_config_parser.data_source import DataSource as ParserDataSource - logger = logging.getLogger(__name__) @@ -267,23 +267,16 @@ def build_query_targets( for m in metric_list ), date_clause=( - """ - AND ds.{submission_date} BETWEEN '{fddr}' AND '{lddr}' - AND ds.{submission_date} BETWEEN + f""" + AND ds.{self.submission_date_column} BETWEEN '{time_limits.first_date_data_required}' AND '{time_limits.last_date_data_required}' + AND ds.{self.submission_date_column} BETWEEN DATE_ADD(t.enrollment_date, interval t.analysis_window_start day) AND - DATE_ADD(t.enrollment_date, interval t.analysis_window_end day)""".format( - submission_date=self.submission_date_column, - fddr=time_limits.first_date_data_required, - lddr=time_limits.last_date_data_required, - ) + DATE_ADD(t.enrollment_date, interval t.analysis_window_end day)""" # noqa: E501 if not continuous_enrollment - else """AND ds.{submission_date} BETWEEN + else f"""AND ds.{self.submission_date_column} BETWEEN t.enrollment_date AND DATE_ADD(t.enrollment_date, interval {analysis_length} day) - """.format( - submission_date=self.submission_date_column, - analysis_length=analysis_length, - ) + """ ), ) @@ -366,7 +359,7 @@ def from_mcp_data_source( parser_data_source: ParserDataSource, app_name: str | None = None, group_id_column: str | None = ExperimentalUnit.PROFILE_GROUP.value, - ) -> "DataSource": + ) -> DataSource: """metric-config-parser DataSource objects do not have an `app_name` and do not, yet, have a group_id_column""" return cls( From bcf9f2f00dcb4adf47dac74dd9422d90e98c5f19 Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Fri, 2 Aug 2024 10:29:10 -0700 Subject: [PATCH 27/40] mypy validation for metrics.py --- src/mozanalysis/config.py | 15 +++++++++------ src/mozanalysis/metrics.py | 31 +++++++++++++++++++++++++++---- 2 files changed, 36 insertions(+), 10 deletions(-) diff --git a/src/mozanalysis/config.py b/src/mozanalysis/config.py index 6a921c3e..aeebb254 100644 --- a/src/mozanalysis/config.py +++ b/src/mozanalysis/config.py @@ -7,6 +7,7 @@ from metric_config_parser.config import ConfigCollection from mozanalysis.metrics import DataSource, Metric +from mozanalysis.segments import Segment, SegmentDataSource METRIC_HUB_JETSTREAM_REPO = "https://github.com/mozilla/metric-hub/tree/main/jetstream" @@ -122,12 +123,11 @@ def get_data_source(self, data_source_slug: str, app_name: str) -> DataSource: return DataSource.from_mcp_data_source(data_source_definition, app_name) - def get_segment(self, segment_slug: str, app_name: str): + def get_segment(self, segment_slug: str, app_name: str) -> Segment: """Load a segment definition for the given app. Returns a :class:`mozanalysis.segments.Segment` instance. """ - from mozanalysis.segments import Segment segment_definition = self.configs.get_segment_definition(segment_slug, app_name) if segment_definition is None: @@ -153,12 +153,13 @@ def get_segment(self, segment_slug: str, app_name: str): app_name=app_name, ) - def get_segment_data_source(self, data_source_slug: str, app_name: str): + def get_segment_data_source( + self, data_source_slug: str, app_name: str + ) -> SegmentDataSource: """Load a segment data source definition for the given app. Returns a :class:`mozanalysis.segments.SegmentDataSource` instance. """ - from mozanalysis.segments import SegmentDataSource data_source_definition = self.configs.get_segment_data_source_definition( data_source_slug, app_name @@ -184,7 +185,9 @@ def get_segment_data_source(self, data_source_slug: str, app_name: str): app_name=app_name, ) - def get_outcome_metric(self, metric_slug: str, outcome_slug: str, app_name: str): + def get_outcome_metric( + self, metric_slug: str, outcome_slug: str, app_name: str + ) -> Metric: """Load a metric definition from an outcome defined for the given app. Parametrized metrics are not supported, since they may not be defined outside @@ -232,7 +235,7 @@ class MinimalConfiguration: def get_outcome_data_source( self, data_source_slug: str, outcome_slug: str, app_name: str - ): + ) -> DataSource: """Load a data source definition from an outcome defined for the given app. Returns a :class:`mozanalysis.metrics.DataSource` instance. diff --git a/src/mozanalysis/metrics.py b/src/mozanalysis/metrics.py index 178cf1c4..298e09f1 100644 --- a/src/mozanalysis/metrics.py +++ b/src/mozanalysis/metrics.py @@ -29,6 +29,29 @@ class AnalysisBasis(Enum): EXPOSURES = "exposures" +# attr.s converters aren't compatible with mypy, define our own +# see: https://mypy.readthedocs.io/en/stable/additional_features.html#id1 +def client_id_column_converter(client_id_column: str | None) -> str: + if client_id_column is None: + return ExperimentalUnit.CLIENT.value + else: + return client_id_column + + +def group_id_column_converter(group_id_column: str | None) -> str: + if group_id_column is None: + return ExperimentalUnit.PROFILE_GROUP.value + else: + return group_id_column + + +def submission_date_column_converter(submission_date_column: str | None) -> str: + if submission_date_column is None: + return "submission_date" + else: + return submission_date_column + + @attr.s(frozen=True, slots=True) class DataSource: """Represents a table or view, from which Metrics may be defined. @@ -75,13 +98,13 @@ class DataSource: default=ExperimentalUnit.CLIENT.value, type=str, validator=[attr.validators.instance_of(str), attr.validators.min_len(1)], - converter=attr.converters.default_if_none(ExperimentalUnit.CLIENT.value), + converter=client_id_column_converter, ) submission_date_column = attr.ib( default="submission_date", type=str, validator=[attr.validators.instance_of(str), attr.validators.min_len(1)], - converter=attr.converters.default_if_none("submission_date"), + converter=group_id_column_converter, ) default_dataset = attr.ib(default=None, type=str | None) app_name = attr.ib(default=None, type=str | None) @@ -89,7 +112,7 @@ class DataSource: default=ExperimentalUnit.PROFILE_GROUP.value, type=str, validator=[attr.validators.instance_of(str), attr.validators.min_len(1)], - converter=attr.converters.default_if_none(ExperimentalUnit.PROFILE_GROUP.value), + converter=group_id_column_converter, ) EXPERIMENT_COLUMN_TYPES = (None, "simple", "native", "glean") @@ -162,7 +185,7 @@ def build_query( time_limits: TimeLimits, experiment_slug: str, from_expr_dataset: str | None = None, - analysis_basis: str = AnalysisBasis.ENROLLMENTS, + analysis_basis: AnalysisBasis = AnalysisBasis.ENROLLMENTS, experimental_unit: ExperimentalUnit = ExperimentalUnit.CLIENT, exposure_signal=None, ) -> str: From 37be2931795ffb47a2d6e2806b619698878f3f1a Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Mon, 5 Aug 2024 07:59:30 -0700 Subject: [PATCH 28/40] ExperimentalUnit -> AnalysisUnit --- src/mozanalysis/experiment.py | 70 +++++++++++++------------- src/mozanalysis/metrics.py | 24 ++++----- src/mozanalysis/types.py | 4 +- tests/test_experiment.py | 94 ++++++++++++++--------------------- 4 files changed, 86 insertions(+), 106 deletions(-) diff --git a/src/mozanalysis/experiment.py b/src/mozanalysis/experiment.py index fdbf750a..ccff8154 100644 --- a/src/mozanalysis/experiment.py +++ b/src/mozanalysis/experiment.py @@ -15,7 +15,7 @@ from mozanalysis.config import ConfigLoader from mozanalysis.metrics import AnalysisBasis, DataSource, Metric from mozanalysis.segments import Segment, SegmentDataSource -from mozanalysis.types import ExperimentalUnit, IncompatibleExperimentalUnit +from mozanalysis.types import AnalysisUnit, IncompatibleAnalysisUnit from mozanalysis.utils import add_days, date_sub, hash_ish if TYPE_CHECKING: @@ -115,11 +115,11 @@ class Experiment: app_id (str, optional): For a Glean app, the name of the BigQuery dataset derived from its app ID, like `org_mozilla_firefox`. app_name (str, optional): The Glean app name, like `fenix`. - experimental_unit (ExperimentalUnit, optional): the "unit" of analysis, + analysis_unit (AnalysisUnit, optional): the "unit" of analysis, which defines an experimental unit. For example: `CLIENT` for mobile experiments or `GROUP` for desktop experiments. Is used as the join key when building queries and sub-unit level data is - aggregated up to that level. Defaults to `ExperimentalUnit.CLIENT` + aggregated up to that level. Defaults to `AnalysisUnit.CLIENT` unless specified Attributes: @@ -141,10 +141,10 @@ class Experiment: num_dates_enrollment = attr.ib(default=None) app_id = attr.ib(default=None) app_name = attr.ib(default=None) - experimental_unit = attr.ib( - type=ExperimentalUnit, - default=ExperimentalUnit.CLIENT, - converter=lambda s: ExperimentalUnit(s), # allows callers to pass a string + analysis_unit = attr.ib( + type=AnalysisUnit, + default=AnalysisUnit.CLIENT, + converter=lambda s: AnalysisUnit(s), # allows callers to pass a string ) def get_app_name(self): @@ -504,10 +504,10 @@ def build_enrollments_query( SELECT se.*, - e.* EXCEPT ({self.experimental_unit}, branch) + e.* EXCEPT ({self.analysis_unit}, branch) FROM segmented_enrollments se LEFT JOIN exposures e - USING ({self.experimental_unit.value}, branch) + USING ({self.analysis_unit.value}, branch) """ def build_metrics_query( @@ -602,7 +602,7 @@ def build_metrics_query( metrics_columns=",\n ".join(metrics_columns), metrics_joins="\n".join(metrics_joins), enrollments_table=enrollments_table, - id_column=self.experimental_unit.value, + id_column=self.analysis_unit.value, ) @staticmethod @@ -637,16 +637,16 @@ def _build_enrollments_query( raise ValueError( "App ID must be defined for building Glean enrollments query" ) - if not self.experimental_unit == ExperimentalUnit.CLIENT: - raise IncompatibleExperimentalUnit( + if not self.analysis_unit == AnalysisUnit.CLIENT: + raise IncompatibleAnalysisUnit( "Glean enrollments currently only support client_id analysis units" ) return self._build_enrollments_query_glean_event( time_limits, self.app_id, sample_size ) elif enrollments_query_type == EnrollmentsQueryType.FENIX_FALLBACK: - if not self.experimental_unit == ExperimentalUnit.CLIENT: - raise IncompatibleExperimentalUnit( + if not self.analysis_unit == AnalysisUnit.CLIENT: + raise IncompatibleAnalysisUnit( "Fenix fallback enrollments currently only support client_id analysis units" # noqa: E501 ) return self._build_enrollments_query_fenix_baseline( @@ -657,8 +657,8 @@ def _build_enrollments_query( raise ValueError( "App ID must be defined for building Cirrus enrollments query" ) - if not self.experimental_unit == ExperimentalUnit.CLIENT: - raise IncompatibleExperimentalUnit( + if not self.analysis_unit == AnalysisUnit.CLIENT: + raise IncompatibleAnalysisUnit( "Cirrus enrollments currently only support client_id analysis units" ) return self._build_enrollments_query_cirrus(time_limits, self.app_id) @@ -678,14 +678,14 @@ def _build_exposure_query( raise ValueError( "App ID must be defined for building Glean exposures query" ) - if not self.experimental_unit == ExperimentalUnit.CLIENT: - raise IncompatibleExperimentalUnit( + if not self.analysis_unit == AnalysisUnit.CLIENT: + raise IncompatibleAnalysisUnit( "Glean exposures currently only support client_id analysis units" ) return self._build_exposure_query_glean_event(time_limits, self.app_id) elif exposure_query_type == EnrollmentsQueryType.FENIX_FALLBACK: - if not self.experimental_unit == ExperimentalUnit.CLIENT: - raise IncompatibleExperimentalUnit( + if not self.analysis_unit == AnalysisUnit.CLIENT: + raise IncompatibleAnalysisUnit( "Fenix fallback exposures currently only support client_id analysis units" # noqa: E501 ) return self._build_exposure_query_glean_event( @@ -696,8 +696,8 @@ def _build_exposure_query( raise ValueError( "App ID must be defined for building Cirrus exposures query" ) - if not self.experimental_unit == ExperimentalUnit.CLIENT: - raise IncompatibleExperimentalUnit( + if not self.analysis_unit == AnalysisUnit.CLIENT: + raise IncompatibleAnalysisUnit( "Cirrus exposures currently only support client_id analysis units" ) return self._build_exposure_query_glean_event( @@ -715,15 +715,13 @@ def _build_enrollments_query_normandy( sample_size: int = 100, ) -> str: """Return SQL to query enrollments for a normandy experiment""" - if (self.experimental_unit == ExperimentalUnit.PROFILE_GROUP) and ( - sample_size < 100 - ): + if (self.analysis_unit == AnalysisUnit.PROFILE_GROUP) and (sample_size < 100): raise ValueError( "Downsampling is not yet supported for group-level experiments" ) return f""" SELECT - e.{self.experimental_unit.value}, + e.{self.analysis_unit.value}, `mozfun.map.get_key`(e.event_map_values, 'branch') AS branch, MIN(e.submission_date) AS enrollment_date, @@ -737,7 +735,7 @@ def _build_enrollments_query_normandy( BETWEEN '{time_limits.first_enrollment_date}' AND '{time_limits.last_enrollment_date}' AND e.event_string_value = '{self.experiment_slug}' AND e.sample_id < {sample_size} - GROUP BY e.{self.experimental_unit.value}, branch + GROUP BY e.{self.analysis_unit.value}, branch """ # noqa:E501 def _build_enrollments_query_fenix_baseline( @@ -857,14 +855,14 @@ def _build_exposure_query_normandy(self, time_limits: TimeLimits) -> str: """Return SQL to query exposures for a normandy experiment""" return f""" SELECT - e.{self.experimental_unit.value}, + e.{self.analysis_unit.value}, e.branch, min(e.submission_date) AS exposure_date, COUNT(e.submission_date) AS num_exposure_events FROM raw_enrollments re LEFT JOIN ( SELECT - {self.experimental_unit.value}, + {self.analysis_unit.value}, `mozfun.map.get_key`(event_map_values, 'branchSlug') AS branch, submission_date FROM @@ -876,10 +874,10 @@ def _build_exposure_query_normandy(self, time_limits: TimeLimits) -> str: BETWEEN '{time_limits.first_enrollment_date}' AND '{time_limits.last_enrollment_date}' AND event_string_value = '{self.experiment_slug}' ) e - ON re.{self.experimental_unit.value} = e.{self.experimental_unit.value} AND + ON re.{self.analysis_unit.value} = e.{self.analysis_unit.value} AND re.branch = e.branch AND e.submission_date >= re.enrollment_date - GROUP BY e.{self.experimental_unit.value}, e.branch + GROUP BY e.{self.analysis_unit.value}, e.branch """ # noqa: E501 def _build_exposure_query_glean_event( @@ -952,14 +950,14 @@ def _build_metrics_query_bits( self.experiment_slug, self.app_id, analysis_basis, - self.experimental_unit, + self.analysis_unit, exposure_signal, ) metrics_joins.append( f""" LEFT JOIN ( {query_for_metrics} - ) ds_{i} USING ({self.experimental_unit.value}, branch, analysis_window_start, analysis_window_end) + ) ds_{i} USING ({self.analysis_unit.value}, branch, analysis_window_start, analysis_window_end) """ # noqa: E501 ) @@ -1047,7 +1045,7 @@ def _build_segments_query_bits( segments_joins.append( f""" LEFT JOIN ( {query_for_segments} - ) ds_{i} USING ({self.experimental_unit.value}, branch) + ) ds_{i} USING ({self.analysis_unit.value}, branch) """ ) @@ -1320,7 +1318,7 @@ class TimeSeriesResult: fully_qualified_table_name = attr.ib(type=str) analysis_windows = attr.ib(type=list) - experimental_unit = attr.ib(type=ExperimentalUnit, default=ExperimentalUnit.CLIENT) + analysis_unit = attr.ib(type=AnalysisUnit, default=AnalysisUnit.CLIENT) def get(self, bq_context: BigQueryContext, analysis_window) -> DataFrame: """Get the DataFrame for a specific analysis window. @@ -1430,7 +1428,7 @@ def _build_analysis_window_subset_query( in "the standard format" for a single analysis window. """ return f""" - SELECT * EXCEPT ({self.experimental_unit.value}, analysis_window_start, analysis_window_end) + SELECT * EXCEPT ({self.analysis_unit.value}, analysis_window_start, analysis_window_end) FROM {self.fully_qualified_table_name} WHERE analysis_window_start = {analysis_window.start} AND analysis_window_end = {analysis_window.end} diff --git a/src/mozanalysis/metrics.py b/src/mozanalysis/metrics.py index 298e09f1..c804009a 100644 --- a/src/mozanalysis/metrics.py +++ b/src/mozanalysis/metrics.py @@ -8,7 +8,7 @@ from typing_extensions import assert_never -from mozanalysis.types import ExperimentalUnit, IncompatibleExperimentalUnit +from mozanalysis.types import AnalysisUnit, IncompatibleAnalysisUnit if TYPE_CHECKING: from metric_config_parser.data_source import DataSource as ParserDataSource @@ -33,14 +33,14 @@ class AnalysisBasis(Enum): # see: https://mypy.readthedocs.io/en/stable/additional_features.html#id1 def client_id_column_converter(client_id_column: str | None) -> str: if client_id_column is None: - return ExperimentalUnit.CLIENT.value + return AnalysisUnit.CLIENT.value else: return client_id_column def group_id_column_converter(group_id_column: str | None) -> str: if group_id_column is None: - return ExperimentalUnit.PROFILE_GROUP.value + return AnalysisUnit.PROFILE_GROUP.value else: return group_id_column @@ -95,7 +95,7 @@ class DataSource: _from_expr = attr.ib(validator=attr.validators.instance_of(str)) experiments_column_type = attr.ib(default="simple", type=str | None) client_id_column = attr.ib( - default=ExperimentalUnit.CLIENT.value, + default=AnalysisUnit.CLIENT.value, type=str, validator=[attr.validators.instance_of(str), attr.validators.min_len(1)], converter=client_id_column_converter, @@ -109,7 +109,7 @@ class DataSource: default_dataset = attr.ib(default=None, type=str | None) app_name = attr.ib(default=None, type=str | None) group_id_column = attr.ib( - default=ExperimentalUnit.PROFILE_GROUP.value, + default=AnalysisUnit.PROFILE_GROUP.value, type=str, validator=[attr.validators.instance_of(str), attr.validators.min_len(1)], converter=group_id_column_converter, @@ -186,7 +186,7 @@ def build_query( experiment_slug: str, from_expr_dataset: str | None = None, analysis_basis: AnalysisBasis = AnalysisBasis.ENROLLMENTS, - experimental_unit: ExperimentalUnit = ExperimentalUnit.CLIENT, + experimental_unit: AnalysisUnit = AnalysisUnit.CLIENT, exposure_signal=None, ) -> str: """Return a nearly-self contained SQL query. @@ -194,9 +194,9 @@ def build_query( This query does not define ``enrollments`` but otherwise could be executed to query all metrics from this data source. """ - if experimental_unit == ExperimentalUnit.CLIENT: + if experimental_unit == AnalysisUnit.CLIENT: ds_id = self.client_id_column - elif experimental_unit == ExperimentalUnit.PROFILE_GROUP: + elif experimental_unit == AnalysisUnit.PROFILE_GROUP: ds_id = self.group_id_column else: assert_never(experimental_unit) @@ -253,7 +253,7 @@ def build_query_targets( analysis_length: int, from_expr_dataset: str | None = None, continuous_enrollment: bool = False, - experimental_unit: ExperimentalUnit = ExperimentalUnit.CLIENT, + experimental_unit: AnalysisUnit = AnalysisUnit.CLIENT, ) -> str: """Return a nearly-self contained SQL query that constructs the metrics query for targeting historical data without @@ -262,8 +262,8 @@ def build_query_targets( This query does not define ``targets`` but otherwise could be executed to query all metrics from this data source. """ - if experimental_unit != ExperimentalUnit.CLIENT: - raise IncompatibleExperimentalUnit( + if experimental_unit != AnalysisUnit.CLIENT: + raise IncompatibleAnalysisUnit( "`build_query_targets` currently only supports client_id analysis" ) @@ -381,7 +381,7 @@ def from_mcp_data_source( cls, parser_data_source: ParserDataSource, app_name: str | None = None, - group_id_column: str | None = ExperimentalUnit.PROFILE_GROUP.value, + group_id_column: str | None = AnalysisUnit.PROFILE_GROUP.value, ) -> DataSource: """metric-config-parser DataSource objects do not have an `app_name` and do not, yet, have a group_id_column""" diff --git a/src/mozanalysis/types.py b/src/mozanalysis/types.py index eb80caf2..fd6b3b23 100644 --- a/src/mozanalysis/types.py +++ b/src/mozanalysis/types.py @@ -21,10 +21,10 @@ class Uplift(str, Enum): CompareBranchesOutput = dict[ComparativeOption, EstimatesByBranch] -class ExperimentalUnit(str, Enum): +class AnalysisUnit(str, Enum): CLIENT = "client_id" PROFILE_GROUP = "profile_group_id" -class IncompatibleExperimentalUnit(ValueError): +class IncompatibleAnalysisUnit(ValueError): pass diff --git a/tests/test_experiment.py b/tests/test_experiment.py index 4a749db5..3369ebe1 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -15,13 +15,13 @@ AnalysisWindow, EnrollmentsQueryType, Experiment, - IncompatibleExperimentalUnit, + IncompatibleAnalysisUnit, TimeLimits, ) from mozanalysis.exposure import ExposureSignal from mozanalysis.metrics import AnalysisBasis, DataSource, Metric from mozanalysis.segments import Segment, SegmentDataSource -from mozanalysis.types import ExperimentalUnit +from mozanalysis.types import AnalysisUnit def test_time_limits_validates(): @@ -282,10 +282,10 @@ def test_analysis_window_validates_end(): @pytest.mark.parametrize( - "experimental_unit", [ExperimentalUnit.CLIENT, ExperimentalUnit.PROFILE_GROUP] + "analysis_unit", [AnalysisUnit.CLIENT, AnalysisUnit.PROFILE_GROUP] ) -def test_query_not_detectably_malformed(experimental_unit: ExperimentalUnit): - exp = Experiment("slug", "2019-01-01", 8, experimental_unit=experimental_unit) +def test_query_not_detectably_malformed(analysis_unit: AnalysisUnit): + exp = Experiment("slug", "2019-01-01", 8, analysis_unit=analysis_unit) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -303,9 +303,9 @@ def test_query_not_detectably_malformed(experimental_unit: ExperimentalUnit): sql_lint(enrollments_sql) assert "sample_id < None" not in enrollments_sql - if experimental_unit == ExperimentalUnit.CLIENT: + if analysis_unit == AnalysisUnit.CLIENT: assert "client_id" in enrollments_sql - elif experimental_unit == ExperimentalUnit.PROFILE_GROUP: + elif analysis_unit == AnalysisUnit.PROFILE_GROUP: assert "profile_group_id" in enrollments_sql metrics_sql = exp.build_metrics_query( @@ -316,17 +316,17 @@ def test_query_not_detectably_malformed(experimental_unit: ExperimentalUnit): sql_lint(metrics_sql) - if experimental_unit == ExperimentalUnit.CLIENT: + if analysis_unit == AnalysisUnit.CLIENT: assert "client_id" in metrics_sql - elif experimental_unit == ExperimentalUnit.PROFILE_GROUP: + elif analysis_unit == AnalysisUnit.PROFILE_GROUP: assert "profile_group_id" in metrics_sql @pytest.mark.parametrize( - "experimental_unit", [ExperimentalUnit.CLIENT, ExperimentalUnit.PROFILE_GROUP] + "analysis_unit", [AnalysisUnit.CLIENT, AnalysisUnit.PROFILE_GROUP] ) -def test_megaquery_not_detectably_malformed(experimental_unit: ExperimentalUnit): - exp = Experiment("slug", "2019-01-01", 8, experimental_unit=experimental_unit) +def test_megaquery_not_detectably_malformed(analysis_unit: AnalysisUnit): + exp = Experiment("slug", "2019-01-01", 8, analysis_unit=analysis_unit) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -341,9 +341,9 @@ def test_megaquery_not_detectably_malformed(experimental_unit: ExperimentalUnit) sql_lint(enrollments_sql) - if experimental_unit == ExperimentalUnit.CLIENT: + if analysis_unit == AnalysisUnit.CLIENT: assert "client_id" in enrollments_sql - elif experimental_unit == ExperimentalUnit.PROFILE_GROUP: + elif analysis_unit == AnalysisUnit.PROFILE_GROUP: assert "profile_group_id" in enrollments_sql metrics_sql = exp.build_metrics_query( @@ -354,19 +354,19 @@ def test_megaquery_not_detectably_malformed(experimental_unit: ExperimentalUnit) sql_lint(metrics_sql) - if experimental_unit == ExperimentalUnit.CLIENT: + if analysis_unit == AnalysisUnit.CLIENT: assert "client_id" in metrics_sql - elif experimental_unit == ExperimentalUnit.PROFILE_GROUP: + elif analysis_unit == AnalysisUnit.PROFILE_GROUP: assert "profile_group_id" in metrics_sql @pytest.mark.parametrize( - "experimental_unit", [ExperimentalUnit.CLIENT, ExperimentalUnit.PROFILE_GROUP] + "analysis_unit", [AnalysisUnit.CLIENT, AnalysisUnit.PROFILE_GROUP] ) def test_segments_megaquery_not_detectably_malformed( - experimental_unit: ExperimentalUnit, + analysis_unit: AnalysisUnit, ): - exp = Experiment("slug", "2019-01-01", 8, experimental_unit=experimental_unit) + exp = Experiment("slug", "2019-01-01", 8, analysis_unit=analysis_unit) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1073,9 +1073,7 @@ def test_metrics_query_explicit_client_id(): def test_enrollments_query_explicit_group_id(): - exp = Experiment( - "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.PROFILE_GROUP - ) + exp = Experiment("slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.PROFILE_GROUP) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1155,9 +1153,7 @@ def test_enrollments_query_explicit_group_id(): def test_metrics_query_explicit_group_id(): - exp = Experiment( - "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.PROFILE_GROUP - ) + exp = Experiment("slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.PROFILE_GROUP) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1258,7 +1254,7 @@ def test_glean_group_id_incompatible(): "slug", "2019-01-01", 8, - experimental_unit=ExperimentalUnit.PROFILE_GROUP, + analysis_unit=AnalysisUnit.PROFILE_GROUP, app_id="test_app", ) @@ -1269,7 +1265,7 @@ def test_glean_group_id_incompatible(): num_dates_enrollment=8, ) - with pytest.raises(IncompatibleExperimentalUnit): + with pytest.raises(IncompatibleAnalysisUnit): exp.build_enrollments_query( time_limits=tl, enrollments_query_type=EnrollmentsQueryType.GLEAN_EVENT ) @@ -1280,7 +1276,7 @@ def test_glean_group_id_incompatible_exposures(): "slug", "2019-01-01", 8, - experimental_unit=ExperimentalUnit.PROFILE_GROUP, + analysis_unit=AnalysisUnit.PROFILE_GROUP, app_id="test_app", ) @@ -1291,16 +1287,14 @@ def test_glean_group_id_incompatible_exposures(): num_dates_enrollment=8, ) - with pytest.raises(IncompatibleExperimentalUnit): + with pytest.raises(IncompatibleAnalysisUnit): exp._build_exposure_query( time_limits=tl, exposure_query_type=EnrollmentsQueryType.GLEAN_EVENT ) def test_glean_missing_app_id(): - exp = Experiment( - "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.PROFILE_GROUP - ) + exp = Experiment("slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.PROFILE_GROUP) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1318,9 +1312,7 @@ def test_glean_missing_app_id(): def test_glean_exposures_missing_app_id(): - exp = Experiment( - "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.PROFILE_GROUP - ) + exp = Experiment("slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.PROFILE_GROUP) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1342,7 +1334,7 @@ def test_cirrus_group_id_incompatible(): "slug", "2019-01-01", 8, - experimental_unit=ExperimentalUnit.PROFILE_GROUP, + analysis_unit=AnalysisUnit.PROFILE_GROUP, app_id="test_app", ) @@ -1353,7 +1345,7 @@ def test_cirrus_group_id_incompatible(): num_dates_enrollment=8, ) - with pytest.raises(IncompatibleExperimentalUnit): + with pytest.raises(IncompatibleAnalysisUnit): exp.build_enrollments_query( time_limits=tl, enrollments_query_type=EnrollmentsQueryType.CIRRUS ) @@ -1364,7 +1356,7 @@ def test_cirrus_group_id_incompatible_exposures(): "slug", "2019-01-01", 8, - experimental_unit=ExperimentalUnit.PROFILE_GROUP, + analysis_unit=AnalysisUnit.PROFILE_GROUP, app_id="test_app", ) @@ -1375,16 +1367,14 @@ def test_cirrus_group_id_incompatible_exposures(): num_dates_enrollment=8, ) - with pytest.raises(IncompatibleExperimentalUnit): + with pytest.raises(IncompatibleAnalysisUnit): exp._build_exposure_query( time_limits=tl, exposure_query_type=EnrollmentsQueryType.CIRRUS ) def test_cirrus_missing_app_id(): - exp = Experiment( - "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.PROFILE_GROUP - ) + exp = Experiment("slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.PROFILE_GROUP) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1402,9 +1392,7 @@ def test_cirrus_missing_app_id(): def test_cirrus_missing_app_id_exposures(): - exp = Experiment( - "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.PROFILE_GROUP - ) + exp = Experiment("slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.PROFILE_GROUP) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1422,9 +1410,7 @@ def test_cirrus_missing_app_id_exposures(): def test_fenix_group_id_incompatible(): - exp = Experiment( - "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.PROFILE_GROUP - ) + exp = Experiment("slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.PROFILE_GROUP) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1433,16 +1419,14 @@ def test_fenix_group_id_incompatible(): num_dates_enrollment=8, ) - with pytest.raises(IncompatibleExperimentalUnit): + with pytest.raises(IncompatibleAnalysisUnit): exp.build_enrollments_query( time_limits=tl, enrollments_query_type=EnrollmentsQueryType.FENIX_FALLBACK ) def test_fenix_group_id_incompatible_exposures(): - exp = Experiment( - "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.PROFILE_GROUP - ) + exp = Experiment("slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.PROFILE_GROUP) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", @@ -1451,16 +1435,14 @@ def test_fenix_group_id_incompatible_exposures(): num_dates_enrollment=8, ) - with pytest.raises(IncompatibleExperimentalUnit): + with pytest.raises(IncompatibleAnalysisUnit): exp._build_exposure_query( time_limits=tl, exposure_query_type=EnrollmentsQueryType.FENIX_FALLBACK ) def test_group_id_no_downsampling(): - exp = Experiment( - "slug", "2019-01-01", 8, experimental_unit=ExperimentalUnit.PROFILE_GROUP - ) + exp = Experiment("slug", "2019-01-01", 8, analysis_unit=AnalysisUnit.PROFILE_GROUP) tl = TimeLimits.for_ts( first_enrollment_date="2019-01-01", From 78b0b418cafceda575968f1e94c942f45d293ea7 Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Mon, 5 Aug 2024 08:30:37 -0700 Subject: [PATCH 29/40] make string multiline --- src/mozanalysis/experiment.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/mozanalysis/experiment.py b/src/mozanalysis/experiment.py index ccff8154..eaa408d8 100644 --- a/src/mozanalysis/experiment.py +++ b/src/mozanalysis/experiment.py @@ -1427,12 +1427,15 @@ def _build_analysis_window_subset_query( This method returns SQL to query this table to obtain results in "the standard format" for a single analysis window. """ + except_clause = ( + f"{self.analysis_unit.value}, analysis_window_start, analysis_window_end" + ) return f""" - SELECT * EXCEPT ({self.analysis_unit.value}, analysis_window_start, analysis_window_end) + SELECT * EXCEPT ({except_clause}) FROM {self.fully_qualified_table_name} WHERE analysis_window_start = {analysis_window.start} AND analysis_window_end = {analysis_window.end} - """ # noqa: E501 + """ def _build_aggregated_data_query( self, metric_list: list[Metric], aggregate_function: str From 20d81e2a613e292761c6831c7b1cf0b0b501ad09 Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Mon, 5 Aug 2024 08:43:04 -0700 Subject: [PATCH 30/40] change timelimits analysis windows type --- src/mozanalysis/experiment.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/mozanalysis/experiment.py b/src/mozanalysis/experiment.py index eaa408d8..9e3d0520 100644 --- a/src/mozanalysis/experiment.py +++ b/src/mozanalysis/experiment.py @@ -1098,7 +1098,7 @@ class TimeLimits: first_date_data_required = attr.ib(type=str) last_date_data_required = attr.ib(type=str) - analysis_windows = attr.ib() # type: tuple[AnalysisWindow] + analysis_windows = attr.ib() # type: tuple[AnalysisWindow,...] @classmethod def for_single_analysis_window( @@ -1212,8 +1212,6 @@ def for_ts( ] ) - analysis_windows = cast(tuple[AnalysisWindow], analysis_windows) - last_date_data_required = add_days( last_enrollment_date, analysis_windows[-1].end ) From cf08094f2bdace7feb8b5fd7d48a7b7a7ed31b3c Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Mon, 5 Aug 2024 09:07:29 -0700 Subject: [PATCH 31/40] TimeSeriesResult analysis window parameter match TimeSeries --- src/mozanalysis/experiment.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mozanalysis/experiment.py b/src/mozanalysis/experiment.py index 9e3d0520..003f856a 100644 --- a/src/mozanalysis/experiment.py +++ b/src/mozanalysis/experiment.py @@ -426,7 +426,7 @@ def get_time_series_data( fully_qualified_table_name=bq_context.fully_qualify_table_name( full_res_table_name ), - analysis_windows=list(time_limits.analysis_windows), + analysis_windows=time_limits.analysis_windows, ) def build_enrollments_query( @@ -1315,7 +1315,7 @@ class TimeSeriesResult: """ fully_qualified_table_name = attr.ib(type=str) - analysis_windows = attr.ib(type=list) + analysis_windows = attr.ib(type=tuple[AnalysisWindow, ...]) analysis_unit = attr.ib(type=AnalysisUnit, default=AnalysisUnit.CLIENT) def get(self, bq_context: BigQueryContext, analysis_window) -> DataFrame: From c80119c93f604f21669829eccc34565d1bb5ef7f Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Thu, 8 Aug 2024 07:27:10 -0700 Subject: [PATCH 32/40] Update metrics.py --- src/mozanalysis/metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mozanalysis/metrics.py b/src/mozanalysis/metrics.py index c804009a..3a617c37 100644 --- a/src/mozanalysis/metrics.py +++ b/src/mozanalysis/metrics.py @@ -104,7 +104,7 @@ class DataSource: default="submission_date", type=str, validator=[attr.validators.instance_of(str), attr.validators.min_len(1)], - converter=group_id_column_converter, + converter=submission_date_column_converter, ) default_dataset = attr.ib(default=None, type=str | None) app_name = attr.ib(default=None, type=str | None) From 4ddaee7223994ae913cbe3cdf23bd9dc41a656f1 Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Tue, 20 Aug 2024 10:02:02 -0700 Subject: [PATCH 33/40] update metric-config-parser --- requirements-dev.in | 2 +- requirements-dev.txt | 6 +++--- requirements.in | 2 +- requirements.txt | 6 +++--- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/requirements-dev.in b/requirements-dev.in index 07231a72..7e84a38a 100644 --- a/requirements-dev.in +++ b/requirements-dev.in @@ -226,7 +226,7 @@ more-itertools==10.4.0 # -c requirements.txt # jaraco-classes # jaraco-functools -mozilla-metric-config-parser==2024.7.1 +mozilla-metric-config-parser==2024.8.1 # via # -c requirements.txt # mozanalysis diff --git a/requirements-dev.txt b/requirements-dev.txt index 257f49a0..aad64973 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -953,9 +953,9 @@ more-itertools==10.4.0 \ # -r requirements-dev.in # jaraco-classes # jaraco-functools -mozilla-metric-config-parser==2024.7.1 \ - --hash=sha256:07ba32624cb9a38662bdff259dd4ec385bdc8ca4500b461d8a9025a736a25c6b \ - --hash=sha256:ad5169b4cec7b0fa013b3136584b7ee3d9b8a6a807d2eb71e5d9f269517f9d22 +mozilla-metric-config-parser==2024.8.1 \ + --hash=sha256:49fb6e67367809e3750108246e50dc1e68778564c2c158ba26bd4835b0b9c89c \ + --hash=sha256:cadc4ba9fb8399be0b857abb4bdd1f12ff1943159463fec0d128b71b2e72b554 # via -r requirements-dev.in mozilla-nimbus-schemas==2023.10.3 \ --hash=sha256:8771344a63b0d197dbebd8d7955ce8034d0f5063a13899f4da7d8a99803a76da \ diff --git a/requirements.in b/requirements.in index 0bfe022f..7eb0b03f 100644 --- a/requirements.in +++ b/requirements.in @@ -124,7 +124,7 @@ more-itertools==10.4.0 # via # jaraco-classes # jaraco-functools -mozilla-metric-config-parser==2024.7.1 +mozilla-metric-config-parser==2024.8.1 # via mozanalysis mozilla-nimbus-schemas==2023.10.3 # via mozilla-metric-config-parser diff --git a/requirements.txt b/requirements.txt index ca4c694b..ba439d6e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -924,9 +924,9 @@ more-itertools==10.4.0 \ # -r requirements.in # jaraco-classes # jaraco-functools -mozilla-metric-config-parser==2024.7.1 \ - --hash=sha256:07ba32624cb9a38662bdff259dd4ec385bdc8ca4500b461d8a9025a736a25c6b \ - --hash=sha256:ad5169b4cec7b0fa013b3136584b7ee3d9b8a6a807d2eb71e5d9f269517f9d22 +mozilla-metric-config-parser==2024.8.1 \ + --hash=sha256:49fb6e67367809e3750108246e50dc1e68778564c2c158ba26bd4835b0b9c89c \ + --hash=sha256:cadc4ba9fb8399be0b857abb4bdd1f12ff1943159463fec0d128b71b2e72b554 # via -r requirements.in mozilla-nimbus-schemas==2023.10.3 \ --hash=sha256:8771344a63b0d197dbebd8d7955ce8034d0f5063a13899f4da7d8a99803a76da \ From 4b2fc044aa3716e91ec659e7f6170cec4b7360eb Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Tue, 20 Aug 2024 10:09:21 -0700 Subject: [PATCH 34/40] import AnalysisUnit from metric-config-parser --- src/mozanalysis/experiment.py | 3 ++- src/mozanalysis/metrics.py | 3 ++- src/mozanalysis/types.py | 5 ----- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/mozanalysis/experiment.py b/src/mozanalysis/experiment.py index 003f856a..39b540be 100644 --- a/src/mozanalysis/experiment.py +++ b/src/mozanalysis/experiment.py @@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, cast import attr +from metric_config_parser import AnalysisUnit from typing_extensions import assert_never from mozanalysis import APPS @@ -15,7 +16,7 @@ from mozanalysis.config import ConfigLoader from mozanalysis.metrics import AnalysisBasis, DataSource, Metric from mozanalysis.segments import Segment, SegmentDataSource -from mozanalysis.types import AnalysisUnit, IncompatibleAnalysisUnit +from mozanalysis.types import IncompatibleAnalysisUnit from mozanalysis.utils import add_days, date_sub, hash_ish if TYPE_CHECKING: diff --git a/src/mozanalysis/metrics.py b/src/mozanalysis/metrics.py index 3a617c37..85bc0791 100644 --- a/src/mozanalysis/metrics.py +++ b/src/mozanalysis/metrics.py @@ -6,9 +6,10 @@ from enum import Enum from typing import TYPE_CHECKING +from metric_config_parser import AnalysisUnit from typing_extensions import assert_never -from mozanalysis.types import AnalysisUnit, IncompatibleAnalysisUnit +from mozanalysis.types import IncompatibleAnalysisUnit if TYPE_CHECKING: from metric_config_parser.data_source import DataSource as ParserDataSource diff --git a/src/mozanalysis/types.py b/src/mozanalysis/types.py index fd6b3b23..96ffe934 100644 --- a/src/mozanalysis/types.py +++ b/src/mozanalysis/types.py @@ -21,10 +21,5 @@ class Uplift(str, Enum): CompareBranchesOutput = dict[ComparativeOption, EstimatesByBranch] -class AnalysisUnit(str, Enum): - CLIENT = "client_id" - PROFILE_GROUP = "profile_group_id" - - class IncompatibleAnalysisUnit(ValueError): pass From cf0f5149dba8e769402c8bc43222cd03ea4b1138 Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Tue, 20 Aug 2024 10:15:13 -0700 Subject: [PATCH 35/40] update test --- tests/test_experiment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_experiment.py b/tests/test_experiment.py index 3369ebe1..c1f37ba9 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -21,7 +21,7 @@ from mozanalysis.exposure import ExposureSignal from mozanalysis.metrics import AnalysisBasis, DataSource, Metric from mozanalysis.segments import Segment, SegmentDataSource -from mozanalysis.types import AnalysisUnit +from metric_config_parser import AnalysisUnit def test_time_limits_validates(): From bb9af586a28801195f3368ae05719cedfe7a7dfb Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Tue, 20 Aug 2024 10:24:44 -0700 Subject: [PATCH 36/40] circle's ruff out of sync with dependencies --- .circleci/config.yml | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 70da2131..d66486f0 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -26,15 +26,12 @@ test_settings: &test_settings chmod +x codecov ./codecov -F "$(basename $PWD | sed s/[^a-z]/_/g)" - - #################### # Jobs: see https://circleci.com/docs/2.0/jobs-steps/ #################### version: 2 jobs: - py310: <<: *test_settings docker: @@ -44,12 +41,12 @@ jobs: docker: - image: cimg/python:3.10 steps: - - checkout - - run: - name: Run linting - command: | - pip install ruff - ruff check src/ tests/ + - checkout + - run: + name: Run linting + command: | + pip install -r requirements.txt + ruff check src/ tests/ # Runs when the repository is tagged for release; see the workflows section # below for trigger logic. @@ -133,7 +130,7 @@ workflows: - deploy: filters: tags: - only: /[0-9]{4}.[0-9]{1,2}.[0-9]+/ # Calver: YYYY.M.MINOR + only: /[0-9]{4}.[0-9]{1,2}.[0-9]+/ # Calver: YYYY.M.MINOR branches: # Ignore all branches; this workflow should only run for tags. ignore: /.*/ From 10d8a1d4723dae581ddb14237d4caf8fcda280b1 Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Tue, 20 Aug 2024 10:26:24 -0700 Subject: [PATCH 37/40] ruff fix --- tests/test_experiment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_experiment.py b/tests/test_experiment.py index c1f37ba9..0c0e153e 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -10,6 +10,7 @@ klar_android_metrics, klar_ios_metrics, ) +from metric_config_parser import AnalysisUnit from mozanalysis.config import ApplicationNotFound, ConfigLoader from mozanalysis.experiment import ( AnalysisWindow, @@ -21,7 +22,6 @@ from mozanalysis.exposure import ExposureSignal from mozanalysis.metrics import AnalysisBasis, DataSource, Metric from mozanalysis.segments import Segment, SegmentDataSource -from metric_config_parser import AnalysisUnit def test_time_limits_validates(): From c46e72525359f444b1ce625baa265709fc618d6e Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Tue, 20 Aug 2024 10:50:16 -0700 Subject: [PATCH 38/40] missed .value --- src/mozanalysis/experiment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mozanalysis/experiment.py b/src/mozanalysis/experiment.py index 39b540be..6813307e 100644 --- a/src/mozanalysis/experiment.py +++ b/src/mozanalysis/experiment.py @@ -505,7 +505,7 @@ def build_enrollments_query( SELECT se.*, - e.* EXCEPT ({self.analysis_unit}, branch) + e.* EXCEPT ({self.analysis_unit.value}, branch) FROM segmented_enrollments se LEFT JOIN exposures e USING ({self.analysis_unit.value}, branch) From 1f41806792032c6580c919d393e06cc6149f9b00 Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Thu, 22 Aug 2024 08:14:28 -0700 Subject: [PATCH 39/40] Update metrics.py --- src/mozanalysis/metrics.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/mozanalysis/metrics.py b/src/mozanalysis/metrics.py index 85bc0791..bfaf551c 100644 --- a/src/mozanalysis/metrics.py +++ b/src/mozanalysis/metrics.py @@ -187,7 +187,7 @@ def build_query( experiment_slug: str, from_expr_dataset: str | None = None, analysis_basis: AnalysisBasis = AnalysisBasis.ENROLLMENTS, - experimental_unit: AnalysisUnit = AnalysisUnit.CLIENT, + analysis_unit: AnalysisUnit = AnalysisUnit.CLIENT, exposure_signal=None, ) -> str: """Return a nearly-self contained SQL query. @@ -195,12 +195,12 @@ def build_query( This query does not define ``enrollments`` but otherwise could be executed to query all metrics from this data source. """ - if experimental_unit == AnalysisUnit.CLIENT: + if analysis_unit == AnalysisUnit.CLIENT: ds_id = self.client_id_column - elif experimental_unit == AnalysisUnit.PROFILE_GROUP: + elif analysis_unit == AnalysisUnit.PROFILE_GROUP: ds_id = self.group_id_column else: - assert_never(experimental_unit) + assert_never(analysis_unit) return """SELECT e.{id_column}, @@ -243,7 +243,7 @@ def build_query( submission_date=self.submission_date_column, experiment_slug=experiment_slug, ), - id_column=experimental_unit.value, + id_column=analysis_unit.value, ) def build_query_targets( @@ -254,7 +254,7 @@ def build_query_targets( analysis_length: int, from_expr_dataset: str | None = None, continuous_enrollment: bool = False, - experimental_unit: AnalysisUnit = AnalysisUnit.CLIENT, + analysis_unit: AnalysisUnit = AnalysisUnit.CLIENT, ) -> str: """Return a nearly-self contained SQL query that constructs the metrics query for targeting historical data without @@ -263,7 +263,7 @@ def build_query_targets( This query does not define ``targets`` but otherwise could be executed to query all metrics from this data source. """ - if experimental_unit != AnalysisUnit.CLIENT: + if analysis_unit != AnalysisUnit.CLIENT: raise IncompatibleAnalysisUnit( "`build_query_targets` currently only supports client_id analysis" ) From b8ec9954823f6081e4c11ffaafc6dbb75e915b3a Mon Sep 17 00:00:00 2001 From: Daniel Berry Date: Thu, 22 Aug 2024 08:23:08 -0700 Subject: [PATCH 40/40] removed analysis_unit converter --- src/mozanalysis/experiment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mozanalysis/experiment.py b/src/mozanalysis/experiment.py index 6813307e..34b3b001 100644 --- a/src/mozanalysis/experiment.py +++ b/src/mozanalysis/experiment.py @@ -145,7 +145,7 @@ class Experiment: analysis_unit = attr.ib( type=AnalysisUnit, default=AnalysisUnit.CLIENT, - converter=lambda s: AnalysisUnit(s), # allows callers to pass a string + validator=attr.validators.instance_of(AnalysisUnit), ) def get_app_name(self):