From f93613f2c737839e9426d7a88a1cfe81c3d5c22d Mon Sep 17 00:00:00 2001 From: Steffen Schneider Date: Tue, 25 Jul 2023 13:26:50 +0200 Subject: [PATCH 01/14] Started implementing improved goodness of fit implementation --- cebra/integrations/sklearn/metrics.py | 77 +++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/cebra/integrations/sklearn/metrics.py b/cebra/integrations/sklearn/metrics.py index ccecaa11..9ef2efb9 100644 --- a/cebra/integrations/sklearn/metrics.py +++ b/cebra/integrations/sklearn/metrics.py @@ -108,6 +108,83 @@ def infonce_loss( return avg_loss +def goodness_of_fit_score( + cebra_model: cebra_sklearn_cebra.CEBRA, + X: Union[npt.NDArray, torch.Tensor], + *y, + session_id: Optional[int] = None, + num_batches: int = 500, + correct_by_batchsize: bool = False, +) -> float: + """Compute the InfoNCE loss on a *single session* dataset on the model. + + Args: + cebra_model: The model to use to compute the InfoNCE loss on the samples. + X: A 2D data matrix, corresponding to a *single session* recording. + y: An arbitrary amount of continuous indices passed as 2D matrices, and up to one + discrete index passed as a 1D array. Each index has to match the length of ``X``. + session_id: The session ID, an :py:class:`int` between 0 and :py:attr:`cebra.CEBRA.num_sessions` + for multisession, set to ``None`` for single session. + num_batches: The number of iterations to consider to evaluate the model on the new data. + Higher values will give a more accurate estimate. Set it to at least 500 iterations. + """ + loss = infonce_loss(cebra_model=cebra_model, + X=X, + *y, + session_id=session_id, + num_batches=500, + correct_by_batchsize=False) + return infonce_to_goodness_of_fit(loss, cebra_model) + + +def goodness_of_fit_score(model): + infonce = np.array(model.state_dict_["log"]["total"]) + return infonce_to_goodness_of_fit(infonce, model) + + +def infonce_to_goodness_of_fit(infonce: Union[float, Iterable[float]], + model: cebra.CEBRA) -> np.ndarray: + """Given a trained CEBRA model, return goodness of fit metric + + The goodness of fit ranges from 0 (lowest meaningful value) + to a positive number with the unit "bits", the higher the + better. + + Values lower than 0 bits are possible, but these only occur + due to numerical effects. A perfectly collapsed embedding + (e.g., because the data cannot be fit with the provided + auxiliary variables) will have a goodness of fit of 0. + + The conversion between the generalized InfoNCE metric that + CEBRA is trained with and the goodness of fit computed with this + function is + + .. math:: + + S = \log N - \text{InfoNCE} + + Args: + model: The trained CEBRA model + + Returns: + Numpy array containing the goodness of fit + values, measured in bits + + Raises: + ``RuntimeError``, if provided model is not + fit to data. + """ + if not hasattr(model, "state_dict_"): + raise RuntimeError("Fit the CEBRA model first.") + + nats_to_bits = np.log2(np.e) + num_sessions = model.num_sessions_ + if num_sessions is None: + num_sessions = 1 + chance_level = np.log(model.batch_size * (model.num_sessions_ or 1)) + return (chance_level - infonce) * nats_to_bits + + def _consistency_scores( embeddings: List[Union[npt.NDArray, torch.Tensor]], datasets: List[Union[int, str]], From d87153521c3afda13f1541d5e0fcf52649e8e842 Mon Sep 17 00:00:00 2001 From: Steffen Schneider Date: Sun, 27 Oct 2024 18:50:33 +0100 Subject: [PATCH 02/14] add tests and improve implementation --- cebra/integrations/sklearn/metrics.py | 70 ++++++++++++++++++++------- tests/test_sklearn_metrics.py | 64 ++++++++++++++++++++++++ 2 files changed, 116 insertions(+), 18 deletions(-) diff --git a/cebra/integrations/sklearn/metrics.py b/cebra/integrations/sklearn/metrics.py index 9ef2efb9..41dc67ff 100644 --- a/cebra/integrations/sklearn/metrics.py +++ b/cebra/integrations/sklearn/metrics.py @@ -108,16 +108,15 @@ def infonce_loss( return avg_loss -def goodness_of_fit_score( - cebra_model: cebra_sklearn_cebra.CEBRA, - X: Union[npt.NDArray, torch.Tensor], - *y, - session_id: Optional[int] = None, - num_batches: int = 500, - correct_by_batchsize: bool = False, -) -> float: +def goodness_of_fit_score(cebra_model: cebra_sklearn_cebra.CEBRA, + X: Union[npt.NDArray, torch.Tensor], + *y, + session_id: Optional[int] = None, + num_batches: int = 500) -> float: """Compute the InfoNCE loss on a *single session* dataset on the model. + This function uses the :func:`infonce_loss` function to compute the InfoNCE loss. + Args: cebra_model: The model to use to compute the InfoNCE loss on the samples. X: A 2D data matrix, corresponding to a *single session* recording. @@ -127,23 +126,60 @@ def goodness_of_fit_score( for multisession, set to ``None`` for single session. num_batches: The number of iterations to consider to evaluate the model on the new data. Higher values will give a more accurate estimate. Set it to at least 500 iterations. + + Returns: + The average GoF score estimated over ``num_batches`` batches from the data distribution. + + Related: + :func:`infonce_to_goodness_of_fit` + + Example: + + >>> import cebra + >>> import numpy as np + >>> neural_data = np.random.uniform(0, 1, (1000, 20)) + >>> cebra_model = cebra.CEBRA(max_iterations=10) + >>> cebra_model.fit(neural_data) + CEBRA(max_iterations=10) + >>> gof = cebra.goodness_of_fit_score(cebra_model, neural_data) """ - loss = infonce_loss(cebra_model=cebra_model, - X=X, + loss = infonce_loss(cebra_model, + X, *y, session_id=session_id, - num_batches=500, + num_batches=num_batches, correct_by_batchsize=False) return infonce_to_goodness_of_fit(loss, cebra_model) -def goodness_of_fit_score(model): +def goodness_of_fit_history(model): + """Return the history of the goodness of fit score. + + Args: + model: A trained CEBRA model. + + Returns: + A numpy array containing the goodness of fit values, measured in bits. + + Related: + :func:`infonce_to_goodness_of_fit` + + Example: + + >>> import cebra + >>> import numpy as np + >>> neural_data = np.random.uniform(0, 1, (1000, 20)) + >>> cebra_model = cebra.CEBRA(max_iterations=10) + >>> cebra_model.fit(neural_data) + CEBRA(max_iterations=10) + >>> gof_history = cebra.goodness_of_fit_history(cebra_model) + """ infonce = np.array(model.state_dict_["log"]["total"]) return infonce_to_goodness_of_fit(infonce, model) def infonce_to_goodness_of_fit(infonce: Union[float, Iterable[float]], - model: cebra.CEBRA) -> np.ndarray: + model: cebra_sklearn_cebra.CEBRA) -> np.ndarray: """Given a trained CEBRA model, return goodness of fit metric The goodness of fit ranges from 0 (lowest meaningful value) @@ -161,18 +197,16 @@ def infonce_to_goodness_of_fit(infonce: Union[float, Iterable[float]], .. math:: - S = \log N - \text{InfoNCE} + S = \\log N - \\text{InfoNCE} Args: model: The trained CEBRA model Returns: - Numpy array containing the goodness of fit - values, measured in bits + Numpy array containing the goodness of fit values, measured in bits Raises: - ``RuntimeError``, if provided model is not - fit to data. + ``RuntimeError``, if provided model is not fit to data. """ if not hasattr(model, "state_dict_"): raise RuntimeError("Fit the CEBRA model first.") diff --git a/tests/test_sklearn_metrics.py b/tests/test_sklearn_metrics.py index 58e12010..eb4d8420 100644 --- a/tests/test_sklearn_metrics.py +++ b/tests/test_sklearn_metrics.py @@ -383,3 +383,67 @@ def test_sklearn_runs_consistency(): with pytest.raises(ValueError, match="Invalid.*embeddings"): _, _, _ = cebra_sklearn_metrics.consistency_score( invalid_embeddings_runs, between="runs") + + +@pytest.mark.parametrize("seed", [42, 24, 10]) +def test_goodness_of_fit_score(seed): + """ + Ensure that the GoF score is close to 0 for a model fit on random data. + """ + cebra_model = cebra_sklearn_cebra.CEBRA( + model_architecture="offset1-model", + max_iterations=5, + batch_size=512, + ) + X = torch.tensor(np.random.uniform(0, 1, (5000, 50))) + y = torch.tensor(np.random.uniform(0, 1, (5000, 5))) + cebra_model.fit(X, y) + score = cebra_sklearn_metrics.goodness_of_fit_score(cebra_model, + X, + y, + session_id=0, + num_batches=500) + assert isinstance(score, float) + assert np.isclose(score, 0, atol=0.01) + + +@pytest.mark.parametrize("seed", [42, 24, 10]) +def test_goodness_of_fit_history(seed): + """ + Ensure that the GoF score is higher for a model fit on data with underlying + structure than for a model fit on random data. + """ + + # Generate data + generator = torch.Generator().manual_seed(seed) + X = torch.rand(1000, 50, dtype=torch.float32, generator=generator) + y_random = torch.rand(len(X), 5, dtype=torch.float32, generator=generator) + linear_map = torch.randn(50, 5, dtype=torch.float32, generator=generator) + y_linear = X @ linear_map + + def _fit_and_get_history(X, y): + cebra_model = cebra_sklearn_cebra.CEBRA( + model_architecture="offset1-model", + max_iterations=150, + batch_size=512, + device="cpu") + cebra_model.fit(X, y) + history = cebra_sklearn_metrics.goodness_of_fit_history(cebra_model) + # NOTE(stes): Ignore the first 5 iterations, they can have nonsensical values + # due to numerical issues. + return history[5:] + + history_random = _fit_and_get_history(X, y_random) + history_linear = _fit_and_get_history(X, y_linear) + + assert isinstance(history_random, np.ndarray) + assert history_random.shape[0] > 0 + # NOTE(stes): Ignore the first 5 iterations, they can have nonsensical values + # due to numerical issues. + history_random_non_negative = history_random[history_random >= 0] + np.testing.assert_allclose(history_random_non_negative, 0, atol=0.05) + + assert isinstance(history_linear, np.ndarray) + assert history_linear.shape[0] > 0 + + assert np.all(history_linear[-20:] > history_random[-20:]) From 17d31e5ace235fd3070103d168ed3fd047058194 Mon Sep 17 00:00:00 2001 From: Steffen Schneider Date: Sun, 27 Oct 2024 19:00:40 +0100 Subject: [PATCH 03/14] Fix examples --- cebra/integrations/sklearn/metrics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cebra/integrations/sklearn/metrics.py b/cebra/integrations/sklearn/metrics.py index 41dc67ff..4b3c08ea 100644 --- a/cebra/integrations/sklearn/metrics.py +++ b/cebra/integrations/sklearn/metrics.py @@ -141,7 +141,7 @@ def goodness_of_fit_score(cebra_model: cebra_sklearn_cebra.CEBRA, >>> cebra_model = cebra.CEBRA(max_iterations=10) >>> cebra_model.fit(neural_data) CEBRA(max_iterations=10) - >>> gof = cebra.goodness_of_fit_score(cebra_model, neural_data) + >>> gof = cebra.sklearn.metrics.goodness_of_fit_score(cebra_model, neural_data) """ loss = infonce_loss(cebra_model, X, @@ -172,7 +172,7 @@ def goodness_of_fit_history(model): >>> cebra_model = cebra.CEBRA(max_iterations=10) >>> cebra_model.fit(neural_data) CEBRA(max_iterations=10) - >>> gof_history = cebra.goodness_of_fit_history(cebra_model) + >>> gof_history = cebra.sklearn.metrics.goodness_of_fit_history(cebra_model) """ infonce = np.array(model.state_dict_["log"]["total"]) return infonce_to_goodness_of_fit(infonce, model) @@ -215,7 +215,7 @@ def infonce_to_goodness_of_fit(infonce: Union[float, Iterable[float]], num_sessions = model.num_sessions_ if num_sessions is None: num_sessions = 1 - chance_level = np.log(model.batch_size * (model.num_sessions_ or 1)) + chance_level = np.log(model.batch_size * num_sessions) return (chance_level - infonce) * nats_to_bits From 4f155d83b9e86e157ee3ebbd6075cbecd148a19f Mon Sep 17 00:00:00 2001 From: Steffen Schneider Date: Sun, 27 Oct 2024 19:02:10 +0100 Subject: [PATCH 04/14] Fix docstring error --- cebra/integrations/sklearn/metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cebra/integrations/sklearn/metrics.py b/cebra/integrations/sklearn/metrics.py index 4b3c08ea..29f7715b 100644 --- a/cebra/integrations/sklearn/metrics.py +++ b/cebra/integrations/sklearn/metrics.py @@ -206,7 +206,7 @@ def infonce_to_goodness_of_fit(infonce: Union[float, Iterable[float]], Numpy array containing the goodness of fit values, measured in bits Raises: - ``RuntimeError``, if provided model is not fit to data. + RuntimeError: If the provided model is not fit to data. """ if not hasattr(model, "state_dict_"): raise RuntimeError("Fit the CEBRA model first.") From afe25e68b11d9f7eafa667c73e9057bb41f21086 Mon Sep 17 00:00:00 2001 From: Steffen Schneider Date: Mon, 16 Dec 2024 12:12:51 -0500 Subject: [PATCH 05/14] Handle batch size = None for goodness of fit computation --- cebra/integrations/sklearn/metrics.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/cebra/integrations/sklearn/metrics.py b/cebra/integrations/sklearn/metrics.py index 29f7715b..9a1dd5a6 100644 --- a/cebra/integrations/sklearn/metrics.py +++ b/cebra/integrations/sklearn/metrics.py @@ -138,7 +138,7 @@ def goodness_of_fit_score(cebra_model: cebra_sklearn_cebra.CEBRA, >>> import cebra >>> import numpy as np >>> neural_data = np.random.uniform(0, 1, (1000, 20)) - >>> cebra_model = cebra.CEBRA(max_iterations=10) + >>> cebra_model = cebra.CEBRA(max_iterations=10, batch_size = 512) >>> cebra_model.fit(neural_data) CEBRA(max_iterations=10) >>> gof = cebra.sklearn.metrics.goodness_of_fit_score(cebra_model, neural_data) @@ -169,7 +169,7 @@ def goodness_of_fit_history(model): >>> import cebra >>> import numpy as np >>> neural_data = np.random.uniform(0, 1, (1000, 20)) - >>> cebra_model = cebra.CEBRA(max_iterations=10) + >>> cebra_model = cebra.CEBRA(max_iterations=10, batch_size = 512) >>> cebra_model.fit(neural_data) CEBRA(max_iterations=10) >>> gof_history = cebra.sklearn.metrics.goodness_of_fit_history(cebra_model) @@ -210,6 +210,11 @@ def infonce_to_goodness_of_fit(infonce: Union[float, Iterable[float]], """ if not hasattr(model, "state_dict_"): raise RuntimeError("Fit the CEBRA model first.") + if model.batch_size is None: + raise ValueError( + "Computing the goodness of fit is not yet supported for " + "models trained on the full dataset (batchsize = None). " + ) nats_to_bits = np.log2(np.e) num_sessions = model.num_sessions_ From caba8c50d1f3a099dc4d6a09ff72f8f9968df2a2 Mon Sep 17 00:00:00 2001 From: Steffen Schneider Date: Mon, 16 Dec 2024 09:28:52 -0800 Subject: [PATCH 06/14] adapt GoF implementation --- cebra/integrations/sklearn/metrics.py | 42 ++++++++++++++++++--------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/cebra/integrations/sklearn/metrics.py b/cebra/integrations/sklearn/metrics.py index 9a1dd5a6..46e3b8ca 100644 --- a/cebra/integrations/sklearn/metrics.py +++ b/cebra/integrations/sklearn/metrics.py @@ -178,9 +178,11 @@ def goodness_of_fit_history(model): return infonce_to_goodness_of_fit(infonce, model) -def infonce_to_goodness_of_fit(infonce: Union[float, Iterable[float]], - model: cebra_sklearn_cebra.CEBRA) -> np.ndarray: - """Given a trained CEBRA model, return goodness of fit metric +def infonce_to_goodness_of_fit(infonce: Union[float, np.ndarray], + model: Optional[cebra_sklearn_cebra.CEBRA] = None, + batch_size: Optional[int] = None, + num_sessions: Optional[int] = None) -> Union[float, np.ndarray]: + """Given a trained CEBRA model, return goodness of fit metric. The goodness of fit ranges from 0 (lowest meaningful value) to a positive number with the unit "bits", the higher the @@ -199,27 +201,41 @@ def infonce_to_goodness_of_fit(infonce: Union[float, Iterable[float]], S = \\log N - \\text{InfoNCE} + To use this function, either provide a trained CEBRA model or the + batch size and number of sessions. + Args: + infonce: The InfoNCE loss, either a single value or an iterable of values. model: The trained CEBRA model + batch_size: The batch size used to train the model. + num_sessions: The number of sessions used to train the model. Returns: Numpy array containing the goodness of fit values, measured in bits Raises: RuntimeError: If the provided model is not fit to data. + ValueError: If both ``model`` and ``(batch_size, num_sessions)`` are provided. """ - if not hasattr(model, "state_dict_"): - raise RuntimeError("Fit the CEBRA model first.") - if model.batch_size is None: - raise ValueError( - "Computing the goodness of fit is not yet supported for " - "models trained on the full dataset (batchsize = None). " - ) + if model is not None: + if batch_size is not None or num_sessions is not None: + raise ValueError("batch_size and num_sessions should not be provided if model is provided.") + if not hasattr(model, "state_dict_"): + raise RuntimeError("Fit the CEBRA model first.") + if model.batch_size is None: + raise ValueError( + "Computing the goodness of fit is not yet supported for " + "models trained on the full dataset (batchsize = None). " + ) + batch_size = model.batch_size + num_sessions = model.num_sessions_ + if num_sessions is None: + num_sessions = 1 + else: + if batch_size is None or num_sessions is None: + raise ValueError("batch_size should be provided if model is not provided.") nats_to_bits = np.log2(np.e) - num_sessions = model.num_sessions_ - if num_sessions is None: - num_sessions = 1 chance_level = np.log(model.batch_size * num_sessions) return (chance_level - infonce) * nats_to_bits From 3d05a1862c8570ed8999d0a20da1964eedc039f2 Mon Sep 17 00:00:00 2001 From: Steffen Schneider Date: Tue, 17 Dec 2024 04:51:10 -0500 Subject: [PATCH 07/14] Fix docstring tests --- cebra/integrations/sklearn/metrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cebra/integrations/sklearn/metrics.py b/cebra/integrations/sklearn/metrics.py index 46e3b8ca..bf1be3df 100644 --- a/cebra/integrations/sklearn/metrics.py +++ b/cebra/integrations/sklearn/metrics.py @@ -140,7 +140,7 @@ def goodness_of_fit_score(cebra_model: cebra_sklearn_cebra.CEBRA, >>> neural_data = np.random.uniform(0, 1, (1000, 20)) >>> cebra_model = cebra.CEBRA(max_iterations=10, batch_size = 512) >>> cebra_model.fit(neural_data) - CEBRA(max_iterations=10) + CEBRA(batch_size=512, max_iterations=10) >>> gof = cebra.sklearn.metrics.goodness_of_fit_score(cebra_model, neural_data) """ loss = infonce_loss(cebra_model, @@ -171,7 +171,7 @@ def goodness_of_fit_history(model): >>> neural_data = np.random.uniform(0, 1, (1000, 20)) >>> cebra_model = cebra.CEBRA(max_iterations=10, batch_size = 512) >>> cebra_model.fit(neural_data) - CEBRA(max_iterations=10) + CEBRA(batch_size=512, max_iterations=10) >>> gof_history = cebra.sklearn.metrics.goodness_of_fit_history(cebra_model) """ infonce = np.array(model.state_dict_["log"]["total"]) From e577b5ae0ed72b767faf718f21c4e0ac73050391 Mon Sep 17 00:00:00 2001 From: Steffen Schneider Date: Tue, 21 Jan 2025 22:59:32 +0100 Subject: [PATCH 08/14] Update docstring for goodness_of_fit_score MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Célia Benquet <32598028+CeliaBenquet@users.noreply.github.com> --- cebra/integrations/sklearn/metrics.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cebra/integrations/sklearn/metrics.py b/cebra/integrations/sklearn/metrics.py index bf1be3df..93ad983f 100644 --- a/cebra/integrations/sklearn/metrics.py +++ b/cebra/integrations/sklearn/metrics.py @@ -113,9 +113,11 @@ def goodness_of_fit_score(cebra_model: cebra_sklearn_cebra.CEBRA, *y, session_id: Optional[int] = None, num_batches: int = 500) -> float: - """Compute the InfoNCE loss on a *single session* dataset on the model. + """Compute the goodness of fit score on a *single session* dataset on the model. - This function uses the :func:`infonce_loss` function to compute the InfoNCE loss. + This function uses the :func:`infonce_loss` function to compute the InfoNCE loss + for a given `cebra_model` and the :func:`infonce_to_goodness_of_fit` function + to derive the goodness of fit from the InfoNCE loss. Args: cebra_model: The model to use to compute the InfoNCE loss on the samples. From cab2b2b4d85559d0def9b4e93eb62abd4821ccc3 Mon Sep 17 00:00:00 2001 From: Steffen Schneider Date: Tue, 21 Jan 2025 23:00:02 +0100 Subject: [PATCH 09/14] add annotations to goodness_of_fit_history MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Célia Benquet <32598028+CeliaBenquet@users.noreply.github.com> --- cebra/integrations/sklearn/metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cebra/integrations/sklearn/metrics.py b/cebra/integrations/sklearn/metrics.py index 93ad983f..4f30ba01 100644 --- a/cebra/integrations/sklearn/metrics.py +++ b/cebra/integrations/sklearn/metrics.py @@ -154,7 +154,7 @@ def goodness_of_fit_score(cebra_model: cebra_sklearn_cebra.CEBRA, return infonce_to_goodness_of_fit(loss, cebra_model) -def goodness_of_fit_history(model): +def goodness_of_fit_history(model: cebra_sklearn_cebra.CEBRA) -> np.ndarray: """Return the history of the goodness of fit score. Args: From 1d4276918488602833a5fb464f478c6940603433 Mon Sep 17 00:00:00 2001 From: Steffen Schneider Date: Tue, 21 Jan 2025 23:00:16 +0100 Subject: [PATCH 10/14] fix typo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Célia Benquet <32598028+CeliaBenquet@users.noreply.github.com> --- cebra/integrations/sklearn/metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cebra/integrations/sklearn/metrics.py b/cebra/integrations/sklearn/metrics.py index 4f30ba01..d5f9e2c2 100644 --- a/cebra/integrations/sklearn/metrics.py +++ b/cebra/integrations/sklearn/metrics.py @@ -208,7 +208,7 @@ def infonce_to_goodness_of_fit(infonce: Union[float, np.ndarray], Args: infonce: The InfoNCE loss, either a single value or an iterable of values. - model: The trained CEBRA model + model: The trained CEBRA model. batch_size: The batch size used to train the model. num_sessions: The number of sessions used to train the model. From d6f70e49197d629f4fe0f61e904a039c6a2a68c4 Mon Sep 17 00:00:00 2001 From: Steffen Schneider Date: Tue, 21 Jan 2025 23:00:33 +0100 Subject: [PATCH 11/14] improve err message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Célia Benquet <32598028+CeliaBenquet@users.noreply.github.com> --- cebra/integrations/sklearn/metrics.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cebra/integrations/sklearn/metrics.py b/cebra/integrations/sklearn/metrics.py index d5f9e2c2..a49dd32a 100644 --- a/cebra/integrations/sklearn/metrics.py +++ b/cebra/integrations/sklearn/metrics.py @@ -235,7 +235,10 @@ def infonce_to_goodness_of_fit(infonce: Union[float, np.ndarray], num_sessions = 1 else: if batch_size is None or num_sessions is None: - raise ValueError("batch_size should be provided if model is not provided.") + raise ValueError( + f"batch_size ({batch_size}) and num_sessions ({num_sessions})" + f"should be provided if model is not provided." + ) nats_to_bits = np.log2(np.e) chance_level = np.log(model.batch_size * num_sessions) From bf8694436906fed31bf16a213086787f7f86e5f5 Mon Sep 17 00:00:00 2001 From: Steffen Schneider Date: Sat, 25 Jan 2025 00:14:19 +0100 Subject: [PATCH 12/14] make numerical test less conversative --- tests/test_sklearn_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_sklearn_metrics.py b/tests/test_sklearn_metrics.py index eb4d8420..ab37e291 100644 --- a/tests/test_sklearn_metrics.py +++ b/tests/test_sklearn_metrics.py @@ -441,7 +441,7 @@ def _fit_and_get_history(X, y): # NOTE(stes): Ignore the first 5 iterations, they can have nonsensical values # due to numerical issues. history_random_non_negative = history_random[history_random >= 0] - np.testing.assert_allclose(history_random_non_negative, 0, atol=0.05) + np.testing.assert_allclose(history_random_non_negative, 0, atol=0.075) assert isinstance(history_linear, np.ndarray) assert history_linear.shape[0] > 0 From fd8e7cdaedf2ab692e1e2d225c6ae1aabae414e1 Mon Sep 17 00:00:00 2001 From: Steffen Schneider Date: Sat, 25 Jan 2025 17:26:40 +0100 Subject: [PATCH 13/14] Add tests for exception handling --- tests/test_sklearn_metrics.py | 68 +++++++++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 2 deletions(-) diff --git a/tests/test_sklearn_metrics.py b/tests/test_sklearn_metrics.py index ab37e291..3a3de159 100644 --- a/tests/test_sklearn_metrics.py +++ b/tests/test_sklearn_metrics.py @@ -395,8 +395,9 @@ def test_goodness_of_fit_score(seed): max_iterations=5, batch_size=512, ) - X = torch.tensor(np.random.uniform(0, 1, (5000, 50))) - y = torch.tensor(np.random.uniform(0, 1, (5000, 5))) + generator = torch.Generator().manual_seed(seed) + X = torch.rand(5000, 50, dtype=torch.float32, generator=generator) + y = torch.rand(5000, 5, dtype=torch.float32, generator=generator) cebra_model.fit(X, y) score = cebra_sklearn_metrics.goodness_of_fit_score(cebra_model, X, @@ -447,3 +448,66 @@ def _fit_and_get_history(X, y): assert history_linear.shape[0] > 0 assert np.all(history_linear[-20:] > history_random[-20:]) + + +@pytest.mark.parametrize("seed", [42, 24, 10]) +def test_infonce_to_goodness_of_fit(seed): + """Test the conversion from InfoNCE loss to goodness of fit metric.""" + # Test with model + cebra_model = cebra_sklearn_cebra.CEBRA( + model_architecture="offset10-model", + max_iterations=5, + batch_size=128, + ) + generator = torch.Generator().manual_seed(seed) + X = torch.rand(1000, 50, dtype=torch.float32, generator=generator) + cebra_model.fit(X) + + # Test single value + gof = cebra_sklearn_metrics.infonce_to_goodness_of_fit(1.0, + model=cebra_model) + assert isinstance(gof, float) + + # Test array of values + infonce_values = np.array([1.0, 2.0, 3.0]) + gof_array = cebra_sklearn_metrics.infonce_to_goodness_of_fit( + infonce_values, model=cebra_model) + assert isinstance(gof_array, np.ndarray) + assert gof_array.shape == infonce_values.shape + + # Test with explicit batch_size and num_sessions + gof = cebra_sklearn_metrics.infonce_to_goodness_of_fit(1.0, + batch_size=128, + num_sessions=1) + assert isinstance(gof, float) + + # Test error cases + with pytest.raises(ValueError, match="batch_size.*should not be provided"): + cebra_sklearn_metrics.infonce_to_goodness_of_fit(1.0, + model=cebra_model, + batch_size=128) + + with pytest.raises(ValueError, match="batch_size.*should not be provided"): + cebra_sklearn_metrics.infonce_to_goodness_of_fit(1.0, + model=cebra_model, + num_sessions=1) + + # Test with unfitted model + unfitted_model = cebra_sklearn_cebra.CEBRA() + with pytest.raises(RuntimeError, match="Fit the CEBRA model first"): + cebra_sklearn_metrics.infonce_to_goodness_of_fit(1.0, + model=unfitted_model) + + # Test with model having batch_size=None + none_batch_model = cebra_sklearn_cebra.CEBRA(batch_size=None) + none_batch_model.fit(X) + with pytest.raises(ValueError, match="Computing the goodness of fit"): + cebra_sklearn_metrics.infonce_to_goodness_of_fit(1.0, + model=none_batch_model) + + # Test missing batch_size or num_sessions when model is None + with pytest.raises(ValueError, match="batch_size.*and num_sessions"): + cebra_sklearn_metrics.infonce_to_goodness_of_fit(1.0, batch_size=128) + + with pytest.raises(ValueError, match="batch_size.*and num_sessions"): + cebra_sklearn_metrics.infonce_to_goodness_of_fit(1.0, num_sessions=1) From 3771990f3e9345fa523dac1d478051a73965be67 Mon Sep 17 00:00:00 2001 From: Steffen Schneider Date: Sun, 2 Feb 2025 16:50:41 +0100 Subject: [PATCH 14/14] fix tests --- cebra/integrations/sklearn/metrics.py | 32 ++++++++++++++++----------- tests/test_sklearn_metrics.py | 5 +++-- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/cebra/integrations/sklearn/metrics.py b/cebra/integrations/sklearn/metrics.py index a49dd32a..0af44ecb 100644 --- a/cebra/integrations/sklearn/metrics.py +++ b/cebra/integrations/sklearn/metrics.py @@ -116,7 +116,7 @@ def goodness_of_fit_score(cebra_model: cebra_sklearn_cebra.CEBRA, """Compute the goodness of fit score on a *single session* dataset on the model. This function uses the :func:`infonce_loss` function to compute the InfoNCE loss - for a given `cebra_model` and the :func:`infonce_to_goodness_of_fit` function + for a given `cebra_model` and the :func:`infonce_to_goodness_of_fit` function to derive the goodness of fit from the InfoNCE loss. Args: @@ -180,10 +180,11 @@ def goodness_of_fit_history(model: cebra_sklearn_cebra.CEBRA) -> np.ndarray: return infonce_to_goodness_of_fit(infonce, model) -def infonce_to_goodness_of_fit(infonce: Union[float, np.ndarray], - model: Optional[cebra_sklearn_cebra.CEBRA] = None, - batch_size: Optional[int] = None, - num_sessions: Optional[int] = None) -> Union[float, np.ndarray]: +def infonce_to_goodness_of_fit( + infonce: Union[float, np.ndarray], + model: Optional[cebra_sklearn_cebra.CEBRA] = None, + batch_size: Optional[int] = None, + num_sessions: Optional[int] = None) -> Union[float, np.ndarray]: """Given a trained CEBRA model, return goodness of fit metric. The goodness of fit ranges from 0 (lowest meaningful value) @@ -208,7 +209,7 @@ def infonce_to_goodness_of_fit(infonce: Union[float, np.ndarray], Args: infonce: The InfoNCE loss, either a single value or an iterable of values. - model: The trained CEBRA model. + model: The trained CEBRA model. batch_size: The batch size used to train the model. num_sessions: The number of sessions used to train the model. @@ -221,27 +222,32 @@ def infonce_to_goodness_of_fit(infonce: Union[float, np.ndarray], """ if model is not None: if batch_size is not None or num_sessions is not None: - raise ValueError("batch_size and num_sessions should not be provided if model is provided.") + raise ValueError( + "batch_size and num_sessions should not be provided if model is provided." + ) if not hasattr(model, "state_dict_"): raise RuntimeError("Fit the CEBRA model first.") if model.batch_size is None: raise ValueError( "Computing the goodness of fit is not yet supported for " - "models trained on the full dataset (batchsize = None). " - ) + "models trained on the full dataset (batchsize = None). ") batch_size = model.batch_size num_sessions = model.num_sessions_ if num_sessions is None: num_sessions = 1 + + if model.batch_size is None: + raise ValueError( + "Computing the goodness of fit is not yet supported for " + "models trained on the full dataset (batchsize = None). ") else: if batch_size is None or num_sessions is None: raise ValueError( - f"batch_size ({batch_size}) and num_sessions ({num_sessions})" - f"should be provided if model is not provided." - ) + f"batch_size ({batch_size}) and num_sessions ({num_sessions})" + f"should be provided if model is not provided.") nats_to_bits = np.log2(np.e) - chance_level = np.log(model.batch_size * num_sessions) + chance_level = np.log(batch_size * num_sessions) return (chance_level - infonce) * nats_to_bits diff --git a/tests/test_sklearn_metrics.py b/tests/test_sklearn_metrics.py index 3a3de159..4e765ba7 100644 --- a/tests/test_sklearn_metrics.py +++ b/tests/test_sklearn_metrics.py @@ -493,13 +493,14 @@ def test_infonce_to_goodness_of_fit(seed): num_sessions=1) # Test with unfitted model - unfitted_model = cebra_sklearn_cebra.CEBRA() + unfitted_model = cebra_sklearn_cebra.CEBRA(max_iterations=5) with pytest.raises(RuntimeError, match="Fit the CEBRA model first"): cebra_sklearn_metrics.infonce_to_goodness_of_fit(1.0, model=unfitted_model) # Test with model having batch_size=None - none_batch_model = cebra_sklearn_cebra.CEBRA(batch_size=None) + none_batch_model = cebra_sklearn_cebra.CEBRA(batch_size=None, + max_iterations=5) none_batch_model.fit(X) with pytest.raises(ValueError, match="Computing the goodness of fit"): cebra_sklearn_metrics.infonce_to_goodness_of_fit(1.0,