From c96615b8f18b6da5f8d8e2c92f6819f04fdea77a Mon Sep 17 00:00:00 2001
From: noemibuehrer <noeminaijia.buehrer@uzh.ch>
Date: Mon, 2 Mar 2026 11:15:43 +0100
Subject: [PATCH] fix: correct TI evaluation and leave out uncertainty
 reporting

---
 src/lyscripts/compute/evidence.py | 25 +++++++++----------------
 src/lyscripts/evaluate.py         | 25 +++++++++----------------
 2 files changed, 18 insertions(+), 32 deletions(-)

diff --git a/src/lyscripts/compute/evidence.py b/src/lyscripts/compute/evidence.py
index 1cd51e2..ffce481 100644
--- a/src/lyscripts/compute/evidence.py
+++ b/src/lyscripts/compute/evidence.py
@@ -48,23 +48,17 @@ def comp_bic(log_probs: np.ndarray, num_params: int, num_data: int) -> float:
 def compute_evidence(
     temp_schedule: np.ndarray,
     log_probs: np.ndarray,
-    num: int = 1000,
-) -> tuple[float, float]:
-    """Compute the evidence and its standard deviation.
+) -> float:
+    """Compute the evidence.
 
     Given a ``temp_schedule`` of inverse temperatures and corresponding sets of
-    ``log_probs``, draw ``num`` "paths" of log-probabilities and compute the evidence
-    for each using trapezoidal integration.
-
-    The evidence is then the mean of those ``num`` integrations, while the error is
-    their standard deviation.
+    ``log_probs``, we calculate the mean ``log_prob`` over all samples to approximate
+    the expectation value under the corresponding power posterior for each step in the
+    ``temp_schedule``. The evidence is evaluated using trapezoidal integration of the
+    expectation values over the ``temp_schedule``.
     """
-    integrals = np.zeros(shape=num)
-    for i in range(num):
-        rand_idx = RNG.choice(log_probs.shape[1], size=log_probs.shape[0])
-        drawn_accuracy = log_probs[np.arange(log_probs.shape[0]), rand_idx].copy()
-        integrals[i] = trapezoid(y=drawn_accuracy, x=temp_schedule)
-    return np.mean(integrals), np.std(integrals)
+    a_mc = np.mean(log_probs, axis=1)
+    return trapezoid(y=a_mc, x=temp_schedule)
 
 
 def compute_ti_results(
@@ -95,9 +89,8 @@ def compute_ti_results(
         )
         ti_log_probs[i] = reader.get_blobs(flat=True)["log_prob"]
 
-    evidence, evidence_std = compute_evidence(temp_schedule, ti_log_probs)
+    evidence = compute_evidence(temp_schedule, ti_log_probs)
     metrics["evidence"] = evidence
-    metrics["evidence_std"] = evidence_std
 
     return temp_schedule, ti_log_probs
 
diff --git a/src/lyscripts/evaluate.py b/src/lyscripts/evaluate.py
index d09c5dd..5f9dbd0 100644
--- a/src/lyscripts/evaluate.py
+++ b/src/lyscripts/evaluate.py
@@ -90,23 +90,17 @@ def comp_bic(log_probs: np.ndarray, num_params: int, num_data: int) -> float:
 def compute_evidence(
     temp_schedule: np.ndarray,
     log_probs: np.ndarray,
-    num: int = 1000,
-) -> tuple[float, float]:
-    """Compute the evidence and its standard deviation.
+) -> float:
+    """Compute the evidence.
 
     Given a ``temp_schedule`` of inverse temperatures and corresponding sets of
-    ``log_probs``, draw ``num`` "paths" of log-probabilities and compute the evidence
-    for each using trapezoidal integration.
-
-    The evidence is then the mean of those ``num`` integrations, while the error is
-    their standard deviation.
+    ``log_probs``, we calculate the mean ``log_prob`` over all samples to approximate
+    the expectation value under the corresponding power posterior for each step in the
+    ``temp_schedule``. The evidence is evaluated using trapezoidal integration of the
+    expectation values over the ``temp_schedule``.
     """
-    integrals = np.zeros(shape=num)
-    for i in range(num):
-        rand_idx = RNG.choice(log_probs.shape[1], size=log_probs.shape[0])
-        drawn_accuracy = log_probs[np.arange(log_probs.shape[0]), rand_idx].copy()
-        integrals[i] = trapezoid(y=drawn_accuracy, x=temp_schedule)
-    return np.mean(integrals), np.std(integrals)
+    a_mc = np.mean(log_probs, axis=1)
+    return trapezoid(y=a_mc, x=temp_schedule)
 
 
 def compute_ti_results(
@@ -134,9 +128,8 @@ def compute_ti_results(
         reader = emcee.backends.HDFBackend(model, name=f"ti/{run}", read_only=True)
         ti_log_probs[i] = reader.get_blobs(flat=True)
 
-    evidence, evidence_std = compute_evidence(temp_schedule, ti_log_probs)
+    evidence = compute_evidence(temp_schedule, ti_log_probs)
     metrics["evidence"] = evidence
-    metrics["evidence_std"] = evidence_std
 
     return temp_schedule, ti_log_probs