From d9715f97a131fb3de3004475435fc7f2cc1f8cfa Mon Sep 17 00:00:00 2001
From: Stefan V <stefan.email.2000@gmail.com>
Date: Sat, 1 Feb 2025 20:12:51 +0100
Subject: [PATCH 1/5] updated version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index e3efbd1..2c06164 100644
--- a/setup.py
+++ b/setup.py
@@ -9,7 +9,7 @@
 from shutil import which
 import subprocess
 
-VERSION = "1.1.0"                                  # current package version
+VERSION = "1.1.1"                                  # current package version
 
 IS_WINDOWS = (platform.system() == 'Windows')      # get the operating system
 STATE_SIZE = 8                                     # the compiled code supports MHNs with maximum size of 32 * STATE_SIZE

From df0222ac1c4ddc1b6601449f7dc4f5ff9d66b358 Mon Sep 17 00:00:00 2001
From: Linda Hu <hu.linda@aol.com>
Date: Tue, 1 Apr 2025 08:19:03 +0000
Subject: [PATCH 2/5] set parameters that break the cuda implementation

---
 test/test_state_space_restriction.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/test/test_state_space_restriction.py b/test/test_state_space_restriction.py
index f2be894..944e13e 100644
--- a/test/test_state_space_restriction.py
+++ b/test/test_state_space_restriction.py
@@ -123,13 +123,21 @@ def setUp(self) -> None:
             self.skipTest("CUDA not available for testing")
 
     def test_compare_with_cython(self):
-        n = 40
-        sample_num = 30
-        theta = ModelConstruction.random_theta(n)
-        random_sample = np.random.choice([0, 1], (sample_num, n), p=[0.7, 0.3]).astype(np.int32)
-        # make sure that there are mutations in two different "parts" of the "State" C struct
-        random_sample[:, 1] = 1
-        random_sample[:, -3] = 1
+        # n = 40
+        # sample_num = 30
+        # theta = ModelConstruction.random_theta(n)
+        # random_sample = np.random.choice([0, 1], (sample_num, n), p=[0.7, 0.3]).astype(np.int32)
+        # # make sure that there are mutations in two different "parts" of the "State" C struct
+        # random_sample[:, 1] = 1
+        # random_sample[:, -3] = 1
+        theta = np.array([[0, 0], [0, 1]], dtype=float)
+        random_sample = np.array(
+            [
+                [0, 0],
+                [1, 1],
+            ],
+            dtype=np.int32,
+        )
         state_containers = StateContainer(random_sample)
         gradient1, score1 = likelihood_cmhn.cpu_gradient_and_score(theta, state_containers)
         gradient2, score2 = likelihood_cmhn.cuda_gradient_and_score(theta, state_containers)

From fe8b810f49844321e661125f3a9caada8bff2fe0 Mon Sep 17 00:00:00 2001
From: Linda Hu <hu.linda@aol.com>
Date: Wed, 9 Apr 2025 11:22:12 +0000
Subject: [PATCH 3/5] set patch_size to 0 if mutation_num == 1 and if path_size
 == 0, only add one value

---
 mhn/training/cuda_likelihood.cu      | 16 ++++++++++++----
 test/test_state_space_restriction.py | 24 +++++++++---------------
 2 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/mhn/training/cuda_likelihood.cu b/mhn/training/cuda_likelihood.cu
index 9627dc7..fac7cdb 100644
--- a/mhn/training/cuda_likelihood.cu
+++ b/mhn/training/cuda_likelihood.cu
@@ -156,8 +156,14 @@ __global__ void cuda_restricted_kronvec(const double* __restrict__ ptheta, const
     // patch_size is important for later for the case i == j in the shuffle algorithm
     // as we do not actually shuffle the data in px in this implementation (only implicitly), we have to keep track of some indices
     // and which entries have to be computed together in the case i == j. Those two indices are (x_index) and (x_index + patch_size)
-    // ("patch_size", as over all, the entries that have to be computed together occur in patches of size 2**(count_before_i))
-    const int patch_size = 1 << count_before_i;
+    // ("patch_size", as over all, the entries that have to be computed together occur in patches of size 2**(count_before_i)
+    
+    int patch_size;
+    if (count_before_i < 0) {
+        patch_size = 0;
+    } else {
+        patch_size = 1 << count_before_i;
+    }
     int x_index = (cuda_index >> count_before_i) * 2 * patch_size + (cuda_index & (patch_size - 1));
 
     // for each iteration of this while loop, we compute the output values for indices (x_index) and (x_index + patch_size) 
@@ -238,10 +244,12 @@ __global__ void cuda_restricted_kronvec(const double* __restrict__ ptheta, const
         else {
             pout[x_index] += theta_product * px[x_index];
             // multiply the last theta to this entry, as the thetas needed for both entries only differ in this last one
-            pout[x_index + patch_size] += theta_product * px[x_index + patch_size] * theta;
+            if (patch_size > 0) {
+                pout[x_index + patch_size] += theta_product * px[x_index + patch_size] * theta;
+            }
         }
 
-
+        
         // if patch_size is bigger than stride, we have to do corrections to the indices
         if (stride < patch_size) {
             // check if the current index is inside an odd patch, if so, jump to the next one
diff --git a/test/test_state_space_restriction.py b/test/test_state_space_restriction.py
index 944e13e..1763b70 100644
--- a/test/test_state_space_restriction.py
+++ b/test/test_state_space_restriction.py
@@ -123,21 +123,15 @@ def setUp(self) -> None:
             self.skipTest("CUDA not available for testing")
 
     def test_compare_with_cython(self):
-        # n = 40
-        # sample_num = 30
-        # theta = ModelConstruction.random_theta(n)
-        # random_sample = np.random.choice([0, 1], (sample_num, n), p=[0.7, 0.3]).astype(np.int32)
-        # # make sure that there are mutations in two different "parts" of the "State" C struct
-        # random_sample[:, 1] = 1
-        # random_sample[:, -3] = 1
-        theta = np.array([[0, 0], [0, 1]], dtype=float)
-        random_sample = np.array(
-            [
-                [0, 0],
-                [1, 1],
-            ],
-            dtype=np.int32,
-        )
+        n = 40
+        sample_num = 30
+        theta = ModelConstruction.random_theta(n)
+        random_sample = np.random.choice([0, 1], (sample_num, n), p=[0.7, 0.3]).astype(np.int32)
+        # make sure that there are mutations in two different "parts" of the "State" C struct
+        random_sample[:, 1] = 1
+        random_sample[:, -3] = 1
+        # also test with an empty sample
+        random_sample[-1, :] = 0
         state_containers = StateContainer(random_sample)
         gradient1, score1 = likelihood_cmhn.cpu_gradient_and_score(theta, state_containers)
         gradient2, score2 = likelihood_cmhn.cuda_gradient_and_score(theta, state_containers)

From 20e17dfe43d74de1c254d10c420f1f67584b7d0e Mon Sep 17 00:00:00 2001
From: Stefan V <stefan.email.2000@gmail.com>
Date: Wed, 9 Apr 2025 22:05:52 +0200
Subject: [PATCH 4/5] put no mutation check at the beginning of the kernel to
 simplify logic and save compute

---
 mhn/training/cuda_likelihood.cu | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/mhn/training/cuda_likelihood.cu b/mhn/training/cuda_likelihood.cu
index fac7cdb..70f4499 100644
--- a/mhn/training/cuda_likelihood.cu
+++ b/mhn/training/cuda_likelihood.cu
@@ -125,6 +125,18 @@ __global__ void cuda_restricted_kronvec(const double* __restrict__ ptheta, const
     const int stride = blockDim.x * gridDim.x;
     const int cuda_index = blockIdx.x * blockDim.x + threadIdx.x;
 
+    // special case: if there are no mutations, the later logic won't work, instead simply multiply px[0] with theta_ii and subtract it from pout[0]
+    if (mutation_num == 0) {
+        // as the transition rate matrix is here a simple scalar, if diag is false, there is nothing to do
+        if (!diag)
+            return;
+
+        if (cuda_index == 0)
+            pout[0] -= ptheta[i * n + i] * px[0];
+
+        return;
+    }
+
     // in the following 1 << i is equivalent to 2^i, x >> i is equivalent to x // 2^i, x & ((1<<i)-1) to x % 2^i
     const int nx = 1 << mutation_num;
 
@@ -157,13 +169,7 @@ __global__ void cuda_restricted_kronvec(const double* __restrict__ ptheta, const
     // as we do not actually shuffle the data in px in this implementation (only implicitly), we have to keep track of some indices
     // and which entries have to be computed together in the case i == j. Those two indices are (x_index) and (x_index + patch_size)
     // ("patch_size", as over all, the entries that have to be computed together occur in patches of size 2**(count_before_i)
-    
-    int patch_size;
-    if (count_before_i < 0) {
-        patch_size = 0;
-    } else {
-        patch_size = 1 << count_before_i;
-    }
+    const int patch_size = 1 << count_before_i;
     int x_index = (cuda_index >> count_before_i) * 2 * patch_size + (cuda_index & (patch_size - 1));
 
     // for each iteration of this while loop, we compute the output values for indices (x_index) and (x_index + patch_size) 
@@ -244,9 +250,7 @@ __global__ void cuda_restricted_kronvec(const double* __restrict__ ptheta, const
         else {
             pout[x_index] += theta_product * px[x_index];
             // multiply the last theta to this entry, as the thetas needed for both entries only differ in this last one
-            if (patch_size > 0) {
-                pout[x_index + patch_size] += theta_product * px[x_index + patch_size] * theta;
-            }
+            pout[x_index + patch_size] += theta_product * px[x_index + patch_size] * theta;
         }
 
         

From 454daa08dd52c4200114ba6ce3357f439a783eef Mon Sep 17 00:00:00 2001
From: Stefan V <stefan.email.2000@gmail.com>
Date: Fri, 11 Apr 2025 15:56:08 +0200
Subject: [PATCH 5/5] some really minor typo corrections in comments

---
 mhn/training/cuda_likelihood.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mhn/training/cuda_likelihood.cu b/mhn/training/cuda_likelihood.cu
index 70f4499..cff08e5 100644
--- a/mhn/training/cuda_likelihood.cu
+++ b/mhn/training/cuda_likelihood.cu
@@ -168,7 +168,7 @@ __global__ void cuda_restricted_kronvec(const double* __restrict__ ptheta, const
     // patch_size is important for later for the case i == j in the shuffle algorithm
     // as we do not actually shuffle the data in px in this implementation (only implicitly), we have to keep track of some indices
     // and which entries have to be computed together in the case i == j. Those two indices are (x_index) and (x_index + patch_size)
-    // ("patch_size", as over all, the entries that have to be computed together occur in patches of size 2**(count_before_i)
+    // ("patch_size", as over all, the entries that have to be computed together occur in patches of size 2**(count_before_i))
     const int patch_size = 1 << count_before_i;
     int x_index = (cuda_index >> count_before_i) * 2 * patch_size + (cuda_index & (patch_size - 1));
 
@@ -245,7 +245,7 @@ __global__ void cuda_restricted_kronvec(const double* __restrict__ ptheta, const
                 }
             }
         }
-        // if the ith gene is not mutated the two entries do not have to be computated together, this is why we could choose
+        // if the ith gene is not mutated the two entries do not have to be computed together, this is why we could choose
         // count_before_i independently from the given state
         else {
             pout[x_index] += theta_product * px[x_index];
@@ -666,7 +666,7 @@ void DLL_PREFIX cuda_restricted_gradient(const double *ptheta, const State *stat
         // use the shuffle trick for a more efficient computation of the gradient
         for (int j = 0; j < n; j++) {
             // confusion warning: the p0_pD here has nothing to do with p0 or pD
-            // in this section p0_pD is used again, because we need an allocated array and p0_pD isnt needed anymore so we can just use that as memory
+            // in this section p0_pD is used again, because we need an allocated array and p0_pD isn't needed anymore so we can just use that as memory
             if (state_copy & 1) {
                 shuffle<<<block_num, thread_num>>>(old_vec, shuffled_vec, nx);
                 if (i == j) {