StefanDevAccount · StefanDevAccount · Apr 11, 2025 · Feb 1, 2025 · Apr 1, 2025 · Apr 9, 2025
diff --git a/mhn/training/cuda_likelihood.cu b/mhn/training/cuda_likelihood.cu
@@ -125,6 +125,18 @@ __global__ void cuda_restricted_kronvec(const double* __restrict__ ptheta, const
     const int stride = blockDim.x * gridDim.x;
     const int cuda_index = blockIdx.x * blockDim.x + threadIdx.x;
 
+    // special case: if there are no mutations, the later logic won't work, instead simply multiply px[0] with theta_ii and subtract it from pout[0]
+    if (mutation_num == 0) {
+        // as the transition rate matrix is here a simple scalar, if diag is false, there is nothing to do
+        if (!diag)
+            return;
+
+        if (cuda_index == 0)
+            pout[0] -= ptheta[i * n + i] * px[0];
+
+        return;
+    }
+
     // in the following 1 << i is equivalent to 2^i, x >> i is equivalent to x // 2^i, x & ((1<<i)-1) to x % 2^i
     const int nx = 1 << mutation_num;
 
@@ -233,15 +245,15 @@ __global__ void cuda_restricted_kronvec(const double* __restrict__ ptheta, const
                 }
             }
         }
-        // if the ith gene is not mutated the two entries do not have to be computated together, this is why we could choose
+        // if the ith gene is not mutated the two entries do not have to be computed together, this is why we could choose
         // count_before_i independently from the given state
         else {
             pout[x_index] += theta_product * px[x_index];
             // multiply the last theta to this entry, as the thetas needed for both entries only differ in this last one
             pout[x_index + patch_size] += theta_product * px[x_index + patch_size] * theta;
         }
 
-
+        
         // if patch_size is bigger than stride, we have to do corrections to the indices
         if (stride < patch_size) {
             // check if the current index is inside an odd patch, if so, jump to the next one
@@ -654,7 +666,7 @@ void DLL_PREFIX cuda_restricted_gradient(const double *ptheta, const State *stat
         // use the shuffle trick for a more efficient computation of the gradient
         for (int j = 0; j < n; j++) {
             // confusion warning: the p0_pD here has nothing to do with p0 or pD
-            // in this section p0_pD is used again, because we need an allocated array and p0_pD isnt needed anymore so we can just use that as memory
+            // in this section p0_pD is used again, because we need an allocated array and p0_pD isn't needed anymore so we can just use that as memory
             if (state_copy & 1) {
                 shuffle<<<block_num, thread_num>>>(old_vec, shuffled_vec, nx);
                 if (i == j) {

diff --git a/setup.py b/setup.py
@@ -9,7 +9,7 @@
 from shutil import which
 import subprocess
 
-VERSION = "1.1.0"                                  # current package version
+VERSION = "1.1.1"                                  # current package version
 
 IS_WINDOWS = (platform.system() == 'Windows')      # get the operating system
 STATE_SIZE = 8                                     # the compiled code supports MHNs with maximum size of 32 * STATE_SIZE

diff --git a/test/test_state_space_restriction.py b/test/test_state_space_restriction.py
@@ -130,6 +130,8 @@ def test_compare_with_cython(self):
         # make sure that there are mutations in two different "parts" of the "State" C struct
         random_sample[:, 1] = 1
         random_sample[:, -3] = 1
+        # also test with an empty sample
+        random_sample[-1, :] = 0
         state_containers = StateContainer(random_sample)
         gradient1, score1 = likelihood_cmhn.cpu_gradient_and_score(theta, state_containers)
         gradient2, score2 = likelihood_cmhn.cuda_gradient_and_score(theta, state_containers)