Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions mhn/training/cuda_likelihood.cu
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,18 @@ __global__ void cuda_restricted_kronvec(const double* __restrict__ ptheta, const
const int stride = blockDim.x * gridDim.x;
const int cuda_index = blockIdx.x * blockDim.x + threadIdx.x;

// special case: if there are no mutations, the later logic won't work, instead simply multiply px[0] with theta_ii and subtract it from pout[0]
if (mutation_num == 0) {
// as the transition rate matrix is here a simple scalar, if diag is false, there is nothing to do
if (!diag)
return;

if (cuda_index == 0)
pout[0] -= ptheta[i * n + i] * px[0];

return;
}

// in the following 1 << i is equivalent to 2^i, x >> i is equivalent to x // 2^i, x & ((1<<i)-1) to x % 2^i
const int nx = 1 << mutation_num;

Expand Down Expand Up @@ -233,15 +245,15 @@ __global__ void cuda_restricted_kronvec(const double* __restrict__ ptheta, const
}
}
}
// if the ith gene is not mutated the two entries do not have to be computated together, this is why we could choose
// if the ith gene is not mutated the two entries do not have to be computed together, this is why we could choose
// count_before_i independently from the given state
else {
pout[x_index] += theta_product * px[x_index];
// multiply the last theta to this entry, as the thetas needed for both entries only differ in this last one
pout[x_index + patch_size] += theta_product * px[x_index + patch_size] * theta;
}


// if patch_size is bigger than stride, we have to do corrections to the indices
if (stride < patch_size) {
// check if the current index is inside an odd patch, if so, jump to the next one
Expand Down Expand Up @@ -654,7 +666,7 @@ void DLL_PREFIX cuda_restricted_gradient(const double *ptheta, const State *stat
// use the shuffle trick for a more efficient computation of the gradient
for (int j = 0; j < n; j++) {
// confusion warning: the p0_pD here has nothing to do with p0 or pD
// in this section p0_pD is used again, because we need an allocated array and p0_pD isnt needed anymore so we can just use that as memory
// in this section p0_pD is used again, because we need an allocated array and p0_pD isn't needed anymore so we can just use that as memory
if (state_copy & 1) {
shuffle<<<block_num, thread_num>>>(old_vec, shuffled_vec, nx);
if (i == j) {
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from shutil import which
import subprocess

VERSION = "1.1.0" # current package version
VERSION = "1.1.1" # current package version

IS_WINDOWS = (platform.system() == 'Windows') # get the operating system
STATE_SIZE = 8 # the compiled code supports MHNs with maximum size of 32 * STATE_SIZE
Expand Down
2 changes: 2 additions & 0 deletions test/test_state_space_restriction.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,8 @@ def test_compare_with_cython(self):
# make sure that there are mutations in two different "parts" of the "State" C struct
random_sample[:, 1] = 1
random_sample[:, -3] = 1
# also test with an empty sample
random_sample[-1, :] = 0
state_containers = StateContainer(random_sample)
gradient1, score1 = likelihood_cmhn.cpu_gradient_and_score(theta, state_containers)
gradient2, score2 = likelihood_cmhn.cuda_gradient_and_score(theta, state_containers)
Expand Down