From d9715f97a131fb3de3004475435fc7f2cc1f8cfa Mon Sep 17 00:00:00 2001 From: Stefan V Date: Sat, 1 Feb 2025 20:12:51 +0100 Subject: [PATCH 1/5] updated version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e3efbd1..2c06164 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ from shutil import which import subprocess -VERSION = "1.1.0" # current package version +VERSION = "1.1.1" # current package version IS_WINDOWS = (platform.system() == 'Windows') # get the operating system STATE_SIZE = 8 # the compiled code supports MHNs with maximum size of 32 * STATE_SIZE From df0222ac1c4ddc1b6601449f7dc4f5ff9d66b358 Mon Sep 17 00:00:00 2001 From: Linda Hu Date: Tue, 1 Apr 2025 08:19:03 +0000 Subject: [PATCH 2/5] set parameters that break the cuda implementation --- test/test_state_space_restriction.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/test/test_state_space_restriction.py b/test/test_state_space_restriction.py index f2be894..944e13e 100644 --- a/test/test_state_space_restriction.py +++ b/test/test_state_space_restriction.py @@ -123,13 +123,21 @@ def setUp(self) -> None: self.skipTest("CUDA not available for testing") def test_compare_with_cython(self): - n = 40 - sample_num = 30 - theta = ModelConstruction.random_theta(n) - random_sample = np.random.choice([0, 1], (sample_num, n), p=[0.7, 0.3]).astype(np.int32) - # make sure that there are mutations in two different "parts" of the "State" C struct - random_sample[:, 1] = 1 - random_sample[:, -3] = 1 + # n = 40 + # sample_num = 30 + # theta = ModelConstruction.random_theta(n) + # random_sample = np.random.choice([0, 1], (sample_num, n), p=[0.7, 0.3]).astype(np.int32) + # # make sure that there are mutations in two different "parts" of the "State" C struct + # random_sample[:, 1] = 1 + # random_sample[:, -3] = 1 + theta = np.array([[0, 0], [0, 1]], dtype=float) + random_sample = np.array( + [ + [0, 0], + [1, 1], + ], + dtype=np.int32, + ) state_containers = StateContainer(random_sample) gradient1, score1 = likelihood_cmhn.cpu_gradient_and_score(theta, state_containers) gradient2, score2 = likelihood_cmhn.cuda_gradient_and_score(theta, state_containers) From fe8b810f49844321e661125f3a9caada8bff2fe0 Mon Sep 17 00:00:00 2001 From: Linda Hu Date: Wed, 9 Apr 2025 11:22:12 +0000 Subject: [PATCH 3/5] set patch_size to 0 if mutation_num == 1 and if path_size == 0, only add one value --- mhn/training/cuda_likelihood.cu | 16 ++++++++++++---- test/test_state_space_restriction.py | 24 +++++++++--------------- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/mhn/training/cuda_likelihood.cu b/mhn/training/cuda_likelihood.cu index 9627dc7..fac7cdb 100644 --- a/mhn/training/cuda_likelihood.cu +++ b/mhn/training/cuda_likelihood.cu @@ -156,8 +156,14 @@ __global__ void cuda_restricted_kronvec(const double* __restrict__ ptheta, const // patch_size is important for later for the case i == j in the shuffle algorithm // as we do not actually shuffle the data in px in this implementation (only implicitly), we have to keep track of some indices // and which entries have to be computed together in the case i == j. Those two indices are (x_index) and (x_index + patch_size) - // ("patch_size", as over all, the entries that have to be computed together occur in patches of size 2**(count_before_i)) - const int patch_size = 1 << count_before_i; + // ("patch_size", as over all, the entries that have to be computed together occur in patches of size 2**(count_before_i) + + int patch_size; + if (count_before_i < 0) { + patch_size = 0; + } else { + patch_size = 1 << count_before_i; + } int x_index = (cuda_index >> count_before_i) * 2 * patch_size + (cuda_index & (patch_size - 1)); // for each iteration of this while loop, we compute the output values for indices (x_index) and (x_index + patch_size) @@ -238,10 +244,12 @@ __global__ void cuda_restricted_kronvec(const double* __restrict__ ptheta, const else { pout[x_index] += theta_product * px[x_index]; // multiply the last theta to this entry, as the thetas needed for both entries only differ in this last one - pout[x_index + patch_size] += theta_product * px[x_index + patch_size] * theta; + if (patch_size > 0) { + pout[x_index + patch_size] += theta_product * px[x_index + patch_size] * theta; + } } - + // if patch_size is bigger than stride, we have to do corrections to the indices if (stride < patch_size) { // check if the current index is inside an odd patch, if so, jump to the next one diff --git a/test/test_state_space_restriction.py b/test/test_state_space_restriction.py index 944e13e..1763b70 100644 --- a/test/test_state_space_restriction.py +++ b/test/test_state_space_restriction.py @@ -123,21 +123,15 @@ def setUp(self) -> None: self.skipTest("CUDA not available for testing") def test_compare_with_cython(self): - # n = 40 - # sample_num = 30 - # theta = ModelConstruction.random_theta(n) - # random_sample = np.random.choice([0, 1], (sample_num, n), p=[0.7, 0.3]).astype(np.int32) - # # make sure that there are mutations in two different "parts" of the "State" C struct - # random_sample[:, 1] = 1 - # random_sample[:, -3] = 1 - theta = np.array([[0, 0], [0, 1]], dtype=float) - random_sample = np.array( - [ - [0, 0], - [1, 1], - ], - dtype=np.int32, - ) + n = 40 + sample_num = 30 + theta = ModelConstruction.random_theta(n) + random_sample = np.random.choice([0, 1], (sample_num, n), p=[0.7, 0.3]).astype(np.int32) + # make sure that there are mutations in two different "parts" of the "State" C struct + random_sample[:, 1] = 1 + random_sample[:, -3] = 1 + # also test with an empty sample + random_sample[-1, :] = 0 state_containers = StateContainer(random_sample) gradient1, score1 = likelihood_cmhn.cpu_gradient_and_score(theta, state_containers) gradient2, score2 = likelihood_cmhn.cuda_gradient_and_score(theta, state_containers) From 20e17dfe43d74de1c254d10c420f1f67584b7d0e Mon Sep 17 00:00:00 2001 From: Stefan V Date: Wed, 9 Apr 2025 22:05:52 +0200 Subject: [PATCH 4/5] put no mutation check at the beginning of the kernel to simplify logic and save compute --- mhn/training/cuda_likelihood.cu | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/mhn/training/cuda_likelihood.cu b/mhn/training/cuda_likelihood.cu index fac7cdb..70f4499 100644 --- a/mhn/training/cuda_likelihood.cu +++ b/mhn/training/cuda_likelihood.cu @@ -125,6 +125,18 @@ __global__ void cuda_restricted_kronvec(const double* __restrict__ ptheta, const const int stride = blockDim.x * gridDim.x; const int cuda_index = blockIdx.x * blockDim.x + threadIdx.x; + // special case: if there are no mutations, the later logic won't work, instead simply multiply px[0] with theta_ii and subtract it from pout[0] + if (mutation_num == 0) { + // as the transition rate matrix is here a simple scalar, if diag is false, there is nothing to do + if (!diag) + return; + + if (cuda_index == 0) + pout[0] -= ptheta[i * n + i] * px[0]; + + return; + } + // in the following 1 << i is equivalent to 2^i, x >> i is equivalent to x // 2^i, x & ((1<> count_before_i) * 2 * patch_size + (cuda_index & (patch_size - 1)); // for each iteration of this while loop, we compute the output values for indices (x_index) and (x_index + patch_size) @@ -244,9 +250,7 @@ __global__ void cuda_restricted_kronvec(const double* __restrict__ ptheta, const else { pout[x_index] += theta_product * px[x_index]; // multiply the last theta to this entry, as the thetas needed for both entries only differ in this last one - if (patch_size > 0) { - pout[x_index + patch_size] += theta_product * px[x_index + patch_size] * theta; - } + pout[x_index + patch_size] += theta_product * px[x_index + patch_size] * theta; } From 454daa08dd52c4200114ba6ce3357f439a783eef Mon Sep 17 00:00:00 2001 From: Stefan V Date: Fri, 11 Apr 2025 15:56:08 +0200 Subject: [PATCH 5/5] some really minor typo corrections in comments --- mhn/training/cuda_likelihood.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mhn/training/cuda_likelihood.cu b/mhn/training/cuda_likelihood.cu index 70f4499..cff08e5 100644 --- a/mhn/training/cuda_likelihood.cu +++ b/mhn/training/cuda_likelihood.cu @@ -168,7 +168,7 @@ __global__ void cuda_restricted_kronvec(const double* __restrict__ ptheta, const // patch_size is important for later for the case i == j in the shuffle algorithm // as we do not actually shuffle the data in px in this implementation (only implicitly), we have to keep track of some indices // and which entries have to be computed together in the case i == j. Those two indices are (x_index) and (x_index + patch_size) - // ("patch_size", as over all, the entries that have to be computed together occur in patches of size 2**(count_before_i) + // ("patch_size", as over all, the entries that have to be computed together occur in patches of size 2**(count_before_i)) const int patch_size = 1 << count_before_i; int x_index = (cuda_index >> count_before_i) * 2 * patch_size + (cuda_index & (patch_size - 1)); @@ -245,7 +245,7 @@ __global__ void cuda_restricted_kronvec(const double* __restrict__ ptheta, const } } } - // if the ith gene is not mutated the two entries do not have to be computated together, this is why we could choose + // if the ith gene is not mutated the two entries do not have to be computed together, this is why we could choose // count_before_i independently from the given state else { pout[x_index] += theta_product * px[x_index]; @@ -666,7 +666,7 @@ void DLL_PREFIX cuda_restricted_gradient(const double *ptheta, const State *stat // use the shuffle trick for a more efficient computation of the gradient for (int j = 0; j < n; j++) { // confusion warning: the p0_pD here has nothing to do with p0 or pD - // in this section p0_pD is used again, because we need an allocated array and p0_pD isnt needed anymore so we can just use that as memory + // in this section p0_pD is used again, because we need an allocated array and p0_pD isn't needed anymore so we can just use that as memory if (state_copy & 1) { shuffle<<>>(old_vec, shuffled_vec, nx); if (i == j) {