From 6394d61262b9b37353b8a415d541c3e9e28a1d31 Mon Sep 17 00:00:00 2001 From: Ryan O'Dea <70209371+ryan-odea@users.noreply.github.com> Date: Thu, 19 Feb 2026 13:30:08 +0100 Subject: [PATCH 01/16] skeletonize --- CompRiskReg/__init__.py | 0 CompRiskReg/_crr_kernels.py | 0 CompRiskReg/_cuminc_kernels.py | 0 CompRiskReg/_km.py | 0 CompRiskReg/crr.py | 0 CompRiskReg/cuminc.py | 0 6 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 CompRiskReg/__init__.py create mode 100644 CompRiskReg/_crr_kernels.py create mode 100644 CompRiskReg/_cuminc_kernels.py create mode 100644 CompRiskReg/_km.py create mode 100644 CompRiskReg/crr.py create mode 100644 CompRiskReg/cuminc.py diff --git a/CompRiskReg/__init__.py b/CompRiskReg/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/CompRiskReg/_crr_kernels.py b/CompRiskReg/_crr_kernels.py new file mode 100644 index 0000000..e69de29 diff --git a/CompRiskReg/_cuminc_kernels.py b/CompRiskReg/_cuminc_kernels.py new file mode 100644 index 0000000..e69de29 diff --git a/CompRiskReg/_km.py b/CompRiskReg/_km.py new file mode 100644 index 0000000..e69de29 diff --git a/CompRiskReg/crr.py b/CompRiskReg/crr.py new file mode 100644 index 0000000..e69de29 diff --git a/CompRiskReg/cuminc.py b/CompRiskReg/cuminc.py new file mode 100644 index 0000000..e69de29 From 0f439602c8c532034b2fae2be1ecd9618100d422 Mon Sep 17 00:00:00 2001 From: Ryan O'Dea <70209371+ryan-odea@users.noreply.github.com> Date: Thu, 19 Feb 2026 13:48:27 +0100 Subject: [PATCH 02/16] crr translation --- CompRiskReg/_crr_kernels.py | 611 ++++++++++++++++++++++++++++++++++++ 1 file changed, 611 insertions(+) diff --git a/CompRiskReg/_crr_kernels.py b/CompRiskReg/_crr_kernels.py index e69de29..77877f5 100644 --- a/CompRiskReg/_crr_kernels.py +++ b/CompRiskReg/_crr_kernels.py @@ -0,0 +1,611 @@ +from typing import Optional, Tuple + +import numpy as np + + +def _covt( + j: int, + k: int, + x: Optional[np.ndarray], + cov2: Optional[np.ndarray], + tf: Optional[np.ndarray], + b: np.ndarray, +) -> Tuple[float, np.ndarray]: + """ + Compute linear predictor and effective covariate vector for subject j at failure-time index k. + + :param j: Subject index. + :param k: Failure-time index (0-based into tf rows). + :param x: Fixed covariate matrix of shape (n_subjects, ncov), or None. + :param cov2: Time-varying covariate matrix of shape (n_subjects, ncov2), or None. + :param tf: Time functions evaluated at unique failure times, shape (n_times, ncov2), or None. + :param b: Current coefficient vector of shape (ncov + ncov2,). + :returns: Tuple of: + - **wk** (*float*) -- Linear predictor eta for subject j at time k. + - **xbt** (*np.ndarray*) -- Effective covariate vector of shape (ncov + ncov2,). + """ + + ncov = x.shape[1] if x is not None else 0 + ncov2 = cov2.shape[1] if cov2 is not None else 0 + np_ = ncov + ncov2 + xbt = np.zeros(np_) + wk = 0.0 + + # Fixed covariates + if ncov > 0: + xbt[:ncov] = x[j, :] + wk += np.dot(xbt[:ncov], b[:ncov]) + + # Time-varying covariates: cov2[j, i] * tf[k, i] + if ncov2 > 0: + xbt[ncov:] = cov2[j, :] * tf[k, :] + wk += np.dot(xbt[ncov:], b[ncov:]) + + return wk, xbt + + +from typing import Optional, Tuple + +import numpy as np + + +def crrfsv( + t2: np.ndarray, + ici: np.ndarray, + x: Optional[np.ndarray], + ncov: int, + np_: int, + cov2: Optional[np.ndarray], + ncov2: int, + tf: Optional[np.ndarray], + ndf: int, + wt: np.ndarray, + ncg: int, + icg: np.ndarray, + b: np.ndarray, +) -> Tuple[float, np.ndarray, np.ndarray]: + """ + Compute log pseudo-partial likelihood, score vector, and observed information matrix. + + :param t2: Sorted event times of shape (n,). + :param ici: Status codes of shape (n,); 0=censored, 1=cause of interest, 2=competing event. + :param x: Fixed covariate matrix of shape (n, ncov), or None. + :param ncov: Number of fixed covariates. + :param np_: Total number of parameters (ncov + ncov2). + :param cov2: Time-varying covariate matrix of shape (n, ncov2), or None. + :param ncov2: Number of time-varying covariates. + :param tf: Time functions evaluated at unique failure times, shape (ndf, ncov2), or None. + :param ndf: Number of unique failure times. + :param wt: IPCW weight matrix of shape (ncg, n). + :param ncg: Number of censoring groups. + :param icg: 1-based censoring group assignments of shape (n,). + :param b: Current coefficient vector of shape (np_,). + :returns: Tuple of: + - **lik** (*float*) -- Negative log pseudo-partial likelihood. + - **s** (*np.ndarray*) -- Score vector of shape (np_,). + - **v** (*np.ndarray*) -- Observed information matrix of shape (np_, np_). + """ + n = len(t2) + lik = 0.0 + s = np.zeros(np_) + v = np.zeros((np_, np_)) + + iuc = n - 1 + ldf = ndf + + while iuc >= 0: + found = False + for i in range(iuc, -1, -1): + if ici[i] == 1: + cft = t2[i] + iuc = i + found = True + break + if not found: + break + + ldf -= 1 + twf = 0.0 + + itmp = 0 + for i in range(iuc, -1, -1): + if t2[i] < cft: + itmp = i + 1 + break + itmp = i + if ici[i] == 1: + wk, xbt = _covt(i, ldf, x, cov2, tf, b) + twf += 1.0 + lik -= wk + s -= xbt + iuc = itmp + + xb1 = 0.0 + xb1o = 0.0 + xb = np.zeros(np_) + vt = np.zeros((np_, np_)) + + for i in range(n): + if t2[i] < cft: + if ici[i] <= 1: + continue + wk, xbt = _covt(i, ldf, x, cov2, tf, b) + twt = np.exp(wk) * wt[icg[i] - 1, iuc] / wt[icg[i] - 1, i] + else: + wk, xbt = _covt(i, ldf, x, cov2, tf, b) + twt = np.exp(wk) + + xb1 += twt + xb += twt * xbt + + xbt_c = xbt - xb / xb1 + if xb1o > 0.0: + ratio = xb1 * twt / xb1o + for kk in range(np_): + for jj in range(kk, np_): + vt[kk, jj] += ratio * xbt_c[kk] * xbt_c[jj] + xb1o = xb1 + + lik += twf * np.log(xb1) + twt_f = twf / xb1 + s += twt_f * xb + for i in range(np_): + for j in range(i, np_): + v[i, j] += twt_f * vt[i, j] + v[j, i] = v[i, j] + + iuc -= 1 + + return lik, s, v + + +def crrf( + t2: np.ndarray, + ici: np.ndarray, + x: Optional[np.ndarray], + ncov: int, + np_: int, + cov2: Optional[np.ndarray], + ncov2: int, + tf: Optional[np.ndarray], + ndf: int, + wt: np.ndarray, + ncg: int, + icg: np.ndarray, + b: np.ndarray, +) -> float: + """ + Compute log pseudo-partial likelihood only, without score or information matrix. + + Used during backtracking line search where only the likelihood value is needed. + Shares the same parameter interface as :func:`crrfsv`. + + :param t2: Sorted event times of shape (n,). + :param ici: Status codes of shape (n,); 0=censored, 1=cause of interest, 2=competing event. + :param x: Fixed covariate matrix of shape (n, ncov), or None. + :param ncov: Number of fixed covariates. + :param np_: Total number of parameters (ncov + ncov2). + :param cov2: Time-varying covariate matrix of shape (n, ncov2), or None. + :param ncov2: Number of time-varying covariates. + :param tf: Time functions evaluated at unique failure times, shape (ndf, ncov2), or None. + :param ndf: Number of unique failure times. + :param wt: IPCW weight matrix of shape (ncg, n). + :param ncg: Number of censoring groups. + :param icg: 1-based censoring group assignments of shape (n,). + :param b: Current coefficient vector of shape (np_,). + :returns: Negative log pseudo-partial likelihood. + :rtype: float + """ + n = len(t2) + lik = 0.0 + iuc = n - 1 + ldf = ndf + + while iuc >= 0: + found = False + for i in range(iuc, -1, -1): + if ici[i] == 1: + cft = t2[i] + iuc = i + found = True + break + if not found: + break + + ldf -= 1 + twf = 0.0 + + itmp = 0 + for i in range(iuc, -1, -1): + if t2[i] < cft: + itmp = i + 1 + break + itmp = i + if ici[i] == 1: + wk, _ = _covt(i, ldf, x, cov2, tf, b) + twf += 1.0 + lik -= wk + iuc = itmp + + xb1 = 0.0 + for i in range(n): + if t2[i] < cft: + if ici[i] <= 1: + continue + wk, _ = _covt(i, ldf, x, cov2, tf, b) + twt = np.exp(wk) * wt[icg[i] - 1, iuc] / wt[icg[i] - 1, i] + else: + wk, _ = _covt(i, ldf, x, cov2, tf, b) + twt = np.exp(wk) + xb1 += twt + + lik += twf * np.log(xb1) + iuc -= 1 + + return lik + + +def crrvv( + t2: np.ndarray, + ici: np.ndarray, + x: Optional[np.ndarray], + ncov: int, + np_: int, + cov2: Optional[np.ndarray], + ncov2: int, + tf: Optional[np.ndarray], + ndf: int, + wt: np.ndarray, + ncg: int, + icg: np.ndarray, + b: np.ndarray, +) -> Tuple[np.ndarray, np.ndarray]: + """ + Compute robust sandwich variance estimate. + + :param t2: Sorted event times of shape (n,). + :param ici: Status codes of shape (n,); 0=censored, 1=cause of interest, 2=competing event. + :param x: Fixed covariate matrix of shape (n, ncov), or None. + :param ncov: Number of fixed covariates. + :param np_: Total number of parameters (ncov + ncov2). + :param cov2: Time-varying covariate matrix of shape (n, ncov2), or None. + :param ncov2: Number of time-varying covariates. + :param tf: Time functions evaluated at unique failure times, shape (ndf, ncov2), or None. + :param ndf: Number of unique failure times. + :param wt: IPCW weight matrix of shape (ncg, n). + :param ncg: Number of censoring groups. + :param icg: 1-based censoring group assignments of shape (n,). + :param b: Current coefficient vector of shape (np_,). + :returns: Tuple of: + - **v** (*np.ndarray*) -- Observed information matrix (bread) of shape (np_, np_). + - **v2** (*np.ndarray*) -- Meat of the sandwich estimator of shape (np_, np_). + """ + n = len(t2) + v = np.zeros((np_, np_)) + v2 = np.zeros((np_, np_)) + + icrsk = np.zeros(ncg, dtype=np.int64) + for i in range(n): + icrsk[icg[i] - 1] += 1 + + xb_mat = np.zeros((n, np_ + 1)) + + ldf = 0 + cft = min(-1.0, t2[0] * (1.0 - 1e-5)) + + for i in range(n): + if ici[i] != 1: + continue + if t2[i] > cft: + cft = t2[i] + ldf += 1 + + for j in range(n): + if t2[j] < t2[i]: + if ici[j] <= 1: + continue + wk, xbt = _covt(j, ldf - 1, x, cov2, tf, b) + twt = np.exp(wk) * wt[icg[j] - 1, i] / wt[icg[j] - 1, j] + else: + wk, xbt = _covt(j, ldf - 1, x, cov2, tf, b) + twt = np.exp(wk) + xb_mat[i, 0] += twt + xb_mat[i, 1:] += twt * xbt + + lc = 0 + ldf2 = 0 + cft2 = min(-1.0, t2[0] * (1.0 - 1e-5)) + + icrsk[:] = 0 + for i in range(n): + icrsk[icg[i] - 1] += 1 + + ss2 = np.zeros((np_, ncg)) + qu = np.zeros((np_, ncg)) + + for i in range(n): + st = np.zeros(np_) + + ldf_local = 0 + cft_local = min(-1.0, t2[0] * (1.0 - 1e-5)) + + for j in range(n): + if ici[j] != 1: + continue + if t2[j] > cft_local: + cft_local = t2[j] + ldf_local += 1 + + if t2[j] <= t2[i]: + wk, xbt = _covt(i, ldf_local - 1, x, cov2, tf, b) + twt = np.exp(wk) + elif t2[i] < t2[j] and ici[i] > 1: + wk, xbt = _covt(i, ldf_local - 1, x, cov2, tf, b) + twt = np.exp(wk) * wt[icg[i] - 1, j] / wt[icg[i] - 1, i] + else: + continue + + xbar = xb_mat[j, 1:] / xb_mat[j, 0] + st -= (xbt - xbar) * twt / xb_mat[j, 0] + + if ici[i] == 1: + if t2[i] > cft2: + cft2 = t2[i] + ldf2 += 1 + + wk, xbt_i = _covt(i, ldf2 - 1, x, cov2, tf, b) + xbar_i = xb_mat[i, 1:] / xb_mat[i, 0] + st += xbt_i - xbar_i + + vt_local = np.zeros((np_, np_)) + for j in range(n): + if t2[j] < t2[i]: + if ici[j] <= 1: + continue + wk_j, xbt_j = _covt(j, ldf2 - 1, x, cov2, tf, b) + twt_j = np.exp(wk_j) * wt[icg[j] - 1, i] / wt[icg[j] - 1, j] + else: + wk_j, xbt_j = _covt(j, ldf2 - 1, x, cov2, tf, b) + twt_j = np.exp(wk_j) + + xbt_c = xbt_j - xbar_i + for k1 in range(np_): + for k2 in range(k1, np_): + vt_local[k1, k2] += twt_j * xbt_c[k1] * xbt_c[k2] + + for j1 in range(np_): + for j2 in range(j1, np_): + v[j1, j2] += vt_local[j1, j2] / xb_mat[i, 0] + + iflg = 1 if (i == 0 or t2[i] > t2[i - 1]) else 0 + + if iflg == 1: + qu[:, :] = 0.0 + + local_ldf = ldf2 + local_cft = cft2 + + for j1 in range(lc, n): + if ici[j1] != 1: + continue + if t2[j1] > local_cft: + local_cft = t2[j1] + local_ldf += 1 + + ss4 = np.zeros(ncg) + ss3 = np.zeros((np_, ncg)) + + for j2 in range(n): + if t2[j2] >= t2[i]: + break + if ici[j2] <= 1: + continue + wk_j2, xbt_j2 = _covt(j2, local_ldf - 1, x, cov2, tf, b) + twt_j2 = np.exp(wk_j2) * wt[icg[j2] - 1, j1] / wt[icg[j2] - 1, j2] + g = icg[j2] - 1 + ss4[g] += twt_j2 + ss3[:, g] += xbt_j2 * twt_j2 + + g1 = icg[j1] - 1 + for k in range(np_): + qu[k, g1] += ( + ss3[k, g1] - xb_mat[j1, k + 1] * ss4[g1] / xb_mat[j1, 0] + ) / xb_mat[j1, 0] + + for j in range(i, n): + if t2[j] > t2[i]: + break + if ici[j] == 0: + g = icg[j] - 1 + icrsk_sq = float(icrsk[g]) * float(icrsk[g]) + ss2[:, g] -= qu[:, g] / icrsk_sq + + st_psi = ss2[:, icg[i] - 1].copy() + if ici[i] == 0: + g = icg[i] - 1 + st_psi += qu[:, g] / icrsk[g] + + st += st_psi + for j1 in range(np_): + for j2 in range(j1, np_): + v2[j1, j2] += st[j1] * st[j2] + + if i < n - 1 and t2[i + 1] > t2[i]: + for j in range(lc, i + 1): + icrsk[icg[j] - 1] -= 1 + lc = i + 1 + + for j1 in range(np_ - 1): + for j2 in range(j1 + 1, np_): + v[j2, j1] = v[j1, j2] + v2[j2, j1] = v2[j1, j2] + + return v, v2 + + +def crrsr( + t2: np.ndarray, + ici: np.ndarray, + x: Optional[np.ndarray], + ncov: int, + np_: int, + cov2: Optional[np.ndarray], + ncov2: int, + tf: Optional[np.ndarray], + ndf: int, + wt: np.ndarray, + ncg: int, + icg: np.ndarray, + b: np.ndarray, +) -> np.ndarray: + """ + Compute score residuals at each unique failure time. + + :param t2: Sorted event times of shape (n,). + :param ici: Status codes of shape (n,); 0=censored, 1=cause of interest, 2=competing event. + :param x: Fixed covariate matrix of shape (n, ncov), or None. + :param ncov: Number of fixed covariates. + :param np_: Total number of parameters (ncov + ncov2). + :param cov2: Time-varying covariate matrix of shape (n, ncov2), or None. + :param ncov2: Number of time-varying covariates. + :param tf: Time functions evaluated at unique failure times, shape (ndf, ncov2), or None. + :param ndf: Number of unique failure times. + :param wt: IPCW weight matrix of shape (ncg, n). + :param ncg: Number of censoring groups. + :param icg: 1-based censoring group assignments of shape (n,). + :param b: Current coefficient vector of shape (np_,). + :returns: Score residual matrix of shape (np_, ndf), one column per unique failure time. + :rtype: np.ndarray + """ + n = len(t2) + res = np.zeros((np_, ndf)) + + iuc = n - 1 + ldf = ndf + + while iuc >= 0: + found = False + for i in range(iuc, -1, -1): + if ici[i] == 1: + cft = t2[i] + iuc = i + found = True + break + if not found: + break + + ldf -= 1 + twf = 0.0 + + itmp = 0 + for i in range(iuc, -1, -1): + if t2[i] < cft: + itmp = i + 1 + break + itmp = i + if ici[i] == 1: + wk, xbt = _covt(i, ldf, x, cov2, tf, b) + twf += 1.0 + res[:, ldf] += xbt + iuc = itmp + + xb1 = 0.0 + xb = np.zeros(np_) + for i in range(n): + if t2[i] < cft: + if ici[i] <= 1: + continue + wk, xbt = _covt(i, ldf, x, cov2, tf, b) + twt = np.exp(wk) * wt[icg[i] - 1, iuc] / wt[icg[i] - 1, i] + else: + wk, xbt = _covt(i, ldf, x, cov2, tf, b) + twt = np.exp(wk) + xb1 += twt + xb += twt * xbt + + res[:, ldf] += (-twf / xb1) * xb + iuc -= 1 + + return res + + +def crrfit( + t2: np.ndarray, + ici: np.ndarray, + x: Optional[np.ndarray], + ncov: int, + np_: int, + cov2: Optional[np.ndarray], + ncov2: int, + tf: Optional[np.ndarray], + ndf: int, + wt: np.ndarray, + ncg: int, + icg: np.ndarray, + b: np.ndarray, +) -> np.ndarray: + """ + Estimate baseline subdistribution hazard increments at each unique failure time. + + :param t2: Sorted event times of shape (n,). + :param ici: Status codes of shape (n,); 0=censored, 1=cause of interest, 2=competing event. + :param x: Fixed covariate matrix of shape (n, ncov), or None. + :param ncov: Number of fixed covariates. + :param np_: Total number of parameters (ncov + ncov2). + :param cov2: Time-varying covariate matrix of shape (n, ncov2), or None. + :param ncov2: Number of time-varying covariates. + :param tf: Time functions evaluated at unique failure times, shape (ndf, ncov2), or None. + :param ndf: Number of unique failure times. + :param wt: IPCW weight matrix of shape (ncg, n). + :param ncg: Number of censoring groups. + :param icg: 1-based censoring group assignments of shape (n,). + :param b: Current coefficient vector of shape (np_,). + :returns: Baseline subdistribution hazard increments dLambda_0 of shape (ndf,), + one value per unique failure time. + :rtype: np.ndarray + """ + n = len(t2) + res = np.zeros(ndf) + + iuc = 0 + ldf = 0 + + while iuc < n: + found = False + for i in range(iuc, n): + if ici[i] == 1: + cft = t2[i] + iuc = i + found = True + break + if not found: + break + + ldf += 1 + twf = 0.0 + + itmp = iuc + for i in range(iuc, n): + if t2[i] > cft: + break + itmp = i + if ici[i] == 1: + twf += 1.0 + iuc = itmp + + xb1 = 0.0 + for i in range(n): + if t2[i] < cft: + if ici[i] <= 1: + continue + wk, _ = _covt(i, ldf - 1, x, cov2, tf, b) + twt = np.exp(wk) * wt[icg[i] - 1, iuc] / wt[icg[i] - 1, i] + else: + wk, _ = _covt(i, ldf - 1, x, cov2, tf, b) + twt = np.exp(wk) + xb1 += twt + + res[ldf - 1] = twf / xb1 + iuc = itmp + 1 + + return res From 6154b9e468b36ca00d1f5616d59fb011f27e8548 Mon Sep 17 00:00:00 2001 From: Ryan O'Dea <70209371+ryan-odea@users.noreply.github.com> Date: Thu, 19 Feb 2026 13:50:47 +0100 Subject: [PATCH 03/16] cuminc translation --- CompRiskReg/_cuminc_kernels.py | 334 +++++++++++++++++++++++++++++++++ 1 file changed, 334 insertions(+) diff --git a/CompRiskReg/_cuminc_kernels.py b/CompRiskReg/_cuminc_kernels.py index e69de29..1592ae7 100644 --- a/CompRiskReg/_cuminc_kernels.py +++ b/CompRiskReg/_cuminc_kernels.py @@ -0,0 +1,334 @@ +from typing import Tuple + +import numpy as np + + +def tpoi(x: np.ndarray, tp: np.ndarray) -> np.ndarray: + """ + Map query times to step-function knot indices. + + :param x: Sorted knot times of shape (n,). + :param tp: Sorted query times of shape (ntp,). + :returns: 1-based index of the last knot time <= tp[i] for each query, + 0 if tp[i] is before x[0]. Shape (ntp,). + :rtype: np.ndarray + """ + ind = np.searchsorted(x, tp, side="right").astype(np.int32) + return ind + + +def cinc( + y: np.ndarray, + ic: np.ndarray, + icc: np.ndarray, + n: int, +) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Aalen-Johansen cumulative incidence function (CIF) estimator for a single group and cause. + + :param y: Sorted failure times of shape (n,). + :param ic: Overall event indicator of shape (n,); 1=any event, 0=censored. + :param icc: Cause-of-interest indicator of shape (n,); 1=cause of interest, 0=other/censored. + :param n: Number of observations. + :returns: Tuple of: + - **x_out** (*np.ndarray*) -- Step-function time knots. + - **f_out** (*np.ndarray*) -- CIF estimates at each knot. + - **v_out** (*np.ndarray*) -- Variance estimates at each knot. + """ + max_out = 2 * n + 2 + x_out = np.zeros(max_out) + f_out = np.zeros(max_out) + v_out = np.zeros(max_out) + + fk = 1.0 + v1 = v2_ = v3 = 0.0 + lcnt = 0 + rs = float(n) + + x_out[0] = 0.0 + f_out[0] = 0.0 + v_out[0] = 0.0 + + ll = 0 + while ll < n: + l = ll + while l + 1 < n and y[l + 1] == y[ll]: + l += 1 + + nd1 = 0 + nd2 = 0 + for i in range(ll, l + 1): + nd1 += icc[i] + nd2 += ic[i] - icc[i] + nd = nd1 + nd2 + + if nd > 0: + fkn = fk * (rs - nd) / rs + + if nd1 > 0: + lcnt += 2 + f_out[lcnt - 1] = f_out[lcnt - 2] + f_out[lcnt] = f_out[lcnt - 1] + fk * nd1 / rs + + if nd2 > 0 and fkn > 0.0: + t5 = 1.0 + if nd2 > 1: + t5 = 1.0 - (nd2 - 1.0) / (rs - 1.0) + t6 = fk * fk * t5 * nd2 / (rs * rs) + t3 = 1.0 / fkn + t4 = f_out[lcnt] / fkn + v1 += t4 * t4 * t6 + v2_ += t3 * t4 * t6 + v3 += t3 * t3 * t6 + + if nd1 > 0: + t5 = 1.0 + if nd1 > 1: + t5 = 1.0 - (nd1 - 1.0) / (rs - 1.0) + t6 = fk * fk * t5 * nd1 / (rs * rs) + t3 = 0.0 + if fkn > 0.0: + t3 = 1.0 / fkn + t4 = 1.0 + t3 * f_out[lcnt] + v1 += t4 * t4 * t6 + v2_ += t3 * t4 * t6 + v3 += t3 * t3 * t6 + + t2_val = f_out[lcnt] + x_out[lcnt - 1] = y[l] + x_out[lcnt] = y[l] + v_out[lcnt - 1] = v_out[lcnt - 2] + v_out[lcnt] = v1 + t2_val * t2_val * v3 - 2.0 * t2_val * v2_ + + fk = fkn + + rs = float(n - l - 1) + ll = l + 1 + + lcnt += 1 + x_out[lcnt] = y[n - 1] + f_out[lcnt] = f_out[lcnt - 1] + v_out[lcnt] = v_out[lcnt - 1] + + return x_out[: lcnt + 1], f_out[: lcnt + 1], v_out[: lcnt + 1] + + +def _crst( + y: np.ndarray, + m: np.ndarray, + ig: np.ndarray, + n: int, + ng: int, + rho: float, +) -> Tuple[np.ndarray, np.ndarray]: + """ + Compute Gray test score and variance for a single stratum. + + :param y: Sorted event times for this stratum of shape (n,). + :param m: Event type indicators of shape (n,); 0=censored, 1=cause of interest, 2=competing. + :param ig: 1-based group labels of shape (n,). + :param n: Number of observations in this stratum. + :param ng: Number of groups. + :param rho: Weight exponent for the (1 - F)^rho test family. + :returns: Tuple of: + - **s** (*np.ndarray*) -- Score vector of shape (ng-1,). + - **v** (*np.ndarray*) -- Packed lower-triangle variance of shape (ng*(ng-1)//2,). + """ + ng1 = ng - 1 + ng2 = ng * ng1 // 2 + + rs = np.zeros(ng, dtype=np.int64) + for i in range(n): + rs[ig[i] - 1] += 1 + + s = np.zeros(ng1) + v = np.zeros(ng2) + + f1m = np.zeros(ng) + f1 = np.zeros(ng) + skmm = np.ones(ng) + skm = np.ones(ng) + v3 = np.zeros(ng) + v2 = np.zeros((ng1, ng)) + c_mat = np.zeros((ng, ng)) + a_mat = np.zeros((ng, ng)) + + fm = 0.0 + f_val = 0.0 + + ll = 0 + while ll < n: + lu = ll + while lu + 1 < n and y[lu + 1] <= y[ll]: + lu += 1 + + d = np.zeros((3, ng), dtype=np.int64) + for i in range(ll, lu + 1): + j = ig[i] - 1 + k = m[i] + d[k, j] += 1 + + nd1 = int(np.sum(d[1, :])) + nd2 = int(np.sum(d[2, :])) + + if nd1 == 0 and nd2 == 0: + for i in range(ll, lu + 1): + rs[ig[i] - 1] -= 1 + fm = f_val + f1m[:] = f1 + skmm[:] = skm + ll = lu + 1 + continue + + tr = 0.0 + tq = 0.0 + for i in range(ng): + if rs[i] <= 0: + continue + td = float(d[1, i] + d[2, i]) + skm[i] = skmm[i] * (rs[i] - td) / rs[i] + f1[i] = f1m[i] + skmm[i] * d[1, i] / rs[i] + tr += float(rs[i]) / skmm[i] + tq += rs[i] * (1.0 - f1m[i]) / skmm[i] + + f_val = fm + nd1 / tr + fb = (1.0 - fm) ** rho + + a_mat[:, :] = 0.0 + for i in range(ng): + if rs[i] <= 0: + continue + t1 = float(rs[i]) / skmm[i] + a_mat[i, i] = fb * t1 * (1.0 - t1 / tr) + if a_mat[i, i] != 0.0: + c_mat[i, i] += a_mat[i, i] * nd1 / (tr * (1.0 - fm)) + for j in range(i + 1, ng): + if rs[j] <= 0: + continue + a_mat[i, j] = -fb * t1 * rs[j] / (skmm[j] * tr) + if a_mat[i, j] != 0.0: + c_mat[i, j] += a_mat[i, j] * nd1 / (tr * (1.0 - fm)) + + for i in range(1, ng): + for j in range(i): + a_mat[i, j] = a_mat[j, i] + c_mat[i, j] = c_mat[j, i] + + for i in range(ng1): + if rs[i] <= 0: + continue + s[i] += fb * (d[1, i] - nd1 * rs[i] * (1.0 - f1m[i]) / (skmm[i] * tq)) + + if nd1 > 0: + for k in range(ng): + if rs[k] <= 0: + continue + t4 = 1.0 + if skm[k] > 0.0: + t4 = 1.0 - (1.0 - f_val) / skm[k] + t5 = 1.0 + if nd1 > 1: + t5 = 1.0 - (nd1 - 1.0) / (tr * skmm[k] - 1.0) + t3 = t5 * skmm[k] * nd1 / (tr * rs[k]) + v3[k] += t4 * t4 * t3 + for i in range(ng1): + t1 = a_mat[i, k] - t4 * c_mat[i, k] + v2[i, k] += t1 * t4 * t3 + for j in range(i + 1): + l = i * (i + 1) // 2 + j + t2_val = a_mat[j, k] - t4 * c_mat[j, k] + v[l] += t1 * t2_val * t3 + + if nd2 > 0: + for k in range(ng): + if skm[k] <= 0.0 or d[2, k] <= 0: + continue + t4 = (1.0 - f_val) / skm[k] + t5 = 1.0 + if d[2, k] > 1: + t5 = 1.0 - (d[2, k] - 1.0) / (rs[k] - 1.0) + t6 = float(rs[k]) + t3 = t5 * (skmm[k] ** 2) * d[2, k] / (t6 * t6) + v3[k] += t4 * t4 * t3 + for i in range(ng1): + t1 = t4 * c_mat[i, k] + v2[i, k] -= t1 * t4 * t3 + for j in range(i + 1): + l = i * (i + 1) // 2 + j + t2_val = t4 * c_mat[j, k] + v[l] += t1 * t2_val * t3 + + if lu >= n - 1: + break + + for i in range(ll, lu + 1): + rs[ig[i] - 1] -= 1 + fm = f_val + f1m[:] = f1 + skmm[:] = skm + ll = lu + 1 + + l = 0 + for i in range(ng1): + for j in range(i + 1): + for k in range(ng): + v[l] += c_mat[i, k] * c_mat[j, k] * v3[k] + v[l] += c_mat[i, k] * v2[j, k] + v[l] += c_mat[j, k] * v2[i, k] + l += 1 + + return s, v + + +def crstm( + y: np.ndarray, + m: np.ndarray, + ig: np.ndarray, + ist: np.ndarray, + no: int, + rho: float, + nst: int, + ng: int, +) -> Tuple[np.ndarray, np.ndarray]: + """ + Stratified K-sample Gray test for equality of cause-specific CIFs across groups. + + :param y: Sorted failure times of shape (no,). + :param m: Event type indicators of shape (no,); 0=censored, 1=cause of interest, 2=competing. + :param ig: 1-based group labels of shape (no,). + :param ist: 1-based stratum labels of shape (no,). + :param no: Total number of observations. + :param rho: Weight exponent for the (1 - F)^rho test family. + :param nst: Number of strata. + :param ng: Number of groups. + :returns: Tuple of: + - **s** (*np.ndarray*) -- Stratified score vector of shape (ng-1,). + - **vs** (*np.ndarray*) -- Full symmetric variance matrix of shape (ng-1, ng-1). + """ + ng1 = ng - 1 + ng2 = ng * ng1 // 2 + + s = np.zeros(ng1) + v = np.zeros(ng2) + + for ks in range(1, nst + 1): + mask = ist == ks + ys = y[mask] + ms = m[mask] + igs = ig[mask] + ns = len(ys) + + st, vt = _crst(ys, ms, igs, ns, ng, rho) + + s += st + v += vt + + vs = np.zeros((ng1, ng1)) + l = 0 + for i in range(ng1): + for j in range(i + 1): + vs[i, j] = v[l] + vs[j, i] = v[l] + l += 1 + + return s, vs From fc170b9cf560dff34c048f0c4a33d59a0d50f937 Mon Sep 17 00:00:00 2001 From: Ryan O'Dea <70209371+ryan-odea@users.noreply.github.com> Date: Thu, 19 Feb 2026 13:56:14 +0100 Subject: [PATCH 04/16] manual km --- CompRiskReg/_km.py | 70 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/CompRiskReg/_km.py b/CompRiskReg/_km.py index e69de29..aff34c1 100644 --- a/CompRiskReg/_km.py +++ b/CompRiskReg/_km.py @@ -0,0 +1,70 @@ +import numpy as np + + +def km_censoring_weights( + ftime: np.ndarray, + cenind: np.ndarray, + cengroup: np.ndarray, + n_groups: int, +) -> np.ndarray: + """ + Compute Kaplan-Meier censoring weight matrix. + + For each censoring group, fits a KM estimator to the censoring distribution + and evaluates it left-continuously at each observed time in ftime. + + :param ftime: Failure/censoring times of shape (n,), sorted ascending. + :param cenind: Censoring indicator of shape (n,); 1=censored, 0=failed (any cause). + :param cengroup: 1-based censoring group membership of shape (n,), + with values in [1, n_groups]. + :param n_groups: Number of distinct censoring groups. + :returns: KM censoring survival matrix of shape (n_groups, n), where + entry [k-1, i] = P(C >= ftime[i]) for group k, evaluated left-continuously. + :rtype: np.ndarray + """ + n = len(ftime) + uuu = np.zeros((n_groups, n), dtype=np.float64) + + for k in range(1, n_groups + 1): + mask = cengroup == k + ft_k = ftime[mask] + ci_k = cenind[mask] + + if len(ft_k) == 0: + uuu[k - 1, :] = 1.0 + continue + + unique_times = np.unique(ft_k) + surv_times = [] + surv_vals = [] + n_risk = len(ft_k) + surv = 1.0 + + for t in unique_times: + at_t = ft_k == t + d = np.sum(ci_k[at_t]) + if n_risk > 0 and d > 0: + surv *= 1.0 - d / n_risk + surv_times.append(t) + surv_vals.append(surv) + n_risk -= np.sum(at_t) + + surv_times = np.array(surv_times) + surv_vals = np.array(surv_vals) + + eps = 100.0 * np.finfo(float).eps + query = ftime * (1.0 - eps) + + t_aug = np.concatenate( + [ + [min(0.0, surv_times[0]) - 10.0 * np.finfo(float).eps], + surv_times, + [surv_times[-1] * (1.0 + 10.0 * np.finfo(float).eps)], + ] + ) + s_aug = np.concatenate([[1.0], surv_vals, [0.0]]) + idx = np.searchsorted(t_aug, query, side="right") - 1 + idx = np.clip(idx, 0, len(s_aug) - 1) + uuu[k - 1, :] = s_aug[idx] + + return uuu From bcf2f782dec52d740f42483a3a2e547a3523720d Mon Sep 17 00:00:00 2001 From: Ryan O'Dea <70209371+ryan-odea@users.noreply.github.com> Date: Thu, 19 Feb 2026 14:28:37 +0100 Subject: [PATCH 05/16] translate crr --- CompRiskReg/crr.py | 453 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 453 insertions(+) diff --git a/CompRiskReg/crr.py b/CompRiskReg/crr.py index e69de29..d87b7b6 100644 --- a/CompRiskReg/crr.py +++ b/CompRiskReg/crr.py @@ -0,0 +1,453 @@ +from dataclasses import dataclass, field +from typing import Any, Dict, Optional + +import numpy as np +from scipy.stats import norm + +from ._crr_kernels import crrf, crrfit, crrfsv, crrsr, crrvv +from ._km import km_censoring_weights + + +@dataclass +class CrrResult: + """ + Container for crr() output. + + :param coef: Estimated coefficient vector of shape (np_,). + :param loglik: Maximized log pseudo-partial likelihood. + :param score: Score vector at convergence of shape (np_,). + :param inf: Observed information matrix of shape (np_, np_). + :param var: Robust sandwich variance matrix of shape (np_, np_). + :param res: Score residual matrix of shape (ndf, np_), or None if variance=False. + :param uftime: Unique failure times of shape (ndf,). + :param bfitj: Baseline subdistribution hazard increments of shape (ndf,). + :param tfs: Time function matrix evaluated at unique failure times, + shape (ndf, ncov2), or None if no time-varying covariates. + :param converged: Whether the Newton-Raphson algorithm converged. + :param n: Number of observations used after removing missing values. + :param n_missing: Number of observations removed due to missing values. + :param loglik_null: Log pseudo-partial likelihood at the null (zero) coefficients. + :param invinf: Inverse of the observed information matrix of shape (np_, np_). + :param names: Covariate names, or None if not provided. + """ + + coef: np.ndarray + loglik: float + score: np.ndarray + inf: np.ndarray + var: np.ndarray + res: Optional[np.ndarray] + uftime: np.ndarray + bfitj: np.ndarray + tfs: Optional[np.ndarray] + converged: bool + n: int + n_missing: int + loglik_null: float + invinf: np.ndarray + names: Optional[list] = field(default=None) + + def predict( + self, + cov1: Optional[np.ndarray] = None, + cov2: Optional[np.ndarray] = None, + ) -> np.ndarray: + """ + Predict subdistribution CIF for given covariate values. + + :param cov1: Fixed covariate values of shape (ncov1,) or (nsubj, ncov1). + :param cov2: Time-varying covariate values of shape (ncov2,) or (nsubj, ncov2). + :returns: Predicted CIF matrix of shape (ndf, 1 + nsubj); first column + is uftime, remaining columns are predicted CIF per subject. + :rtype: np.ndarray + """ + np_ = len(self.coef) + tfs = self.tfs + + if tfs is None or (isinstance(tfs, np.ndarray) and tfs.size <= 1): + cov1 = np.atleast_2d(np.asarray(cov1, dtype=np.float64)) + nsubj = cov1.shape[0] + lhat = np.zeros((len(self.uftime), nsubj)) + for j in range(nsubj): + lp = np.sum(cov1[j] * self.coef) + lhat[:, j] = np.cumsum(np.exp(lp) * self.bfitj) + else: + ncov_tfs = tfs.shape[1] if tfs.ndim > 1 else 1 + if np_ == ncov_tfs: + cov2 = np.atleast_2d(np.asarray(cov2, dtype=np.float64)) + nsubj = cov2.shape[0] + lhat = np.zeros((len(self.uftime), nsubj)) + for j in range(nsubj): + lp = tfs @ (cov2[j] * self.coef) + lhat[:, j] = np.cumsum(np.exp(lp) * self.bfitj) + else: + cov1 = np.atleast_2d(np.asarray(cov1, dtype=np.float64)) + cov2 = np.atleast_2d(np.asarray(cov2, dtype=np.float64)) + nsubj = cov1.shape[0] + nc1 = cov1.shape[1] + lhat = np.zeros((len(self.uftime), nsubj)) + for j in range(nsubj): + lp_fixed = np.sum(cov1[j] * self.coef[:nc1]) + lp_tv = tfs @ (cov2[j] * self.coef[nc1:]) + lhat[:, j] = np.cumsum(np.exp(lp_fixed + lp_tv) * self.bfitj) + + return np.column_stack([self.uftime, 1.0 - np.exp(-lhat)]) + + def summary(self, conf_int: float = 0.95) -> Dict[str, Any]: + """ + Produce summary statistics table. + + :param conf_int: Confidence level for the interval, default 0.95. + :returns: Dictionary with keys: + + - **coef** (*np.ndarray*) -- Table of shape (np_, 5): coefficient, + exp(coef), standard error, z-score, p-value. + - **conf_int** (*np.ndarray*) -- Table of shape (np_, 4): exp(coef), + exp(-coef), lower CI, upper CI. + - **n** (*int*) -- Number of observations. + - **n_missing** (*int*) -- Number of missing observations dropped. + - **loglik** (*float*) -- Maximized log pseudo-partial likelihood. + - **logtest** (*float*) -- -2 * (loglik_null - loglik) likelihood ratio statistic. + - **df** (*int*) -- Degrees of freedom (number of coefficients). + - **converged** (*bool*) -- Whether Newton-Raphson converged. + - **names** (*list or None*) -- Covariate names. + :rtype: dict + """ + beta = self.coef + se = np.sqrt(np.diag(self.var)) + z = beta / se + pval = 2.0 * (1.0 - norm.cdf(np.abs(z))) + coef_table = np.column_stack([beta, np.exp(beta), se, z, pval]) + + a = (1.0 - conf_int) / 2.0 + z_val = norm.ppf([a, 1.0 - a]) + ci_table = np.column_stack( + [ + np.exp(beta), + np.exp(-beta), + np.exp(beta + z_val[0] * se), + np.exp(beta + z_val[1] * se), + ] + ) + + return { + "coef": coef_table, + "conf_int": ci_table, + "n": self.n, + "n_missing": self.n_missing, + "loglik": self.loglik, + "logtest": -2.0 * (self.loglik_null - self.loglik), + "df": len(beta), + "converged": self.converged, + "names": self.names, + } + + +def crr( + ftime: np.ndarray, + fstatus: np.ndarray, + cov1: Optional[np.ndarray] = None, + cov2: Optional[np.ndarray] = None, + tf=None, + cengroup: Optional[np.ndarray] = None, + failcode: int = 1, + cencode: int = 0, + gtol: float = 1e-6, + maxiter: int = 10, + init: Optional[np.ndarray] = None, + variance: bool = True, +) -> CrrResult: + """ + Fit a competing risks regression model using the Fine-Gray subdistribution hazard. + + :param ftime: Failure/censoring times of shape (n,). + :param fstatus: Status codes of shape (n,). + :param cov1: Fixed covariate matrix of shape (n, ncov1), or None. + :param cov2: Time-varying covariate matrix of shape (n, ncov2), or None. + :param tf: Callable mapping a vector of times to a matrix of shape (ndf, ncov2); + required when cov2 is provided. + :param cengroup: Censoring group indicator of shape (n,), or None for a single group. + :param failcode: Status code identifying the failure of interest, default 1. + :param cencode: Status code identifying censoring, default 0. + :param gtol: Gradient convergence tolerance for Newton-Raphson, default 1e-6. + :param maxiter: Maximum number of Newton-Raphson iterations, default 10. + :param init: Initial coefficient vector of shape (np_,), or None for zero initialization. + :param variance: Whether to compute the robust sandwich variance, default True. + :returns: Fitted model result. + :rtype: CrrResult + :raises ValueError: If inputs are inconsistent or contain invalid values. + """ + ftime = np.asarray(ftime, dtype=np.float64).ravel() + fstatus = np.asarray(fstatus).ravel() + n_orig = len(ftime) + + if n_orig == 0: + raise ValueError("ftime must have at least one observation") + if len(fstatus) != n_orig: + raise ValueError("ftime and fstatus must have the same length") + if np.any(np.isinf(ftime)): + raise ValueError("ftime contains Inf values") + if cov1 is None and cov2 is None: + raise ValueError("at least one of cov1 or cov2 must be provided") + if cov2 is not None and tf is None: + raise ValueError("tf function is required when cov2 is provided") + + if cengroup is None: + cengroup = np.ones(n_orig, dtype=np.float64) + else: + cengroup = np.asarray(cengroup, dtype=np.float64).ravel() + + arrays = [ftime, fstatus.astype(float), cengroup] + if cov1 is not None: + cov1 = np.atleast_2d(np.asarray(cov1, dtype=np.float64)) + if cov1.shape[0] == 1 and cov1.shape[1] == n_orig: + cov1 = cov1.T + nc1 = cov1.shape[1] + arrays.append(cov1) + else: + nc1 = 0 + + if cov2 is not None: + cov2 = np.atleast_2d(np.asarray(cov2, dtype=np.float64)) + if cov2.shape[0] == 1 and cov2.shape[1] == n_orig: + cov2 = cov2.T + nc2 = cov2.shape[1] + arrays.append(cov2) + else: + nc2 = 0 + + if cov1 is not None and np.any(np.isinf(cov1)): + raise ValueError("cov1 contains Inf values") + if cov2 is not None and np.any(np.isinf(cov2)): + raise ValueError("cov2 contains Inf values") + + keep = np.ones(n_orig, dtype=bool) + for arr in arrays: + if arr.ndim == 1: + keep &= ~np.isnan(arr) + else: + keep &= ~np.any(np.isnan(arr), axis=1) + + nmis = n_orig - int(np.sum(keep)) + if nmis > 0: + ftime = ftime[keep] + fstatus = fstatus[keep] + cengroup = cengroup[keep] + if cov1 is not None: + cov1 = cov1[keep] + if cov2 is not None: + cov2 = cov2[keep] + + order = np.argsort(ftime, kind="stable") + ftime = ftime[order] + fstatus = fstatus[order] + cengroup = cengroup[order] + if cov1 is not None: + cov1 = cov1[order] + if cov2 is not None: + cov2 = cov2[order] + + n = len(ftime) + + cenind = np.where(fstatus == cencode, 1, 0) + fstatus_coded = np.where(fstatus == failcode, 1, 2 * (1 - cenind)).astype(np.int32) + + ucg = np.sort(np.unique(cengroup)) + ncg = len(ucg) + cg_map = {v: i + 1 for i, v in enumerate(ucg)} + cengroup_int = np.array([cg_map[v] for v in cengroup], dtype=np.int32) + + uuu = km_censoring_weights(ftime, cenind, cengroup_int, ncg) + + uft = np.sort(np.unique(ftime[fstatus_coded == 1])) + ndf = len(uft) + if ndf == 0: + raise ValueError("no failures of the specified type (failcode=%s)" % failcode) + + np_ = nc1 + nc2 + ncov = nc1 + ncov2 = nc2 + + x = np.asarray(cov1, dtype=np.float64) if nc1 > 0 else None + if nc2 > 0: + cov2_arr = np.asarray(cov2, dtype=np.float64) + tfs_arr = np.atleast_2d(np.asarray(tf(uft), dtype=np.float64)) + else: + cov2_arr = None + tfs_arr = None + + wt = uuu + b = ( + np.asarray(init, dtype=np.float64).copy() + if init is not None + else np.zeros(np_, dtype=np.float64) + ) + + stepf = 0.5 + converge = False + lik_val = 0.0 + + for ll in range(maxiter + 1): + lik_val, s_val, v_val = crrfsv( + ftime, + fstatus_coded, + x, + ncov, + np_, + cov2_arr, + ncov2, + tfs_arr, + ndf, + wt, + ncg, + cengroup_int, + b, + ) + + if ( + np.max(np.abs(s_val) * np.maximum(np.abs(b), 1.0)) + < np.maximum(np.abs(lik_val), 1.0) * gtol + ): + converge = True + break + + if ll == maxiter: + break + + try: + sc = -np.linalg.solve(v_val, s_val) + except np.linalg.LinAlgError: + break + + bn = b + sc + fbn = crrf( + ftime, + fstatus_coded, + x, + ncov, + np_, + cov2_arr, + ncov2, + tfs_arr, + ndf, + wt, + ncg, + cengroup_int, + bn, + ) + + i = 0 + while np.isnan(fbn) or fbn > lik_val + 1e-4 * np.sum(sc * s_val): + i += 1 + sc *= stepf + bn = b + sc + fbn = crrf( + ftime, + fstatus_coded, + x, + ncov, + np_, + cov2_arr, + ncov2, + tfs_arr, + ndf, + wt, + ncg, + cengroup_int, + bn, + ) + if i > 20: + break + + if i > 20: + break + + b = bn.copy() + + if variance: + h0, v2_mat = crrvv( + ftime, + fstatus_coded, + x, + ncov, + np_, + cov2_arr, + ncov2, + tfs_arr, + ndf, + wt, + ncg, + cengroup_int, + b, + ) + h_inv = np.linalg.inv(h0) + var_mat = h_inv @ v2_mat @ h_inv.T + res_mat = crrsr( + ftime, + fstatus_coded, + x, + ncov, + np_, + cov2_arr, + ncov2, + tfs_arr, + ndf, + wt, + ncg, + cengroup_int, + b, + ).T + else: + var_mat = np.full((np_, np_), np.nan) + h0 = np.full((np_, np_), np.nan) + h_inv = np.full((np_, np_), np.nan) + res_mat = None + + b0 = np.zeros(np_, dtype=np.float64) + fb0 = crrf( + ftime, + fstatus_coded, + x, + ncov, + np_, + cov2_arr, + ncov2, + tfs_arr, + ndf, + wt, + ncg, + cengroup_int, + b0, + ) + bj_out = crrfit( + ftime, + fstatus_coded, + x, + ncov, + np_, + cov2_arr, + ncov2, + tfs_arr, + ndf, + wt, + ncg, + cengroup_int, + b, + ) + + return CrrResult( + coef=b, + loglik=-lik_val, + score=-s_val, + inf=h0, + var=var_mat, + res=res_mat, + uftime=uft, + bfitj=bj_out, + tfs=tfs_arr if nc2 > 0 else None, + converged=converge, + n=n, + n_missing=nmis, + loglik_null=-fb0, + invinf=h_inv, + ) From 6a2b06cc197ff0a021c556c9cbf0babf396464a4 Mon Sep 17 00:00:00 2001 From: Ryan O'Dea <70209371+ryan-odea@users.noreply.github.com> Date: Thu, 19 Feb 2026 14:28:49 +0100 Subject: [PATCH 06/16] translate cuminc --- CompRiskReg/cuminc.py | 201 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 201 insertions(+) diff --git a/CompRiskReg/cuminc.py b/CompRiskReg/cuminc.py index e69de29..cb55a8a 100644 --- a/CompRiskReg/cuminc.py +++ b/CompRiskReg/cuminc.py @@ -0,0 +1,201 @@ +from typing import Any, Dict, Optional + +import numpy as np + +from ._cuminc_kernels import cinc, crstm, tpoi + + +def cuminc( + ftime: np.ndarray, + fstatus: np.ndarray, + group: Optional[np.ndarray] = None, + strata: Optional[np.ndarray] = None, + rho: float = 0.0, + cencode: float = 0, +) -> Dict[str, Any]: + """ + Estimate cumulative incidence functions and optionally test equality across groups. + + For each group-cause combination, computes the Aalen-Johansen CIF estimate. + If more than one group is present, also computes the Gray K-sample test statistic + for each cause. + + :param ftime: Failure/censoring times of shape (n,). + :param fstatus: Status codes of shape (n,); cencode indicates censoring, + all other values identify distinct causes. + :param group: Group labels of shape (n,), or None for a single group. + :param strata: Strata labels of shape (n,) for stratified tests, or None + for unstratified. + :param rho: Exponent for the (1 - F)^rho weight family, default 0. + :param cencode: Value of fstatus identifying censored observations, default 0. + :returns: Dictionary with the following entries: + + - Each group-cause combination key (e.g. ``"a 1"``) maps to a dict with + keys ``"time"`` (*np.ndarray*), ``"est"`` (*np.ndarray*), and + ``"var"`` (*np.ndarray*). + - ``"Tests"`` (*np.ndarray*) -- present only when ng > 1; shape (nc, 3) + with columns [test statistic, p-value, degrees of freedom]. + - ``"_test_labels"`` -- cause labels corresponding to rows of ``"Tests"``. + :rtype: dict + :raises ValueError: If inputs are inconsistent or contain invalid values. + """ + ftime = np.asarray(ftime, dtype=np.float64) + fstatus = np.asarray(fstatus) + + n = len(ftime) + if n == 0: + raise ValueError("ftime must have at least one observation") + if len(fstatus) != n: + raise ValueError("ftime and fstatus must have the same length") + if np.any(np.isinf(ftime)): + raise ValueError("ftime contains Inf values") + + if group is None: + group = np.ones(n, dtype=object) + else: + group = np.asarray(group) + + if strata is None: + strata = np.ones(n, dtype=int) + else: + strata = np.asarray(strata) + + keep = np.ones(n, dtype=bool) + for arr in [ftime, fstatus, group, strata]: + try: + keep &= ~np.isnan(arr.astype(float)) + except (ValueError, TypeError): + keep &= np.array( + [x is not None and x != "nan" and str(x) != "nan" for x in arr] + ) + + nmis = n - np.sum(keep) + if nmis > 0: + ftime = ftime[keep] + fstatus = fstatus[keep] + group = group[keep] + strata = strata[keep] + + no = len(ftime) + + order = np.argsort(ftime, kind="stable") + ftime = ftime[order] + fstatus = fstatus[order] + group = group[order] + strata = strata[order] + + ugg = [] + seen = set() + for g in group: + if g not in seen: + ugg.append(g) + seen.add(g) + group_counts = {g: np.sum(group == g) for g in ugg} + ugg = [g for g in ugg if group_counts[g] > 0] + group_map = {g: i + 1 for i, g in enumerate(ugg)} + ig = np.array([group_map[g] for g in group], dtype=np.int32) + + ustrata = sorted(set(strata)) + strata_map = {s: i + 1 for i, s in enumerate(ustrata)} + ist = np.array([strata_map[s] for s in strata], dtype=np.int32) + nst = len(ustrata) + + censind = np.where(fstatus == cencode, 0, 1).astype(np.int32) + + uclab = sorted(set(fstatus[censind == 1])) + nc = len(uclab) + ng = len(ugg) + + result = {} + stat = np.zeros(nc, dtype=np.float64) + + for ii, cause in enumerate(uclab): + causeind = np.where(fstatus == cause, 1, 0).astype(np.int32) + + for jj, grp in enumerate(ugg): + name = f"{grp} {cause}" + + cgmask = group == grp + ncg = int(np.sum(cgmask)) + ft_g = np.ascontiguousarray(ftime[cgmask], dtype=np.float64) + ci_g = np.ascontiguousarray(censind[cgmask], dtype=np.int32) + cc_g = np.ascontiguousarray(causeind[cgmask], dtype=np.int32) + + x_out, f_out, v_out = cinc(ft_g, ci_g, cc_g, ncg) + result[name] = {"time": x_out, "est": f_out, "var": v_out} + + if ng > 1: + m_test = (2 * censind - causeind).astype(np.int32) + + s_out, vs_out = crstm( + np.ascontiguousarray(ftime, dtype=np.float64), + np.ascontiguousarray(m_test, dtype=np.int32), + np.ascontiguousarray(ig, dtype=np.int32), + np.ascontiguousarray(ist, dtype=np.int32), + no, + rho, + nst, + ng, + ) + + stat[ii] = -1.0 + try: + if np.linalg.matrix_rank(vs_out) == ng - 1: + stat[ii] = float(s_out @ np.linalg.inv(vs_out) @ s_out) + except np.linalg.LinAlgError: + pass + + if ng > 1: + from scipy.stats import chi2 + + pv = np.array([1.0 - chi2.cdf(s, ng - 1) if s >= 0 else np.nan for s in stat]) + result["Tests"] = np.column_stack([stat, pv, np.full(nc, ng - 1)]) + result["_test_labels"] = uclab + + return result + + +def timepoints( + w: Dict[str, Any], + times: np.ndarray, +) -> Dict[str, Any]: + """ + Evaluate cumulative incidence estimates at specific time points. + + :param w: Output dictionary from :func:`cuminc`. + :param times: Time points at which to evaluate the CIF, of shape (nt,). + :returns: Dictionary with keys: + + - **est** (*np.ndarray*) -- CIF estimates of shape (ng, nt). + - **var** (*np.ndarray*) -- Variance estimates of shape (ng, nt). + - **row_names** (*list*) -- Group-cause combination labels. + - **col_names** (*np.ndarray*) -- Evaluated time points of shape (nt,). + :rtype: dict + """ + times = np.sort(np.unique(np.asarray(times, dtype=np.float64))) + nt = len(times) + + curve_names = [k for k in w if k != "Tests" and not k.startswith("_")] + ng = len(curve_names) + + est = np.full((ng, nt), np.nan, dtype=np.float64) + var = np.full((ng, nt), np.nan, dtype=np.float64) + + for i, name in enumerate(curve_names): + curve = w[name] + if curve.get("est") is None: + continue + + x = curve["time"] + ind = tpoi( + np.ascontiguousarray(x, dtype=np.float64), + np.ascontiguousarray(times, dtype=np.float64), + ) + + for j in range(nt): + if ind[j] > 0: + est[i, j] = curve["est"][ind[j] - 1] + if "var" in curve and len(curve["var"]) > 0: + var[i, j] = curve["var"][ind[j] - 1] + + return {"est": est, "var": var, "row_names": curve_names, "col_names": times} From 915bd62d871dc00b6fd0c2dfb6b0e17a1bbc4f02 Mon Sep 17 00:00:00 2001 From: Ryan O'Dea <70209371+ryan-odea@users.noreply.github.com> Date: Thu, 19 Feb 2026 14:28:57 +0100 Subject: [PATCH 07/16] expose package --- CompRiskReg/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CompRiskReg/__init__.py b/CompRiskReg/__init__.py index e69de29..7956af6 100644 --- a/CompRiskReg/__init__.py +++ b/CompRiskReg/__init__.py @@ -0,0 +1,5 @@ +from .crr import crr +from .cuminc import cuminc, timepoints + +__all__ = ["crr", "timepoints", "cuminc"] +__version__ = "0.0.1" From 72fb3cbe81fbacd88aa71a03c407cac116c3daee Mon Sep 17 00:00:00 2001 From: Ryan O'Dea <70209371+ryan-odea@users.noreply.github.com> Date: Thu, 19 Feb 2026 14:29:16 +0100 Subject: [PATCH 08/16] make test workflow --- .github/workflows/test.yml | 42 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 .github/workflows/test.yml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..ea5672a --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,42 @@ +name: Test CompRiskReg with R +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + test-python-only: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.9", "3.10", "3.11", "3.12"] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - run: pip install -e ".[dev]" + - run: pytest -m "not requires_r" + + test-with-r: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.9", "3.12"] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - uses: r-lib/actions/setup-r@v2 + with: + r-version: "release" + - name: Install R cmprsk + run: Rscript -e "install.packages('cmprsk', repos='https://cloud.r-project.org')" + - run: pip install -e ".[dev]" + - run: pytest -m "requires_r" \ No newline at end of file From a2b368a30bec77f9355727ea61764c3aebc79754 Mon Sep 17 00:00:00 2001 From: Ryan O'Dea <70209371+ryan-odea@users.noreply.github.com> Date: Thu, 19 Feb 2026 14:29:59 +0100 Subject: [PATCH 09/16] Update .gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index b7faf40..5864926 100644 --- a/.gitignore +++ b/.gitignore @@ -205,3 +205,5 @@ cython_debug/ marimo/_static/ marimo/_lsp/ __marimo__/ +uv.lock +.DS_store From 93917c5a17e5deed7bd29fb7a504b12302032234 Mon Sep 17 00:00:00 2001 From: Ryan O'Dea <70209371+ryan-odea@users.noreply.github.com> Date: Thu, 19 Feb 2026 15:56:05 +0100 Subject: [PATCH 10/16] protection against div 0 --- CompRiskReg/_crr_kernels.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/CompRiskReg/_crr_kernels.py b/CompRiskReg/_crr_kernels.py index 77877f5..56356b5 100644 --- a/CompRiskReg/_crr_kernels.py +++ b/CompRiskReg/_crr_kernels.py @@ -138,12 +138,13 @@ def crrfsv( xb1 += twt xb += twt * xbt - xbt_c = xbt - xb / xb1 - if xb1o > 0.0: - ratio = xb1 * twt / xb1o - for kk in range(np_): - for jj in range(kk, np_): - vt[kk, jj] += ratio * xbt_c[kk] * xbt_c[jj] + if xb1 > 0.0: + xbt_c = xbt - xb / xb1 + if xb1o > 0.0: + ratio = xb1 * twt / xb1o + for kk in range(np_): + for jj in range(kk, np_): + vt[kk, jj] += ratio * xbt_c[kk] * xbt_c[jj] xb1o = xb1 lik += twf * np.log(xb1) From 3ce36bfa6bac18b851fad5076f4cc39d16f5c419 Mon Sep 17 00:00:00 2001 From: Ryan O'Dea <70209371+ryan-odea@users.noreply.github.com> Date: Thu, 19 Feb 2026 15:56:35 +0100 Subject: [PATCH 11/16] Update pyproject.toml --- pyproject.toml | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 09460e2..8cee593 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,8 +7,22 @@ name = "CompRiskReg" version = "0.0.1" description = "Competing Risks Regression and Cumulative Incidence Estimation" license = { file = "LICENSE" } +authors = [ + { name = "Ryan O'Dea", email = "ryan.odea@psi.com" }, +] +maintainers = [ + { name = "Ryan O'Dea", email = "ryan.odea@psi.com" }, +] requires-python = ">=3.8" -dependencies = ["numpy>=1.20"] +dependencies = ["numpy>=1.20", "scipy>=1.8.0"] + +keywords = [ + "biostatistics", + "survival", + "competing-events", + "cmprsk", + "cumulative-incidence" +] classifiers = [ "Development Status :: 1 - Planning", @@ -27,8 +41,14 @@ classifiers = [ "Typing :: Typed", ] - +[tool.setuptools.packages.find] +where = ["."] +include = ["CompRiskReg*"] [project.optional-dependencies] test = ["pytest"] +[tool.pytest.ini_options] +markers = [ + "requires_r: marks tests that require R with the cmprsk package installed", +] From 94a0ab94f67a05fa25678e1e47a6acb6795827b4 Mon Sep 17 00:00:00 2001 From: Ryan O'Dea <70209371+ryan-odea@users.noreply.github.com> Date: Thu, 19 Feb 2026 15:56:47 +0100 Subject: [PATCH 12/16] translate tests --- tests/conftest.py | 104 ++++++++++++++++++ tests/test_crr.py | 247 +++++++++++++++++++++++++++++++++++++++++++ tests/test_cuminc.py | 155 +++++++++++++++++++++++++++ 3 files changed, 506 insertions(+) create mode 100644 tests/conftest.py create mode 100644 tests/test_crr.py create mode 100644 tests/test_cuminc.py diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..fbfac0f --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,104 @@ +import json +import subprocess +import pytest + +_R_DATA_SCRIPT = r""" +library(cmprsk) +set.seed(10) +ss <- rexp(100) +cc <- sample(0:2, 100, replace=TRUE) +cv <- matrix(runif(300), nrow=100) +cv[c(3,51,77), 1] <- NA +gg <- factor(sample(0:2, 100, replace=TRUE), 0:2, c('a','b','c')) +strt <- sample(1:2, 100, replace=TRUE) +dd <- data.frame(ss=runif(100, 0, 5)) +gg2 <- as.character(gg) +gg2[sample(100, 5, replace=FALSE)] <- NA + +out <- list( + ss = as.list(ss), + cc = as.list(cc), + cv1 = as.list(cv[,1]), + cv2 = as.list(cv[,2]), + cv3 = as.list(cv[,3]), + gg = as.list(as.character(gg)), + strt = as.list(strt), + dd_ss = as.list(dd$ss), + gg2 = as.list(gg2) +) +cat(jsonlite::toJSON(out, null="null", na="null", auto_unbox=TRUE)) +""" + + +def _r_available() -> bool: + """ + Check whether Rscript is available and the cmprsk package is installed. + + :returns: True if both conditions are met, False otherwise. + :rtype: bool + """ + try: + result = subprocess.run( + ["Rscript", "-e", "library(cmprsk)"], + capture_output=True, + timeout=15, + ) + return result.returncode == 0 + except (FileNotFoundError, subprocess.TimeoutExpired): + return False + + +def pytest_configure(config): + config.addinivalue_line( + "markers", + "requires_r: marks tests that require R with the cmprsk package installed", + ) + + +def pytest_collection_modifyitems(config, items): + """Skip requires_r tests automatically when R is not available on PATH.""" + if _r_available(): + return + skip = pytest.mark.skip(reason="R with cmprsk not available") + for item in items: + if item.get_closest_marker("requires_r"): + item.add_marker(skip) + + +@pytest.fixture(scope="session") +def r_data(): + """ + Generate test data by running the cmprsk example script in R. + + Returns a dictionary with keys: ss, cc, cv1, cv2, cv3, gg, strt, dd_ss, gg2. + NaN entries (from NA in R) are preserved as float('nan') for numeric arrays + and None for the gg2 string array. + """ + result = subprocess.run( + ["Rscript", "--vanilla", "-e", _R_DATA_SCRIPT], + capture_output=True, + text=True, + timeout=30, + ) + if result.returncode != 0: + pytest.skip(f"R data generation failed: {result.stderr}") + + raw = json.loads(result.stdout) + + def _to_numeric(lst): + out = [] + for v in lst: + out.append(float("nan") if v is None else float(v)) + return out + + return { + "ss": _to_numeric(raw["ss"]), + "cc": [int(v) for v in raw["cc"]], + "cv1": _to_numeric(raw["cv1"]), + "cv2": _to_numeric(raw["cv2"]), + "cv3": _to_numeric(raw["cv3"]), + "gg": list(raw["gg"]), + "strt": [int(v) for v in raw["strt"]], + "dd_ss": _to_numeric(raw["dd_ss"]), + "gg2": [None if v is None else str(v) for v in raw["gg2"]], + } diff --git a/tests/test_crr.py b/tests/test_crr.py new file mode 100644 index 0000000..6e09474 --- /dev/null +++ b/tests/test_crr.py @@ -0,0 +1,247 @@ +import numpy as np +import pytest + +from CompRiskReg import crr +from CompRiskReg._km import km_censoring_weights + + +def _cv_matrix(r_data: dict) -> np.ndarray: + """ + Reconstruct the covariate matrix from R test data. + + :param r_data: Dictionary loaded from the r_data fixture. + :returns: Covariate matrix of shape (n, 3). + :rtype: np.ndarray + """ + return np.column_stack( + [ + np.array(r_data["cv1"], dtype=np.float64), + np.array(r_data["cv2"], dtype=np.float64), + np.array(r_data["cv3"], dtype=np.float64), + ] + ) + + +@pytest.mark.requires_r +class TestCrrBasic: + """crr: basic 3-covariate model.""" + + def test_convergence_and_coefs(self, r_data): + ss = np.array(r_data["ss"]) + cc = np.array(r_data["cc"]) + cv = _cv_matrix(r_data) + + xx = crr(ss, cc, cv) + assert xx.converged + assert xx.n_missing == 3 + np.testing.assert_allclose(xx.coef, [-0.02689, 0.98601, 0.00354], atol=1e-4) + np.testing.assert_allclose( + np.sqrt(np.diag(xx.var)), [0.5788, 0.5409, 0.5005], atol=1e-3 + ) + + +@pytest.mark.requires_r +class TestCrrTimeVarying: + """crr: with time-varying covariates and cengroup.""" + + def test_coefs(self, r_data): + ss = np.array(r_data["ss"]) + cc = np.array(r_data["cc"]) + cv = _cv_matrix(r_data) + cov2_mat = np.column_stack([cv[:, 0], cv[:, 0]]) + tf_func = lambda uft: np.column_stack([uft, uft**2]) + + ww = crr(ss, cc, cv, cov2_mat, tf=tf_func, cengroup=cv[:, 2]) + assert ww.converged + assert ww.n_missing == 3 + np.testing.assert_allclose( + ww.coef, + [1.68937, 1.56093, -0.39161, -3.15281, 0.70563], + atol=1e-3, + ) + np.testing.assert_allclose( + np.sqrt(np.diag(ww.var)), + [1.2551, 0.5753, 0.5465, 2.4617, 0.9357], + atol=1e-3, + ) + + +@pytest.mark.requires_r +class TestCrrPredict: + """crr: predict method.""" + + def test_predict_values(self, r_data): + ss = np.array(r_data["ss"]) + cc = np.array(r_data["cc"]) + cv = _cv_matrix(r_data) + cov2_mat = np.column_stack([cv[:, 0], cv[:, 0]]) + tf_func = lambda uft: np.column_stack([uft, uft**2]) + + ww = crr(ss, cc, cv, cov2_mat, tf=tf_func, cengroup=cv[:, 2]) + wp = ww.predict( + cov1=np.array([[1, 1, 1], [0, 0, 0]]), + cov2=np.array([[1, 1], [0, 0]]), + ) + + np.testing.assert_allclose(wp[0, 1], 0.034917, atol=1e-4) + np.testing.assert_allclose(wp[0, 2], 0.002521, atol=1e-4) + np.testing.assert_allclose(wp[-1, 1], 0.984495, atol=1e-3) + np.testing.assert_allclose(wp[-1, 2], 0.921125, atol=1e-3) + + +@pytest.mark.requires_r +class TestCrrFailcode: + """crr: different failcode/cencode.""" + + def test_failcode2(self, r_data): + ss = np.array(r_data["ss"]) + cc = np.array(r_data["cc"]) + cv = _cv_matrix(r_data) + cov2_mat = np.column_stack([cv[:, 0], cv[:, 0]]) + tf_func = lambda uft: np.column_stack([uft, uft**2]) + + ww = crr(ss, cc, cv, cov2_mat, tf=tf_func, cengroup=cv[:, 2], failcode=2) + assert ww.converged + np.testing.assert_allclose( + ww.coef, + [0.51727, 0.26965, -0.54690, -1.15366, 0.36355], + atol=1e-3, + ) + + def test_cencode2(self, r_data): + ss = np.array(r_data["ss"]) + cc = np.array(r_data["cc"]) + cv = _cv_matrix(r_data) + cov2_mat = np.column_stack([cv[:, 0], cv[:, 0]]) + tf_func = lambda uft: np.column_stack([uft, uft**2]) + + ww = crr(ss, cc, cv, cov2_mat, tf=tf_func, cengroup=cv[:, 2], cencode=2) + assert ww.converged + np.testing.assert_allclose( + ww.coef, + [1.68937, 1.56093, -0.39161, -3.15281, 0.70563], + atol=1e-3, + ) + + +@pytest.mark.requires_r +class TestCrrSingleCovariate: + """crr: single covariate.""" + + def test_single_cov(self, r_data): + ss = np.array(r_data["ss"]) + cc = np.array(r_data["cc"]) + cv = _cv_matrix(r_data) + + ww = crr(ss, cc, cv[:, 0:1]) + assert ww.converged + np.testing.assert_allclose(ww.coef, [0.14979], atol=1e-3) + np.testing.assert_allclose(np.sqrt(np.diag(ww.var)), [0.5805], atol=1e-3) + + def test_cov2_only(self, r_data): + ss = np.array(r_data["ss"]) + cc = np.array(r_data["cc"]) + cv = _cv_matrix(r_data) + + ww = crr(ss, cc, cov2=cv[:, 0:1], tf=lambda x: x.reshape(-1, 1)) + assert ww.converged + np.testing.assert_allclose(ww.coef, [-0.05430], atol=1e-4) + np.testing.assert_allclose(np.sqrt(np.diag(ww.var)), [0.3800], atol=1e-3) + + +@pytest.mark.requires_r +class TestCrrSummary: + """crr: summary method.""" + + def test_summary(self, r_data): + ss = np.array(r_data["ss"]) + cc = np.array(r_data["cc"]) + cv = _cv_matrix(r_data) + + ww = crr(ss, cc, cov2=cv[:, 0:1], tf=lambda x: x.reshape(-1, 1)) + s = ww.summary() + assert s["n"] == 97 + assert s["n_missing"] == 3 + np.testing.assert_allclose(s["loglik"], -130.63, atol=0.1) + + +class TestCrrInputValidation: + """crr: input validation and edge cases -- no R required.""" + + def test_empty_input(self): + with pytest.raises(ValueError, match="at least one observation"): + crr([], [], cov1=np.zeros((0, 1))) + + def test_no_covariates(self): + with pytest.raises(ValueError, match="at least one of cov1 or cov2"): + crr([1, 2, 3], [1, 0, 2]) + + def test_inf_in_ftime(self): + with pytest.raises(ValueError, match="Inf"): + crr([1, np.inf, 3], [1, 0, 2], cov1=np.ones((3, 1))) + + def test_inf_in_cov1(self): + with pytest.raises(ValueError, match="cov1 contains Inf"): + crr([1, 2, 3], [1, 0, 2], cov1=np.array([[1], [np.inf], [0]])) + + def test_length_mismatch(self): + with pytest.raises(ValueError, match="same length"): + crr([1, 2, 3], [1, 0], cov1=np.ones((3, 1))) + + def test_no_failures(self): + with pytest.raises(ValueError, match="no failures"): + crr([1, 2, 3], [0, 0, 0], cov1=np.ones((3, 1))) + + def test_cov2_without_tf(self): + with pytest.raises(ValueError, match="tf function is required"): + crr([1, 2, 3], [1, 0, 2], cov2=np.ones((3, 1))) + + @pytest.mark.requires_r + def test_variance_false(self, r_data): + """variance=False should still return coefs but var is NaN.""" + ss = np.array(r_data["ss"]) + cc = np.array(r_data["cc"]) + cv = _cv_matrix(r_data) + + ww = crr(ss, cc, cv, variance=False) + assert ww.converged + np.testing.assert_allclose(ww.coef, [-0.02689, 0.98601, 0.00354], atol=1e-4) + assert np.all(np.isnan(ww.var)) + + @pytest.mark.requires_r + def test_km_matches_r_survfit(self, r_data): + """KM censoring weights must match R's survfit-based values.""" + ss = np.array(r_data["ss"]) + cc = np.array(r_data["cc"]) + cv = _cv_matrix(r_data) + + keep = np.ones(len(ss), dtype=bool) + for arr in [ss, np.array(cc, dtype=float), np.ones(len(ss)), cv]: + if arr.ndim == 1: + keep &= ~np.isnan(arr) + else: + keep &= ~np.any(np.isnan(arr), axis=1) + ss2 = ss[keep] + cc2 = np.array(cc)[keep] + order = np.argsort(ss2, kind="stable") + ss2 = ss2[order] + cc2 = cc2[order] + + cenind = np.where(cc2 == 0, 1, 0) + cengroup = np.ones(len(ss2), dtype=np.int32) + uuu = km_censoring_weights(ss2, cenind, cengroup, 1) + + # Values derived from R survfit(Surv(ss2, cenind) ~ 1) on the same data + r_first5 = [ + 1.0, + 1.0, + 0.98958333333333337034, + 0.98958333333333337034, + 0.97905585106382986282, + ] + r_last3 = [0.37599366978102594095, 0.37599366978102594095, 0.18799683489051297047] + + np.testing.assert_allclose(uuu[0, :5], r_first5, atol=1e-12) + np.testing.assert_allclose(uuu[0, -3:], r_last3, atol=1e-12) + assert uuu[0, 0] == 1.0 + assert uuu.shape[1] == len(ss2) diff --git a/tests/test_cuminc.py b/tests/test_cuminc.py new file mode 100644 index 0000000..06c2fbb --- /dev/null +++ b/tests/test_cuminc.py @@ -0,0 +1,155 @@ +import numpy as np +import pytest + +from CompRiskReg import cuminc, timepoints + + +@pytest.mark.requires_r +class TestCumincNoGroup: + """cuminc with no group argument (single group).""" + + def test_basic(self, r_data): + ss = np.array(r_data["ss"]) + cc = np.array(r_data["cc"]) + xx = cuminc(ss, cc) + assert "Tests" not in xx + assert len([k for k in xx if not k.startswith("_")]) == 2 + + def test_timepoints(self, r_data): + ss = np.array(r_data["ss"]) + cc = np.array(r_data["cc"]) + xx = cuminc(ss, cc) + tp = timepoints(xx, [1, 2, 3, 4, 5]) + + np.testing.assert_allclose( + tp["est"][0], + [0.1957379, 0.3985624, 0.4444948, 0.5252162, 0.5252162], + atol=1e-6, + ) + np.testing.assert_allclose( + tp["est"][1], + [0.1866538, 0.3555730, 0.4478767, 0.4747838, 0.4747838], + atol=1e-6, + ) + np.testing.assert_allclose( + tp["var"][0], + [0.001874372, 0.003472507, 0.004005942, 0.007496865, 0.007496865], + atol=1e-6, + ) + np.testing.assert_allclose( + tp["var"][1], + [0.001834993, 0.003242584, 0.004320712, 0.004600893, 0.004600893], + atol=1e-6, + ) + + +@pytest.mark.requires_r +class TestCumincWithGroup: + """cuminc with group and strata.""" + + def test_group_strata(self, r_data): + ss = np.array(r_data["ss"]) + cc = np.array(r_data["cc"]) + gg = np.array(r_data["gg"]) + strt = np.array(r_data["strt"]) + + xx = cuminc(ss, cc, gg, strt) + assert "Tests" in xx + + tests = xx["Tests"] + np.testing.assert_allclose(tests[0, 0], 0.8372, atol=1e-3) + np.testing.assert_allclose(tests[0, 1], 0.6580, atol=1e-3) + np.testing.assert_allclose(tests[1, 0], 0.5817, atol=1e-3) + np.testing.assert_allclose(tests[1, 1], 0.7476, atol=1e-3) + + def test_group_strata_timepoints(self, r_data): + ss = np.array(r_data["ss"]) + cc = np.array(r_data["cc"]) + gg = np.array(r_data["gg"]) + strt = np.array(r_data["strt"]) + + xx = cuminc(ss, cc, gg, strt) + tp = timepoints(xx, [1, 2, 3, 4, 5]) + names = tp["row_names"] + + a1 = names.index("a 1") + b1 = names.index("b 1") + c1 = names.index("c 1") + + np.testing.assert_allclose( + tp["est"][a1], + [0.2247026, 0.4761877, 0.4761877, 0.4761877, 0.4761877], + atol=1e-5, + ) + np.testing.assert_allclose(tp["est"][b1, :2], [0.2143791, 0.4001698], atol=1e-5) + np.testing.assert_allclose(tp["est"][c1, :2], [0.1574019, 0.3229729], atol=1e-5) + + +@pytest.mark.requires_r +class TestCumincDifferentData: + """cuminc with dd$ss (different failure times).""" + + def test_dd_ss(self, r_data): + dd_ss = np.array(r_data["dd_ss"]) + cc = np.array(r_data["cc"]) + gg = np.array(r_data["gg"]) + strt = np.array(r_data["strt"]) + + xx = cuminc(dd_ss, cc, gg, strt) + tests = xx["Tests"] + np.testing.assert_allclose(tests[0, 0], 3.6466, atol=1e-3) + np.testing.assert_allclose(tests[1, 0], 0.8494, atol=1e-3) + + tp = timepoints(xx, [1, 2, 3]) + names = tp["row_names"] + a1 = names.index("a 1") + b1 = names.index("b 1") + c1 = names.index("c 1") + + np.testing.assert_allclose(tp["est"][a1, 0], 0.05804, atol=1e-4) + np.testing.assert_allclose(tp["est"][b1, 0], 0.05573, atol=1e-4) + np.testing.assert_allclose(tp["est"][c1, 0], 0.04545, atol=1e-4) + + +class TestCumincInputValidation: + """cuminc: input validation and edge cases -- no R required.""" + + def test_empty_input(self): + with pytest.raises(ValueError, match="at least one observation"): + cuminc([], []) + + def test_length_mismatch(self): + with pytest.raises(ValueError, match="same length"): + cuminc([1, 2, 3], [1, 0]) + + def test_inf_in_ftime(self): + with pytest.raises(ValueError, match="Inf"): + cuminc([1, np.inf, 3], [1, 0, 2]) + + def test_all_censored(self): + """All censored -- should produce no cause curves.""" + result = cuminc([1, 2, 3], [0, 0, 0]) + curve_names = [k for k in result if k != "Tests" and not k.startswith("_")] + assert len(curve_names) == 0 + + @pytest.mark.requires_r + def test_no_trailing_zeros(self, r_data): + """Verify curve arrays do not have trailing zero padding.""" + ss = np.array(r_data["ss"]) + cc = np.array(r_data["cc"]) + xx = cuminc(ss, cc) + for key in xx: + if key.startswith("_"): + continue + assert xx[key]["time"][-1] > 0 + + @pytest.mark.requires_r + def test_missing_values(self, r_data): + """cuminc handles None values in group variable.""" + ss = np.array(r_data["ss"]) + cc = np.array(r_data["cc"]) + gg2 = np.array(r_data["gg2"], dtype=object) + + result = cuminc(ss, cc, gg2) + curve_names = [k for k in result if k != "Tests" and not k.startswith("_")] + assert len(curve_names) > 0 From 0038c1d5b24cc83f4860f455efd9877fb7c185e6 Mon Sep 17 00:00:00 2001 From: Ryan O'Dea <70209371+ryan-odea@users.noreply.github.com> Date: Thu, 19 Feb 2026 15:56:52 +0100 Subject: [PATCH 13/16] Update README.md --- README.md | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e2ad56c..f233174 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,79 @@ # CompRiskReg -todo + +A Python package for competing risks regression and cumulative incidence estimation. Implements and translates the methods from the R [`cmprsk`](https://cran.r-project.org/package=cmprsk) package. + +## Installation + +```bash +pip install CompRiskReg +``` + +## Overview + +In survival analysis with competing events, standard Kaplan-Meier and Cox regression can be misleading because they treat competing events as independent censoring. This package provides two core methods designed for this setting: + +- **Cumulative Incidence Function (CIF)** — Aalen-Johansen estimator with Gray's K-sample test for comparing groups +- **Fine-Gray regression** — Subdistribution hazard model for covariate-adjusted inference on the CIF + +## API and Use + +### `cuminc` + +Estimates the CIF for each group-cause combination and optionally tests equality across groups using Gray's K-sample test. + +```python +import numpy as np +from CompRiskReg import cuminc, timepoints + +rng = np.random.default_rng(42) +n = 200 +ftime = rng.exponential(scale=1.0, size=n) +fstatus = rng.choice([0, 1, 2], size=n, p=[0.3, 0.4, 0.3]) +group = rng.choice(["A", "B"], size=n) + +result = cuminc(ftime, fstatus, group=group) + +# Access CIF estimates for group A, cause 1 +curve = result["A 1"] +print(curve["time"]) # event times +print(curve["est"]) # CIF estimates +print(curve["var"]) # variance estimates + +# Gray's test comparing groups (present when >1 group) +print(result["Tests"]) # columns: [test stat, p-value, df] +``` + +### `crr` + +Fits the Fine-Gray subdistribution hazard model. + +```python +from CompRiskReg import crr + +n = 200 +ftime = rng.exponential(scale=1.0, size=n) +fstatus = rng.choice([0, 1, 2], size=n, p=[0.3, 0.4, 0.3]) +cov1 = rng.standard_normal(size=(n, 2)) + +fit = crr(ftime, fstatus, cov1=cov1, failcode=1, cencode=0) + +print(fit.coef) # estimated coefficients +print(fit.converged) # whether Newton-Raphson converged + +# Summary table (coef, exp(coef), SE, z, p-value) +summary = fit.summary() +print(summary["coef"]) +print(summary["conf_int"]) + +# Predict CIF for new subjects +new_cov = np.array([[0.5, -1.0], [1.0, 0.0]]) +pred = fit.predict(cov1=new_cov) +``` + +`crr` also supports time-varying covariates via the `cov2` and `tf` arguments, where `tf` is a callable that maps a vector of times to a covariate matrix. + +## Dependencies + +- Python >= 3.8 +- NumPy >= 1.20 +- SciPy >= 1.8.0 From 42e20727be89b0e000729369af0cbbab31e8e8ce Mon Sep 17 00:00:00 2001 From: Ryan O'Dea <70209371+ryan-odea@users.noreply.github.com> Date: Thu, 19 Feb 2026 15:57:57 +0100 Subject: [PATCH 14/16] release versioning --- CompRiskReg/__init__.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CompRiskReg/__init__.py b/CompRiskReg/__init__.py index 7956af6..766d7fc 100644 --- a/CompRiskReg/__init__.py +++ b/CompRiskReg/__init__.py @@ -2,4 +2,4 @@ from .cuminc import cuminc, timepoints __all__ = ["crr", "timepoints", "cuminc"] -__version__ = "0.0.1" +__version__ = "1.0.0" diff --git a/pyproject.toml b/pyproject.toml index 8cee593..34d8395 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "CompRiskReg" -version = "0.0.1" +version = "1.0.0" description = "Competing Risks Regression and Cumulative Incidence Estimation" license = { file = "LICENSE" } authors = [ From b5fe1d43cd603f128b3a3f409e8ba2aa13c4e200 Mon Sep 17 00:00:00 2001 From: Ryan O'Dea <70209371+ryan-odea@users.noreply.github.com> Date: Thu, 19 Feb 2026 16:01:09 +0100 Subject: [PATCH 15/16] Update test.yml --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ea5672a..13eb527 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -19,7 +19,7 @@ jobs: - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - - run: pip install -e ".[dev]" + - run: pip install -e ".[test]" - run: pytest -m "not requires_r" test-with-r: From 9fbf6d6f0eb720c599255262012711d28be4d8c8 Mon Sep 17 00:00:00 2001 From: Ryan O'Dea <70209371+ryan-odea@users.noreply.github.com> Date: Thu, 19 Feb 2026 16:03:02 +0100 Subject: [PATCH 16/16] expand grid --- .github/workflows/test.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 13eb527..08f721a 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -13,7 +13,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.9", "3.10", "3.11", "3.12"] + python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 @@ -27,7 +27,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.9", "3.12"] + python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"] steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 @@ -38,5 +38,5 @@ jobs: r-version: "release" - name: Install R cmprsk run: Rscript -e "install.packages('cmprsk', repos='https://cloud.r-project.org')" - - run: pip install -e ".[dev]" + - run: pip install -e ".[test]" - run: pytest -m "requires_r" \ No newline at end of file