RaredonLab · huangyaqing-123 · Jul 11, 2025 · Jul 11, 2025
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -1,2 +1,3 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
+^LICENSE\.md$
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -24,7 +24,8 @@ Imports:
     viridis,
     stats,
     effsize,
-    rlang
+    tidyverse,
+    purrr
 Suggests: 
     testthat (>= 3.0.0)
 Config/testthat/edition: 3

diff --git a/LICENSE b/LICENSE
@@ -1,21 +1,2 @@
-MIT License
-
-Copyright (c) 2025 Raredon Lab
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+YEAR: 2025
+COPYRIGHT HOLDER: Raredon Lab
diff --git a/LICENSE.md b/LICENSE.md
@@ -0,0 +1,21 @@
+# MIT License
+
+Copyright (c) 2025 Raredon Lab
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/NAMESPACE b/NAMESPACE
@@ -12,11 +12,14 @@ import(cowplot)
 import(ggplot2)
 import(matrixStats)
 import(readxl)
-import(stats)
 import(tidyverse)
 import(viridis)
+importFrom(dplyr,"%>%")
 importFrom(dplyr,bind_rows)
-importFrom(dplyr,filter)
-importFrom(dplyr,pull)
 importFrom(effsize,cohen.d)
-importFrom(rlang,sym)
+importFrom(matrixStats,rowMaxs)
+importFrom(matrixStats,rowMins)
+importFrom(purrr,map)
+importFrom(stats,cmdscale)
+importFrom(stats,dist)
+importFrom(stats,na.omit)
diff --git a/R/CalculatePercentage.R b/R/CalculatePercentage.R
@@ -1,8 +1,12 @@
 #' Calculate the percentage of cells in activation status
+#'
+#' This function calculates the percentage of cells in ON (scale > 0) and OFF (scale < 0)
+#' activation states within each group defined by `group_var`. If exactly two groups
+#' are provided, it also computes Cohen's d effect size between their activation values.
 #' @name CalculatePercentage
-#' @importFrom dplyr filter pull bind_rows
-#' @importFrom rlang sym
+#' @importFrom dplyr bind_rows
 #' @importFrom effsize cohen.d
+#' @importFrom stats na.omit
 #' @param to.plot A data frame containing at least a `scale` column and a grouping column.
 #' @param group_var A string specifying the grouping variable (e.g., "genotype", "treatment").
 #' @return A data frame with the percentage of ON/OFF cells and Cohen's d (if applicable).
@@ -11,36 +15,47 @@
 #' CalculatePercentage(fake_to_plot, "genotype")
 #' @export
 CalculatePercentage <- function(to.plot, group_var){
+  # Make sure there is scale data
   stopifnot("scale" %in% names(to.plot))
 
-  group_sym <- sym(group_var)
+  # Make sure no NA
   groups <- unique(na.omit(to.plot[[group_var]]))
   results <- list()
 
   for (g in groups) {
-    subset_data <- dplyr::filter(to.plot, !!group_sym == g)
+    subset_data <- to.plot[to.plot[[group_var]] == g, ]
     total <- nrow(subset_data)
 
+    # Calculate how many cells are in on/off status
     on <- sum(subset_data[["scale"]] > 0, na.rm = TRUE)
     off <- sum(subset_data[["scale"]] < 0, na.rm = TRUE)
 
+    # Calculate percentages of on/off cells
     results[[as.character(g)]] <- list(
       percentage_on = round(100 * on / total, 2),
       percentage_off = round(100 * off / total, 2)
     )
   }
 
+  # When there are two groups in comparison, Cohen's d — a measure of effect size — will be applied for statistic purpose
   if (length(groups) == 2) {
     g1 <- groups[1]
     g2 <- groups[2]
-    vec1 <- pull(dplyr::filter(to.plot, !!group_sym == g1), scale)
-    vec2 <- pull(dplyr::filter(to.plot, !!group_sym == g2), scale)
+    vec1 <- to.plot[to.plot[[group_var]] == g1, "scale"]
+    vec2 <- to.plot[to.plot[[group_var]] == g2, "scale"]
+
+    # Computes Cohen's d between two numeric vectors (vec1 and vec2) and extracts the estimated value of the effect size.
     cohens_d_val <- cohen.d(vec1, vec2)$estimate
+    # |d value|: 0 - 0.2, effect size is negligible
+    # |d value|: 0.2 - 0.5: small effect
+    # |d value|: 0.5 - 0.8: medium effect
+    # |d value|: > 0.8: large effect
 
     results[[as.character(g1)]]$cohens_d <- cohens_d_val
     results[[as.character(g2)]]$cohens_d <- cohens_d_val
   }
 
+  # Make a dataframe for the output
   df <- bind_rows(results, .id = "group")
   return(df)
 }
diff --git a/R/ComputeCellData.R b/R/ComputeCellData.R
@@ -1,145 +1,130 @@
-#' A function for scRNA sequencing pathway analysis
-#'
-#' This function computes cell status for a given pathway in single-cell RNA-seq data,
+#' A function computes cell status for a given pathway in single-cell RNA-seq data,
 #' based on the distance between genes in a specified pathway. The distance is computed
 #' for each batch of cells, and classical multidimensional scaling (MDS) is used to
 #' visualize the pathway expression across cells.
 #'
 #' @name ComputeCellData
 #' @import Seurat
+#' @importFrom matrixStats rowMins rowMaxs
+#' @importFrom stats dist cmdscale
+#' @importFrom dplyr %>%
+#' @importFrom purrr map
 #' @import tidyverse
 #' @import viridis
-#' @import matrixStats
-#' @import stats
+#'
 #' @param x A `Seurat` object containing single-cell RNA sequencing data.
-#' @param pathway A `character` string specifying the pathway name.
-#' @param distance.method A `character` string specifying the distance method.
-#' Options include: "manhattan", "euclidean", "canberra", "binary", "minkowski".
-#' @return A data frame representing the multidimensional scaling (MDS) results
-#' for the cells based on the pathway expression.
+#' @param pathway A `character` string specifying the pathway name. This should match a pathway used by `LoadPathway()`.
+#' @param distance.method A `character` string specifying the distance metric to use.
+#' Options include: `"manhattan"`, `"euclidean"`, `"canberra"`, `"binary"`, `"minkowski"`
+#' @param batch.size An `integer` specifying the number of cells to process per batch. Default is 1000.
+#' @param scale.data A `logical` indicating whether to use scaled data (`scale.data = TRUE`) or normalized data. Default is `TRUE`.
+#'
+#' @return A data frame of MDS results with normalized values per cell, suitable for thresholding or visualization.
+#'
 #' @examples
-#' data(fake_test_object) # load the fake test data
-#' ComputeCellData(fake_test_object, "Wnt", "manhattan")
+#' data(fake_test_object)
+#' ComputeCellData(fake_test_object, pathway = "Wnt", distance.method = "manhattan", batch.size = 2000)
+#'
 #' @export
-ComputeCellData <- function(x, pathway, distance.method){
+ComputeCellData <- function(x, pathway, distance.method, batch.size = batch.size, scale.data = TRUE){
 
   # Get pathway data
   pathwaydata <- LoadPathway(pathway)
   names <- c(pathwaydata[[1]])
 
-  # Ensure only valid genes are used
+  # Use only genes present in Seurat object
   valid_names <- intersect(names, rownames(x))
   if (length(valid_names) == 0) {
-    stop("No matching genes found in the Seurat object for the given pathway.")
+    stop("No valid pathway genes found in the Seurat object.")
   }
+  x <- ScaleData(x, features = valid_names)
+
+  # Extract expression data from the desired slot
+  slot_use <- if (scale.data) "scale.data" else "data"
+  expr_data <- GetAssayData(x, assay = "RNA", slot = slot_use)[valid_names, , drop = FALSE]
 
   # Pathway max and min
   pathway.stat <- PathwayMaxMin(x, pathway)
 
-  # Gel all cells
-  all_cells <- Cells(x)
+  # Get cell indices
+  cell_id <- colnames(expr_data)
 
+  # Shuffle cell indices
+  shuffled_cell_id <- sample(cell_id)
+
+  # Split shuffled indices into batches
   # Define batch size
-  batch_size <-1000
-  # test batch_size = 1 store the output, -> identical or not?
+  batch_size <- batch.size
+  batches <- split(shuffled_cell_id, ceiling(seq_along(shuffled_cell_id) / batch.size))
+
+  # Subset expression data into chunks based on sampled indices
+  expr_chunks <- lapply(batches, function(cols) expr_data[, cols, drop = FALSE])
 
-  # Determine the number of iterations
-  num_batches <- ceiling(length(all_cells) / batch_size)
+  # For each expr_chunks, do distance measuring
   # Initialize list to store results
   batch_results <- list()
 
   # Loop through batches of 500 cells
-  for (i in seq_len(num_batches)) {
-
-    # Ensure there are remaining cells to sample
-    if (length(all_cells) == 0) break
-
-    # Sample cells
-    sample_cells <- sample(all_cells, min(batch_size, length(all_cells)))
-    if (length(sample_cells) == 0) next  # Avoid errors if no cells left
-
-    # Subset Seurat object
-    x_batch <- subset(x, cells = sample_cells)
-    DefaultAssay(x_batch) <- "RNA"  # Ensure correct assay
-
-    # Extract expression data
-    temp.data.batch <- x_batch[valid_names, ] # when n= 1, it is a vecor
-    # if temp.data.batch > 2 more rows
-    # if temp.data.batch = 1 row
-    # if temp.data.batch = 0, stop
-    # Convert to data frame to avoid vector issues when n = 1
-    if (is.vector(temp.data.batch)) {
-      temp.data.batch <- as.data.frame(t(temp.data.batch))
-    } else {
-      temp.data.batch <- as.data.frame(temp.data.batch@assays[["RNA"]]$data)
-    }
+  for (i in seq_len(length(batches))) {
 
-    # Check if temp.data.batch is empty
-    if (nrow(temp.data.batch) == 0) {
-      warning("Batch", i, "has no valid data. Skipping...")
-      next
-    }
+    message("Processing batch ", i)
 
-    # Merge pathway stats with expression data
-    # Ensure they have the same columes
-    common_rows <- intersect(rownames(pathway.stat), rownames(temp.data.batch))
-    pathway.stat <- pathway.stat[common_rows, , drop = FALSE]
-    temp.data.batch <- temp.data.batch[common_rows, , drop = FALSE]
+    # Extract and convert expression chunk
+    expr_data <- expr_chunks[[i]]
+    temp.data.batch <- as.data.frame(expr_data)
 
+    # Merge along columns
     pathwaytempdata <- cbind(pathway.stat, temp.data.batch)
 
-    # Ensure there are at least two columns for distance computation
+    # Check for enough cells (columns)
     if (ncol(pathwaytempdata) < 2) {
-      warning("Batch", i, "does not have enough features for distance calculation. Skipping...")
+      warning("Batch ", i, " does not have enough cells for distance calculation. Skipping...")
       next
     }
 
-    # Compute Manhattan distance
-    # distance.method <- 'manhattan'
+    # Distance calculation
     message("Computing distance...")
-    d <- dist(t(pathwaytempdata), method = distance.method) # should we use scaled data?
+    d <- dist(t(pathwaytempdata), method = distance.method)
     # "manhattan" is sum of absolute differences (city block distance), good for sparse data (gene expression)
     # "euclidean" is stratight-line distance, is useful for PCA clustering
     # "canberra" is weighted distance, is also good for sparse data and when values have very different scales
     # "binary" is distance based on presence/absence (0/1)
     # "minkowski" is generalization of euclidean & manhattan, tunable using p parameter
     # choose "manhattan" as it works well for high-dimensional data and less sensitive to large outliers than euclidean distance
 
-    # Perform classical multidimensional scaling (MDS)
-    message("running mds ...")
+    # MDS
+    message("Running MDS ...")
     fit <- cmdscale(d, eig = TRUE, k = 1)
-    message("mds finished")
+    message("MDS finished")
 
-
-    # Transform to data frame
+    # Normalize the MDS values
     temp.data.mds <- as.data.frame(fit$points)
     colnames(temp.data.mds) <- "V1"
-
-    # Normalize the MDS data safely
     V1_min <- min(temp.data.mds$V1, na.rm = TRUE)
     V1_max <- max(temp.data.mds$V1, na.rm = TRUE)
 
     if (V1_max == V1_min) {
-      temp.data.mds$normalized <- 0  # Avoid division by zero
+      temp.data.mds$normalized <- 0
     } else {
       temp.data.mds$normalized <- (temp.data.mds$V1 - V1_min) / (V1_max - V1_min)
     }
 
-    # Store MDS results for each batch
+    # Store result
     batch_results[[i]] <- temp.data.mds
 
-    # Print progress
-    cat("Batch", i, "processed with", length(sample_cells), "cells\n")
-
-    # Remove used cells to avoid duplication in the next iteration
-    all_cells <- setdiff(all_cells, sample_cells)
+    # Report
+    cat("Batch", i, "processed with", ncol(expr_data), "cells\n")
   }
+
   final_mds <- do.call(rbind, batch_results)  # Merge all batch MDS results
 
   return(final_mds)
 }
 
-# we need to re-scale
-# help function -> documentation
-# clear all R environment -> test_script see if works
-# document()
+# using sample
+# barcode list (randomization)
+# list of data chunk
+# make these list independent
+# short loop
+# lappy, sapply (list-wide operation)
+# https://www.r-bloggers.com/2022/03/complete-tutorial-on-using-apply-functions-in-r/
diff --git a/R/LoadPathway.R b/R/LoadPathway.R
@@ -2,7 +2,7 @@
 #'
 #' This function reads pathway data from the package's built-in Excel file.
 #' @name LoadPathway
-#' @param pathway The name of the pathway interested.
+#' @param pathway A `character` string specifying the pathway name.
 #' @return A data frame with pathway data.
 #' @examples
 #' LoadPathway("Wnt")