RaredonLab · huangyaqing-123 · May 28, 2025 · Feb 11, 2025 · Feb 11, 2025 · May 21, 2025
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -0,0 +1,2 @@
+^.*\.Rproj$
+^\.Rproj\.user$
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,32 @@
+Package: PathwayEmbed
+Title: Tools for Pathway-Level Embedding and Visualization in Single-Cell Data
+Version: 0.0.0.9000
+Authors@R: 
+    person("Yaqing", "Huang", email = "yaqing.huang@yale.edu", role = c("aut", "cre"))
+Description: Provides tools for analyzing and visualizing pathway-level activity 
+    in single-cell RNA-seq data. Includes functions for computing cell-wise pathway scores, 
+    visualizing transduction states, calculating activation percentages, 
+    and integrating pathway data with Seurat objects.
+License: MIT + file LICENSE
+Encoding: UTF-8
+Roxygen: list(markdown = TRUE)
+RoxygenNote: 7.3.2
+Depends: 
+    R (>= 3.5)
+Imports:
+    readxl,
+    Seurat,
+    RColorBrewer,
+    ggplot2,
+    cowplot,
+    dplyr,
+    matrixStats,
+    viridis,
+    stats,
+    effsize,
+    rlang
+Suggests: 
+    testthat (>= 3.0.0)
+Config/testthat/edition: 3
+LazyData: true
+
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2024 Raredon Lab
+Copyright (c) 2025 Raredon Lab
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,22 @@
+# Generated by roxygen2: do not edit by hand
+
+export(CalculatePercentage)
+export(ComputeCellData)
+export(LoadPathway)
+export(PathwayMaxMin)
+export(PlotPathway)
+export(PreparePlotData)
+import(RColorBrewer)
+import(Seurat)
+import(cowplot)
+import(ggplot2)
+import(matrixStats)
+import(readxl)
+import(stats)
+import(tidyverse)
+import(viridis)
+importFrom(dplyr,bind_rows)
+importFrom(dplyr,filter)
+importFrom(dplyr,pull)
+importFrom(effsize,cohen.d)
+importFrom(rlang,sym)
diff --git a/PathwayEmbed.Rproj b/PathwayEmbed.Rproj
@@ -0,0 +1,23 @@
+Version: 1.0
+ProjectId: 0c111876-39b0-460f-a888-db107bec1084
+
+RestoreWorkspace: No
+SaveWorkspace: No
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
+
+AutoAppendNewline: Yes
+StripTrailingWhitespace: Yes
+LineEndingConversion: Posix
+
+BuildType: Package
+PackageUseDevtools: Yes
+PackageInstallArgs: --no-multiarch --with-keep.source
+PackageRoxygenize: rd,collate,namespace
diff --git a/R/CalculatePercentage.R b/R/CalculatePercentage.R
@@ -0,0 +1,46 @@
+#' Calculate the percentage of cells in activation status
+#' @name CalculatePercentage
+#' @importFrom dplyr filter pull bind_rows
+#' @importFrom rlang sym
+#' @importFrom effsize cohen.d
+#' @param to.plot A data frame containing at least a `scale` column and a grouping column.
+#' @param group_var A string specifying the grouping variable (e.g., "genotype", "treatment").
+#' @return A data frame with the percentage of ON/OFF cells and Cohen's d (if applicable).
+#' @examples
+#' data(fake_to_plot)
+#' CalculatePercentage(fake_to_plot, "genotype")
+#' @export
+CalculatePercentage <- function(to.plot, group_var){
+  stopifnot("scale" %in% names(to.plot))
+
+  group_sym <- sym(group_var)
+  groups <- unique(na.omit(to.plot[[group_var]]))
+  results <- list()
+
+  for (g in groups) {
+    subset_data <- dplyr::filter(to.plot, !!group_sym == g)
+    total <- nrow(subset_data)
+
+    on <- sum(subset_data[["scale"]] > 0, na.rm = TRUE)
+    off <- sum(subset_data[["scale"]] < 0, na.rm = TRUE)
+
+    results[[as.character(g)]] <- list(
+      percentage_on = round(100 * on / total, 2),
+      percentage_off = round(100 * off / total, 2)
+    )
+  }
+
+  if (length(groups) == 2) {
+    g1 <- groups[1]
+    g2 <- groups[2]
+    vec1 <- pull(dplyr::filter(to.plot, !!group_sym == g1), scale)
+    vec2 <- pull(dplyr::filter(to.plot, !!group_sym == g2), scale)
+    cohens_d_val <- cohen.d(vec1, vec2)$estimate
+
+    results[[as.character(g1)]]$cohens_d <- cohens_d_val
+    results[[as.character(g2)]]$cohens_d <- cohens_d_val
+  }
+
+  df <- bind_rows(results, .id = "group")
+  return(df)
+}
diff --git a/R/ComputeCellData.R b/R/ComputeCellData.R
@@ -0,0 +1,145 @@
+#' A function for scRNA sequencing pathway analysis
+#'
+#' This function computes cell status for a given pathway in single-cell RNA-seq data,
+#' based on the distance between genes in a specified pathway. The distance is computed
+#' for each batch of cells, and classical multidimensional scaling (MDS) is used to
+#' visualize the pathway expression across cells.
+#'
+#' @name ComputeCellData
+#' @import Seurat
+#' @import tidyverse
+#' @import viridis
+#' @import matrixStats
+#' @import stats
+#' @param x A `Seurat` object containing single-cell RNA sequencing data.
+#' @param pathway A `character` string specifying the pathway name.
+#' @param distance.method A `character` string specifying the distance method.
+#' Options include: "manhattan", "euclidean", "canberra", "binary", "minkowski".
+#' @return A data frame representing the multidimensional scaling (MDS) results
+#' for the cells based on the pathway expression.
+#' @examples
+#' data(fake_test_object) # load the fake test data
+#' ComputeCellData(fake_test_object, "Wnt", "manhattan")
+#' @export
+ComputeCellData <- function(x, pathway, distance.method){
+
+  # Get pathway data
+  pathwaydata <- LoadPathway(pathway)
+  names <- c(pathwaydata[[1]])
+
+  # Ensure only valid genes are used
+  valid_names <- intersect(names, rownames(x))
+  if (length(valid_names) == 0) {
+    stop("No matching genes found in the Seurat object for the given pathway.")
+  }
+
+  # Pathway max and min
+  pathway.stat <- PathwayMaxMin(x, pathway)
+
+  # Gel all cells
+  all_cells <- Cells(x)
+
+  # Define batch size
+  batch_size <-1000
+  # test batch_size = 1 store the output, -> identical or not?
+
+  # Determine the number of iterations
+  num_batches <- ceiling(length(all_cells) / batch_size)
+  # Initialize list to store results
+  batch_results <- list()
+
+  # Loop through batches of 500 cells
+  for (i in seq_len(num_batches)) {
+
+    # Ensure there are remaining cells to sample
+    if (length(all_cells) == 0) break
+
+    # Sample cells
+    sample_cells <- sample(all_cells, min(batch_size, length(all_cells)))
+    if (length(sample_cells) == 0) next  # Avoid errors if no cells left
+
+    # Subset Seurat object
+    x_batch <- subset(x, cells = sample_cells)
+    DefaultAssay(x_batch) <- "RNA"  # Ensure correct assay
+
+    # Extract expression data
+    temp.data.batch <- x_batch[valid_names, ] # when n= 1, it is a vecor
+    # if temp.data.batch > 2 more rows
+    # if temp.data.batch = 1 row
+    # if temp.data.batch = 0, stop
+    # Convert to data frame to avoid vector issues when n = 1
+    if (is.vector(temp.data.batch)) {
+      temp.data.batch <- as.data.frame(t(temp.data.batch))
+    } else {
+      temp.data.batch <- as.data.frame(temp.data.batch@assays[["RNA"]]$data)
+    }
+
+    # Check if temp.data.batch is empty
+    if (nrow(temp.data.batch) == 0) {
+      warning("Batch", i, "has no valid data. Skipping...")
+      next
+    }
+
+    # Merge pathway stats with expression data
+    # Ensure they have the same columes
+    common_rows <- intersect(rownames(pathway.stat), rownames(temp.data.batch))
+    pathway.stat <- pathway.stat[common_rows, , drop = FALSE]
+    temp.data.batch <- temp.data.batch[common_rows, , drop = FALSE]
+
+    pathwaytempdata <- cbind(pathway.stat, temp.data.batch)
+
+    # Ensure there are at least two columns for distance computation
+    if (ncol(pathwaytempdata) < 2) {
+      warning("Batch", i, "does not have enough features for distance calculation. Skipping...")
+      next
+    }
+
+    # Compute Manhattan distance
+    # distance.method <- 'manhattan'
+    message("Computing distance...")
+    d <- dist(t(pathwaytempdata), method = distance.method) # should we use scaled data?
+    # "manhattan" is sum of absolute differences (city block distance), good for sparse data (gene expression)
+    # "euclidean" is stratight-line distance, is useful for PCA clustering
+    # "canberra" is weighted distance, is also good for sparse data and when values have very different scales
+    # "binary" is distance based on presence/absence (0/1)
+    # "minkowski" is generalization of euclidean & manhattan, tunable using p parameter
+    # choose "manhattan" as it works well for high-dimensional data and less sensitive to large outliers than euclidean distance
+
+    # Perform classical multidimensional scaling (MDS)
+    message("running mds ...")
+    fit <- cmdscale(d, eig = TRUE, k = 1)
+    message("mds finished")
+
+
+    # Transform to data frame
+    temp.data.mds <- as.data.frame(fit$points)
+    colnames(temp.data.mds) <- "V1"
+
+    # Normalize the MDS data safely
+    V1_min <- min(temp.data.mds$V1, na.rm = TRUE)
+    V1_max <- max(temp.data.mds$V1, na.rm = TRUE)
+
+    if (V1_max == V1_min) {
+      temp.data.mds$normalized <- 0  # Avoid division by zero
+    } else {
+      temp.data.mds$normalized <- (temp.data.mds$V1 - V1_min) / (V1_max - V1_min)
+    }
+
+    # Store MDS results for each batch
+    batch_results[[i]] <- temp.data.mds
+
+    # Print progress
+    cat("Batch", i, "processed with", length(sample_cells), "cells\n")
+
+    # Remove used cells to avoid duplication in the next iteration
+    all_cells <- setdiff(all_cells, sample_cells)
+  }
+  final_mds <- do.call(rbind, batch_results)  # Merge all batch MDS results
+
+  return(final_mds)
+}
+
+# we need to re-scale
+# help function -> documentation
+# clear all R environment -> test_script see if works
+# document()
diff --git a/R/LoadPathway.R b/R/LoadPathway.R
@@ -0,0 +1,26 @@
+## Pathway Data Extraction from Exceldataset
+#'
+#' This function reads pathway data from the package's built-in Excel file.
+#' @name LoadPathway
+#' @param pathway The name of the pathway interested.
+#' @return A data frame with pathway data.
+#' @examples
+#' LoadPathway("Wnt")
+#' @import readxl
+#' @export
+LoadPathway <- function(pathway) {
+  file_path <- system.file("extdata", "Pathway_Embedding.xlsx", package = "PathwayEmbed")
+
+  if (file_path == "") {
+    stop("Pathway data file not found. Ensure the package is installed correctly.")
+  }
+
+  # Read the specified sheet
+  data <- readxl::read_excel(file_path, sheet = pathway)
+  # extract the molecules in the pathway
+  pathway.molecules <- c(data[["Molecules"]])
+  # extract the coefficients of the molecules in the pathway
+  pathway.coefficients <- as.numeric(c(data[["Coefficients"]]))
+
+  return(data)
+}
diff --git a/R/PathwayMaxMin.R b/R/PathwayMaxMin.R
@@ -0,0 +1,69 @@
+#' A function for scRNA sequencing pathway analysis
+#' @name PathwayMaxMin
+#' @import Seurat
+#' @import tidyverse
+#' @import viridis
+#' @import matrixStats
+#'
+#' @param x A Seurat Object.
+#' @param pathway The name of the pathway.
+#' @return The value for Pathway on and off (max and min value for features)
+#' @examples
+#' data(fake_test_object) # load the fake test data
+#' PathwayMaxMin(fake_test_object, "Wnt")
+#' @export
+PathwayMaxMin <- function(x, pathway){
+
+  # Define pathway parameters using LoadPathway
+  pathwaydata <- LoadPathway(pathway) # load pathway data
+  names <- c(pathwaydata[[1]]) # molecule names
+  pathway.on <- as.numeric(c(pathwaydata[[2]])) # coefficients
+  names(pathway.on) <- names
+  pathway.off <- -pathway.on # define off status
+
+  # Extract normalized RNA expression data for the pathway genes
+  temp.data <- x[names, ]
+  data.temp <- as.data.frame(temp.data@assays[["RNA"]]$data) # Seurat version
+  # "sometimes is counts not data
+
+  # Max and min value for genes in the pathway
+  # Compute row-wise min and max values
+  ranges <- cbind(
+    rowMins(as.matrix(data.temp), na.rm = FALSE),
+    rowMaxs(as.matrix(data.temp), na.rm = FALSE)
+  )
+
+  # Scale the ON/OFF states to the extrema of these ranges for each features
+  for (i in seq_along(pathway.on)) {  # safer than 1:length(pathway.on)
+    feature_name <- names(pathway.on[i])
+
+    if (!feature_name %in% rownames(ranges)) {
+      warning(paste("Feature", feature_name, "not found in ranges!"))
+      next  # Skip iteration if feature is missing
+    }
+    if (pathway.on[i] < 0) {
+      pathway.on[i] <- ranges[feature_name, 1]  # min for ON
+    } else {
+      pathway.on[i] <- ranges[feature_name, 2]  # max for ON
+    }
+  }
+  for (i in seq_along(pathway.off)) {  # Safer indexing
+    feature_name <- names(pathway.off[i])  # Get feature name
+
+    if (!feature_name %in% rownames(ranges)) {  # Check if feature exists in ranges
+      warning(paste("Feature", feature_name, "not found in ranges! Skipping..."))
+      next  # Skip to the next iteration if missing
+    }
+
+    # Assign min or max based on value
+    pathway.off[i] <- ifelse(pathway.off[i] < 0,
+                             ranges[feature_name, 1],  # Min for OFF
+                             ranges[feature_name, 2])  # Max for OFF
+  }
+
+
+  # Bind on and off states
+  pathway.stat <- data.frame(pathway.on,pathway.off)
+
+  return(pathway.stat)
+}