Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
^.*\.Rproj$
^\.Rproj\.user$
32 changes: 32 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
Package: PathwayEmbed
Title: Tools for Pathway-Level Embedding and Visualization in Single-Cell Data
Version: 0.0.0.9000
Authors@R:
person("Yaqing", "Huang", email = "yaqing.huang@yale.edu", role = c("aut", "cre"))
Description: Provides tools for analyzing and visualizing pathway-level activity
in single-cell RNA-seq data. Includes functions for computing cell-wise pathway scores,
visualizing transduction states, calculating activation percentages,
and integrating pathway data with Seurat objects.
License: MIT + file LICENSE
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.2
Depends:
R (>= 3.5)
Imports:
readxl,
Seurat,
RColorBrewer,
ggplot2,
cowplot,
dplyr,
matrixStats,
viridis,
stats,
effsize,
rlang
Suggests:
testthat (>= 3.0.0)
Config/testthat/edition: 3
LazyData: true

2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (c) 2024 Raredon Lab
Copyright (c) 2025 Raredon Lab

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
22 changes: 22 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Generated by roxygen2: do not edit by hand

export(CalculatePercentage)
export(ComputeCellData)
export(LoadPathway)
export(PathwayMaxMin)
export(PlotPathway)
export(PreparePlotData)
import(RColorBrewer)
import(Seurat)
import(cowplot)
import(ggplot2)
import(matrixStats)
import(readxl)
import(stats)
import(tidyverse)
import(viridis)
importFrom(dplyr,bind_rows)
importFrom(dplyr,filter)
importFrom(dplyr,pull)
importFrom(effsize,cohen.d)
importFrom(rlang,sym)
23 changes: 23 additions & 0 deletions PathwayEmbed.Rproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
Version: 1.0
ProjectId: 0c111876-39b0-460f-a888-db107bec1084

RestoreWorkspace: No
SaveWorkspace: No
AlwaysSaveHistory: Default

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8

RnwWeave: Sweave
LaTeX: pdfLaTeX

AutoAppendNewline: Yes
StripTrailingWhitespace: Yes
LineEndingConversion: Posix

BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
PackageRoxygenize: rd,collate,namespace
46 changes: 46 additions & 0 deletions R/CalculatePercentage.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#' Calculate the percentage of cells in activation status
#' @name CalculatePercentage
#' @importFrom dplyr filter pull bind_rows
#' @importFrom rlang sym
#' @importFrom effsize cohen.d
#' @param to.plot A data frame containing at least a `scale` column and a grouping column.
#' @param group_var A string specifying the grouping variable (e.g., "genotype", "treatment").
#' @return A data frame with the percentage of ON/OFF cells and Cohen's d (if applicable).
#' @examples
#' data(fake_to_plot)
#' CalculatePercentage(fake_to_plot, "genotype")
#' @export
CalculatePercentage <- function(to.plot, group_var){
stopifnot("scale" %in% names(to.plot))

group_sym <- sym(group_var)
groups <- unique(na.omit(to.plot[[group_var]]))
results <- list()

for (g in groups) {
subset_data <- dplyr::filter(to.plot, !!group_sym == g)
total <- nrow(subset_data)

on <- sum(subset_data[["scale"]] > 0, na.rm = TRUE)
off <- sum(subset_data[["scale"]] < 0, na.rm = TRUE)

results[[as.character(g)]] <- list(
percentage_on = round(100 * on / total, 2),
percentage_off = round(100 * off / total, 2)
)
}

if (length(groups) == 2) {
g1 <- groups[1]
g2 <- groups[2]
vec1 <- pull(dplyr::filter(to.plot, !!group_sym == g1), scale)
vec2 <- pull(dplyr::filter(to.plot, !!group_sym == g2), scale)
cohens_d_val <- cohen.d(vec1, vec2)$estimate

results[[as.character(g1)]]$cohens_d <- cohens_d_val
results[[as.character(g2)]]$cohens_d <- cohens_d_val
}

df <- bind_rows(results, .id = "group")
return(df)
}
145 changes: 145 additions & 0 deletions R/ComputeCellData.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
#' A function for scRNA sequencing pathway analysis
#'
#' This function computes cell status for a given pathway in single-cell RNA-seq data,
#' based on the distance between genes in a specified pathway. The distance is computed
#' for each batch of cells, and classical multidimensional scaling (MDS) is used to
#' visualize the pathway expression across cells.
#'
#' @name ComputeCellData
#' @import Seurat
#' @import tidyverse
#' @import viridis
#' @import matrixStats
#' @import stats
#' @param x A `Seurat` object containing single-cell RNA sequencing data.
#' @param pathway A `character` string specifying the pathway name.
#' @param distance.method A `character` string specifying the distance method.
#' Options include: "manhattan", "euclidean", "canberra", "binary", "minkowski".
#' @return A data frame representing the multidimensional scaling (MDS) results
#' for the cells based on the pathway expression.
#' @examples
#' data(fake_test_object) # load the fake test data
#' ComputeCellData(fake_test_object, "Wnt", "manhattan")
#' @export
ComputeCellData <- function(x, pathway, distance.method){

# Get pathway data
pathwaydata <- LoadPathway(pathway)
names <- c(pathwaydata[[1]])

# Ensure only valid genes are used
valid_names <- intersect(names, rownames(x))
if (length(valid_names) == 0) {
stop("No matching genes found in the Seurat object for the given pathway.")
}

# Pathway max and min
pathway.stat <- PathwayMaxMin(x, pathway)

# Gel all cells
all_cells <- Cells(x)

# Define batch size
batch_size <-1000
# test batch_size = 1 store the output, -> identical or not?

# Determine the number of iterations
num_batches <- ceiling(length(all_cells) / batch_size)
# Initialize list to store results
batch_results <- list()

# Loop through batches of 500 cells
for (i in seq_len(num_batches)) {

# Ensure there are remaining cells to sample
if (length(all_cells) == 0) break

# Sample cells
sample_cells <- sample(all_cells, min(batch_size, length(all_cells)))
if (length(sample_cells) == 0) next # Avoid errors if no cells left

# Subset Seurat object
x_batch <- subset(x, cells = sample_cells)
DefaultAssay(x_batch) <- "RNA" # Ensure correct assay

# Extract expression data
temp.data.batch <- x_batch[valid_names, ] # when n= 1, it is a vecor
# if temp.data.batch > 2 more rows
# if temp.data.batch = 1 row
# if temp.data.batch = 0, stop
# Convert to data frame to avoid vector issues when n = 1
if (is.vector(temp.data.batch)) {
temp.data.batch <- as.data.frame(t(temp.data.batch))
} else {
temp.data.batch <- as.data.frame(temp.data.batch@assays[["RNA"]]$data)
}

# Check if temp.data.batch is empty
if (nrow(temp.data.batch) == 0) {
warning("Batch", i, "has no valid data. Skipping...")
next
}

# Merge pathway stats with expression data
# Ensure they have the same columes
common_rows <- intersect(rownames(pathway.stat), rownames(temp.data.batch))
pathway.stat <- pathway.stat[common_rows, , drop = FALSE]
temp.data.batch <- temp.data.batch[common_rows, , drop = FALSE]

pathwaytempdata <- cbind(pathway.stat, temp.data.batch)

# Ensure there are at least two columns for distance computation
if (ncol(pathwaytempdata) < 2) {
warning("Batch", i, "does not have enough features for distance calculation. Skipping...")
next
}

# Compute Manhattan distance
# distance.method <- 'manhattan'
message("Computing distance...")
d <- dist(t(pathwaytempdata), method = distance.method) # should we use scaled data?
# "manhattan" is sum of absolute differences (city block distance), good for sparse data (gene expression)
# "euclidean" is stratight-line distance, is useful for PCA clustering
# "canberra" is weighted distance, is also good for sparse data and when values have very different scales
# "binary" is distance based on presence/absence (0/1)
# "minkowski" is generalization of euclidean & manhattan, tunable using p parameter
# choose "manhattan" as it works well for high-dimensional data and less sensitive to large outliers than euclidean distance

# Perform classical multidimensional scaling (MDS)
message("running mds ...")
fit <- cmdscale(d, eig = TRUE, k = 1)
message("mds finished")


# Transform to data frame
temp.data.mds <- as.data.frame(fit$points)
colnames(temp.data.mds) <- "V1"

# Normalize the MDS data safely
V1_min <- min(temp.data.mds$V1, na.rm = TRUE)
V1_max <- max(temp.data.mds$V1, na.rm = TRUE)

if (V1_max == V1_min) {
temp.data.mds$normalized <- 0 # Avoid division by zero
} else {
temp.data.mds$normalized <- (temp.data.mds$V1 - V1_min) / (V1_max - V1_min)
}

# Store MDS results for each batch
batch_results[[i]] <- temp.data.mds

# Print progress
cat("Batch", i, "processed with", length(sample_cells), "cells\n")

# Remove used cells to avoid duplication in the next iteration
all_cells <- setdiff(all_cells, sample_cells)
}
final_mds <- do.call(rbind, batch_results) # Merge all batch MDS results

return(final_mds)
}

# we need to re-scale
# help function -> documentation
# clear all R environment -> test_script see if works
# document()
26 changes: 26 additions & 0 deletions R/LoadPathway.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
## Pathway Data Extraction from Exceldataset
#'
#' This function reads pathway data from the package's built-in Excel file.
#' @name LoadPathway
#' @param pathway The name of the pathway interested.
#' @return A data frame with pathway data.
#' @examples
#' LoadPathway("Wnt")
#' @import readxl
#' @export
LoadPathway <- function(pathway) {
file_path <- system.file("extdata", "Pathway_Embedding.xlsx", package = "PathwayEmbed")

if (file_path == "") {
stop("Pathway data file not found. Ensure the package is installed correctly.")
}

# Read the specified sheet
data <- readxl::read_excel(file_path, sheet = pathway)
# extract the molecules in the pathway
pathway.molecules <- c(data[["Molecules"]])
# extract the coefficients of the molecules in the pathway
pathway.coefficients <- as.numeric(c(data[["Coefficients"]]))

return(data)
}
69 changes: 69 additions & 0 deletions R/PathwayMaxMin.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#' A function for scRNA sequencing pathway analysis
#' @name PathwayMaxMin
#' @import Seurat
#' @import tidyverse
#' @import viridis
#' @import matrixStats
#'
#' @param x A Seurat Object.
#' @param pathway The name of the pathway.
#' @return The value for Pathway on and off (max and min value for features)
#' @examples
#' data(fake_test_object) # load the fake test data
#' PathwayMaxMin(fake_test_object, "Wnt")
#' @export
PathwayMaxMin <- function(x, pathway){

# Define pathway parameters using LoadPathway
pathwaydata <- LoadPathway(pathway) # load pathway data
names <- c(pathwaydata[[1]]) # molecule names
pathway.on <- as.numeric(c(pathwaydata[[2]])) # coefficients
names(pathway.on) <- names
pathway.off <- -pathway.on # define off status

# Extract normalized RNA expression data for the pathway genes
temp.data <- x[names, ]
data.temp <- as.data.frame(temp.data@assays[["RNA"]]$data) # Seurat version
# "sometimes is counts not data

# Max and min value for genes in the pathway
# Compute row-wise min and max values
ranges <- cbind(
rowMins(as.matrix(data.temp), na.rm = FALSE),
rowMaxs(as.matrix(data.temp), na.rm = FALSE)
)

# Scale the ON/OFF states to the extrema of these ranges for each features
for (i in seq_along(pathway.on)) { # safer than 1:length(pathway.on)
feature_name <- names(pathway.on[i])

if (!feature_name %in% rownames(ranges)) {
warning(paste("Feature", feature_name, "not found in ranges!"))
next # Skip iteration if feature is missing
}
if (pathway.on[i] < 0) {
pathway.on[i] <- ranges[feature_name, 1] # min for ON
} else {
pathway.on[i] <- ranges[feature_name, 2] # max for ON
}
}
for (i in seq_along(pathway.off)) { # Safer indexing
feature_name <- names(pathway.off[i]) # Get feature name

if (!feature_name %in% rownames(ranges)) { # Check if feature exists in ranges
warning(paste("Feature", feature_name, "not found in ranges! Skipping..."))
next # Skip to the next iteration if missing
}

# Assign min or max based on value
pathway.off[i] <- ifelse(pathway.off[i] < 0,
ranges[feature_name, 1], # Min for OFF
ranges[feature_name, 2]) # Max for OFF
}


# Bind on and off states
pathway.stat <- data.frame(pathway.on,pathway.off)

return(pathway.stat)
}
Loading