Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 42 additions & 33 deletions tiny/rna/tiny-deseq.r
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,6 @@ Optional arguments:
# 8170: https://github.com/wch/r-source/blob/tags/R-4-1-1/src/main/options.c#L626
options(warning.length = min(getOption("warning.length") + nchar(usage), 8170))

# Suppress conversion to scientific notation
options(scipen = 999)


#### ---- Validate commandline arguments ---- ####

Expand Down Expand Up @@ -68,23 +65,33 @@ if (count_file %in% all_args[all_args != '--input-file']){

#### ---- Read in inputs ---- ####


## Resolve relative paths and perform tilde expansion for input file path
## Read unsanitized CSV header and save sample column names for use in final outputs
count_file <- normalizePath(count_file)
header_row <- read.csv(count_file, check.names = FALSE, row.names = 1, nrows = 1)

## DESeq2 prefers R-safe column names. We want to preserve original sample names to use in final outputs
orig_sample_names <- colnames(read.csv(count_file, check.names = FALSE, row.names = 1, nrows = 1))[0:-3]

## Read counts CSV with sanitized column names for handling. Subset classes and aliases.
## Read counts CSV with R-safe column names for handling.
counts <- read.csv(count_file)

## Set rownames to concatenated Feature ID and Tag, then drop these columns
rownames(counts) <- split(counts[, c('Feature.ID', 'Tag')], seq(nrow(counts)))
counts <- subset(counts, select = -c(Feature.ID, Tag))
## Returns vec with names(vec) holding R-safe names
with_names <- function(vec){
names(vec) <- make.names(vec)
return(vec)
}

## Original column names
idx_cols <- with_names(c('Feature ID', 'Tag'))
meta_cols <- with_names(c('Feature Name', 'Feature Class'))
char_cols <- with_names(unique(c(idx_cols, meta_cols)))
sample_cols <- with_names(colnames(header_row)[!(colnames(header_row) %in% char_cols)])

## Concatenate index column contents, assign the result to rownames, then drop
rownames(counts) <- split(counts[names(idx_cols)], seq(nrow(counts)))
counts <- counts[!(colnames(counts) %in% names(idx_cols))]

## Copy feature metadata and drop these columns, leaving only integer columns
metadata <- data.frame("Feature Name" = counts[[1]], "Feature Class" = counts[[2]], row.names = rownames(counts), check.names = FALSE)
counts <- data.frame(sapply(counts[0:-2], as.integer), row.names = rownames(counts))
metadata <- data.frame(counts[names(meta_cols)])
counts <- counts[!(colnames(counts) %in% names(meta_cols))]
counts[] <- lapply(counts, as.integer)

## Remove rows containing all zeros if user requests
if (drop_zero){
Expand All @@ -96,8 +103,7 @@ if (drop_zero){


## Get sample conditions as a named vector where names are R safe names
sampleConditions <- sapply(strsplit(as.character(orig_sample_names), '_rep_'), '[[', 1)
names(sampleConditions) <- make.names(sampleConditions)
sampleConditions <- with_names(sapply(strsplit(as.character(sample_cols), '_rep_'), '[[', 1))

## Ensure control group is present in samples, if specified
if (has_control && !(control_grp %in% sampleConditions)){
Expand All @@ -109,15 +115,26 @@ library(DESeq2)
library(ggplot2)
library(utils)

## Create the deseqdataset
sample_table <- data.frame(row.names=names(sample_cols), condition=factor(names(sampleConditions)))
deseq_ds <- DESeq2::DESeqDataSetFromMatrix(countData = counts, colData = sample_table, design = ~ condition)


#### ---- Run DESeq2 & write outputs ---- ####


## Returns a new dataframe with metadata columns prepended
df_with_metadata <- function(classless_df){
return(data.frame(
merge(metadata, data.frame(classless_df), by=0),
df_with_metadata <- function(df_no_metadata){
with_md <- data.frame(
merge(metadata, df_no_metadata, by=0),
row.names = "Row.names",
check.names = FALSE
))
)
colnames(with_md) <- c(meta_cols, colnames(df_no_metadata))
return(with_md)
}

## Splits concatenated index cols and restores them as normal cols
restore_multiindex <- function(base_df){
base_df[] <- sapply(base_df, as.character)
base_matrix <- as.matrix(base_df)
Expand All @@ -128,19 +145,12 @@ restore_multiindex <- function(base_df){

# Assign the MultiIndex columns and drop rownames
multiidx_mx <- cbind(split_mx, base_matrix)
colnames(multiidx_mx) <- c("Feature ID", "Tag", colnames(base_matrix))
colnames(multiidx_mx) <- c(idx_cols, colnames(base_matrix))
rownames(multiidx_mx) <- NULL

return(multiidx_mx)
}

## Create the deseqdataset
sample_table <- data.frame(row.names=names(orig_sample_names), condition=factor(names(sampleConditions)))
deseq_ds <- DESeq2::DESeqDataSetFromMatrix(countData = counts, colData = sample_table, design = ~ condition)


#### ---- Run DESeq2 & write outputs ---- ####


## Create the DESeq Object
deseq_run <- DESeq2::DESeq(deseq_ds)
Expand All @@ -159,13 +169,12 @@ if (plot_pca){

## Get normalized counts and write them to CSV with original sample names in header
deseq_counts <- df_with_metadata(DESeq2::counts(deseq_run, normalized=TRUE))
colnames(deseq_counts)[0:-2] <- orig_sample_names
colnames(deseq_counts) <- c(meta_cols, sample_cols)
write.csv(
restore_multiindex(deseq_counts),
paste(out_pref, "norm_counts.csv", sep="_"),
row.names=FALSE,
quote=1:4,
na=""
)

if (has_control){
Expand All @@ -186,10 +195,10 @@ write_dge_table <- function (dge_df, cond1, cond2){
paste(out_pref,
"cond1", cond1,
"cond2", cond2,
"deseq_table.csv", sep="_"),
"deseq_table.csv",
sep="_"),
row.names=FALSE,
quote=1:4,
na=""
)
}

Expand All @@ -198,7 +207,7 @@ for (i in seq_len(nrow(all_comparisons))){
comparison <- all_comparisons[i,]

deseq_res <- DESeq2::results(deseq_run, c("condition", comparison[2], comparison[1]))
result_df <- df_with_metadata(deseq_res[order(deseq_res$padj),])
result_df <- df_with_metadata(data.frame(deseq_res[order(deseq_res$padj),]))

# Resolve original condition names for use in output filename
cond1 <- sampleConditions[[comparison[1]]]
Expand Down