diff --git a/tiny/rna/tiny-deseq.r b/tiny/rna/tiny-deseq.r index 6085adf1..07ef86e1 100755 --- a/tiny/rna/tiny-deseq.r +++ b/tiny/rna/tiny-deseq.r @@ -32,9 +32,6 @@ Optional arguments: # 8170: https://github.com/wch/r-source/blob/tags/R-4-1-1/src/main/options.c#L626 options(warning.length = min(getOption("warning.length") + nchar(usage), 8170)) -# Suppress conversion to scientific notation -options(scipen = 999) - #### ---- Validate commandline arguments ---- #### @@ -68,23 +65,33 @@ if (count_file %in% all_args[all_args != '--input-file']){ #### ---- Read in inputs ---- #### - -## Resolve relative paths and perform tilde expansion for input file path +## Read unsanitized CSV header and save sample column names for use in final outputs count_file <- normalizePath(count_file) +header_row <- read.csv(count_file, check.names = FALSE, row.names = 1, nrows = 1) -## DESeq2 prefers R-safe column names. We want to preserve original sample names to use in final outputs -orig_sample_names <- colnames(read.csv(count_file, check.names = FALSE, row.names = 1, nrows = 1))[0:-3] - -## Read counts CSV with sanitized column names for handling. Subset classes and aliases. +## Read counts CSV with R-safe column names for handling. counts <- read.csv(count_file) -## Set rownames to concatenated Feature ID and Tag, then drop these columns -rownames(counts) <- split(counts[, c('Feature.ID', 'Tag')], seq(nrow(counts))) -counts <- subset(counts, select = -c(Feature.ID, Tag)) +## Returns vec with names(vec) holding R-safe names +with_names <- function(vec){ + names(vec) <- make.names(vec) + return(vec) +} + +## Original column names +idx_cols <- with_names(c('Feature ID', 'Tag')) +meta_cols <- with_names(c('Feature Name', 'Feature Class')) +char_cols <- with_names(unique(c(idx_cols, meta_cols))) +sample_cols <- with_names(colnames(header_row)[!(colnames(header_row) %in% char_cols)]) + +## Concatenate index column contents, assign the result to rownames, then drop +rownames(counts) <- split(counts[names(idx_cols)], seq(nrow(counts))) +counts <- counts[!(colnames(counts) %in% names(idx_cols))] ## Copy feature metadata and drop these columns, leaving only integer columns -metadata <- data.frame("Feature Name" = counts[[1]], "Feature Class" = counts[[2]], row.names = rownames(counts), check.names = FALSE) -counts <- data.frame(sapply(counts[0:-2], as.integer), row.names = rownames(counts)) +metadata <- data.frame(counts[names(meta_cols)]) +counts <- counts[!(colnames(counts) %in% names(meta_cols))] +counts[] <- lapply(counts, as.integer) ## Remove rows containing all zeros if user requests if (drop_zero){ @@ -96,8 +103,7 @@ if (drop_zero){ ## Get sample conditions as a named vector where names are R safe names -sampleConditions <- sapply(strsplit(as.character(orig_sample_names), '_rep_'), '[[', 1) -names(sampleConditions) <- make.names(sampleConditions) +sampleConditions <- with_names(sapply(strsplit(as.character(sample_cols), '_rep_'), '[[', 1)) ## Ensure control group is present in samples, if specified if (has_control && !(control_grp %in% sampleConditions)){ @@ -109,15 +115,26 @@ library(DESeq2) library(ggplot2) library(utils) +## Create the deseqdataset +sample_table <- data.frame(row.names=names(sample_cols), condition=factor(names(sampleConditions))) +deseq_ds <- DESeq2::DESeqDataSetFromMatrix(countData = counts, colData = sample_table, design = ~ condition) + + +#### ---- Run DESeq2 & write outputs ---- #### + + ## Returns a new dataframe with metadata columns prepended -df_with_metadata <- function(classless_df){ - return(data.frame( - merge(metadata, data.frame(classless_df), by=0), +df_with_metadata <- function(df_no_metadata){ + with_md <- data.frame( + merge(metadata, df_no_metadata, by=0), row.names = "Row.names", check.names = FALSE - )) + ) + colnames(with_md) <- c(meta_cols, colnames(df_no_metadata)) + return(with_md) } +## Splits concatenated index cols and restores them as normal cols restore_multiindex <- function(base_df){ base_df[] <- sapply(base_df, as.character) base_matrix <- as.matrix(base_df) @@ -128,19 +145,12 @@ restore_multiindex <- function(base_df){ # Assign the MultiIndex columns and drop rownames multiidx_mx <- cbind(split_mx, base_matrix) - colnames(multiidx_mx) <- c("Feature ID", "Tag", colnames(base_matrix)) + colnames(multiidx_mx) <- c(idx_cols, colnames(base_matrix)) rownames(multiidx_mx) <- NULL return(multiidx_mx) } -## Create the deseqdataset -sample_table <- data.frame(row.names=names(orig_sample_names), condition=factor(names(sampleConditions))) -deseq_ds <- DESeq2::DESeqDataSetFromMatrix(countData = counts, colData = sample_table, design = ~ condition) - - -#### ---- Run DESeq2 & write outputs ---- #### - ## Create the DESeq Object deseq_run <- DESeq2::DESeq(deseq_ds) @@ -159,13 +169,12 @@ if (plot_pca){ ## Get normalized counts and write them to CSV with original sample names in header deseq_counts <- df_with_metadata(DESeq2::counts(deseq_run, normalized=TRUE)) -colnames(deseq_counts)[0:-2] <- orig_sample_names +colnames(deseq_counts) <- c(meta_cols, sample_cols) write.csv( restore_multiindex(deseq_counts), paste(out_pref, "norm_counts.csv", sep="_"), row.names=FALSE, quote=1:4, - na="" ) if (has_control){ @@ -186,10 +195,10 @@ write_dge_table <- function (dge_df, cond1, cond2){ paste(out_pref, "cond1", cond1, "cond2", cond2, - "deseq_table.csv", sep="_"), + "deseq_table.csv", + sep="_"), row.names=FALSE, quote=1:4, - na="" ) } @@ -198,7 +207,7 @@ for (i in seq_len(nrow(all_comparisons))){ comparison <- all_comparisons[i,] deseq_res <- DESeq2::results(deseq_run, c("condition", comparison[2], comparison[1])) - result_df <- df_with_metadata(deseq_res[order(deseq_res$padj),]) + result_df <- df_with_metadata(data.frame(deseq_res[order(deseq_res$padj),])) # Resolve original condition names for use in output filename cond1 <- sampleConditions[[comparison[1]]]