From 7e0521e909424048beed8011862aec8ba7c25760 Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Thu, 22 Sep 2022 12:22:31 -0600
Subject: [PATCH 1/2] Preliminary draft fix for table formatting issues.
 There's likely a cleaner/better way, but for now this works

---
 tiny/rna/tiny-deseq.r | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tiny/rna/tiny-deseq.r b/tiny/rna/tiny-deseq.r
index 6085adf1..9661bd07 100755
--- a/tiny/rna/tiny-deseq.r
+++ b/tiny/rna/tiny-deseq.r
@@ -110,7 +110,10 @@ library(ggplot2)
 library(utils)
 
 ## Returns a new dataframe with metadata columns prepended
+## Columns are rounded according to host preference
 df_with_metadata <- function(classless_df){
+  prec <- getOption('digits')
+  classless_df <- data.frame(rbind(format(round(classless_df, prec), digits=prec, nsmall=prec)))
   return(data.frame(
     merge(metadata, data.frame(classless_df), by=0),
     row.names = "Row.names",
@@ -165,7 +168,6 @@ write.csv(
   paste(out_pref, "norm_counts.csv", sep="_"),
   row.names=FALSE,
   quote=1:4,
-  na=""
 )
 
 if (has_control){
@@ -198,7 +200,7 @@ for (i in seq_len(nrow(all_comparisons))){
   comparison <- all_comparisons[i,]
 
   deseq_res <- DESeq2::results(deseq_run, c("condition", comparison[2], comparison[1]))
-  result_df <- df_with_metadata(deseq_res[order(deseq_res$padj),])
+  result_df <- df_with_metadata(data.frame(deseq_res[order(deseq_res$padj),]))
 
   # Resolve original condition names for use in output filename
   cond1 <- sampleConditions[[comparison[1]]]

From ea64f6970df646e15b89a0acad0cd07e1aa9f2dd Mon Sep 17 00:00:00 2001
From: Alex Tate <0xalextate@gmail.com>
Date: Tue, 4 Oct 2022 15:37:02 -0700
Subject: [PATCH 2/2] Removed the option for maximum scipen so that scientific
 notation is triggered normally. Also refactored so that columns are
 referenced by name rather than by range. This should make it much easier to
 adapt the script to future changes in multi-index and metadata columns

---
 tiny/rna/tiny-deseq.r | 75 +++++++++++++++++++++++--------------------
 1 file changed, 41 insertions(+), 34 deletions(-)

diff --git a/tiny/rna/tiny-deseq.r b/tiny/rna/tiny-deseq.r
index 9661bd07..07ef86e1 100755
--- a/tiny/rna/tiny-deseq.r
+++ b/tiny/rna/tiny-deseq.r
@@ -32,9 +32,6 @@ Optional arguments:
 # 8170: https://github.com/wch/r-source/blob/tags/R-4-1-1/src/main/options.c#L626
 options(warning.length = min(getOption("warning.length") + nchar(usage), 8170))
 
-# Suppress conversion to scientific notation
-options(scipen = 999)
-
 
 #### ---- Validate commandline arguments ---- ####
 
@@ -68,23 +65,33 @@ if (count_file %in% all_args[all_args != '--input-file']){
 
 #### ---- Read in inputs ---- ####
 
-
-## Resolve relative paths and perform tilde expansion for input file path
+## Read unsanitized CSV header and save sample column names for use in final outputs
 count_file <- normalizePath(count_file)
+header_row <- read.csv(count_file, check.names = FALSE, row.names = 1, nrows = 1)
 
-## DESeq2 prefers R-safe column names. We want to preserve original sample names to use in final outputs
-orig_sample_names <- colnames(read.csv(count_file, check.names = FALSE, row.names = 1, nrows = 1))[0:-3]
-
-## Read counts CSV with sanitized column names for handling. Subset classes and aliases.
+## Read counts CSV with R-safe column names for handling.
 counts <- read.csv(count_file)
 
-## Set rownames to concatenated Feature ID and Tag, then drop these columns
-rownames(counts) <- split(counts[, c('Feature.ID', 'Tag')], seq(nrow(counts)))
-counts <- subset(counts, select = -c(Feature.ID, Tag))
+## Returns vec with names(vec) holding R-safe names
+with_names <- function(vec){
+  names(vec) <- make.names(vec)
+  return(vec)
+}
+
+## Original column names
+idx_cols <- with_names(c('Feature ID', 'Tag'))
+meta_cols <- with_names(c('Feature Name', 'Feature Class'))
+char_cols <- with_names(unique(c(idx_cols, meta_cols)))
+sample_cols <- with_names(colnames(header_row)[!(colnames(header_row) %in% char_cols)])
+
+## Concatenate index column contents, assign the result to rownames, then drop
+rownames(counts) <- split(counts[names(idx_cols)], seq(nrow(counts)))
+counts <- counts[!(colnames(counts) %in% names(idx_cols))]
 
 ## Copy feature metadata and drop these columns, leaving only integer columns
-metadata <- data.frame("Feature Name" = counts[[1]], "Feature Class" = counts[[2]], row.names = rownames(counts), check.names = FALSE)
-counts <- data.frame(sapply(counts[0:-2], as.integer), row.names = rownames(counts))
+metadata <- data.frame(counts[names(meta_cols)])
+counts <- counts[!(colnames(counts) %in% names(meta_cols))]
+counts[] <- lapply(counts, as.integer)
 
 ## Remove rows containing all zeros if user requests
 if (drop_zero){
@@ -96,8 +103,7 @@ if (drop_zero){
 
 
 ## Get sample conditions as a named vector where names are R safe names
-sampleConditions <- sapply(strsplit(as.character(orig_sample_names), '_rep_'), '[[', 1)
-names(sampleConditions) <- make.names(sampleConditions)
+sampleConditions <- with_names(sapply(strsplit(as.character(sample_cols), '_rep_'), '[[', 1))
 
 ## Ensure control group is present in samples, if specified
 if (has_control && !(control_grp %in% sampleConditions)){
@@ -109,18 +115,26 @@ library(DESeq2)
 library(ggplot2)
 library(utils)
 
+## Create the deseqdataset
+sample_table <- data.frame(row.names=names(sample_cols), condition=factor(names(sampleConditions)))
+deseq_ds <- DESeq2::DESeqDataSetFromMatrix(countData = counts, colData = sample_table, design = ~ condition)
+
+
+#### ---- Run DESeq2 & write outputs ---- ####
+
+
 ## Returns a new dataframe with metadata columns prepended
-## Columns are rounded according to host preference
-df_with_metadata <- function(classless_df){
-  prec <- getOption('digits')
-  classless_df <- data.frame(rbind(format(round(classless_df, prec), digits=prec, nsmall=prec)))
-  return(data.frame(
-    merge(metadata, data.frame(classless_df), by=0),
+df_with_metadata <- function(df_no_metadata){
+  with_md <- data.frame(
+    merge(metadata, df_no_metadata, by=0),
     row.names = "Row.names",
     check.names = FALSE
-  ))
+  )
+  colnames(with_md) <- c(meta_cols, colnames(df_no_metadata))
+  return(with_md)
 }
 
+## Splits concatenated index cols and restores them as normal cols
 restore_multiindex <- function(base_df){
   base_df[] <- sapply(base_df, as.character)
   base_matrix <- as.matrix(base_df)
@@ -131,19 +145,12 @@ restore_multiindex <- function(base_df){
 
   # Assign the MultiIndex columns and drop rownames
   multiidx_mx <- cbind(split_mx, base_matrix)
-  colnames(multiidx_mx) <- c("Feature ID", "Tag", colnames(base_matrix))
+  colnames(multiidx_mx) <- c(idx_cols, colnames(base_matrix))
   rownames(multiidx_mx) <- NULL
 
   return(multiidx_mx)
 }
 
-## Create the deseqdataset
-sample_table <- data.frame(row.names=names(orig_sample_names), condition=factor(names(sampleConditions)))
-deseq_ds <- DESeq2::DESeqDataSetFromMatrix(countData = counts, colData = sample_table, design = ~ condition)
-
-
-#### ---- Run DESeq2 & write outputs ---- ####
-
 
 ## Create the DESeq Object
 deseq_run <- DESeq2::DESeq(deseq_ds)
@@ -162,7 +169,7 @@ if (plot_pca){
 
 ## Get normalized counts and write them to CSV with original sample names in header
 deseq_counts <- df_with_metadata(DESeq2::counts(deseq_run, normalized=TRUE))
-colnames(deseq_counts)[0:-2] <- orig_sample_names
+colnames(deseq_counts) <- c(meta_cols, sample_cols)
 write.csv(
   restore_multiindex(deseq_counts),
   paste(out_pref, "norm_counts.csv", sep="_"),
@@ -188,10 +195,10 @@ write_dge_table <- function (dge_df, cond1, cond2){
     paste(out_pref,
           "cond1", cond1,
           "cond2", cond2,
-          "deseq_table.csv", sep="_"),
+          "deseq_table.csv",
+          sep="_"),
     row.names=FALSE,
     quote=1:4,
-    na=""
   )
 }