MontgomeryLab · taimontgomery · Jun 1, 2023 · May 26, 2023 · May 26, 2023 · May 26, 2023
diff --git a/START_HERE/run_config.yml b/START_HERE/run_config.yml
@@ -20,7 +20,7 @@
 user:
 run_date: ~
 run_time: ~
-paths_config: ./paths.yml
+paths_config: paths.yml
 
 ##-- The label for final outputs --##
 ##-- If none provided, the default of user_tinyrna will be used --##
@@ -310,6 +310,7 @@ dir_name_tiny-count: tiny-count
 dir_name_tiny-deseq: tiny-deseq
 dir_name_tiny-plot: tiny-plot
 dir_name_logs: logs
+dir_name_config: config
 
 
 #########################  AUTOMATICALLY GENERATED CONFIGURATIONS #########################
@@ -320,7 +321,7 @@ dir_name_logs: logs
 #
 ###########################################################################################
 
-version: 1.4.0
+version: 1.5.0
 
 ######--------------------------- DERIVED FROM PATHS FILE ---------------------------######
 #
@@ -332,10 +333,10 @@ run_directory: ~
 tmp_directory: ~
 features_csv: { }
 samples_csv: { }
-paths_file: { }
 gff_files: [ ]
 run_bowtie_build: false
 reference_genome_files: [ ]
+bt_index_files: [ ]
 plot_style_sheet: ~
 adapter_fasta: ~
 ebwt: ~
@@ -356,10 +357,6 @@ in_fq: [ ]
 # output reports
 fastp_report_titles: [ ]
 
-###-- Utilized by bowtie --###
-# bowtie index files
-bt_index_files: [ ]
-
 ##-- Utilized by tiny-deseq.r --##
 # The control for comparison. If unspecified, all comparisons are made
 control_condition:
@@ -383,4 +380,11 @@ run_deseq: True
 ##-- Utilized by tiny-plot --##
 # Filters for class scatter plots
 plot_class_scatter_filter_include: []
-plot_class_scatter_filter_exclude: []
+plot_class_scatter_filter_exclude: []
+
+##-- Used to populate the directory defined in dir_name_config --##
+##-- CWL spec doesn't provide a way to get this info from within the workflow --##
+processed_run_config: {}
+
+##-- This is the paths_config key converted to a CWL file object for handling --##
+paths_file: {}
diff --git a/START_HERE/samples.csv b/START_HERE/samples.csv
@@ -1,4 +1,4 @@
-Input Files,Sample/Group Name,Replicate number,Control,Normalization
+Input Files,Sample/Group Name,Replicate Number,Control,Normalization
 ./fastq_files/cond1_rep1.fastq.gz,condition1,1,TRUE,
 ./fastq_files/cond1_rep2.fastq.gz,condition1,2,,
 ./fastq_files/cond1_rep3.fastq.gz,condition1,3,,

diff --git a/START_HERE/tinyRNA_TUTORIAL.md b/START_HERE/tinyRNA_TUTORIAL.md
@@ -32,7 +32,7 @@ And when you're done, you can close your terminal or use `conda deactivate` to r
 The output you see on your terminal is from `cwltool`, which coordinates the execution of the workflow CWL. The terminal output from individual steps is redirected to a logfile for later reference.
 
 ### File outputs
-When the analysis is complete you'll notice a new folder has appeared whose name contains the date and time of the run. Inside you'll find subdirectories containing the file and terminal outputs for each step, and the processed Run Config file for auto-documentation of the run.
+When the analysis is complete you'll notice a new timestamped folder has appeared. Inside you'll find subdirectories containing the file outputs for each step, and processed copies of your configuration files which serve as auto-documentation of the run. These configuration copies also allow for repeat analyses using the existing file outputs.
 
 ### Bowtie indexes
 Bowtie indexes were built during this run because `paths.yml` didn't define an `ebwt` prefix. Now, you'll see the `ebwt` points to the freshly built indexes in your run directory. This means that indexes won't be rebuilt during any subsequent runs that use this `paths.yml` file. If you need to rebuild your indexes, simply delete the value to the right of `ebwt` in paths.yml

diff --git a/doc/Pipeline.md b/doc/Pipeline.md
@@ -18,9 +18,10 @@ tiny replot --config processed_run_config.yml
 The `tiny run` command performs a comprehensive analysis of your [input files](../README.md#requirements-for-user-provided-input-files) according to the preferences defined in your [configuration files](Configuration.md).
 
 ## Resuming a Prior Analysis
-The tiny-count and tiny-plot steps offer a wide variety of options for refining your analysis. You might find that repeat analyses are required while tuning these options to your goals. However, the earlier pipeline steps (fastp, tiny-collapse, and bowtie) handle the largest volume of data and are resource intensive, so you can save time by reusing their outputs for subsequent analyses. One could do so by running the later steps individually (e.g. using commands `tiny-count`, `tiny-deseq.r`, and `tiny-plot`), but assembling their commandline inputs by hand is labor-intensive and prone to spelling mistakes.
+The tiny-count and tiny-plot steps offer many options for refining your analysis. You might find that repeat analyses are required while tuning these options to your goals. However, the earlier pipeline steps (fastp, tiny-collapse, and bowtie) handle the largest volume of data and are resource intensive, so you can save time by reusing their outputs for subsequent analyses.
+
+The commands `tiny recount` and `tiny replot` allow the workflow to be resumed using outputs from a prior run. The Run Directory for each end-to-end analysis will contain the run's four primary configuration files, and these files can be freely edited to change the resume run's behavior without sacrificing auto-documentation.
 
-The commands `tiny recount` and `tiny replot` seek to solve this problem. As discussed in the [Run Config documentation](Configuration.md#the-processed-run-config), the Run Directory for each end-to-end analysis will contain a processed Run Config, and this is the file that determines the behavior of a resume run.
 
 <figure align="center">
     <figcaption><b>tiny recount</b></figcaption>
@@ -29,25 +30,19 @@ The commands `tiny recount` and `tiny replot` seek to solve this problem. As dis
     <img src="../images/replot.png" width="65%" alt="replot"/>
 </figure>
 
-
-You can modify the behavior of a resume run by changing settings in:
-- The **processed** Run Config
-- The **original** Features Sheet that was used for the end-to-end run (as indicated by `features_csv` in the processed Run Config)
-- The **original** Paths File (as indicated by `paths_config` in the processed Run Config)
-
 ### The Steps
-1. Make and save the desired changes in the files above
-2. In your terminal, `cd` to the Run Directory of the end-to-end run you wish to resume
+1. Make and save changes to the configuration files within the target Run Directory
+2. In your terminal, `cd` to the target Run Directory
 3. Run the desired resume command
 
-### A Note on File Inputs
-File inputs are sourced from the **original** output subdirectories of prior steps in the target Run Directory. For `tiny replot`, this means that files from previous executions of `tiny recount` will **not** be used as inputs; only the original end-to-end outputs are used.
+### Auto-Documentation
+Among the subdirectories produced in your Run Directory after an end-to-end run, you'll find a directory named "config" which holds a copy of the run's four primary configuration files. These files serve as documentation for the run and, unlike those found at the root of the Run Directory, they should not be modified. A timestamped "config" directory is created after each resume run to similarly document the configurations that were used.
 
-### Where to Find Outputs from Resume Runs
+### Resume Run Outputs
 Output subdirectories for resume runs can be found alongside the originals, and will have a timestamp appended to their name to differentiate them.
 
-### Auto-Documentation of Resume Runs
-A new processed Run Config will be saved in the Run Directory at the beginning of each resume run. It will be labelled with the same timestamp used in the resume run's other outputs to differentiate it. It includes the changes to your Paths File and Run Config. A copy of your Features Sheet is saved to the timestamped tiny-count output directory during `tiny recount` runs.
+### Repeated Analyses
+If a `recount` run is performed and a `replot` is performed later in the same Run Directory, then only the outputs of the `recount` run are used for generating the plots. If multiple `recount` runs precede the `replot` then the most recent outputs are used.
 
 ## Parallelization
 Most steps in the pipeline run in parallel to minimize runtimes. This is particularly advantageous for multiprocessor systems like server environments. However, parallelization isn't always beneficial. If your computer doesn't have enough free memory, or if you have a large sample file set and/or reference genome, parallel execution might push your machine to its limits. When this happens you might see memory errors or your computer may become unresponsive. In these cases it makes more sense to run resource intensive steps one at a time, in serial, rather than in parallel. To do so, set `run_parallel: false` in your Run Config. This will affect fastp, tiny-collapse, and bowtie since these steps typically handle the largest volumes of data.

diff --git a/images/recount.png b/images/recount.png
diff --git a/images/replot.png b/images/replot.png
diff --git a/tests/testdata/config_files/run_config_template.yml b/tests/testdata/config_files/run_config_template.yml
@@ -310,6 +310,7 @@ dir_name_tiny-count: tiny-count
 dir_name_tiny-deseq: tiny-deseq
 dir_name_tiny-plot: tiny-plot
 dir_name_logs: logs
+dir_name_config: config
 
 
 #########################  AUTOMATICALLY GENERATED CONFIGURATIONS #########################
@@ -320,7 +321,7 @@ dir_name_logs: logs
 #
 ###########################################################################################
 
-version: 1.4.0
+version: 1.5.0
 
 ######--------------------------- DERIVED FROM PATHS FILE ---------------------------######
 #
@@ -332,10 +333,10 @@ run_directory: ~
 tmp_directory: ~
 features_csv: { }
 samples_csv: { }
-paths_file: { }
 gff_files: [ ]
 run_bowtie_build: false
 reference_genome_files: [ ]
+bt_index_files: [ ]
 plot_style_sheet: ~
 adapter_fasta: ~
 ebwt: ~
@@ -356,10 +357,6 @@ in_fq: [ ]
 # output reports
 fastp_report_titles: [ ]
 
-###-- Utilized by bowtie --###
-# bowtie index files
-bt_index_files: [ ]
-
 ##-- Utilized by tiny-deseq.r --##
 # The control for comparison. If unspecified, all comparisons are made
 control_condition:
@@ -383,4 +380,11 @@ run_deseq: True
 ##-- Utilized by tiny-plot --##
 # Filters for class scatter plots
 plot_class_scatter_filter_include: []
-plot_class_scatter_filter_exclude: []
+plot_class_scatter_filter_exclude: []
+
+##-- Used to populate the directory defined in dir_name_config --##
+##-- CWL spec doesn't provide a way to get this info from within the workflow --##
+processed_run_config: {}
+
+##-- This is the paths_config key converted to a CWL file object for handling --##
+paths_file: {}
diff --git a/tiny/cwl/workflows/tinyrna_wf.cwl b/tiny/cwl/workflows/tinyrna_wf.cwl
@@ -15,6 +15,7 @@ inputs:
   # multi input
   threads: int?
   run_name: string
+  processed_run_config: File
   sample_basenames: string[]
 
   # bowtie build
@@ -117,6 +118,7 @@ inputs:
   dir_name_tiny-count: string
   dir_name_tiny-deseq: string
   dir_name_tiny-plot: string
+  dir_name_config: string
 
 steps:
 
@@ -281,6 +283,14 @@ steps:
       - sample_avg_scatter_by_dge
       - sample_avg_scatter_by_dge_class
 
+  organize_config:
+    run: ../tools/make-subdir.cwl
+    in:
+      dir_files:
+        source: [ processed_run_config, paths_file, samples_csv, features_csv, plot_style_sheet ]
+      dir_name: dir_name_config
+    out: [ subdir ]
+
   organize_bt_indexes:
     run: ../tools/make-subdir.cwl
     when: $(inputs.run_bowtie_build)
@@ -353,6 +363,10 @@ steps:
 outputs:
 
   # Subdirectory outputs
+  config_out_dir:
+    type: Directory
+    outputSource: organize_config/subdir
+
   bt_build_out_dir:
     type: Directory?
     outputSource: organize_bt_indexes/subdir

diff --git a/tiny/entry.py b/tiny/entry.py
@@ -189,32 +189,32 @@ def resume(tinyrna_cwl_path: str, config_file: str, step: str) -> None:
 
     """
 
-    # Maps step to Configuration class
-    entry_config = {
+    # Map step to Configuration class
+    resume_config_class = {
         "tiny-count": ResumeCounterConfig,
         "tiny-plot": ResumePlotterConfig
-    }
+    }[step]
 
     print(f"Resuming pipeline execution at the {step} step...")
 
-    # Make appropriate config and workflow for this step; write modified workflow to disk
-    config = entry_config[step](config_file, f"{tinyrna_cwl_path}/workflows/tinyrna_wf.cwl")
-    resume_wf = f"{tinyrna_cwl_path}/workflows/tiny-resume.cwl"
-    config.write_workflow(resume_wf)
+    # The resume workflow is dynamically generated from the run workflow
+    base_workflow = f"{tinyrna_cwl_path}/workflows/tinyrna_wf.cwl"   # The workflow to derive from
+    workflow_dyna = f"{tinyrna_cwl_path}/workflows/tiny-resume.cwl"  # The dynamically generated workflow to write
+
+    config_object = resume_config_class(config_file, base_workflow)
+    config_object.write_processed_config(config_file)
+    config_object.write_workflow(workflow_dyna)
 
-    if config['run_native']:
-        # We can pass our config object directly without writing to disk first
-        run_cwltool_native(config, resume_wf)
+    if config_object['run_native']:
+        # Can pass the config object directly but still write to disk for autodocumentation
+        run_cwltool_native(config_object, workflow_dyna)
     else:
         # Processed Run Config must be written to disk first
-        resume_conf_file = config.get_outfile_path()
-        config.write_processed_config(resume_conf_file)
-        run_cwltool_subprocess(config, resume_wf)
+        run_cwltool_subprocess(config_object, workflow_dyna)
 
-    if os.path.isfile(resume_wf):
+    if os.path.isfile(workflow_dyna):
         # We don't want the generated workflow to be returned by a call to setup-cwl
-        os.remove(resume_wf)
-
+        os.remove(workflow_dyna)
 
 def run_cwltool_subprocess(config_object: 'ConfigBase', workflow: str, run_directory='.') -> int:
     """Executes the workflow using a command line invocation of cwltool

diff --git a/tiny/rna/compatibility.py b/tiny/rna/compatibility.py
@@ -68,9 +68,10 @@ def add_mapping(doc: CommentedMap, prec_key, key_obj):
 
         # Comments & linebreaks are often (but not always!) attached to
         #   the preceding key. Move them down to the new key.
-        inherit_prev = doc.ca.items[prec_key][2]
-        doc.ca.items[key] = [None, None, inherit_prev, None]
-        doc.ca.items[prec_key][2] = None
+        if prec_key in doc.ca.items:
+            inherit_prev = doc.ca.items[prec_key][2]
+            doc.ca.items[key] = [None, None, inherit_prev, None]
+            doc.ca.items[prec_key][2] = None
 
 
 class RunConfigCompatibility: