diff --git a/README.md b/README.md index 21dfe51c..704ed172 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ tinyRNA is a set of tools to simplify the analysis of next-generation sequencing ### The Current Workflow -![tinyRNA basic pipeline](images/tinyrna-workflow_current.png) +![tinyRNA basic pipeline](images/tinyrna_workflow_current.png) ## tinyRNA Installation @@ -161,7 +161,7 @@ At the core of tinyRNA is tiny-count, a highly flexible counting utility that al A wrapper R script for DESeq2 facilitates DGE analysis of counted sample files. ### `tiny-plot` -The results of feature counting and DGE are visualized with high resolution plot PDFs. User-defined plot styles are also supported via a Matplotlib stylesheet. +The results of feature counting and DGE analysis are visualized with high resolution plot PDFs. User-defined plot styles are also supported via a Matplotlib stylesheet. [Full documentation for tiny-plot can be found here.](doc/tiny-plot.md) @@ -276,13 +276,14 @@ Simple static plots are generated from the outputs of tiny-count and tiny-deseq. tiny-deseq.r will produce a standard **PCA plot** from variance stabilizing transformed feature counts. This output is controlled by the `dge_pca_plot` key in the Run Config and by your experiment design. DGE outputs, including the PCA plot, will not be produced for experiments with less than 1 degree of freedom. ### Reducing Storage Usage -The files produced by certain steps can be very large and after several runs this may present significant storage usage. You can remove the following subdirectories from a Run Directory to free up space, but **you will no longer be able to perform repeat analyses within it (i.e. `tiny recount` or `tiny replot`)**: +The files produced by certain steps can be very large and after several runs this may present significant storage usage. You can remove the following subdirectories from a Run Directory to free up space, but **you will no longer be able to perform recount analyses within it** (i.e. `tiny recount`): - fastp (though we recommend keeping the reports) - collapser - bowtie Cleanup commands will be added to tinyRNA in a future release, but for now the following command will remove commonly large files while preserving report files: ```shell +# Execute within the Run Directory you want to clean rm {fastp/*.fastq,{collapser,bowtie}/*.fa,bowtie/*.sam} ``` diff --git a/START_HERE/paths.yml b/START_HERE/paths.yml index 782f2fb3..0246ca0e 100644 --- a/START_HERE/paths.yml +++ b/START_HERE/paths.yml @@ -22,7 +22,7 @@ gff_files: #- path: # alias: [ ] -##-- The final output directory for files produced by the pipeline --# +##-- The suffix to use in the final output directory name (optional) --# run_directory: run_directory ##-- The directory for temporary files. Determined by cwltool if blank. --## diff --git a/START_HERE/run_config.yml b/START_HERE/run_config.yml index eea5a674..50c49570 100644 --- a/START_HERE/run_config.yml +++ b/START_HERE/run_config.yml @@ -1 +1,392 @@ -######----------------------------- tinyRNA Configuration -----------------------------###### # # In this file you can specify your configuration preferences for the workflow and # each workflow step. # # If you want to use DEFAULT settings for the workflow, all you need to do is provide the path # to your Samples Sheet and Features Sheet in your Paths File, then make sure that the # 'paths_config' setting below points to your Paths File. # # We suggest that you also: # 1. Add a username to identify the person performing runs, if desired for record keeping # 2. Add a run directory name in your Paths File. If not provided, "run_directory" is used # 3. Add a run name to label your run directory and run-specific summary reports. # If not provided, user_tinyrna will be used. # # This file will be further processed at run time to generate the appropriate pipeline # settings for each workflow step. A copy of this processed configuration will be stored # in your run directory. # ######-------------------------------------------------------------------------------###### user: NewUser run_date: ~ run_time: ~ paths_config: ./paths.yml ##-- The label for final outputs --## ##-- If none provided, the default of user_tinyrna will be used --## run_name: tinyrna ##-- Number of threads to use when a step supports multi-threading --## ##-- For best performance, this should be equal to your computer's processor core count --## threads: 4 ##-- Control the amount of information printed to terminal: debug, normal, quiet --## verbosity: normal ##-- If True: process fastp, tiny-collapse, and bowtie in parallel per-library --## run_parallel: true ##-- (EXPERIMENTAL) If True: execute the pipeline using native cwltool Python --## run_native: false ######------------------------- BOWTIE INDEX BUILD OPTIONS --------------------------###### # # If you do not already have bowtie indexes, they can be built for you by setting # run_bowtie_build (above) to true and adding your reference genome file(s) to your # paths_config file. # # We have specified default parameters for small RNA data based on our own "best practices". # You can change the parameters here. # ######-------------------------------------------------------------------------------###### ##-- SA is sampled every 2^offRate BWT chars (default: 5) offrate: ~ ##-- Convert Ns in reference to As --## ntoa: false ##-- Don't build .3/.4.ebwt (packed reference) portion --## noref: false ##-- Number of chars consumed in initial lookup (default: 10) --## ftabchars: ~ ######---------------------TRIMMING AND QUALITY FILTER OPTIONS ----------------------###### # # We use the program fastp to perform: adapter trimming (req), quality filtering (on), # and QC analysis for an output QC report. See https://github.com/OpenGene/fastp for more # information on the fastp tool. We have limited the options available to those appropriate # for small RNA sequencing data. If you require an addition option, create an issue on the # pipeline github: https://github.com/MontgomeryLab/tinyrna # # We have specified default parameters for small RNA data based on our own "best practices". # You can change the parameters here. # ######-------------------------------------------------------------------------------###### ##-- Adapter sequence to trim --## adapter_sequence: 'auto' ##-- Minumum & maximum accepted lengths after trimming --## length_required: 15 length_limit: 35 ##-- Minimum average score for a read to pass quality filter --## average_qual: 25 ##-- Minimum phred score for a base to pass quality filter --## qualified_quality_phred: 20 ##-- Minimum % of bases that can be below minimum phred score (above) --## unqualified_percent_limit: 10 ##-- Minimum allowed number of bases --## n_base_limit: 1 ##-- Compression level for gzip output --## compression: 4 ###-- Unused optional inputs: Remove '#' in front to use --### ##-- Trim poly x tails of a given length --## # trim_poly_x: false # poly_x_min_len: 0 ##-- Trim n bases from the front/tail of a read --## # trim_front1: 0 # trim_tail1: 0 ##-- Is the data phred 64? --## # fp_phred64: False ##-- Turn on overrepresentation sampling analysis --## # overrepresentation_sampling: 0 # overrepresentation_analysis: false ##-- If true: don't overwrite the files --## # dont_overwrite: false ##-- If true: disable these options --## # disable_quality_filtering: false # disable_length_filtering: false # disable_adapter_trimming: false ######--------------------------- READ COLLAPSER OPTIONS ----------------------------###### # # We use a custom Python utility for collapsing duplicate reads. # We recommend using the default (keep all reads, or threshold: 0). # Sequences <= threshold will not be included in downstream steps. # Trimming takes place prior to counting/collapsing. # # We have specified default parameters for small RNA data based on our own "best practices". # You can change the parameters here. # ######-------------------------------------------------------------------------------###### ##-- Trim the specified number of bases from the 5' end of each sequence --## 5p_trim: 0 ##-- Trim the specified number of bases from the 3' end of each sequence --## 3p_trim: 0 ##-- Sequences with count <= threshold will be placed in a separate low_counts fasta --## threshold: 0 ##-- If True: outputs will be gzip compressed --## compress: False ######-------------------------- BOWTIE ALIGNMENT OPTIONS ---------------------------###### # # We use bowtie for read alignment to a genome. # # We have specified default parameters for small RNA data based on our own "best practices". # You can change the parameters here. # ######-------------------------------------------------------------------------------###### ##-- Report end-to-end hits w/ <=v mismatches; ignore qualities --## end_to_end: 0 ##-- Report all alignments per read (much slower than low -k) --## all_aln: True ##-- Seed for random number generator --## seed: 0 ##-- Suppress SAM records for unaligned reads --## no_unal: True ##-- Use shared mem for index; many bowtie's can share --## ##-- Note: this requires further configuration of your OS --## ##-- http://bowtie-bio.sourceforge.net/manual.shtml#bowtie-options-shmem --## shared_memory: False ###-- Unused option inputs: Remove '#' in front to use --### ##-- Hits are guaranteed best stratum, sorted; ties broken by quality --## #best: False ##-- Hits in sub-optimal strata aren't reported (requires best, ^^^^) --## #strata: False ##-- Max mismatches in seed (can be 0-3, default: -n 2) --## #seedmms: 2 ##-- Seed length for seedmms (default: 28) --## #seedlen: 28 ##-- Do not align to reverse-compliment reference --## # norc: False ##-- Do not align to forward reference --## # nofw: False ##-- Input quals are Phred+64 (same as --solexa1.3-quals) --## # bt_phred64: False ##-- Report up to good alignments per read (default: 1) --## # k_aln ##-- Number of bases to trim from 5' or 3' end of reads --## # trim5: 0 # trim3: 0 ##-- Input quals are from GA Pipeline ver. < 1.3 --## # solexa: false ##-- Input quals are from GA Pipeline ver. >= 1.3 --## # solexa13: false ######--------------------------- FEATURE COUNTER OPTIONS ---------------------------###### # # We use a custom Python utility that utilizes HTSeq's Genomic Array of Sets and GFF reader # to count small RNA reads. Selection rules are defined in your Features Sheet. # ######-------------------------------------------------------------------------------###### ##-- If True: show all parsed features in the counts csv, regardless of count/identity --## counter_all_features: False ##-- If True: counts will be normalized by genomic hits AND selected feature count --## ##-- If False: counts will only be normalized by genomic hits --## counter_normalize_by_hits: True ##-- If True: a decollapsed copy of each SAM file will be produced (useful for IGV) --## counter_decollapse: False ##-- Select the StepVector implementation that is used. Options: HTSeq or Cython --## counter_stepvector: 'Cython' ##-- If True: produce diagnostic logs to indicate what was eliminated and why --## counter_diags: False ######--------------------------- DIFFERENTIAL EXPRESSION ---------------------------###### # # Differential expression analysis is performed using the DESeq2 R library. # ######-------------------------------------------------------------------------------###### ##-- If True: produce a principal component analysis plot from the input dataset --## dge_pca_plot: True ##-- If True: before analysis, drop features which have a zero count across all samples --## dge_drop_zero: False ######-------------------------------- PLOTTING OPTIONS -----------------------------###### # # We use a custom Python script for creating all plots. If you wish to use another matplotlib # stylesheet you can specify that in the Paths File. # # We have specified default parameters for small RNA data based on our own "best practices". # You can change the parameters here. # ######-------------------------------------------------------------------------------###### ##-- Enable plots by uncommenting (removing the '#') for the desired plot type --## ##-- Disable plots by commenting (adding a '#') for the undesired plot type --## plot_requests: - 'len_dist' - 'rule_charts' - 'class_charts' - 'replicate_scatter' - 'sample_avg_scatter_by_dge' - 'sample_avg_scatter_by_dge_class' ##-- You can set a custom P value to use in DGE scatter plots. Default: 0.05 --## plot_pval: ~ ##-- If True: scatter plot points will be vectorized. If False, only points are raster --## plot_vector_points: False ##-- Optionally set the min and/or max lengths for len_dist plots; auto if unset --## plot_len_dist_min: plot_len_dist_max: ##-- Optionally set the log2 min and/or max view limits for scatter_by_dge plots; auto if unset --## plot_dge_scatter_min: plot_dge_scatter_max: ##-- Use this label in class plots for counts assigned by rules lacking a classifier --## plot_unknown_class: "_UNKNOWN_" ##-- Use this label in class plots for unassigned counts --## plot_unassigned_class: "_UNASSIGNED_" ##-- Optionally filter the classes in class scatter plots --## plot_class_scatter_filter: style: include # Choose: include or exclude classes: [] # Add classes between [ and ], separated by comma ######----------------------------- OUTPUT DIRECTORIES ------------------------------###### # # Outputs for each step are organized into their own subdirectories in your run # directory. You can set these folder names here. # ######-------------------------------------------------------------------------------###### dir_name_bt_build: bowtie-build dir_name_fastp: fastp dir_name_tiny-collapse: tiny-collapse dir_name_bowtie: bowtie dir_name_tiny-count: tiny-count dir_name_tiny-deseq: tiny-deseq dir_name_tiny-plot: tiny-plot dir_name_logs: logs ######################### AUTOMATICALLY GENERATED CONFIGURATIONS ######################### # # Do not make any changes to the following sections. These options are automatically # generated using your Paths File, your Samples and Features sheets, and the above # settings in this file. # ########################################################################################### version: 1.2.2 ######--------------------------- DERIVED FROM PATHS FILE ---------------------------###### # # The following configuration settings are automatically derived from the Paths File # ######-------------------------------------------------------------------------------###### run_directory: ~ tmp_directory: ~ features_csv: { } samples_csv: { } paths_file: { } gff_files: [ ] run_bowtie_build: false reference_genome_files: [ ] plot_style_sheet: ~ adapter_fasta: ~ ebwt: ~ ######------------------------- DERIVED FROM SAMPLES SHEET --------------------------###### # # The following configuration settings are automatically derived from the Samples Sheet # ######-------------------------------------------------------------------------------###### ##-- Utilized by fastp, tiny-collapse, and bowtie --## sample_basenames: [ ] ##-- Utilized by fastp --## # input fastq files in_fq: [ ] # output reports fastp_report_titles: [ ] ###-- Utilized by bowtie --### # bowtie index files bt_index_files: [ ] ##-- Utilized by tiny-deseq.r --## # The control for comparison. If unspecified, all comparisons are made control_condition: # If the experiment design yields less than one degree of freedom, tiny-deseq.r is skipped run_deseq: True ######------------------------- DERIVED FROM FEATURES SHEET -------------------------###### # # The following configuration settings are automatically derived from the Features Sheet # ######-------------------------------------------------------------------------------###### ######--------------------------- DERIVED FROM RUN CONFIG ---------------------------###### # # The following configuration settings are automatically derived from this file # ######-------------------------------------------------------------------------------###### ##-- Utilized by tiny-plot --## # Filters for class scatter plots plot_class_scatter_filter_include: [] plot_class_scatter_filter_exclude: [] \ No newline at end of file +######----------------------------- tinyRNA Configuration -----------------------------###### +# +# In this file you can specify your configuration preferences for the workflow. +# +# If you want to use DEFAULT settings for the workflow, all you need to do is provide the path +# to your Samples Sheet and Features Sheet in your Paths File, then make sure that the +# 'paths_config' setting below points to your Paths File. +# +# We suggest that you also: +# 1. Add a `user` below to identify the person performing runs. +# 2. Add a `run_name` below to prefix your run directory and summary reports. +# If not provided, user_tinyrna is used. +# +# This file will be processed at run time to generate the appropriate settings +# for each workflow step. A copy of the processed configuration will be saved +# in your run directory for auto-documentation. +# +######-------------------------------------------------------------------------------###### + +user: +run_date: ~ +run_time: ~ +paths_config: ./paths.yml + +##-- The label for final outputs --## +##-- If none provided, the default of user_tinyrna will be used --## +run_name: + +##-- Number of threads to use when a step supports multi-threading --## +##-- For best performance, this should be equal to your computer's processor core count --## +threads: 4 + +##-- Control the amount of information printed to terminal: debug, normal, quiet --## +verbosity: normal + +##-- If True: process fastp, tiny-collapse, and bowtie in parallel per-library --## +run_parallel: true + +##-- (EXPERIMENTAL) If True: execute the pipeline using native cwltool Python --## +run_native: false + +######------------------------- BOWTIE INDEX BUILD OPTIONS --------------------------###### +# +# If you do not already have bowtie indexes, they can be built for you by setting +# run_bowtie_build (above) to true and adding your reference genome file(s) to your +# paths_config file. +# +# We have specified default parameters for small RNA data based on our own "best practices". +# You can change the parameters here. +# +######-------------------------------------------------------------------------------###### + + +##-- SA is sampled every 2^offRate BWT chars (default: 5) +offrate: ~ + +##-- Convert Ns in reference to As --## +ntoa: false + +##-- Don't build .3/.4.ebwt (packed reference) portion --## +noref: false + +##-- Number of chars consumed in initial lookup (default: 10) --## +ftabchars: ~ + + +######---------------------TRIMMING AND QUALITY FILTER OPTIONS ----------------------###### +# +# We use the program fastp to perform: adapter trimming (req), quality filtering (on), +# and QC analysis for an output QC report. See https://github.com/OpenGene/fastp for more +# information on the fastp tool. We have limited the options available to those appropriate +# for small RNA sequencing data. If you require an addition option, create an issue on the +# pipeline github: https://github.com/MontgomeryLab/tinyrna +# +# We have specified default parameters for small RNA data based on our own "best practices". +# You can change the parameters here. +# +######-------------------------------------------------------------------------------###### + + +##-- Adapter sequence to trim --## +adapter_sequence: 'auto' + +##-- Minumum & maximum accepted lengths after trimming --## +length_required: 15 +length_limit: 35 + +##-- Minimum average score for a read to pass quality filter --## +average_qual: 25 + +##-- Minimum phred score for a base to pass quality filter --## +qualified_quality_phred: 20 + +##-- Minimum % of bases that can be below minimum phred score (above) --## +unqualified_percent_limit: 10 + +##-- Minimum allowed number of bases --## +n_base_limit: 1 + +##-- Compression level for gzip output --## +compression: 4 + +###-- Unused optional inputs: Remove '#' in front to use --### +##-- Trim poly x tails of a given length --## +# trim_poly_x: false +# poly_x_min_len: 0 + +##-- Trim n bases from the front/tail of a read --## +# trim_front1: 0 +# trim_tail1: 0 + +##-- Is the data phred 64? --## +# fp_phred64: False + +##-- Turn on overrepresentation sampling analysis --## +# overrepresentation_sampling: 0 +# overrepresentation_analysis: false + +##-- If true: don't overwrite the files --## +# dont_overwrite: false + +##-- If true: disable these options --## +# disable_quality_filtering: false +# disable_length_filtering: false +# disable_adapter_trimming: false + + +######--------------------------- READ COLLAPSER OPTIONS ----------------------------###### +# +# We use a custom Python utility for collapsing duplicate reads. +# We recommend using the default (keep all reads, or threshold: 0). +# Sequences <= threshold will not be included in downstream steps. +# Trimming takes place prior to counting/collapsing. +# +# We have specified default parameters for small RNA data based on our own "best practices". +# You can change the parameters here. +# +######-------------------------------------------------------------------------------###### + +##-- Trim the specified number of bases from the 5' end of each sequence --## +5p_trim: 0 + +##-- Trim the specified number of bases from the 3' end of each sequence --## +3p_trim: 0 + +##-- Sequences with count <= threshold will be placed in a separate low_counts fasta --## +threshold: 0 + +##-- If True: outputs will be gzip compressed --## +compress: False + + +######-------------------------- BOWTIE ALIGNMENT OPTIONS ---------------------------###### +# +# We use bowtie for read alignment to a genome. +# +# We have specified default parameters for small RNA data based on our own "best practices". +# You can change the parameters here. +# +######-------------------------------------------------------------------------------###### + + +##-- Report end-to-end hits w/ <=v mismatches; ignore qualities (option -v) --## +end_to_end: 0 + +##-- Report all alignments per read (much slower than low -k) (option --all) --## +all_aln: True + +##-- Seed for random number generator (option --seed) --## +seed: 0 + +##-- Suppress SAM records for unaligned reads (option --no-unal) --## +no_unal: True + +##-- Use shared mem for index; many bowtie's can share (option --shmem) --## +##-- Note: this requires further configuration of your OS --## +##-- http://bowtie-bio.sourceforge.net/manual.shtml#bowtie-options-shmem --## +shared_memory: False + +###-- Unused option inputs: Remove '#' in front to use --### +##-- Hits are guaranteed best stratum, sorted; ties broken by quality (option --best) --## +#best: False + +##-- Hits in sub-optimal strata aren't reported (requires best, ^^^^) (option --strata) --## +#strata: False + +##-- Max mismatches in seed (can be 0-3, default: -n 2) (option --seedmms) --## +#seedmms: 2 + +##-- Seed length for seedmms (default: 28) (option --seedlen) --## +#seedlen: 28 + +##-- Do not align to forward/reverse-complement reference strand (options --norc and --nofw) --## +#norc: False +#nofw: False + +##-- Input quals are Phred+64 (same as --solexa1.3-quals) (option --phred64-quals) --## +#bt_phred64: False + +##-- Report up to good alignments per read (default: 1) (option -k) --## +#k_aln + +##-- Suppress all alignments if > exist (default: no limit) (option -m) --## +#suppress_aln: 10 + +##-- Trim bases from 5' (left) or 3' (right) end of reads (options --trim5 and --trim3) --## +#trim5: 0 +#trim3: 0 + +##-- Input quals are from GA Pipeline ver. < 1.3 (option --solexa-quals) --## +#solexa: false + +##-- Input quals are from GA Pipeline ver. >= 1.3 (option --solexa1.3-quals) --## +#solexa13: false + + +######--------------------------- FEATURE COUNTER OPTIONS ---------------------------###### +# +# We use a custom Python utility that utilizes HTSeq's Genomic Array of Sets and GFF reader +# to count small RNA reads. Selection rules are defined in your Features Sheet. +# +######-------------------------------------------------------------------------------###### + + +##-- If True: show all parsed features in the counts csv, regardless of count/identity --## +counter_all_features: False + +##-- If True: counts will be normalized by genomic hits AND selected feature count --## +##-- If False: counts will only be normalized by genomic hits --## +counter_normalize_by_hits: True + +##-- If True: a decollapsed copy of each SAM file will be produced (useful for IGV) --## +counter_decollapse: False + +##-- Select the StepVector implementation that is used. Options: HTSeq or Cython --## +counter_stepvector: 'Cython' + +##-- If True: produce diagnostic logs to indicate what was eliminated and why --## +counter_diags: False + + +######--------------------------- DIFFERENTIAL EXPRESSION ---------------------------###### +# +# Differential expression analysis is performed using the DESeq2 R library. +# +######-------------------------------------------------------------------------------###### + + +##-- If True: produce a principal component analysis plot from the input dataset --## +dge_pca_plot: True + +##-- If True: before analysis, drop features which have a zero count across all samples --## +dge_drop_zero: False + + +######-------------------------------- PLOTTING OPTIONS -----------------------------###### +# +# We use a custom Python script for creating all plots. If you wish to use another matplotlib +# stylesheet you can specify that in the Paths File. +# +# We have specified default parameters for small RNA data based on our own "best practices". +# You can change the parameters here. +# +######-------------------------------------------------------------------------------###### + + +##-- Enable plots by uncommenting (removing the '#') for the desired plot type --## +##-- Disable plots by commenting (adding a '#') for the undesired plot type --## +plot_requests: + - 'len_dist' + - 'rule_charts' + - 'class_charts' + - 'replicate_scatter' + - 'sample_avg_scatter_by_dge' + - 'sample_avg_scatter_by_dge_class' + +##-- You can set a custom P value to use in DGE scatter plots. Default: 0.05 --## +plot_pval: ~ + +##-- If True: scatter plot points will be vectorized. If False, only points are raster --## +plot_vector_points: False + +##-- Optionally set the min and/or max lengths for len_dist plots; auto if unset --## +plot_len_dist_min: +plot_len_dist_max: + +##-- Optionally set the log2 min and/or max view limits for scatter_by_dge plots; auto if unset --## +plot_dge_scatter_min: +plot_dge_scatter_max: + +##-- Use this label in class plots for counts assigned by rules lacking a classifier --## +plot_unknown_class: "_UNKNOWN_" + +##-- Use this label in class plots for unassigned counts --## +plot_unassigned_class: "_UNASSIGNED_" + +##-- Optionally filter the classes in class scatter plots --## +plot_class_scatter_filter: + style: include # Choose: include or exclude + classes: [] # Add classes between [ and ], separated by comma + + +######----------------------------- OUTPUT DIRECTORIES ------------------------------###### +# +# Outputs for each step are organized into their own subdirectories in your run +# directory. You can set these folder names here. +# +######-------------------------------------------------------------------------------###### + + +dir_name_bt_build: bowtie-build +dir_name_fastp: fastp +dir_name_tiny-collapse: tiny-collapse +dir_name_bowtie: bowtie +dir_name_tiny-count: tiny-count +dir_name_tiny-deseq: tiny-deseq +dir_name_tiny-plot: tiny-plot +dir_name_logs: logs + + +######################### AUTOMATICALLY GENERATED CONFIGURATIONS ######################### +# +# Do not make any changes to the following sections. These options are automatically +# generated using your Paths File, your Samples and Features sheets, and the above +# settings in this file. +# +########################################################################################### + +version: 1.2.2 + +######--------------------------- DERIVED FROM PATHS FILE ---------------------------###### +# +# The following configuration settings are automatically derived from the Paths File +# +######-------------------------------------------------------------------------------###### + +run_directory: ~ +tmp_directory: ~ +features_csv: { } +samples_csv: { } +paths_file: { } +gff_files: [ ] +run_bowtie_build: false +reference_genome_files: [ ] +plot_style_sheet: ~ +adapter_fasta: ~ +ebwt: ~ + + +######------------------------- DERIVED FROM SAMPLES SHEET --------------------------###### +# +# The following configuration settings are automatically derived from the Samples Sheet +# +######-------------------------------------------------------------------------------###### + +##-- Utilized by fastp, tiny-collapse, and bowtie --## +sample_basenames: [ ] + +##-- Utilized by fastp --## +# input fastq files +in_fq: [ ] +# output reports +fastp_report_titles: [ ] + +###-- Utilized by bowtie --### +# bowtie index files +bt_index_files: [ ] + +##-- Utilized by tiny-deseq.r --## +# The control for comparison. If unspecified, all comparisons are made +control_condition: +# If the experiment design yields less than one degree of freedom, tiny-deseq.r is skipped +run_deseq: True + +######------------------------- DERIVED FROM FEATURES SHEET -------------------------###### +# +# The following configuration settings are automatically derived from the Features Sheet +# +######-------------------------------------------------------------------------------###### + + + +######--------------------------- DERIVED FROM RUN CONFIG ---------------------------###### +# +# The following configuration settings are automatically derived from this file +# +######-------------------------------------------------------------------------------###### + +##-- Utilized by tiny-plot --## +# Filters for class scatter plots +plot_class_scatter_filter_include: [] +plot_class_scatter_filter_exclude: [] \ No newline at end of file diff --git a/doc/Configuration.md b/doc/Configuration.md index 94c23f66..ab26bfa6 100644 --- a/doc/Configuration.md +++ b/doc/Configuration.md @@ -92,7 +92,7 @@ When the pipeline starts up, tinyRNA will process the Run Config based on the co ## Paths File Details ### GFF Files -GFF annotations are required by tinyRNA. For each file, you can optionally provide an `alias` which is a list of attributes to represent each feature in the Feature Name column of output counts tables. Each entry under the `gff_files` parameter must look something like the following mock example: +GFF annotations are optional but recommended. If not provided, tiny-count will perform [sequence-based counting](tiny-count.md#sequence-based-counting-mode) rather than feature-based counting. For each file, you can optionally provide an `alias` which is a list of attributes to represent each feature in the Feature Name column of output counts tables. Each entry under the `gff_files` parameter must look something like the following mock example: ```yaml - path: 'a/path/to/your/file.gff' # 0 spaces before - alias: [optional, list, of attributes] # 2 spaces before alias @@ -112,9 +112,11 @@ Once your indexes have been built, your Paths File will be modified such that `e ### The Run Directory The final output directory name has three components: -- The `run_name` defined in your Run Config -- The date and time at pipeline startup -- The `run_directory` basename defined in your Paths File +1. The `run_name` defined in your Run Config +2. The date and time at pipeline startup +3. The basename of `run_directory` defined in your Paths File + +The `run_directory` suffix in the Paths File supports subdirectories; if provided, the final output directory will be named as indicated above, but the subdirectory structure specified in `run_directory` will be retained. ## Samples Sheet Details | _Column:_ | FASTQ/SAM Files | Sample/Group Name | Replicate Number | Control | Normalization | @@ -122,14 +124,16 @@ The final output directory name has three components: | _Example:_ | cond1_rep1.fastq.gz | condition1 | 1 | True | RPM | ### Assigning the Control Group -Assigning the control group allows the proper DGE comparisons to be made and plotted. The Control column is where you'll make this indication by writing `true` on any corresponding row. Regardless of the number of replicates in each group, only one associated row needs to have this indication. Do not write `false` or anything else for the other groups; this column should only be used to indicate the affirmative. +Assigning the control group allows the proper DGE comparisons to be made and plotted. The Control column is where you'll make this indication by writing `true` on any corresponding row. Regardless of the number of replicates in each group, only one row needs to have this indication. + +tinyRNA doesn't support experiments with more than one control condition. However, if you omit all control condition labels then every possible comparison will be made which should include the desired comparisons. ### Applying Custom Normalization Custom normalization can be applied at the conclusion of feature counting using the Normalization column. Unlike the Control column, values in the Normalization column apply to the specific library that they share a row with. Supported values are: - **Blank or 1**: no normalization is applied to the corresponding library -- **Any number**: the corresponding library's counts are divided by this number +- **Any number**: the corresponding library's counts are divided by this number (useful for spike-in normalization) - **RPM or rpm**: the corresponding library's counts are divided by (its mapped read count / 1,000,000) >**NOTE**: These normalizations operate independently of tiny-count's --normalize-by-hits commandline option. The former is concerned with per-library normalization, whereas the latter is concerned with normalization by selected feature count at each locus ([more info](tiny-count.md#count-normalization)). The commandline option does not enable or disable the normalizations detailed above. @@ -138,17 +142,37 @@ Supported values are: DESeq2 requires that your experiment design has at least one degree of freedom. If your experiment doesn't include at least one sample group with more than one replicate, tiny-deseq.r will be skipped and DGE related plots will not be produced. ## Features Sheet Details -| _Column:_ | Select for... | with value... | Classify as... | Source Filter | Type Filter | Hierarchy | Strand | 5' End Nucleotide | Length | Overlap | -|------------|---------------|---------------|----------------|----------------|-------------|-----------|--------|-------------------|--------|-------------| -| _Example:_ | Class | miRNA | miRNA | | | 1 | sense | all | all | 5' anchored | - -The Features Sheet allows you to define selection rules that determine how features are chosen when multiple features are found overlap an alignment locus. Selected features are "assigned" a portion of the reads associated with the alignment. - -Selection first takes place against the feature attributes defined in your GFF files, and is directed by defining the attribute you want to be considered (Select for...) and the acceptable values for that attribute (with value...). - -Rules that match features in the first stage of selection will be used in a second stage which evaluates alignment vs. feature interval overlap. These matches are sorted by hierarchy value and passed to the third and final stage of selection which examines characteristics of the alignment itself: strand relative to the feature of interest, 5' end nucleotide, and length. - -See [tiny-count's documentation](tiny-count.md#feature-selection) for an explanation of each column. +![Features Sheet Header](../images/features_sheet_header.png) + +The Features Sheet allows you to define selection rules that control how reads are assigned to features. We refer to each row as a rule, and columns as a selectors. `Classify as...` isn't a selector because it is used for labelling and subsetting matches rather than determining them. See [tiny-count's documentation](tiny-count.md#feature-selection) for an explanation of the selection process and the role that each selector plays. + +### Selector Formats +Selectors in the Features Sheet can be specified as a single value, a list of comma separated values, a range, or a wildcard. The supported formats vary from selector to selector. For list and range formats, just one of the specified values has to match for the target to be selected. Wildcard formats can be implicitly defined with a blank cell, or explicitly defined using the example keywords below. + +| Selector | Wildcard | Single | List | Range | +|:----------------|:--------:|:------:|:----:|:-----:| +| `Select for...` | ✓ | ✓ | | | +| `with value...` | ✓ | ✓ | | | +| `Source Filter` | ✓ | ✓ | ✓ | | +| `Type Filter` | ✓ | ✓ | ✓ | | +| `Hierarchy` | | ✓ | | | +| `Overlap` | ✓ | ✓ | | | +| `Strand` | ✓ | ✓ | | | +| `5' nt` | ✓ | ✓ | ✓ | | +| `Length` | ✓ | ✓ | ✓ | ✓ | + +Examples: +- **Wildcard** : `any`, `all`, `*`, or a blank cell +- **Single**: `G` or `22` +- **List**: `C,G,U` or `25, 26` (spaces do not matter) +- **Range**: `20-25` +- **Mixed** §: `19, 21-23, 25-30` + + the `Strand` selector also supports `both`
+§ only supported by the `Length` selector + +### Case Sensitivity +All selectors are case-insensitive. ## Plot Stylesheet Details Matplotlib uses key-value "rc parameters" to allow for customization of its properties and styles, and one way these parameters can be specified is with a [matplotlibrc file](https://matplotlib.org/3.4.3/tutorials/introductory/customizing.html#a-sample-matplotlibrc-file), which we simply refer to as the Plot Stylesheet. You can obtain a copy of the default stylesheet used by tiny-plot with the command `tiny get-templates`. Please keep in mind that tiny-plot overrides these defaults for a few specific elements of certain plots. Feel free to reach out if there is a plot style you wish to override but find you are unable to. \ No newline at end of file diff --git a/doc/Pipeline.md b/doc/Pipeline.md index 92a617aa..1fcdf495 100644 --- a/doc/Pipeline.md +++ b/doc/Pipeline.md @@ -32,7 +32,8 @@ The commands `tiny recount` and `tiny replot` seek to solve this problem. As dis You can modify the behavior of a resume run by changing settings in: - The **processed** Run Config -- The **original** Features Sheet that was used for the end-to-end run (as indicated by the `features_csv` key in the **processed** Run Config) +- The **original** Features Sheet that was used for the end-to-end run (as indicated by `features_csv` in the processed Run Config) +- The **original** Paths File (as indicated by `paths_config` in the processed Run Config) ### The Steps 1. Make and save the desired changes in the files above @@ -45,6 +46,9 @@ File inputs are sourced from the **original** output subdirectories of prior ste ### Where to Find Outputs from Resume Runs Output subdirectories for resume runs can be found alongside the originals, and will have a timestamp appended to their name to differentiate them. +### Auto-Documentation of Resume Runs +A new processed Run Config will be saved in the Run Directory at the beginning of each resume run. It will be labelled with the same timestamp used in the resume run's other outputs to differentiate it. It includes the changes to your Paths File and Run Config. A copy of your Features Sheet is saved to the timestamped tiny-count output directory during `tiny recount` runs. + ## Parallelization Most steps in the pipeline run in parallel to minimize runtimes. This is particularly advantageous for multiprocessor systems like server environments. However, parallelization isn't always beneficial. If your computer doesn't have enough free memory, or if you have a large sample file set and/or reference genome, parallel execution might push your machine to its limits. When this happens you might see memory errors or your computer may become unresponsive. In these cases it makes more sense to run resource intensive steps one at a time, in serial, rather than in parallel. To do so, set `run_parallel: false` in your Run Config. This will affect fastp, tiny-collapse, and bowtie since these steps typically handle the largest volumes of data. diff --git a/doc/tiny-count.md b/doc/tiny-count.md index c23300f8..219af31a 100644 --- a/doc/tiny-count.md +++ b/doc/tiny-count.md @@ -15,7 +15,7 @@ While third-party SAM files from non-collapsed reads are supported, there are so # Feature Selection -We provide a Features Sheet (`features.csv`) in which you can define selection rules to more accurately capture counts for the small RNAs of interest. The parameters for these rules include attributes commonly used in the classification of small RNAs, such as length, strandedness, and 5' nucleotide. +We provide a [Features Sheet](Configuration.md#features-sheet-details) (`features.csv`) in which you can define selection rules to more accurately capture counts for the small RNAs of interest. The parameters for these rules include attributes commonly used in the classification of small RNAs, such as length, strandedness, and 5' nucleotide. >**Important**: candidate features do not receive counts if they do not pass the selection process described below @@ -25,39 +25,33 @@ Selection occurs in three stages, with the output of each stage as input to the 3. Finally, features are selected for read assignment based on the small RNA attributes of the alignment locus. Once reads are assigned to a feature, they are excluded from matches with larger hierarchy values. ![Feature Selection Diagram](../images/tiny-count_selection.png) + +## Sequence-Based Counting Mode +If GFF files aren't specified in your Paths File, Stage 1 selection is skipped and reads are counted by sequence rather than by feature. Reference sequence names and lengths are determined from the `@SQ` headers of input SAM files. In Stages 2 and 3, these reference sequences behave like features that had matched every rule in Stage 1. ## Stage 1: Feature Attribute Parameters -| _features.csv columns:_ | Select for... | with value... | Classify as... | Source Filter | Type Filter | -|-------------------------|---------------|---------------|----------------|---------------|-------------| - -Each feature's column 9 attributes are searched for the key-value combinations defined in the `Select for...` and `with value...` columns. These matches are then filtered based on their source (GFF column 2) and type (GFF column 3) according to the corresponding rule's `Source Filter`, and `Type Filter`. Features, and the rules they matched, are retained for later evaluation at alignment loci in Stages 2 and 3. - -#### Feature Classification -You can optionally specify a classifier for each rule. These classifiers are later used to group and label counts in class-related plots. Features that match rules with a classifier are counted separately; the classifier becomes part of the feature's ID to create a distinct "sub-feature", and these sub-features continue to be treated as distinct in downstream DGE analysis. Classified features receive counts exclusively from the rule(s) which hold the same `Classify as...` value. Counts from multiple rules can be pooled by using the same classifier. In the final counts table, this value is displayed in the Classifier column of features matching the rule, and each feature-classifier pair is shown on its own row. +| _Features Sheet Selectors:_ | Select for... | with value... | Classify as... | Source Filter | Type Filter | +|-----------------------------|---------------|---------------|----------------|---------------|-------------| -#### Source and Type Filters -These are inclusive filters, meaning a feature's source/type must match one of the values in these columns to pass Stage 1 selection. If these fields are left empty, or any wildcard value is used, then the feature's source/type is not evaluated. +This stage deals with GFF parsing. Features are selected and classified based on their source, type, and attributes (GFF columns 2, 3, and 9) using the selectors listed above. If you do not have or wish to use GFF annotations for your experiment, [see sequence-based counting](#sequence-based-counting-mode). -#### Value Lists -Attribute keys are allowed to have multiple comma separated values, and these values are treated as a list; only one of the listed values needs to match the `with value...` to be considered a valid match to the rule. For example, if a rule contained `Class` and `WAGO` in these columns, then a feature with attributes `... ;Class=CSR,WAGO; ...` would be considered a match for the rule. Value lists can also be used in the `Source/Type Filter` fields. +Each feature's attributes (column 9) are searched for pairs matching each rule's `Select for...` and `with value...` selectors. Feature matches are then filtered based on the source (column 2) and type (column 3) requirements of their matching rules' `Source Filter` and `Type Filter` selectors. ->**Tip**: The rules defined in your Features Sheet are case-insensitive. You do not need to match the capitalization of your target attributes. +#### Feature Classification +You can optionally specify a classifier for each rule using the `Classify as...` column. In tiny-count, the classifier is used to subset reads from individual features. If a feature matches multiple rules with different classifiers, then each classification of that feature is counted separately, per the matching rule, and will have its own entry in the output counts table. If the rules instead share the same classifier, then counts contributed from each rule are pooled under the same classification of that feature. Each subclassification of a feature is treated as a distinct "feature" during DGE analysis in tiny-deseq, and in tiny-plot the classifier is used to group and label counts in class-related plots. -#### Wildcard Support -Wildcard values (`all`, `*`, or an empty cell) can be used in the `Select for...`, `with value...`, `Source Filter`, and `Type Filter` fields. With this functionality you can evaluate features for the presence of an attribute key without regarding its values, or you can check all attribute keys for the presence of a specific value, or you can skip Stage 1 selection altogether to permit the evaluation of the complete feature set in Stage 2. In the later case, feature-rule matching pairs still serve as the basis for selection; each rule still applies only to its matching subset from previous Stages. +#### Attribute Value Lists +Feature attributes (GFF column 9) with multiple comma separated values are treated as a list. Only one of the listed values needs to match the `with value...` selector for the feature to be considered a match to the rule. For example, if a rule has `Select for: Class` `with value: WAGO` in these columns, then a feature with attribute `Class=CSR,WAGO` would be considered a match for the rule. ## Stage 2: Overlap and Hierarchy Parameters -| _features.csv columns:_ | Hierarchy | Overlap | -|-------------------------|-----------|---------| +| _Features Sheet Selectors:_ | Hierarchy | Overlap | +|-----------------------------|-----------|---------| Features overlapping a read alignment are selected based on their overlap characteristics. These matches are then sorted by hierarchy value before proceeding to Stage 3. ### Overlap This column allows you to specify the extent of overlap required for candidate feature selection. In order to be a candidate, a feature must reside on the same chromosome as the alignment and overlap its interval by at least 1 nucleotide. A shared strand is not required. See the [Strand](#strand) section in Stage 3 for selection by strand. -#### Unstranded Features - If these features match rules with `5' anchored` and `3' anchored` overlap selectors, they will be downgraded to `anchored` selectors. Alignments overlapping these features are evaluated for shared start and/or end coordinates, but 5' and 3' ends are not distinguished. - #### Selector Demonstration The following table provides a description and illustration of the available overlap selectors. All matches apply to features on either strand, i.e. matches shown below the antisense strand also apply, as shown, to the feature on the sense strand, and vice versa. @@ -73,6 +67,25 @@ The following table provides a description and illustration of the available ove :people_holding_hands: Illustration colors have been selected for colorblindness accessibility. +#### Shift Parameters +An optional shift parameter can be provided for each overlap selector which changes the position of the 5' and/or 3' terminus of its feature matches. The shifted interval replaces the original for the given match, and its candidature and selection is based on this new interval. A feature matching both shifted and unshifted rules will retain its original interval for non-shifted matches. + + +The shift parameter can be specified as: +``` +selector, M, N + M = shift value for 5' end + N = shift value for 3' end +``` + +- Positive values shift the specified end in the 3' direction +- Negative values shift the specified end in the 5' direction +- If either parameter is provided, the other must also be provided +- Zero is also an accepted shift value + +#### Unstranded Features + If these features match rules with `5' anchored` and `3' anchored` overlap selectors, they will be downgraded to `anchored` selectors. Alignments overlapping these features are evaluated for shared start and/or end coordinates, but 5' and 3' ends are not distinguished. + ### Hierarchy Each rule must be assigned a hierarchy value. This value is used to sort Stage 2 matches so that matches with smaller hierarchy values take precedence in Stage 3. - Each feature can have multiple hierarchy values if it matched more than one rule during Stage 1 selection @@ -86,8 +99,8 @@ You can use larger hierarchy values to exclude features that are not of interest >**Example:** suppose you have a miRNA locus embedded within a coding gene locus (within an intron for example). By assigning a hierarchy of 1 to miRNA and a hierarchy of 2 to coding genes, all small RNA counts from sequences matching to the miRNA would be excluded from total counts for the coding gene. Reversing the hierarchy such that miRNA had a hierarchy of 2 and coding genes had a hierarchy of 1 would instead exclude reads from sequences matching to the coding gene from total counts for the miRNA. If a hierarchy of 1 was assigned to both miRNAs and coding genes, counts for sequences matching both features would be split between them. ## Stage 3: Alignment Attribute Parameters -| _features.csv columns:_ | Strand | 5' End Nucleotide | Length | -|-------------------------|--------|-------------------|--------| +| _Features Sheet Selectors:_ | Strand | 5' End Nucleotide | Length | +|-----------------------------|--------|-------------------|--------| The final stage of selection is concerned with the small RNA attributes of each alignment locus. Candidates are evaluated in order of hierarchy value where smaller values take precedence. Once a match has been found, reads are excluded from remaining candidates with larger hierarchy values. @@ -102,17 +115,19 @@ These features will match all strand selectors regardless of the alignment's str ### 5' End Nucleotide and Length -| Parameter | Single | List | Range | Wildcard | -|------------|:------:|:----:|:-----:|:--------:| -| 5' end nt | ✓ | ✓ | | ✓ | -| Length | ✓ | ✓ | ✓ | ✓ | +| Selector | Wildcard | Single | List | Range | +|---------:|:--------:|:------:|:----:|:-----:| +| 5' nt | ✓ | ✓ | ✓ | | +| Length | ✓ | ✓ | ✓ | ✓ | Examples: +- **Wildcard**: `any`, `all`, `*`, or a blank cell - **Single**: `G` or `22` - **List**: `C,G,U` or `25, 26` (spaces do not matter) - **Range**: `20-25` -- **Wildcard**: `all` -- **Mixed**: `19, 21-23, 25-30` +- **Mixed** : `19, 21-23, 25-30` + + only supported by the `Length` selector >**Tip:** you may specify U and T bases in your rules. Uracil bases will be converted to thymine when your Features Sheet is loaded. N bases are also allowed. diff --git a/doc/tiny-plot.md b/doc/tiny-plot.md index 551f52af..bf907efa 100644 --- a/doc/tiny-plot.md +++ b/doc/tiny-plot.md @@ -32,7 +32,7 @@ Two plots are produced for each replicate: - Distribution of _Assigned Reads_, which are counted at each alignment where at least one overlapping feature passed selection and was assigned a portion of the sequence's original counts #### Length Bounds -Lengths are plotted over a continuous range, even if an intermediate length was not observed, and the bounds of this range can be assigned automatically or manually. Manual lengths can be assigned using [plot_len_dist_min and plot_len_dist_max](Parameters.md#bounds-for-len_dist-charts). +Lengths are plotted over a continuous range, even if an intermediate length was not observed, and the bounds of this range can be assigned automatically or manually. Manual lengths can be assigned using [plot_len_dist_min and plot_len_dist_max](Parameters.md#bounds-for-lendist-charts). When tiny-plot is called as a step in a pipeline run, min and max bounds are determined independently in the following order of priority: 1. Manual assignment in the Run Config @@ -109,7 +109,7 @@ The P value cutoff [can be changed](Parameters.md#p-value) (default: 0.05). The control condition is plotted on the x-axis, but it must be specified in your Samples Sheet prior to running an end-to-end or `tiny recount` analysis. If using `tiny replot`, is not possible to change a no-control experiment to a control experiment and have these changes reflected in these plots. This is because tiny-deseq.r must be aware of the control condition in order to perform the proper directional comparisons. #### View Limits -Both the lower and upper bound of the plot's axes [can be set manually](Parameters.md#view-limits). Unspecified bounds are automatically calculated to fit the data. +Both the lower and upper bound of the plot's axes [can be set manually](Parameters.md#bounds-for-lendist-charts). Unspecified bounds are automatically calculated to fit the data. diff --git a/images/features_sheet_header.png b/images/features_sheet_header.png new file mode 100644 index 00000000..0a868931 Binary files /dev/null and b/images/features_sheet_header.png differ diff --git a/images/overlap_selectors/anchored.png b/images/overlap_selectors/anchored.png index 29f38393..3b2e0aa9 100644 Binary files a/images/overlap_selectors/anchored.png and b/images/overlap_selectors/anchored.png differ diff --git a/images/tinyrna-workflow_current.png b/images/tinyrna-workflow_current.png deleted file mode 100644 index e59701c6..00000000 Binary files a/images/tinyrna-workflow_current.png and /dev/null differ diff --git a/images/tinyrna_workflow_current.png b/images/tinyrna_workflow_current.png new file mode 100644 index 00000000..f4958d88 Binary files /dev/null and b/images/tinyrna_workflow_current.png differ diff --git a/tests/testdata/config_files/paths.yml b/tests/testdata/config_files/paths.yml index 92cc0da4..46c14bf2 100644 --- a/tests/testdata/config_files/paths.yml +++ b/tests/testdata/config_files/paths.yml @@ -22,7 +22,7 @@ gff_files: #- path: # alias: [ ] -##-- The final output directory for files produced by the pipeline --# +##-- The suffix to use in the final output directory name (optional) --# run_directory: run_directory ##-- The directory for temporary files. Determined by cwltool if blank. --## diff --git a/tests/testdata/config_files/run_config_template.yml b/tests/testdata/config_files/run_config_template.yml index 44338dd0..b8d9bcf4 100644 --- a/tests/testdata/config_files/run_config_template.yml +++ b/tests/testdata/config_files/run_config_template.yml @@ -1,21 +1,19 @@ ######----------------------------- tinyRNA Configuration -----------------------------###### # -# In this file you can specify your configuration preferences for the workflow and -# each workflow step. +# In this file you can specify your configuration preferences for the workflow. # # If you want to use DEFAULT settings for the workflow, all you need to do is provide the path # to your Samples Sheet and Features Sheet in your Paths File, then make sure that the # 'paths_config' setting below points to your Paths File. # # We suggest that you also: -# 1. Add a username to identify the person performing runs, if desired for record keeping -# 2. Add a run directory name in your Paths File. If not provided, "run_directory" is used -# 3. Add a run name to label your run directory and run-specific summary reports. -# If not provided, user_tinyrna will be used. +# 1. Add a `user` below to identify the person performing runs. +# 2. Add a `run_name` below to prefix your run directory and summary reports. +# If not provided, user_tinyrna is used. # -# This file will be further processed at run time to generate the appropriate pipeline -# settings for each workflow step. A copy of this processed configuration will be stored -# in your run directory. +# This file will be processed at run time to generate the appropriate settings +# for each workflow step. A copy of the processed configuration will be saved +# in your run directory for auto-documentation. # ######-------------------------------------------------------------------------------###### @@ -162,57 +160,58 @@ compress: False ######-------------------------------------------------------------------------------###### -##-- Report end-to-end hits w/ <=v mismatches; ignore qualities --## +##-- Report end-to-end hits w/ <=v mismatches; ignore qualities (option -v) --## end_to_end: 0 -##-- Report all alignments per read (much slower than low -k) --## +##-- Report all alignments per read (much slower than low -k) (option --all) --## all_aln: True -##-- Seed for random number generator --## +##-- Seed for random number generator (option --seed) --## seed: 0 -##-- Suppress SAM records for unaligned reads --## +##-- Suppress SAM records for unaligned reads (option --no-unal) --## no_unal: True -##-- Use shared mem for index; many bowtie's can share --## +##-- Use shared mem for index; many bowtie's can share (option --shmem) --## ##-- Note: this requires further configuration of your OS --## ##-- http://bowtie-bio.sourceforge.net/manual.shtml#bowtie-options-shmem --## shared_memory: False ###-- Unused option inputs: Remove '#' in front to use --### -##-- Hits are guaranteed best stratum, sorted; ties broken by quality --## +##-- Hits are guaranteed best stratum, sorted; ties broken by quality (option --best) --## #best: False -##-- Hits in sub-optimal strata aren't reported (requires best, ^^^^) --## +##-- Hits in sub-optimal strata aren't reported (requires best, ^^^^) (option --strata) --## #strata: False -##-- Max mismatches in seed (can be 0-3, default: -n 2) --## +##-- Max mismatches in seed (can be 0-3, default: -n 2) (option --seedmms) --## #seedmms: 2 -##-- Seed length for seedmms (default: 28) --## +##-- Seed length for seedmms (default: 28) (option --seedlen) --## #seedlen: 28 -##-- Do not align to reverse-compliment reference --## -# norc: False +##-- Do not align to forward/reverse-complement reference strand (options --norc and --nofw) --## +#norc: False +#nofw: False -##-- Do not align to forward reference --## -# nofw: False +##-- Input quals are Phred+64 (same as --solexa1.3-quals) (option --phred64-quals) --## +#bt_phred64: False -##-- Input quals are Phred+64 (same as --solexa1.3-quals) --## -# bt_phred64: False +##-- Report up to good alignments per read (default: 1) (option -k) --## +#k_aln -##-- Report up to good alignments per read (default: 1) --## -# k_aln +##-- Suppress all alignments if > exist (default: no limit) (option -m) --## +#suppress_aln: 10 -##-- Number of bases to trim from 5' or 3' end of reads --## -# trim5: 0 -# trim3: 0 +##-- Trim bases from 5' (left) or 3' (right) end of reads (options --trim5 and --trim3) --## +#trim5: 0 +#trim3: 0 -##-- Input quals are from GA Pipeline ver. < 1.3 --## -# solexa: false +##-- Input quals are from GA Pipeline ver. < 1.3 (option --solexa-quals) --## +#solexa: false -##-- Input quals are from GA Pipeline ver. >= 1.3 --## -# solexa13: false +##-- Input quals are from GA Pipeline ver. >= 1.3 (option --solexa1.3-quals) --## +#solexa13: false ######--------------------------- FEATURE COUNTER OPTIONS ---------------------------###### diff --git a/tests/unit_tests_configuration.py b/tests/unit_tests_configuration.py index 54ae4ce1..b9da1a9a 100644 --- a/tests/unit_tests_configuration.py +++ b/tests/unit_tests_configuration.py @@ -399,6 +399,10 @@ def test_get_gff_config_merge_alias_attr(self): class ConfigurationTest(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.file = './testdata/config_files/run_config_template.yml' + """Does get_templates copy the expected number of files for each context?""" def test_get_templates_contexts(self): @@ -442,6 +446,65 @@ def test_get_templates_conflicts(self): self.assertSetEqual(exp_set, act_set) + """Does setup_pipeline() create the proper run_name prefix?""" + + def test_setup_pipeline_run_name(self): + # user_vals = ('"user"', None, "''", 0) + # run_vals = ('"run"', None, "''", 0) + # for a, b in itertools.product(user_vals, run_vals): + # print("({'user': %6s, 'run_name': %5s}, )" % (a, b)) + + cases = [ + ({'user': "user", 'run_name': "run"}, "run_{ts}"), + ({'user': "user", 'run_name': None}, "user_tinyrna_{ts}"), + ({'user': "user", 'run_name': ''}, "user_tinyrna_{ts}"), + ({'user': "user", 'run_name': 0}, "0_{ts}"), + ({'user': None, 'run_name': "run"}, "run_{ts}"), + ({'user': None, 'run_name': None}, "tinyrna_{ts}"), + ({'user': None, 'run_name': ''}, "tinyrna_{ts}"), + ({'user': None, 'run_name': 0}, "0_{ts}"), + ({'user': '', 'run_name': "run"}, "run_{ts}"), + ({'user': '', 'run_name': None}, "tinyrna_{ts}"), + ({'user': '', 'run_name': ''}, "tinyrna_{ts}"), + ({'user': '', 'run_name': 0}, "0_{ts}"), + ({'user': 0, 'run_name': "run"}, "run_{ts}"), + ({'user': 0, 'run_name': None}, "0_tinyrna_{ts}"), + ({'user': 0, 'run_name': ''}, "0_tinyrna_{ts}"), + ({'user': 0, 'run_name': 0}, "0_{ts}"), + ] + + for inputs, output in cases: + config = Configuration(self.file, skip_setup=True) + config.config.update(inputs) + config.setup_pipeline() + + actual = config['run_name'] + expected = output.format(ts=config.dt) + self.assertEqual(actual, expected) + + """Does setup_pipeline() create the proper run_directory suffix?""" + + def test_setup_pipeline_run_directory(self): + run_name = "run" + cases = [ + ({'run_directory': "dir"}, "run_{ts}_dir"), + ({'run_directory': "path/dir"}, "path/run_{ts}_dir"), + ({'run_directory': "path/dir/"}, "path/run_{ts}_dir"), + ({'run_directory': "/"}, "run_{ts}"), + ({'run_directory': None}, "run_{ts}"), + ({'run_directory': ''}, "run_{ts}"), + ({'run_directory': 0}, "run_{ts}_0"), + ] + + for inputs, output in cases: + config = Configuration(self.file, skip_setup=True) + config.config.update(inputs) + config['run_name'] = run_name + config.setup_pipeline() + + actual = config['run_directory'] + expected = output.format(ts=config.dt) + self.assertEqual(actual, expected) if __name__ == '__main__': diff --git a/tiny/cwl/tools/bowtie.cwl b/tiny/cwl/tools/bowtie.cwl index cf9c7bbd..5c6206f2 100644 --- a/tiny/cwl/tools/bowtie.cwl +++ b/tiny/cwl/tools/bowtie.cwl @@ -18,7 +18,7 @@ inputs: type: string inputBinding: prefix: -x - position: 23 + position: 24 doc: "The basename of the index to be searched." # Only used by InitialWorkDirRequirement @@ -29,18 +29,22 @@ inputs: reads: type: File inputBinding: - position: 24 + position: 25 doc: "File containing unpaired reads" + sample_basename: + type: string? + doc: "The basename of the original sample file (not reads file basename)" + outfile: - type: string + type: string? inputBinding: - position: 25 + position: 26 + valueFrom: &out $(inputs.sample_basename + "_aligned_seqs.sam") + default: *out doc: "File to write hits to" - logfile: - type: string - doc: "File to write Bowtie's stdout and stderr streams to" + ### INPUT ### fastq: type: boolean? @@ -102,72 +106,79 @@ inputs: default: 0 doc: "report end-to-end hits w/ <=v mismatches; ignore qualities" - nofw: - type: boolean? - inputBinding: - prefix: --nofw - position: 8 - doc: "do not align to forward reference strand" - - norc: - type: boolean? - inputBinding: - prefix: --norc - position: 9 - doc: "do not align to reverse-complement reference strand" - seedmms: type: int? inputBinding: prefix: --seedmms - position: 10 + position: 8 doc: "max mismatches in seed (can be 0-3, default: -n 2)" seedlen: type: int? inputBinding: prefix: --seedlen - position: 11 + position: 9 doc: "seed length for --seedmms (default: 28)" - ### REPORTING ### - - best: + nofw: type: boolean? inputBinding: - prefix: --best - position: 12 - doc: "hits guaranteed best stratum; ties broken by quality" + prefix: --nofw + position: 10 + doc: "do not align to forward reference strand" - strata: + norc: type: boolean? inputBinding: - prefix: --strata - position: 13 - doc: "hits in sub-optimal strata aren't reported (requires --best)" + prefix: --norc + position: 11 + doc: "do not align to reverse-complement reference strand" + + ### REPORTING ### k_aln: type: int? inputBinding: prefix: -k - position: 14 + position: 12 doc: "report up to good alignments per read (default: 1)" all_aln: type: boolean? inputBinding: prefix: --all - position: 15 + position: 13 default: true doc: "report all alignments per read (much slower than low -k)" + suppress_aln: + type: int? + inputBinding: + prefix: -m + position: 14 + doc: "suppress all alignments if > exist (def: no limit)" + + best: + type: boolean? + inputBinding: + prefix: --best + position: 15 + doc: "hits guaranteed best stratum; ties broken by quality" + + strata: + type: boolean? + inputBinding: + prefix: --strata + position: 16 + doc: "hits in sub-optimal strata aren't reported (requires --best)" + ### OUTPUT ### time: type: boolean? inputBinding: prefix: -t - position: 16 + position: 17 default: true doc: "print wall-clock time taken by search phases" @@ -175,14 +186,16 @@ inputs: type: string? inputBinding: prefix: --un - position: 17 + position: 18 + valueFrom: &un $(inputs.sample_basename + "_unaligned_seqs.fa") + default: *un doc: "write unaligned reads/pairs to file(s) " no_unal: type: boolean? inputBinding: prefix: --no-unal - position: 18 + position: 19 default: true doc: "suppress SAM records for unaligned reads" @@ -192,7 +205,7 @@ inputs: type: boolean? inputBinding: prefix: --sam - position: 19 + position: 20 default: true doc: "write hits in SAM format" @@ -202,14 +215,14 @@ inputs: type: int? inputBinding: prefix: --threads - position: 20 + position: 21 doc: "number of alignment threads to launch (default: 1)" shared_memory: type: boolean? inputBinding: prefix: --shmem - position: 21 + position: 22 doc: "use shared mem for index; many bowtie's can share" ### OTHER ### @@ -218,19 +231,19 @@ inputs: type: int? inputBinding: prefix: --seed - position: 22 + position: 23 doc: "seed for random number generator" outputs: sam_out: type: File outputBinding: - glob: $(inputs.outfile) + glob: *out unal_seqs: type: File? outputBinding: - glob: $(inputs.un) + glob: *un console_output: type: stdout diff --git a/tiny/cwl/workflows/tinyrna_wf.cwl b/tiny/cwl/workflows/tinyrna_wf.cwl index 5c7dc6ec..ebe73510 100644 --- a/tiny/cwl/workflows/tinyrna_wf.cwl +++ b/tiny/cwl/workflows/tinyrna_wf.cwl @@ -69,6 +69,7 @@ inputs: seedlen: int? best: boolean? strata: boolean? + suppress_aln: int? k_aln: int? all_aln: boolean? no_unal: boolean? @@ -175,31 +176,29 @@ steps: scatter: [ reads, sample_basename ] scatterMethod: dotproduct in: - reads: preprocessing/uniq_seqs - sample_basename: sample_basenames + ebwt: ebwt bt_index_files: source: [ bt_build_optional/index_files, bt_index_files ] pickValue: first_non_null default: bt_index_files # To appease the workflow validator - ebwt: ebwt - outfile: { valueFrom: $(inputs.sample_basename + "_aligned_seqs.sam") } - logfile: { valueFrom: $(inputs.sample_basename + "_console_output.log") } + reads: preprocessing/uniq_seqs + sample_basename: sample_basenames trim5: trim5 trim3: trim3 phred64: bt_phred64 solexa: solexa solexa13: solexa13 end_to_end: end_to_end - nofw: nofw - norc: norc seedmms: seedmms seedlen: seedlen - best: best - strata: strata + nofw: nofw + norc: norc k_aln: k_aln all_aln: all_aln + suppress_aln: suppress_aln + best: best + strata: strata no_unal: no_unal - un: { valueFrom: $(inputs.sample_basename + "_unaligned_seqs.fa") } threads: threads shared_memory: shared_memory seed: seed diff --git a/tiny/entry.py b/tiny/entry.py index ac17c496..1b25e45b 100644 --- a/tiny/entry.py +++ b/tiny/entry.py @@ -151,7 +151,7 @@ def run(tinyrna_cwl_path: str, config_file: str) -> None: print("Running the end-to-end analysis...") # First get the configuration file set up for this run - config_object = Configuration(config_file, validate_inputs=True) + config_object = Configuration(config_file, validate_gffs=True) run_directory = config_object.create_run_directory() config_object.save_run_profile() @@ -263,7 +263,7 @@ def run_cwltool_native(config_object: 'ConfigBase', workflow: str, run_directory verbosity = config_object['verbosity'] def furnish_if_file_record(file_dict): - if isinstance(file_dict, dict) and file_dict.get('class', None) == 'File': + if isinstance(file_dict, dict) and file_dict.get('class') == 'File': file_dict['basename'] = os.path.basename(file_dict['path']) file_dict['location'] = file_dict['path'] file_dict['contents'] = None diff --git a/tiny/rna/compatibility.py b/tiny/rna/compatibility.py index fb09bb79..36ec5723 100644 --- a/tiny/rna/compatibility.py +++ b/tiny/rna/compatibility.py @@ -81,7 +81,7 @@ class RunConfigCompatibility: def __init__(self, config_obj: CommentedMap): self.config = config_obj.copy() - self.vstart = config_obj.get("version", "0.0.0").strip("v") # trust reported version for now + self.vstart = (config_obj.get("version") or "0.0.0").strip("v") # trust reported version for now definitions = resource_filename('tiny', 'templates') + "/compatibility/run_config_compatibility.yml" self.yaml = YAML() diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py index e7f6a2d8..e4f9fc54 100644 --- a/tiny/rna/configuration.py +++ b/tiny/rna/configuration.py @@ -164,7 +164,7 @@ def setup_step_inputs(self): def setup_tiny_plot_inputs(): cs_filter = 'plot_class_scatter_filter' style_req = ['include', 'exclude'] - classes = self.get(cs_filter, {}).get('classes') # backward compatibility + classes = (self.get(cs_filter) or {}).get('classes') # backward compatibility if not classes: return # Validate filter style @@ -184,7 +184,7 @@ def create_run_directory(self) -> str: """Create the destination directory for pipeline outputs""" run_dir = self["run_directory"] if not os.path.isdir(run_dir): - os.mkdir(run_dir) + os.makedirs(run_dir) return run_dir @@ -208,31 +208,39 @@ class Configuration(ConfigBase): Ultimately, this class populates workflow settings and per-library settings. This is a convenience to the user as it is tedious to define inputs and outputs pertaining - to each workflow step. Settings are determined by the Paths, Samples, and Features Sheets. - Users may provide both relative and absolute paths + to each workflow step in a manner compatible with CWL. Settings are determined by the + Run Config, Paths File, Samples Sheet, and Features Sheet. + Users can provide both relative and absolute paths. IMPORTANT: Paths provided in any config file are evaluated relative to the containing config file. + Args: + config_file: The Run Config file path + validate_gffs: If true, validate GFFs (not all contexts need to) + skip_setup: If true, only load the Run Config and Paths File with + no further processing (useful for testing) + Attributes: paths: the configuration object from processing the paths_config file. This holds path info for other config files and prefixes, and is updated appropriately if 'run_bowtie_index' is set to 'true' """ - def __init__(self, config_file: str, validate_inputs=False): + def __init__(self, config_file: str, validate_gffs=False, skip_setup=False): # Parse YAML configuration file super().__init__(config_file, RunConfigCompatibility) self.paths = self.load_paths_config() self.absorb_paths_file() + if skip_setup: return self.setup_pipeline() self.setup_file_groups() self.setup_ebwt_idx() self.process_samples_sheet() self.process_features_sheet() self.setup_step_inputs() - if validate_inputs: self.validate_inputs() + if validate_gffs: self.validate_inputs() def load_paths_config(self): """Returns a PathsFile object and updates keys related to the Paths File path""" @@ -287,12 +295,17 @@ def setup_pipeline(self): self.dt = get_timestamp() self['run_date'], self['run_time'] = self.dt.split('_') + # Ensure compatible string joins while preserving 0 + for key in ('user', 'run_name', 'run_directory'): + self[key] = str(self[key]) if self[key] is not None else '' + default_run_name = '_'.join(x for x in [self['user'], "tinyrna"] if x) - self['run_name'] = self.get('run_name', default=default_run_name) + "_" + self.dt + self['run_name'] = f"{self['run_name'] or default_run_name}_{self.dt}" - # Create prefixed Run Directory name - run_dir_parent, run_dir = os.path.split(self['run_directory'].rstrip(os.sep)) - self['run_directory'] = self.joinpath(run_dir_parent, self['run_name'] + "_" + run_dir) + # Prefix Run Directory basename while preserving subdirectory structure + rd_head, rd_tail = os.path.split(self['run_directory'].rstrip(os.sep)) + basename = '_'.join(x for x in [self['run_name'], rd_tail] if x) + self['run_directory'] = self.joinpath(rd_head, basename) self.templates = resource_filename('tiny', 'templates/') @@ -441,7 +454,7 @@ def get_templates(context: str): 'tiny': tiny, 'tiny-count': tiny_count, 'tiny-plot': tiny_plot - }.get(context, None) + }.get(context) if files_to_copy is None: raise ValueError(f"Invalid template file context: {context}") @@ -534,8 +547,10 @@ def validate_paths(self): "The following parameters are required in {selfname}: {params}" \ .format(selfname=self.basename, params=', '.join(self.required)) - # Some entries in Paths File are omitted from tiny-count's working directory during - # pipeline runs. There is no advantage to checking file existence here vs. in load_* + # The availability of these file entries in the working directory will vary by step. + # This is determined by the step's CWL CommandLineTool specification. + # Instead of checking file existence within each step that uses this class, + # check only at pipeline startup and let the workflow runner worry about files from there. if self.in_pipeline: return for key in self.single: @@ -559,6 +574,13 @@ def check_backward_compatibility(self): "that you are using a Paths File from an earlier version of tinyRNA. Please " \ "check the release notes and update your configuration files." + missing_keys = [key for key in (*self.single, *self.groups, *self.prefix) + if key not in self.config] + + assert not missing_keys, \ + "The following expected keys were missing in {selfname}:\n\t{missing}" \ + .format(selfname=self.basename, missing="\n\t".join(missing_keys)) + def get_gff_config(self) -> Dict[str, list]: """Restructures GFF input info so that it can be more easily handled. To be clear, the Paths File YAML could be structured to match the desired output, @@ -574,8 +596,13 @@ def get_gff_config(self) -> Dict[str, list]: # Build dictionary of files and allowed aliases for gff in self['gff_files']: if not self.is_path_dict(gff): continue - path, aliases = gff['path'], gff.get('alias', ()) - gff_files[path].extend(filter(id_filter, aliases)) + alias = gff.get('alias') + path = gff['path'] + + # Allow for some user error in YAML syntax + if isinstance(alias, str): alias = [alias] + if not isinstance(alias, list): alias = [] + gff_files[path].extend(filter(id_filter, alias)) # Remove duplicate aliases per file, keep order for file, alias in gff_files.items(): @@ -603,7 +630,11 @@ def append_to(self, key: str, val: Any): items appended to the temporary list would otherwise be lost.""" assert key in self.groups, "Tried appending to a non-list type parameter" - target = self.config.get(key, []) + + target = self.config.get(key) + if not isinstance(target, list): + self.config[key] = target = [] + target.append(val) return target diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index c3adabe8..eefe28d8 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -297,7 +297,7 @@ def parse_GFF_attribute_string(attrStr, extra_return_first_value=False, gff_vers continue if (gff_version == 2) and attr.count('"') not in (0, 2): raise ValueError( - "The attribute string seems to contain mismatched quotes.") + "The attribute string seems to contain mismatched quotes.") mo = _re_attr_main.match(attr) if not mo: raise ValueError("Failure parsing GFF attribute line") @@ -693,7 +693,7 @@ def was_matched(self, untagged_id): def get_row_parent(self, feature_id: str, row_attrs: CaseInsensitiveAttrs) -> str: """Get the current feature's parent while cooperating with filtered features""" - parent_attr = row_attrs.get("Parent", [None]) + parent_attr = row_attrs.get("Parent") or [None] parent = parent_attr[0] if len(parent_attr) > 1: @@ -728,7 +728,7 @@ def add_alias(self, root_id: str, alias_keys: List[str], row_attrs: CaseInsensit for alias_key in alias_keys: for row_val in row_attrs.get(alias_key, ()): - self.alias[root_id].add(row_val) + if row_val: self.alias[root_id].add(row_val) def _finalize_aliases(self): self.alias = {feat: tuple(sorted(aliases, key=str.lower)) for feat, aliases in self.alias.items()} diff --git a/tiny/rna/plotterlib.py b/tiny/rna/plotterlib.py index b3def66a..ebebcd81 100644 --- a/tiny/rna/plotterlib.py +++ b/tiny/rna/plotterlib.py @@ -133,7 +133,6 @@ def barh_proportion(self, prop_ds: pd.Series, max_prop=1.0, scale=2, **kwargs) - # Create the plot and set plot attributes cbar = (prop_ds * 100).plot(kind='barh', ax=ax, color=bar_colors, sort_columns=False, **kwargs) - cbar.xaxis.set_major_formatter(tix.PercentFormatter()) cbar.set_xlabel('Percentage of reads') cbar.set_xlim(0, min([(max_prop * 100) + 10, 100])) diff --git a/tiny/templates/paths.yml b/tiny/templates/paths.yml index 2b70aba2..a05f83fc 100644 --- a/tiny/templates/paths.yml +++ b/tiny/templates/paths.yml @@ -22,7 +22,7 @@ gff_files: #- path: # alias: [ ] -##-- The final output directory for files produced by the pipeline --# +##-- The suffix to use in the final output directory name (optional) --# run_directory: run_directory ##-- The directory for temporary files. Determined by cwltool if blank. --## diff --git a/tiny/templates/run_config_template.yml b/tiny/templates/run_config_template.yml index 2985e676..a1609840 100644 --- a/tiny/templates/run_config_template.yml +++ b/tiny/templates/run_config_template.yml @@ -1,25 +1,23 @@ ######----------------------------- tinyRNA Configuration -----------------------------###### # -# In this file you can specify your configuration preferences for the workflow and -# each workflow step. +# In this file you can specify your configuration preferences for the workflow. # # If you want to use DEFAULT settings for the workflow, all you need to do is provide the path # to your Samples Sheet and Features Sheet in your Paths File, then make sure that the # 'paths_config' setting below points to your Paths File. # # We suggest that you also: -# 1. Add a username to identify the person performing runs, if desired for record keeping -# 2. Add a run directory name in your Paths File. If not provided, "run_directory" is used -# 3. Add a run name to label your run directory and run-specific summary reports. -# If not provided, user_tinyrna will be used. +# 1. Add a `user` below to identify the person performing runs. +# 2. Add a `run_name` below to prefix your run directory and summary reports. +# If not provided, user_tinyrna is used. # -# This file will be further processed at run time to generate the appropriate pipeline -# settings for each workflow step. A copy of this processed configuration will be stored -# in your run directory. +# This file will be processed at run time to generate the appropriate settings +# for each workflow step. A copy of the processed configuration will be saved +# in your run directory for auto-documentation. # ######-------------------------------------------------------------------------------###### -user: ~ +user: run_date: ~ run_time: ~ paths_config: paths.yml @@ -162,57 +160,58 @@ compress: False ######-------------------------------------------------------------------------------###### -##-- Report end-to-end hits w/ <=v mismatches; ignore qualities --## +##-- Report end-to-end hits w/ <=v mismatches; ignore qualities (option -v) --## end_to_end: 0 -##-- Report all alignments per read (much slower than low -k) --## +##-- Report all alignments per read (much slower than low -k) (option --all) --## all_aln: True -##-- Seed for random number generator --## +##-- Seed for random number generator (option --seed) --## seed: 0 -##-- Suppress SAM records for unaligned reads --## +##-- Suppress SAM records for unaligned reads (option --no-unal) --## no_unal: True -##-- Use shared mem for index; many bowtie's can share --## +##-- Use shared mem for index; many bowtie's can share (option --shmem) --## ##-- Note: this requires further configuration of your OS --## ##-- http://bowtie-bio.sourceforge.net/manual.shtml#bowtie-options-shmem --## shared_memory: False ###-- Unused option inputs: Remove '#' in front to use --### -##-- Hits are guaranteed best stratum, sorted; ties broken by quality --## +##-- Hits are guaranteed best stratum, sorted; ties broken by quality (option --best) --## #best: False -##-- Hits in sub-optimal strata aren't reported (requires best, ^^^^) --## +##-- Hits in sub-optimal strata aren't reported (requires best, ^^^^) (option --strata) --## #strata: False -##-- Max mismatches in seed (can be 0-3, default: -n 2) --## +##-- Max mismatches in seed (can be 0-3, default: -n 2) (option --seedmms) --## #seedmms: 2 -##-- Seed length for seedmms (default: 28) --## +##-- Seed length for seedmms (default: 28) (option --seedlen) --## #seedlen: 28 -##-- Do not align to reverse-compliment reference --## -# norc: False +##-- Do not align to forward/reverse-complement reference strand (options --norc and --nofw) --## +#norc: False +#nofw: False -##-- Do not align to forward reference --## -# nofw: False +##-- Input quals are Phred+64 (same as --solexa1.3-quals) (option --phred64-quals) --## +#bt_phred64: False -##-- Input quals are Phred+64 (same as --solexa1.3-quals) --## -# bt_phred64: False +##-- Report up to good alignments per read (default: 1) (option -k) --## +#k_aln -##-- Report up to good alignments per read (default: 1) --## -# k_aln +##-- Suppress all alignments if > exist (default: no limit) (option -m) --## +#suppress_aln: 10 -##-- Number of bases to trim from 5' or 3' end of reads --## -# trim5: 0 -# trim3: 0 +##-- Trim bases from 5' (left) or 3' (right) end of reads (options --trim5 and --trim3) --## +#trim5: 0 +#trim3: 0 -##-- Input quals are from GA Pipeline ver. < 1.3 --## -# solexa: false +##-- Input quals are from GA Pipeline ver. < 1.3 (option --solexa-quals) --## +#solexa: false -##-- Input quals are from GA Pipeline ver. >= 1.3 --## -# solexa13: false +##-- Input quals are from GA Pipeline ver. >= 1.3 (option --solexa1.3-quals) --## +#solexa13: false ######--------------------------- FEATURE COUNTER OPTIONS ---------------------------######