diff --git a/README.md b/README.md index dbe419c4..a04e13d6 100644 --- a/README.md +++ b/README.md @@ -105,8 +105,8 @@ In most cases you will use this toolset as an end-to-end pipeline. This will run 2. The genome sequence of interest in fasta format. 3. Genome coordinates of small RNA features of interest in GFF format. 4. A completed Samples Sheet (`samples.csv`) with paths to the fastq files. -5. A completed Features Sheet (`features.csv`) with paths to the GFF file(s). -6. An updated Paths File (`paths.yml`) with the path to the genome sequence and/or your bowtie index prefix. +5. A completed Features Sheet (`features.csv`) with feature selection rules. +6. An updated Paths File (`paths.yml`) with paths to your GFF files, the genome sequence and/or your bowtie index prefix, as well as the paths to `samples.csv` and `features.csv`. 7. A Run Config file (`run_config.yml`) located in your working directory or the path to the file. The template provided does not need to be updated if you wish to use the default settings. To run an end-to-end analysis, be sure that you're working within the conda tinyrna environment ([instructions above](#usage)) in your terminal and optionally navigate to the location of your Run Config file. Then, simply run the following in your terminal: @@ -177,13 +177,13 @@ The tiny-count step produces a variety of outputs Custom Python scripts and HTSeq are used to generate a single table of feature counts which includes each counted library. Each matched feature is represented with the following metadata columns: - **_Feature ID_** is determined, in order of preference, by one of the following GFF column 9 attributes: `ID`, `gene_id`, `Parent`. - **_Classifier_** is determined by the rules in your Features Sheet. It is the _Classify as..._ value of each matching rule. Since multiple rules can match a feature, some Feature IDs will be listed multiple times with different classifiers. -- **_Feature Name_** displays aliases of your choice, as specified in the _Alias by..._ column of the Features Sheet. If _Alias by..._ is set to`ID`, the _Feature Name_ column is left empty. +- **_Feature Name_** displays aliases of your choice, as specified in the `alias` key under each GFF listed in your Paths File. If `alias` is set to `ID`, the _Feature Name_ column is left empty. -For example, if your Features Sheet has a rule which specifies _Alias by..._ `sequence_name`, _Classify as..._ `miRNA`, and the GFF entry for this feature has the following attributes column: +For example, if your Paths File has a GFF entry which specifies `alias: [sequence_name]`, and the corresponding GFF file has a feature with the following attributes column: ``` ... ID=406904;sequence_name=mir-1,hsa-miR-1; ... ``` -The row for this feature in the feature counts table would read: +And this feature matched a rule in your Features Sheet defining _Classify as..._ `miRNA`, then the entry for this feature in the final counts table would read: | Feature ID | Classifier | Feature Name | Group1_rep_1 | Group1_rep_2 | ... | |------------|------------|------------------|--------------|--------------|-----| diff --git a/START_HERE/features.csv b/START_HERE/features.csv index bd2cc089..58300b5a 100755 --- a/START_HERE/features.csv +++ b/START_HERE/features.csv @@ -1,7 +1,7 @@ -Select for...,with value...,Alias by...,Classify as...,Hierarchy,Strand,5' End Nucleotide,Length,Overlap,Feature Source -Class,mask,Alias,,1,both,all,all,Partial,./reference_data/ram1.gff3 -Class,miRNA,Alias,,2,sense,all,16-22,Full,./reference_data/ram1.gff3 -Class,piRNA,Alias,5pA,2,both,A,24-32,Full,./reference_data/ram1.gff3 -Class,piRNA,Alias,5pT,2,both,T,24-32,Full,./reference_data/ram1.gff3 -Class,siRNA,Alias,,2,both,all,15-22,Full,./reference_data/ram1.gff3 -Class,unk,Alias,,3,both,all,all,Full,./reference_data/ram1.gff3 \ No newline at end of file +Select for...,with value...,Classify as...,Hierarchy,Strand,5' End Nucleotide,Length,Overlap +Class,mask,,1,both,all,all,Partial +Class,miRNA,,2,sense,all,16-22,Full +Class,piRNA,5pA,2,both,A,24-32,Full +Class,piRNA,5pT,2,both,T,24-32,Full +Class,siRNA,,2,both,all,15-22,Full +Class,unk,,3,both,all,all,Full \ No newline at end of file diff --git a/START_HERE/paths.yml b/START_HERE/paths.yml index cfdd6281..444ac5c6 100644 --- a/START_HERE/paths.yml +++ b/START_HERE/paths.yml @@ -5,8 +5,9 @@ # # Directions: # 1. Fill out the Samples Sheet with files to process + naming scheme. [samples.csv] -# 2. Fill out the Features Sheet with reference files and selection rules [features.csv] -# 3. Set samples_csv and features_csv to point to these files +# 2. Fill out the Features Sheet with selection rules [features.csv] +# 3. Set samples_csv and features_csv (below) to point to these files +# 4. Add annotation files and per-file alias preferences to gff_files # ######-------------------------------------------------------------------------------###### @@ -14,6 +15,13 @@ samples_csv: ./samples.csv features_csv: ./features.csv +##-- Each entry: 1. the file, 2. (optional) list of attribute keys for feature aliases --## +gff_files: +- path: "./reference_data/ram1.gff3" + alias: [Alias] +#- path: +# alias: [ ] + ##-- The final output directory for files produced by the pipeline --# run_directory: run_directory diff --git a/START_HERE/run_config.yml b/START_HERE/run_config.yml index c05ac589..91536e0b 100644 --- a/START_HERE/run_config.yml +++ b/START_HERE/run_config.yml @@ -1,21 +1,21 @@ ######----------------------------- tinyRNA Configuration -----------------------------###### # -# In this file you may specify your configuration preferences for the workflow and +# In this file you can specify your configuration preferences for the workflow and # each workflow step. # # If you want to use DEFAULT settings for the workflow, all you need to do is provide the path -# to your Samples Sheet and Features Sheet in your Paths file, then make sure that the -# 'paths_config' setting below points to your Paths file. +# to your Samples Sheet and Features Sheet in your Paths File, then make sure that the +# 'paths_config' setting below points to your Paths File. # # We suggest that you also: # 1. Add a username to identify the person performing runs, if desired for record keeping -# 2. Add a run directory name in your Paths file. If not provided, "run_directory" is used +# 2. Add a run directory name in your Paths File. If not provided, "run_directory" is used # 3. Add a run name to label your run directory and run-specific summary reports. # If not provided, user_tinyrna will be used. # # This file will be further processed at run time to generate the appropriate pipeline # settings for each workflow step. A copy of this processed configuration will be stored -# in your run directory (as specified by your Paths configuration file). +# in your run directory. # ######-------------------------------------------------------------------------------###### @@ -48,7 +48,7 @@ run_native: false # paths_config file. # # We have specified default parameters for small RNA data based on our own "best practices". -# You may change the parameters here. +# You can change the parameters here. # ######-------------------------------------------------------------------------------###### @@ -75,7 +75,7 @@ ftabchars: ~ # pipeline github: https://github.com/MontgomeryLab/tinyrna # # We have specified default parameters for small RNA data based on our own "best practices". -# You may change the parameters here. +# You can change the parameters here. # ######-------------------------------------------------------------------------------###### @@ -135,7 +135,7 @@ compression: 4 # Trimming takes place prior to counting/collapsing. # # We have specified default parameters for small RNA data based on our own "best practices". -# You may change the parameters here. +# You can change the parameters here. # ######-------------------------------------------------------------------------------###### @@ -157,7 +157,7 @@ compress: False # We use bowtie for read alignment to a genome. # # We have specified default parameters for small RNA data based on our own "best practices". -# You may change the parameters here. +# You can change the parameters here. # ######-------------------------------------------------------------------------------###### @@ -263,12 +263,11 @@ dge_drop_zero: False ######-------------------------------- PLOTTING OPTIONS -----------------------------###### # -# We use a custom Python script for creating all plots. The default base style is called -# 'smrna-light'. If you wish to use another matplotlib stylesheet you may specify that in -# the Paths File. +# We use a custom Python script for creating all plots. If you wish to use another matplotlib +# stylesheet you can specify that in the Paths File. # # We have specified default parameters for small RNA data based on our own "best practices". -# You may change the parameters here. +# You can change the parameters here. # ######-------------------------------------------------------------------------------###### @@ -303,7 +302,7 @@ plot_unassigned_class: "_UNASSIGNED_" ######----------------------------- OUTPUT DIRECTORIES ------------------------------###### # # Outputs for each step are organized into their own subdirectories in your run -# directory. You may set these folder names here. +# directory. You can set these folder names here. # ######-------------------------------------------------------------------------------###### @@ -320,15 +319,15 @@ dir_name_plotter: plots ######################### AUTOMATICALLY GENERATED CONFIGURATIONS ######################### # # Do not make any changes to the following sections. These options are automatically -# generated using your Paths file, your Samples and Features sheets, and the above +# generated using your Paths File, your Samples and Features sheets, and the above # settings in this file. # ########################################################################################### -######--------------------------- DERIVED FROM PATHS SHEET --------------------------###### +######--------------------------- DERIVED FROM PATHS FILE ---------------------------###### # -# The following configuration settings are automatically derived from the sample sheet +# The following configuration settings are automatically derived from the Paths File # ######-------------------------------------------------------------------------------###### @@ -336,6 +335,8 @@ run_directory: ~ tmp_directory: ~ features_csv: { } samples_csv: { } +paths_file: { } +gff_files: [ ] run_bowtie_build: false reference_genome_files: [ ] plot_style_sheet: ~ @@ -343,9 +344,9 @@ adapter_fasta: ~ ebwt: ~ -######-------------------------- DERIVED FROM SAMPLE SHEET --------------------------###### +######------------------------- DERIVED FROM SAMPLES SHEET --------------------------###### # -# The following configuration settings are automatically derived from the sample sheet +# The following configuration settings are automatically derived from the Samples Sheet # ######-------------------------------------------------------------------------------###### @@ -370,10 +371,6 @@ run_deseq: True ######------------------------- DERIVED FROM FEATURES SHEET -------------------------###### # -# The following configuration settings are automatically derived from the sample sheet +# The following configuration settings are automatically derived from the Features Sheet # -######-------------------------------------------------------------------------------###### - -###-- Utilized by tiny-count --### -# a list of only unique GFF files -gff_files: [ ] \ No newline at end of file +######-------------------------------------------------------------------------------###### \ No newline at end of file diff --git a/doc/Configuration.md b/doc/Configuration.md index 0cd827a9..fa009aba 100644 --- a/doc/Configuration.md +++ b/doc/Configuration.md @@ -14,15 +14,15 @@ tiny get-template ## Overview ->**Tip**: Each of the following will allow you to map out paths to your input files for analysis. You can use either relative or absolute paths to do so. **Relative paths will be evaluated relative to the file in which they are defined.** This allows you to flexibly organize and share configurations between projects. +>**Tip**: You can use either relative or absolute paths for your file inputs. **Relative paths will be evaluated relative to the file in which they are defined.** This allows you to flexibly organize and share configurations between projects. #### Run Config -The overall behavior of the pipeline and its steps is determined by the Run Config file (`run_config.yml`). This YAML file can be edited using a simple text editor. Within it you must specify the location of your Paths file (`paths.yml`). All other settings are optional. [More info](#run-config-details). +The overall behavior of the pipeline and its steps is determined by the Run Config file (`run_config.yml`). This YAML file can be edited using a simple text editor. Within it you must specify the location of your Paths File (`paths.yml`). All other settings are optional. [More info](#run-config-details). #### Paths File -The locations of pipeline file inputs are defined in the Paths file (`paths.yml`). This YAML file includes paths to your Samples and Features Sheets, in addition to your bowtie index prefix (optional) and the final run directory name. The final run directory will contain all pipeline outputs. The directory name is prepended with the `run_name` and current date and time to keep outputs separate. [More info](#paths-file-details). +The locations of pipeline file inputs are defined in the Paths file (`paths.yml`). This YAML file includes paths to your configuration files, your GFF files, and your bowtie indexes and/or reference genome. [More info](#paths-file-details). #### Samples Sheet @@ -91,6 +91,15 @@ When the pipeline starts up, tinyRNA will process the Run Config based on the co ## Paths File Details +### GFF Files +GFF annotations are required by tinyRNA. For each file, you can optionally provide an `alias` which is a list of attributes to represent each feature in the Feature Name column of output counts tables. Each entry under the `gff_files` parameter must look something like the following mock example: +```yaml + - path: 'a/path/to/your/file.gff' # 0 spaces before - + alias: [optional, list, of attributes] # 2 spaces before alias + +# ^ Each new GFF path must begin with - +``` + ### Building Bowtie Indexes If you don't have bowtie indexes already built for your reference genome, tinyRNA can build them for you at the beginning of an end-to-end run and reuse them on subsequent runs with the same Paths File. @@ -101,6 +110,12 @@ To build bowtie indexes: Once your indexes have been built, your Paths File will be modified such that `ebwt` points to their location (prefix) within your Run Directory. This means that indexes will not be unnecessarily rebuilt on subsequent runs as long as the same Paths File is used. If you need them rebuilt, simply repeat steps 2 and 3 above. +### The Run Directory +The final output directory name has three components: +- The `run_name` defined in your Run Config +- The date and time at pipeline startup +- The `run_directory` basename defined in your Paths File + ## Samples Sheet Details | _Column:_ | Input FASTQ Files | Sample/Group Name | Replicate Number | Control | Normalization | |-----------:|---------------------|-------------------|------------------|---------|---------------| @@ -123,19 +138,17 @@ Supported values are: DESeq2 requires that your experiment design has at least one degree of freedom. If your experiment doesn't include at least one sample group with more than one replicate, tiny-deseq.r will be skipped and DGE related plots will not be produced. ## Features Sheet Details -| _Column:_ | Select for... | with value... | Alias by... | Classify as... | Hierarchy | Strand | 5' End Nucleotide | Length | Overlap | Feature Source | -|------------|---------------|---------------|-------------|----------------|-----------|--------|-------------------|--------|-------------|----------------| -| _Example:_ | Class | miRNA | Name | miRNA | 1 | sense | all | all | 5' anchored | ram1.gff3 | +| _Column:_ | Select for... | with value... | Classify as... | Hierarchy | Strand | 5' End Nucleotide | Length | Overlap | +|------------|---------------|---------------|----------------|-----------|--------|-------------------|--------|-------------| +| _Example:_ | Class | miRNA | miRNA | 1 | sense | all | all | 5' anchored | The Features Sheet allows you to define selection rules that determine how features are chosen when multiple features are found overlap an alignment locus. Selected features are "assigned" a portion of the reads associated with the alignment. -Rules apply to features parsed from **all** Feature Sources, with the exception of "Alias by..." which only applies to the Feature Source on the same row. Selection first takes place against feature attributes (GFF column 9), and is directed by defining the attribute you want to be considered (Select for...) and the acceptable values for that attribute (with value...). +Selection first takes place against the feature attributes defined in your GFF files, and is directed by defining the attribute you want to be considered (Select for...) and the acceptable values for that attribute (with value...). Rules that match features in the first stage of selection will be used in a second stage which evaluates alignment vs. feature interval overlap. These matches are sorted by hierarchy value and passed to the third and final stage of selection which examines characteristics of the alignment itself: strand relative to the feature of interest, 5' end nucleotide, and length. See [tiny-count's documentation](tiny-count.md#feature-selection) for an explanation of each column. ->**Tip**: Don't worry about having duplicate Feature Source entries. Each GFF file is parsed only once. - ## Plot Stylesheet Details Matplotlib uses key-value "rc parameters" to allow for customization of its properties and styles, and one way these parameters can be specified is with a [matplotlibrc file](https://matplotlib.org/3.4.3/tutorials/introductory/customizing.html#a-sample-matplotlibrc-file), which we simply refer to as the Plot Stylesheet. You can obtain a copy of the default stylesheet used by tiny-plot with the command `tiny get-template`. Please keep in mind that tiny-plot overrides these defaults for a few specific elements of certain plots. Feel free to reach out if there is a plot style you wish to override but find you are unable to. \ No newline at end of file diff --git a/doc/Parameters.md b/doc/Parameters.md index c19634fc..7801a36c 100644 --- a/doc/Parameters.md +++ b/doc/Parameters.md @@ -122,8 +122,8 @@ Diagnostic information will include intermediate alignment files for each librar ### Full tiny-count Help String ``` -tiny-count -i SAMPLES -f FEATURES -o OUTPUTPREFIX [-h] - [-sf [SOURCE ...]] [-tf [TYPE ...]] [-nh T/F] [-dc] [-a] +tiny-count -pf PATHS -o OUTPUTPREFIX [-h] [-sf [SOURCE ...]] + [-tf [TYPE ...]] [-nh T/F] [-dc] [-sv {Cython,HTSeq}] [-a] [-p] [-d] This submodule assigns feature counts for SAM alignments using a Feature Sheet @@ -132,10 +132,8 @@ prior run, we recommend that you instead run `tiny recount` within that run's directory. Required arguments: - -i SAMPLES, --samples-csv SAMPLES - your Samples Sheet - -f FEATURES, --features-csv FEATURES - your Features Sheet + -pf PATHS, --paths-file PATHS + your Paths File -o OUTPUTPREFIX, --out-prefix OUTPUTPREFIX output prefix to use for file names diff --git a/doc/tiny-count.md b/doc/tiny-count.md index c5e447a6..8b575a56 100644 --- a/doc/tiny-count.md +++ b/doc/tiny-count.md @@ -111,23 +111,15 @@ Examples: >**Tip:** you may specify U and T bases in your rules. Uracil bases will be converted to thymine when your Features Sheet is loaded. N bases are also allowed. -### Misc -| features.csv columns: | Alias by... | Feature Source | -|-----------------------|-------------|----------------| - -You may specify an **Alias by...** which is a GFF column 9 attribute key you wish to represent each feature. The intention of this column is to provide a human-friendly name for each feature. The value associated with each feature's **Alias by...** attribute will be shown in the `Feature Name` column of the Feature Counts output table. For example, if one of your rules specifies an alias of `sequence_name` and gene1's `sequence_name` attribute is "abc123", then gene1's `Feature Name` column in the Feature Counts table will read "abc123". - -The **Feature Source** field of a rule is tied only to the **Alias by...**; rules are _not_ partitioned on a GFF file basis, and features parsed from these GFF files are similarly not partitioned as they all go into the same lookup table regardless of source. For each rule, aliases are built on a per-GFF file basis; that is, **Alias by...** values will only be gathered from their corresponding **Feature Source**. Additionally, each GFF file is parsed only once regardless of the number of times it occurs in the Features Sheet. - ## Count Normalization Small RNA reads passing selection will receive a normalized count increment. By default, read counts are normalized twice before being assigned to a feature. The second normalization step can be disabled in `run_config.yml` if desired. Counts for each small RNA sequence are divided: 1. By the number of loci it aligns to in the genome. 2. By the number of _selected_ features for each of its alignments. ## The Details -You may encounter the following cases when you have more than one unique GFF file listed in your **Feature Source**s: +You may encounter the following cases when you have more than one unique GFF file listed in your Paths File: - If a feature is defined in one GFF file, then again in another but with differing attributes, rule and alias matches will be merged for the feature -- If a feature is defined in one GFF file, then again but under a different **Alias by...**, then both aliases are retained and treated as a list. All aliases will be present in the `Feature Name` column of the Feature Counts output table. They will be comma separated. +- If a feature is defined in one GFF file, then again but with a different `alias` parameter, then all unique aliases are retained for the feature and listed in its `Feature Name` column. Discontinuous features and feature filtering support: - Discontinuous features are supported (as defined by the `Parent` attribute key, or by a shared `ID`/`gene_id` attribute value). Rule and alias matches of descendents are merged with the root parent's. diff --git a/tests/unit_test_helpers.py b/tests/unit_test_helpers.py index a6c7bd7e..5b424bca 100644 --- a/tests/unit_test_helpers.py +++ b/tests/unit_test_helpers.py @@ -14,7 +14,7 @@ from typing import List -from tiny.rna.configuration import CSVReader +from tiny.rna.configuration import CSVReader, PathsFile rules_template = [{'Identity': ("Name", "N/A"), 'Strand': "both", @@ -47,6 +47,22 @@ def csv_factory(type: str, rows: List[dict], header=()): return csv_string.getvalue() +paths_template_file = os.path.abspath('../tiny/templates/paths.yml') + + +def make_paths_file(in_pipeline=False, prefs=None): + """IMPORTANT: relative file paths are evaluated relative to /tiny/templates/""" + + paths_file = paths_template_file + config = PathsFile(paths_file, in_pipeline) + if prefs is None: return config + + for key, val in prefs.items(): + config[key] = val + + return config + + def get_dir_tree(root_path: str) -> dict: """Returns a nested dictionary representation of a given directory tree. diff --git a/tests/unit_tests_configuration.py b/tests/unit_tests_configuration.py index e06eb114..5b9b63e2 100644 --- a/tests/unit_tests_configuration.py +++ b/tests/unit_tests_configuration.py @@ -1,11 +1,12 @@ import contextlib import io import os +import random import unittest from unittest.mock import patch, mock_open, call -from tiny.rna.configuration import Configuration, SamplesSheet -from unit_test_helpers import csv_factory +from tiny.rna.configuration import Configuration, SamplesSheet, PathsFile +from unit_test_helpers import csv_factory, paths_template_file, make_paths_file class BowtieIndexesTest(unittest.TestCase): @@ -15,11 +16,6 @@ def setUpClass(self): self.run_config = self.root_cfg_dir + "/run_config_template.yml" self.paths = self.root_cfg_dir + "/paths.yml" - self.default_prefix = os.path.join( - self.root_cfg_dir, - Configuration(self.run_config)['run_directory'], - "bowtie-build/ram1" - ) self.maxDiff = 1522 """============ Helper functions ============""" @@ -31,11 +27,21 @@ def config_with(self, prefs): return config def bt_idx_files_from_prefix(self, prefix): + if not os.path.abspath(prefix): + prefix = os.path.normpath(os.path.join(self.root_cfg_dir, prefix)) + return [ {'path': f"{prefix}.{subext}.ebwt", 'class': 'File'} for subext in ['1', '2', '3', '4', 'rev.1', 'rev.2'] ] + def get_default_prefix(self, config): + return os.path.join( + self.root_cfg_dir, + config['run_directory'], + "bowtie-build/ram1" + ) + """================ Tests ==================""" """Does get_ebwt_prefix() produce the expected prefix path?""" @@ -43,7 +49,7 @@ def bt_idx_files_from_prefix(self, prefix): def test_get_ebwt_prefix(self): config = Configuration(self.run_config) actual_prefix = config.get_ebwt_prefix() - expected_prefix = self.default_prefix + expected_prefix = self.get_default_prefix(config) self.assertEqual(actual_prefix, expected_prefix) @@ -60,7 +66,8 @@ def test_get_ebwt_prefix_no_genome(self): def test_get_bt_index_files_prebuilt_indexes(self): config = self.config_with({'run_bowtie_build': False}) - prefix = config.paths['ebwt'] = os.path.abspath("./testdata/counter/validation/ebwt/ram1") + prefix = os.path.abspath("./testdata/counter/validation/ebwt/ram1") + config.paths['ebwt'] = prefix expected = self.bt_idx_files_from_prefix(prefix) self.assertListEqual(config.get_bt_index_files(), expected) @@ -69,7 +76,8 @@ def test_get_bt_index_files_prebuilt_indexes(self): def test_get_bt_index_files_unbuilt_indexes_with_genome(self): config = self.config_with({'run_bowtie_build': True}) - prefix = config.paths['ebwt'] = "mock_prefix" + config.paths['ebwt'] = "mock_prefix" + prefix = config.paths['ebwt'] expected = self.bt_idx_files_from_prefix(prefix) self.assertListEqual(config.get_bt_index_files(), expected) @@ -78,10 +86,11 @@ def test_get_bt_index_files_unbuilt_indexes_with_genome(self): def test_get_bt_index_files_missing_indexes_without_genome(self): config = self.config_with({'run_bowtie_build': False, 'reference_genome_files': None}) - prefix = config.paths['ebwt'] = "missing" + prefix = "missing" + config.paths['ebwt'] = prefix errmsg = '\n'.join([ "The following Bowtie index file couldn't be found:", - "\t" + f"{prefix}.1.ebwt", + "\t" + f"{self.root_cfg_dir}/{prefix}.1.ebwt", "\nPlease either correct your ebwt prefix or add reference genomes in the Paths File." ]) @@ -94,13 +103,14 @@ def test_get_bt_index_files_missing_indexes_without_genome(self): def test_get_bt_index_files_missing_indexes_with_genome(self): config = self.config_with({'run_bowtie_build': False}) - bad_prefix = config.paths['ebwt'] = "missing" - genome_prefix = self.default_prefix + bad_prefix = "missing" + config.paths['ebwt'] = bad_prefix + genome_prefix = self.get_default_prefix(config) expected_files = self.bt_idx_files_from_prefix(genome_prefix) expected_error = '\n'.join([ "The following Bowtie index file couldn't be found:", - "\t" + f"{bad_prefix}.1.ebwt", + "\t" + f"{self.root_cfg_dir}/{bad_prefix}.1.ebwt", "\nIndexes will be built from your reference genome files during this run.", "" ]) @@ -200,5 +210,128 @@ def test_validate_fastq_filepath(self): SamplesSheet('mock_filename') +class PathsFileTest(unittest.TestCase): + + @classmethod + def setUpClass(self): + self.template_dir = os.path.dirname(paths_template_file) + + """Does PathsFile automatically resolve paths when queried?""" + + def test_getitem_single(self): + config = make_paths_file() + config['mock_parameter'] = "./some/../file" + self.assertEqual(config['mock_parameter'], os.path.join(self.template_dir, "file")) + + """Does PathsFile automatically resolve lists of paths when queried? What if some list items are + strings, others are mappings with a "path" key, and others are None/empty?""" + + def test_getitem_group(self): + config = make_paths_file() + config.groups = ('mock_parameter',) + + mapping_1 = {'path': "./some/../file", "other_key": "irrelevant"} + mapping_2 = {'path': "../templates/another_file"} + path_string = "../../START_HERE/reference_data/ram1.gff3" + + config['mock_parameter'] = [mapping_1, mapping_2, path_string, None, ''] + + # Notice: only 'path' is modified in mappings and empty entries are returned unmodified + expected = [ + dict(mapping_1, path=os.path.join(self.template_dir, "file")), + dict(mapping_2, path=os.path.join(self.template_dir, "another_file")), + os.path.normpath(os.path.join(self.template_dir, path_string)), + None, + '' + ] + + self.assertListEqual(config['mock_parameter'], expected) + + """Does PathsFile check for required parameters?""" + + def test_validate_required_parameters(self): + config = make_paths_file() + for key in PathsFile.required: + oldval = config[key] + + with self.assertRaisesRegex(AssertionError, r".*(parameters are required).*"): + config[key] = '' + config.validate_paths() + + with self.assertRaisesRegex(AssertionError, r".*(parameters are required).*"): + config[key] = None + config.validate_paths() + + with self.assertRaisesRegex(AssertionError, r".*(parameters are required).*"): + del config.config[key] + config.validate_paths() + + config[key] = oldval + + """Does PathsFile check that at least one GFF file has been provided?""" + + def test_validate_gff_files(self): + config = make_paths_file() + + with self.assertRaisesRegex(AssertionError, r".*(At least one GFF).*"): + config['gff_files'] = [{'path': "", 'alias': []}] + config.validate_paths() + + with self.assertRaisesRegex(AssertionError, r".*(At least one GFF).*"): + config['gff_files'] = [{'irrelevant': "value"}] + config.validate_paths() + + """Does PathsFile check for missing files for single entry parameters?""" + + def test_validate_missing_file_single(self): + config = make_paths_file() + key = random.choice(PathsFile.single) + config[key] = '/dev/null/file_dne' + + with self.assertRaisesRegex(AssertionError, f".*(file provided for {key}).*"): + config.validate_paths() + + """Does PathsFile check for missing files under list-type parameters?""" + + def test_validate_missing_file_group(self): + config = make_paths_file() + bad_path = "/dev/null/file_dne" + config.append_to('gff_files', {'path': bad_path, 'alias': []}) + + # Config now has one good entry and one bad entry; make sure the bad entry was specifically reported + with self.assertRaisesRegex(AssertionError, f".*(file provided under gff_files).*\n\t{bad_path}"): + config.validate_paths() + + """Does PathsFile detect a backward compatibility issue?""" + + def test_backward_incompatibility(self): + config = make_paths_file() + del config.config['gff_files'] + + with self.assertRaisesRegex(AssertionError, r".*(check the release notes).*"): + config.check_backward_compatibility() + + """Does PathsFile map all paths to the working directory when in_pipeline=True?""" + + def test_pipeline_mapping(self): + # There are entries in the template Paths File that will cause FileNotFound + # when in_pipeline=True (they're not in the template directory). Since + # file existence isn't relevant for this test, we patch that out. + with patch('tiny.rna.configuration.os.path.isfile', return_value=True): + config = make_paths_file(in_pipeline=True) + + config['mock_mapping'] = {'path': "/dev/null/file_dne", 'other': "irrelevant"} + config['mapping_no_path'] = {'nopath': True} + config['mock_path'] = "/a/very/long/path.gz" + config['empty_path'] = "" + config['none_path'] = None + + self.assertDictEqual(config['mock_mapping'], {'path': "file_dne", 'other': "irrelevant"}) + self.assertDictEqual(config['mapping_no_path'], {'nopath': True}) + self.assertEqual(config['mock_path'], "path.gz") + self.assertEqual(config['empty_path'], "") + self.assertEqual(config['none_path'], None) + + if __name__ == '__main__': unittest.main() diff --git a/tests/unit_tests_counter.py b/tests/unit_tests_counter.py index 56a26990..349f9b5a 100644 --- a/tests/unit_tests_counter.py +++ b/tests/unit_tests_counter.py @@ -32,14 +32,12 @@ def setUpClass(self): self.csv_feat_row_dict = { 'Key': "Class", 'Value': "CSR", - 'Name': "Alias", 'Class': "", 'Hierarchy': "1", 'Strand': "antisense", "nt5end": '"C,G,U"', # Needs to be double-quoted due to commas 'Length': "all", - 'Overlap': "Partial", - 'Source': "test_file.gff3" + 'Overlap': "Partial" } # Represents the parsed Features Sheet row above @@ -183,13 +181,9 @@ def test_load_config_single_cmd(self): with patch('tiny.rna.configuration.open', mock_open(read_data=csv)): dummy_file = '/dev/null' - ruleset, gff_files = counter.load_config(dummy_file, is_pipeline=False) + ruleset = counter.load_config(dummy_file, is_pipeline=False) expected_ruleset = self.parsed_feat_rule - expected_gff_file = from_here(dummy_file, row['Source']) - expected_gff_ret = defaultdict(list, zip([expected_gff_file], [[row['Name']]])) - - self.assertEqual(gff_files, expected_gff_ret) self.assertEqual(ruleset, expected_ruleset) """Does load_config correctly parse a single-entry features.csv config file for pipeline invocation?""" @@ -201,13 +195,9 @@ def test_load_config_single_pipeline(self): with patch('tiny.rna.configuration.open', mock_open(read_data=csv)): dummy_file = '/dev/null' - ruleset, gff_files = counter.load_config(dummy_file, is_pipeline=True) + ruleset = counter.load_config(dummy_file, is_pipeline=True) expected_ruleset = self.parsed_feat_rule - expected_gff_file = os.path.basename(row['Source']) - expected_gff_ret = defaultdict(list, zip([expected_gff_file], [[row['Name']]])) - - self.assertEqual(gff_files, expected_gff_ret) self.assertEqual(ruleset, expected_ruleset) """Does load_config correctly handle duplicate rules? Want: no duplicate rules and no duplicate Name Attributes.""" @@ -219,13 +209,9 @@ def test_load_config_duplicate_rules(self): with patch('tiny.rna.configuration.open', mock_open(read_data=csv)): dummy_filename = '/dev/null' - ruleset, gff_files = counter.load_config(dummy_filename, False) + ruleset = counter.load_config(dummy_filename, False) expected_ruleset = self.parsed_feat_rule - expected_gff_file = from_here(dummy_filename, row['Source']) - expected_gff_ret = defaultdict(list, zip([expected_gff_file], [[row['Name']]])) - - self.assertEqual(gff_files, expected_gff_ret) self.assertEqual(ruleset, expected_ruleset) """Does load_config convert uracil to thymine for proper matching with cDNA sequences?""" @@ -237,26 +223,44 @@ def test_load_config_rna_to_cDNA(self): with patch('tiny.rna.configuration.open', mock_open(read_data=csv)): dummy_file = '/dev/null' - ruleset, _ = counter.load_config(dummy_file, False) + ruleset = counter.load_config(dummy_file, False) self.assertEqual(ruleset[0]['nt5end'], 'T') - """Does load_config properly screen for "ID" Name Attributes?""" + """Does load_gff_files properly screen for "ID" alias attributes?""" - def test_load_config_id_name_attr(self): - row = self.csv_feat_row_dict.copy() - row['Name'] = 'ID' - csv = self.csv("features.csv", [row]) + def test_load_gff_files_id_alias_attr(self): + config = helpers.make_paths_file() + config['gff_files'] = [{'path': "/irrelevant", 'alias': ['ID']}] - with patch('tiny.rna.configuration.open', mock_open(read_data=csv)): - dummy_file = '/dev/null' - _, gff_files = counter.load_config(dummy_file, False) + # Patch GFFValidator so that it isn't called (not relevant) + with patch('tiny.rna.counter.counter.GFFValidator'): + gff_files = counter.load_gff_files(config, [], {}) + + expected = {"/irrelevant": []} + self.assertDictEqual(gff_files, expected) + + """Does load_gff_files merge aliases if the same GFF file is listed more than once? + Are duplicates also removed and original order preserved?""" - # Expect {file: [empty Name Attribute list]} - from_dummy = from_here(dummy_file, row['Source']) - expected = defaultdict(list, zip([from_dummy], [[]])) - self.assertEqual(gff_files, expected) + def test_load_gff_files_merge_alias_attr(self): + config = helpers.make_paths_file() + config['gff_files'] = [ + {'path': "/same/path", 'alias': ['attr1', 'attr2']}, + {'path': "/same/path", 'alias': ['attr1', 'attrN']}, + {'path': "/different/path", 'alias': ['attr3']} + ] + + # Patch GFFValidator so that it isn't called (not relevant) + with patch('tiny.rna.counter.counter.GFFValidator'): + gff_files = counter.load_gff_files(config, [], {}) + + expected = { + "/same/path": ['attr1', 'attr2', 'attrN'], + "/different/path": ['attr3'] + } + self.assertDictEqual(gff_files, expected) if __name__ == '__main__': unittest.main() diff --git a/tiny/cwl/tools/tiny-count.cwl b/tiny/cwl/tools/tiny-count.cwl index 419aa730..5bdf5a17 100644 --- a/tiny/cwl/tools/tiny-count.cwl +++ b/tiny/cwl/tools/tiny-count.cwl @@ -6,6 +6,8 @@ class: CommandLineTool requirements: - class: InitialWorkDirRequirement listing: [ + $(inputs.samples_csv), + $(inputs.features_csv), $(inputs.gff_files), $(inputs.aligned_seqs), $(inputs.fastp_logs), @@ -16,15 +18,10 @@ baseCommand: tiny-count stdout: console_output.log inputs: - samples_csv: - type: File - inputBinding: - prefix: -i - - features_csv: + paths_file: type: File inputBinding: - prefix: -f + prefix: -pf out_prefix: type: string @@ -75,6 +72,12 @@ inputs: # The following optional inputs are for staging InitialWorkingDir files for pipeline execution + samples_csv: + type: File + + features_csv: + type: File + # Specifies the GFF files defined in features.csv gff_files: type: File[]? diff --git a/tiny/cwl/workflows/tinyrna_wf.cwl b/tiny/cwl/workflows/tinyrna_wf.cwl index 85fd388f..dffdc995 100644 --- a/tiny/cwl/workflows/tinyrna_wf.cwl +++ b/tiny/cwl/workflows/tinyrna_wf.cwl @@ -76,6 +76,7 @@ inputs: shared_memory: boolean? # counter inputs + paths_file: File samples_csv: File features_csv: File gff_files: File[]? @@ -204,6 +205,7 @@ steps: counter: run: ../tools/tiny-count.cwl in: + paths_file: paths_file samples_csv: samples_csv features_csv: features_csv aligned_seqs: bowtie/sam_out diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py index 394f97a3..997770b7 100644 --- a/tiny/rna/configuration.py +++ b/tiny/rna/configuration.py @@ -10,7 +10,7 @@ from pkg_resources import resource_filename from collections import Counter, OrderedDict from datetime import datetime -from typing import Union, Any +from typing import Union, Any, Optional from glob import glob from tiny.rna.counter.validation import GFFValidator @@ -95,15 +95,9 @@ def prefix(path: str) -> str: return os.path.splitext(path)[0] @staticmethod - def joinpath(path1: str, path2: str) -> str: - """Combines two relative paths intelligently""" - path1, path2 = (os.path.expanduser(p) for p in [path1, path2]) - if os.path.isabs(path2): return path2 - return os.path.normpath(os.path.join(path1, path2)) - - @staticmethod - def cwl_file(file: str, verify=True) -> dict: + def cwl_file(file: Union[str,dict], verify=True) -> dict: """Returns a minimal File object as defined by CWL""" + if isinstance(file, dict): file = file['path'] if '~' in file: file = os.path.expanduser(file) if verify and not os.path.exists(file): raise(FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file)) @@ -114,10 +108,35 @@ def cwl_dir(dir: str) -> dict: """Returns a minimal Directory object as defined by CWL""" return {'class': 'Directory', 'path': dir} - def from_here(self, destination: str, origin: str = None): - """Calculates paths relative to the input config file""" - origin = self.dir if origin is None else origin - return self.joinpath(origin, destination) + @staticmethod + def joinpath(path1: str, path2: str) -> str: + """Combines two relative paths intelligently""" + path1 = os.path.expanduser(path1) if path1 is not None else "" + path2 = os.path.expanduser(path2) if path2 is not None else "" + if os.path.isabs(path2): return path2 + return os.path.normpath(os.path.join(path1, path2)) + + def from_here(self, destination: Union[str,dict,None], origin: Union[str,dict,None] = None) -> Union[str,dict,None]: + """Calculates paths relative to the input config file. An empty destination returns an empty path. + If destination is an absolute path, then destination is returned without any further joining. + Args: + destination: the path to evaluate relative to the config file. Can be a string or CWL file dictionary. + origin: the starting point (default: the config file's directory) + """ + + if isinstance(origin, dict): origin = origin.get('path') + if origin is None: origin = self.dir + + if ( + isinstance(destination, dict) and + isinstance(destination.get("path"), (str, bytes)) and + len(destination['path']) + ): + return dict(destination, path=self.joinpath(origin, destination["path"])) + elif isinstance(destination, (str, bytes)) and bool(destination): + return self.joinpath(origin, destination) + else: + return destination def create_run_directory(self) -> str: """Create the destination directory for pipeline outputs""" @@ -168,42 +187,31 @@ def __init__(self, config_file: str, validate_inputs=False): super().__init__(config_file) self.paths = self.load_paths_config() - self.process_paths_sheet() + self.assimilate_paths_file() self.setup_pipeline() - self.setup_per_file() + self.setup_file_groups() self.setup_ebwt_idx() self.process_samples_sheet() self.process_features_sheet() if validate_inputs: self.validate_inputs() def load_paths_config(self): - """Constructs a sub-configuration object containing workflow file preferences""" - path_sheet = self.from_here(self['paths_config']) - return ConfigBase(path_sheet) - - def process_paths_sheet(self): - """Loads the paths of all related config files and workflow inputs""" - - def to_cwl_file_class(input_file_path): - path_to_input = self.paths.from_here(input_file_path) - return self.cwl_file(path_to_input) - - for absorb_key in ['ebwt', 'plot_style_sheet', 'adapter_fasta', 'tmp_directory']: - self[absorb_key] = self.paths[absorb_key] - self['run_directory'] = self.paths.from_here(self.paths['run_directory']) - - # Configurations that need to be converted from string to a CWL File object - self['samples_csv'] = to_cwl_file_class(self.paths['samples_csv']) - self['features_csv'] = to_cwl_file_class(self.paths['features_csv']) - self['reference_genome_files'] = [ - to_cwl_file_class(genome) - for genome in self.paths['reference_genome_files'] - if genome is not None - ] + """Constructs a sub-configuration object containing workflow file preferences + self['paths_config'] is the user-facing file path (just the path string) + self['paths_file'] is a CWL file object used as a workflow input.""" + paths_file_path = self.from_here(self['paths_config']) + return PathsFile(paths_file_path) + + def assimilate_paths_file(self): + for key in [*PathsFile.single, *PathsFile.groups]: + self[key] = self.paths.as_cwl_file_obj(key) + for key in PathsFile.prefix: + self[key] = self.paths[key] + self['paths_file'] = self.cwl_file(self.paths.inf) def process_samples_sheet(self): - samples_sheet_path = self.paths.from_here(self['samples_csv']['path']) + samples_sheet_path = self.paths['samples_csv'] samples_sheet = SamplesSheet(samples_sheet_path) self['sample_basenames'] = samples_sheet.sample_basenames @@ -214,20 +222,11 @@ def process_samples_sheet(self): self['fastp_report_titles'] = [f"{g}_rep_{r}" for g, r in samples_sheet.groups_reps] def process_features_sheet(self): - features_sheet = self.paths.from_here(self['features_csv']['path']) - features_sheet_dir = os.path.dirname(features_sheet) + # Configuration doesn't currently do anything with Features Sheet contents + return - csv_reader = CSVReader(features_sheet, "Features Sheet") - for row in csv_reader.rows(): - gff_file = self.from_here(row['Source'], origin=features_sheet_dir) - try: - self.append_if_absent('gff_files', self.cwl_file(gff_file)) - except FileNotFoundError: - row_num = csv_reader.row_num - sys.exit("The GFF file on line %d of your Features Sheet was not found:\n%s" % (row_num, gff_file)) - - def setup_per_file(self): - """Per-library settings lists to be populated by entries from samples_csv""" + def setup_file_groups(self): + """Configuration keys that represent lists of files""" self.set_default_dict({per_file_setting_key: [] for per_file_setting_key in ['in_fq', 'sample_basenames', 'gff_files', 'fastp_report_titles'] @@ -243,10 +242,8 @@ def setup_pipeline(self): self['run_name'] = self.get('run_name', default=default_run_name) + "_" + self.dt # Create prefixed Run Directory name - run_dir_resolved = self.paths.from_here(self.get('run_directory', default='run_directory')) - run_dir_parent = os.path.dirname(run_dir_resolved) - run_dir_withdt = self['run_name'] + '_' + os.path.basename(run_dir_resolved) - self['run_directory'] = self.joinpath(run_dir_parent, run_dir_withdt) + run_dir_parent, run_dir = os.path.split(self['run_directory'].rstrip(os.sep)) + self['run_directory'] = self.joinpath(run_dir_parent, self['run_name'] + "_" + run_dir) self.templates = resource_filename('tiny', 'templates/') @@ -263,8 +260,6 @@ def setup_ebwt_idx(self): # Set the prefix to the run directory outputs. This is necessary # because workflow requires bt_index_files to be a populated list. self.paths['ebwt'] = self.get_ebwt_prefix() - else: - self.paths['ebwt'] = self.paths.from_here(self.paths['ebwt']) # verify_bowtie_build_outputs() will check if these end up being long indexes self['bt_index_files'] = self.get_bt_index_files() @@ -279,9 +274,9 @@ def get_ebwt_prefix(self): genome_files = self['reference_genome_files'] if not genome_files: - raise ValueError("If your Paths Sheet doesn't have a value for \"ebtw:\", then bowtie indexes " + raise ValueError("If your Paths File doesn't have a value for \"ebtw:\", then bowtie indexes " "will be built, but you'll need to provide your reference genome files under " - '"reference_genome_files:" (also in your Paths Sheet)') + '"reference_genome_files:" (also in your Paths File)') genome_basename = os.path.basename(genome_files[0]['path']) return self.prefix(os.path.join( # prefix path: @@ -378,6 +373,111 @@ def main(): config_object.write_processed_config(f"processed_{file_basename}") +class PathsFile(ConfigBase): + """A configuration class for managing and validating Paths Files. + Relative paths are automatically resolved on lookup and list types are enforced. + While this is convenient, developers should be aware of the following caveats: + - Lookups that return list values do not return the original object; don't + append to them. Instead, use the append_to() helper function. + - Chained assignments can produce unexpected results. + """ + + required = ('samples_csv', 'features_csv', 'gff_files') + single = ('samples_csv', 'features_csv', 'plot_style_sheet', 'adapter_fasta') + groups = ('reference_genome_files', 'gff_files') + prefix = ('ebwt', 'run_directory', 'tmp_directory') + + def __init__(self, file: str, in_pipeline=False): + super().__init__(file) + self.in_pipeline = in_pipeline + self.map_path = self.from_pipeline if in_pipeline else self.from_here + self.check_backward_compatibility() + self.validate_paths() + + @staticmethod + def from_pipeline(value): + """When tiny-count runs as a pipeline step, all file inputs are + sourced from the working directory regardless of original path.""" + + if isinstance(value, dict) and value.get("path") is not None: + return dict(value, path=os.path.basename(value['path'])) + elif isinstance(value, (str, bytes)): + return os.path.basename(value) + else: + return value + + def __getitem__(self, key: str): + """Automatically performs path resolution for both single and group parameters. + Note that only keys listed in self.groups are guaranteed to be returned as lists.""" + + value = self.config.get(key) + if key in self.groups: + if value is None: return [] + return [self.map_path(p) for p in value] + else: + return self.map_path(value) + + def as_cwl_file_obj(self, key: str): + """Returns the specified parameter with file paths converted to CWL file objects.""" + + val = self[key] + + if not val: + return val + elif key in self.single: + return self.cwl_file(val) + elif key in self.groups: + return [self.cwl_file(sub) for sub in val if sub] + elif key in self.prefix: + raise ValueError(f"The parameter {key} isn't meant to be a CWL file object.") + else: + raise ValueError(f'Unrecognized parameter: "{key}"') + + def validate_paths(self): + assert all(self[req] for req in self.required), \ + "The following parameters are required in {selfname}: {params}" \ + .format(selfname=self.basename, params=', '.join(self.required)) + + assert any(gff.get('path') for gff in self['gff_files']), \ + "At least one GFF file path must be specified under gff_files in {selfname}" \ + .format(selfname=self.basename) + + # Some entries in Paths File are omitted from tiny-count's working directory during + # pipeline runs. There is no advantage to checking file existence here vs. in load_* + if self.in_pipeline: return + + for key in self.single: + resolved_path = self[key] + if not resolved_path: continue + assert os.path.isfile(resolved_path), \ + "The file provided for {key} in {selfname} could not be found:\n\t{file}" \ + .format(key=key, selfname=self.basename, file=resolved_path) + + for key in self.groups: + for entry in self[key]: + if isinstance(entry, dict): entry = entry['path'] + assert os.path.isfile(entry), \ + "The following file provided under {key} in {selfname} could not be found:\n\t{file}" \ + .format(key=key, selfname=self.basename, file=entry) + + def check_backward_compatibility(self): + assert 'gff_files' in self, \ + "The gff_files parameter was not found in your Paths File. This likely means " \ + "that you are using a Paths File from an earlier version of tinyRNA. Please " \ + "check the release notes and update your configuration files." + + # Override + def append_to(self, key: str, val: Any): + """Overrides method in the base class. This is necessary due to automatic + path resolution in __getitem__ which returns a *temporary* list value, and + items appended to the temporary list would otherwise be lost.""" + + assert key in self.groups, "Tried appending to a non-list type parameter" + target = self.config.get(key, []) + target.append(val) + return target + + class SamplesSheet: def __init__(self, file): self.csv = CSVReader(file, "Samples Sheet") @@ -483,14 +583,12 @@ class CSVReader(csv.DictReader): "Features Sheet": OrderedDict({ "Select for...": "Key", "with value...": "Value", - "Alias by...": "Name", "Classify as...": "Class", "Hierarchy": "Hierarchy", "Strand": "Strand", "5' End Nucleotide": "nt5end", "Length": "Length", "Overlap": "Overlap", - "Feature Source": "Source" }), "Samples Sheet": OrderedDict({ "Input FASTQ Files": "File", @@ -524,11 +622,16 @@ def rows(self): yield row def validate_csv_header(self, header: OrderedDict): + # The expected header values doc_reference = self.tinyrna_sheet_fields[self.doctype] expected = {key.lower() for key in doc_reference.keys()} - read_vals = {val.lower() for val in header.values() if val is not None} + + # The header values that were read + read_vals = {val.lower() for key, val in header.items() if None not in (key, val)} + read_vals.update(val.lower() for val in header.get(None, ())) # Extra headers self.check_backward_compatibility(read_vals) + # Find differences between actual and expected headers unknown = {col_name for col_name in read_vals if col_name not in expected} missing = expected - read_vals diff --git a/tiny/rna/counter/counter.py b/tiny/rna/counter/counter.py index da90e2b7..032f80eb 100644 --- a/tiny/rna/counter/counter.py +++ b/tiny/rna/counter/counter.py @@ -17,7 +17,7 @@ from tiny.rna.counter.features import Features, FeatureCounter from tiny.rna.counter.statistics import MergedStatsManager from tiny.rna.util import report_execution_time, from_here, ReadOnlyDict -from tiny.rna.configuration import CSVReader +from tiny.rna.configuration import CSVReader, PathsFile # Global variables for multiprocessing counter: FeatureCounter @@ -31,10 +31,8 @@ def get_args(): optional_args = arg_parser.add_argument_group("Optional arguments") # Required arguments - required_args.add_argument('-i', '--samples-csv', metavar='SAMPLES', required=True, - help='your Samples Sheet') - required_args.add_argument('-f', '--features-csv', metavar='FEATURES', required=True, - help='your Features Sheet') + required_args.add_argument('-pf', '--paths-file', metavar='PATHS', required=True, + help='your Paths File') required_args.add_argument('-o', '--out-prefix', metavar='OUTPUTPREFIX', required=True, help='output prefix to use for file names') @@ -125,22 +123,20 @@ def get_library_filename(csv_row_file: str, samples_csv: str) -> str: return inputs -def load_config(features_csv: str, is_pipeline: bool) -> Tuple[List[dict], Dict[str, list]]: - """Parses the Features Sheet to provide inputs to FeatureSelector and build_reference_tables +def load_config(features_csv: str, is_pipeline: bool) -> List[dict]: + """Parses the Features Sheet to provide inputs to FeatureSelector and ReferenceTables Args: features_csv: a csv file which defines feature sources and selection rules - is_pipeline: helps locate GFF files defined in the Features Sheet. If true, - GFF files are assumed to reside in the working directory. + is_pipeline: not currently used; helps properly locate input files Returns: rules: a list of dictionaries, each representing a parsed row from input. Note that these are just rule definitions which FeatureSelector will further digest to produce its rules table. - gff_files: a dict of GFF files and associated Name Attribute preferences """ - rules, gff_files = list(), defaultdict(list) + rules = list() for row in CSVReader(features_csv, "Features Sheet").rows(): rule = {col: row[col] for col in ["Class", "Hierarchy", "Strand", "nt5end", "Length", "Overlap"]} @@ -149,27 +145,57 @@ def load_config(features_csv: str, is_pipeline: bool) -> Tuple[List[dict], Dict[ rule['Hierarchy'] = int(rule['Hierarchy']) # Convert hierarchy to number rule['Overlap'] = rule['Overlap'].lower() # Built later in ReferenceTables - gff = os.path.basename(row['Source']) if is_pipeline else from_here(features_csv, row['Source']) - - # Duplicate Name Attributes and rule entries are not allowed - if row['Name'] not in ["ID", *gff_files[gff]]: gff_files[gff].append(row['Name']) + # Duplicate rule entries are not allowed if rule not in rules: rules.append(rule) - return rules, gff_files + return rules + +def load_gff_files(paths: PathsFile, libraries: List[dict], args: ReadOnlyDict) -> Dict[str, list]: + """Retrieves the appropriate file path and alias attributes for each GFF, + then validates + + Args: + paths: a loaded PathsFile with + libraries: a list of library files, each as a dict with a 'File' key + gff_files: a list of gff files, each as a dict with keys `path` and `alias` + args: command line arguments passed to GFFValidator for source/type filters + + Returns: + gff: a dict of GFF files with lists of alias attribute keys + """ -def validate_inputs(gffs, libraries, prefs): - if prefs.get('is_pipeline'): return - libraries = [lib['File'] for lib in libraries] - GFFValidator(gffs, prefs, alignments=libraries).validate() + gff_files = defaultdict(list) + screen_alias = ["id"] + + # Build dictionary of unique files and allowed aliases + for gff in paths['gff_files']: + gff_files[gff['path']].extend( + alias for alias in gff.get('alias', ()) + if alias.lower() not in screen_alias + ) + + # Remove duplicate aliases (per file), keep original order + for file, alias in gff_files.items(): + gff_files[file] = sorted(set(alias), key=alias.index) + + # Prepare supporting file inputs for GFF validation + sam_files = [lib['File'] for lib in libraries] + + # Todo: update GFFValidator so that genomes and ebwt can be safely passed here + # ref_genomes = [map_path(fasta) for fasta in paths['reference_genome_files']] + # ebwt_prefix = map_path(paths['ebwt']) + + GFFValidator(gff_files, args, alignments=sam_files).validate() + return gff_files @report_execution_time("Counting and merging") -def map_and_reduce(libraries, prefs): +def map_and_reduce(libraries, paths, prefs): """Assigns one worker process per library and merges the statistics they report""" # MergedStatsManager handles final output files, regardless of multiprocessing status - summary = MergedStatsManager(Features, prefs) + summary = MergedStatsManager(Features, paths['features_csv'], prefs) # Use a multiprocessing pool if multiple sam files were provided if len(libraries) > 1: @@ -190,21 +216,21 @@ def map_and_reduce(libraries, prefs): def main(): # Get command line arguments. args = get_args() + is_pipeline = args['is_pipeline'] try: - # Determine SAM inputs and their associated library names - libraries = load_samples(args['samples_csv'], args['is_pipeline']) - - # Load selection rules and feature sources from the Features Sheet - selection_rules, gff_file_set = load_config(args['features_csv'], args['is_pipeline']) - validate_inputs(gff_file_set, libraries, args) + # Load and validate config files and input files + paths = PathsFile(args['paths_file'], is_pipeline) + libraries = load_samples(paths['samples_csv'], is_pipeline) + selection = load_config(paths['features_csv'], is_pipeline) + gff_files = load_gff_files(paths, libraries, args) # global for multiprocessing global counter - counter = FeatureCounter(gff_file_set, selection_rules, **args) + counter = FeatureCounter(gff_files, selection, **args) # Assign and count features using multiprocessing and merge results - merged_counts = map_and_reduce(libraries, args) + merged_counts = map_and_reduce(libraries, paths, args) # Write final outputs merged_counts.write_report_files() diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index 24c73cbf..f0d7793a 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -146,11 +146,11 @@ class MergedStatsManager: chrom_misses = Counter() warnings = [] - def __init__(self, Features_obj, prefs): + def __init__(self, Features_obj, features_csv, prefs): MergedStat.prefix = prefs.get('out_prefix', '') self.merged_stats = [ - FeatureCounts(Features_obj), RuleCounts(prefs['features_csv']), + FeatureCounts(Features_obj), RuleCounts(features_csv), AlignmentStats(), SummaryStats(), NtLenMatrices() ] diff --git a/tiny/rna/counter/validation.py b/tiny/rna/counter/validation.py index 40485e25..d44b10f7 100644 --- a/tiny/rna/counter/validation.py +++ b/tiny/rna/counter/validation.py @@ -1,8 +1,10 @@ import functools import subprocess import sys +import os from collections import Counter, defaultdict +from typing import List, Dict from tiny.rna.util import sorted_natural from tiny.rna.counter.hts_parsing import parse_gff, ReferenceTables @@ -155,17 +157,19 @@ def validate_chroms(self, ebwt=None, genomes=None, alignments=None): # First search bowtie indexes if they are available if ebwt is not None: try: - chroms, shared = self.chroms_shared_with_ebwt(ebwt) - self.generate_chrom_report(chroms, shared) + shared, chroms = self.chroms_shared_with_ebwt(ebwt) + self.generate_chrom_report(shared, chroms) return except Exception: pass # Fallback to other input options # Next search the genome fasta(s) if available if genomes is not None: - chroms, shared = self.chroms_shared_with_genomes(genomes) - self.generate_chrom_report(chroms, shared) - return + shared, chroms = self.chroms_shared_with_genomes(genomes) + if chroms: + self.generate_chrom_report(shared, chroms) + return + # Continue search if chroms weren't gathered from genomes # Preferred inputs aren't available; continue testing with heuristic options if alignments is not None: @@ -178,11 +182,17 @@ def validate_chroms_heuristic(self, alignments): self.generate_chrom_heuristics_report(suspect_files) def chroms_shared_with_ebwt(self, index_prefix): - """Returns the set intersection between parsed GFF chromosomes and those in the bowtie index""" + """Determines the complete set of chromosomes in the bowtie index, then finds + their set intersection with the chromosomes parsed from GFF files. + Returns: + a tuple of (shared chromosomes, index chromosomes)""" - summary = subprocess.check_output(['bowtie-inspect', index_prefix, '-s']).decode('latin1').splitlines() - ebwt_chroms = set() + summary = subprocess.check_output( + ['bowtie-inspect', index_prefix, '-s'], + stderr=subprocess.DEVNULL + ).decode('latin1').splitlines() + ebwt_chroms = set() for line in summary: if line.startswith("Sequence-"): try: @@ -194,10 +204,15 @@ def chroms_shared_with_ebwt(self, index_prefix): return shared, ebwt_chroms def chroms_shared_with_genomes(self, genome_fastas): - """Returns the set intersection between parsed GFF chromosomes and those in the reference genome""" + """Determines the complete set of chromosomes defined in each genome file, + then finds their set intersection with the chromosomes parsed from GFF files. + Returns: + a tuple of (shared chromosomes, genome chromosomes)""" genome_chroms = set() for fasta in genome_fastas: + if not os.path.isfile(fasta): + continue with open(fasta, 'rb') as f: for line in f: if line[0] == ord('>'): @@ -206,9 +221,13 @@ def chroms_shared_with_genomes(self, genome_fastas): shared = genome_chroms & self.chrom_set return shared, genome_chroms - def alignment_chroms_mismatch_heuristic(self, sam_files, subset_size=50000): + def alignment_chroms_mismatch_heuristic(self, sam_files: List[str], subset_size=50000) -> Dict[str, set]: """Since alignment files can be very large, we only check that there's at least one shared - chromosome identifier and only the first subset_size lines are read from each file.""" + chromosome identifier and only the first subset_size lines are read from each file. The + returned dictionary contains only the SAM files whose sampled chromosome set failed to + intersect with the chromosomes parsed from GFF files. + Returns: + a dictionary of {SAM filename: SAM chromosomes sampled}""" files_wo_overlap = {} diff --git a/tiny/templates/features.csv b/tiny/templates/features.csv index b038764c..58300b5a 100755 --- a/tiny/templates/features.csv +++ b/tiny/templates/features.csv @@ -1,7 +1,7 @@ -Select for...,with value...,Alias by...,Classify as...,Hierarchy,Strand,5' End Nucleotide,Length,Overlap,Feature Source -Class,mask,Alias,,1,both,all,all,Partial,../../START_HERE/reference_data/ram1.gff3 -Class,miRNA,Alias,,2,sense,all,16-22,Full,../../START_HERE/reference_data/ram1.gff3 -Class,piRNA,Alias,5pA,2,both,A,24-32,Full,../../START_HERE/reference_data/ram1.gff3 -Class,piRNA,Alias,5pT,2,both,T,24-32,Full,../../START_HERE/reference_data/ram1.gff3 -Class,siRNA,Alias,,2,both,all,15-22,Full,../../START_HERE/reference_data/ram1.gff3 -Class,unk,Alias,,3,both,all,all,Full,../../START_HERE/reference_data/ram1.gff3 \ No newline at end of file +Select for...,with value...,Classify as...,Hierarchy,Strand,5' End Nucleotide,Length,Overlap +Class,mask,,1,both,all,all,Partial +Class,miRNA,,2,sense,all,16-22,Full +Class,piRNA,5pA,2,both,A,24-32,Full +Class,piRNA,5pT,2,both,T,24-32,Full +Class,siRNA,,2,both,all,15-22,Full +Class,unk,,3,both,all,all,Full \ No newline at end of file diff --git a/tiny/templates/paths.yml b/tiny/templates/paths.yml index c6553f71..8404f51a 100644 --- a/tiny/templates/paths.yml +++ b/tiny/templates/paths.yml @@ -14,6 +14,13 @@ samples_csv: './samples.csv' features_csv: './features.csv' +##-- Each entry: 1. the file, 2. (optional) list of attribute keys for feature aliases --## +gff_files: +- path: "../../START_HERE/reference_data/ram1.gff3" + alias: [ID] +#- path: +# alias: [ ] + ##-- The final output directory for files produced by the pipeline --# run_directory: run_directory diff --git a/tiny/templates/run_config_template.yml b/tiny/templates/run_config_template.yml index 161a5f72..86523462 100644 --- a/tiny/templates/run_config_template.yml +++ b/tiny/templates/run_config_template.yml @@ -1,21 +1,21 @@ ######----------------------------- tinyRNA Configuration -----------------------------###### # -# In this file you may specify your configuration preferences for the workflow and +# In this file you can specify your configuration preferences for the workflow and # each workflow step. # # If you want to use DEFAULT settings for the workflow, all you need to do is provide the path -# to your Samples Sheet and Features Sheet in your Paths file, then make sure that the -# 'paths_config' setting below points to your Paths file. +# to your Samples Sheet and Features Sheet in your Paths File, then make sure that the +# 'paths_config' setting below points to your Paths File. # # We suggest that you also: # 1. Add a username to identify the person performing runs, if desired for record keeping -# 2. Add a run directory name in your Paths file. If not provided, "run_directory" is used +# 2. Add a run directory name in your Paths File. If not provided, "run_directory" is used # 3. Add a run name to label your run directory and run-specific summary reports. # If not provided, user_tinyrna will be used. # # This file will be further processed at run time to generate the appropriate pipeline # settings for each workflow step. A copy of this processed configuration will be stored -# in your run directory (as specified by your Paths configuration file). +# in your run directory. # ######-------------------------------------------------------------------------------###### @@ -48,7 +48,7 @@ run_native: false # paths_config file. # # We have specified default parameters for small RNA data based on our own "best practices". -# You may change the parameters here. +# You can change the parameters here. # ######-------------------------------------------------------------------------------###### @@ -75,7 +75,7 @@ ftabchars: ~ # pipeline github: https://github.com/MontgomeryLab/tinyrna # # We have specified default parameters for small RNA data based on our own "best practices". -# You may change the parameters here. +# You can change the parameters here. # ######-------------------------------------------------------------------------------###### @@ -135,7 +135,7 @@ compression: 4 # Trimming takes place prior to counting/collapsing. # # We have specified default parameters for small RNA data based on our own "best practices". -# You may change the parameters here. +# You can change the parameters here. # ######-------------------------------------------------------------------------------###### @@ -157,7 +157,7 @@ compress: False # We use bowtie for read alignment to a genome. # # We have specified default parameters for small RNA data based on our own "best practices". -# You may change the parameters here. +# You can change the parameters here. # ######-------------------------------------------------------------------------------###### @@ -263,12 +263,11 @@ dge_drop_zero: False ######-------------------------------- PLOTTING OPTIONS -----------------------------###### # -# We use a custom Python script for creating all plots. The default base style is called -# 'smrna-light'. If you wish to use another matplotlib stylesheet you may specify that in -# the Paths File. +# We use a custom Python script for creating all plots. If you wish to use another matplotlib +# stylesheet you can specify that in the Paths File. # # We have specified default parameters for small RNA data based on our own "best practices". -# You may change the parameters here. +# You can change the parameters here. # ######-------------------------------------------------------------------------------###### @@ -303,7 +302,7 @@ plot_unassigned_class: "_UNASSIGNED_" ######----------------------------- OUTPUT DIRECTORIES ------------------------------###### # # Outputs for each step are organized into their own subdirectories in your run -# directory. You may set these folder names here. +# directory. You can set these folder names here. # ######-------------------------------------------------------------------------------###### @@ -320,15 +319,15 @@ dir_name_plotter: plots ######################### AUTOMATICALLY GENERATED CONFIGURATIONS ######################### # # Do not make any changes to the following sections. These options are automatically -# generated using your Paths file, your Samples and Features sheets, and the above +# generated using your Paths File, your Samples and Features sheets, and the above # settings in this file. # ########################################################################################### -######--------------------------- DERIVED FROM PATHS SHEET --------------------------###### +######--------------------------- DERIVED FROM PATHS FILE ---------------------------###### # -# The following configuration settings are automatically derived from the sample sheet +# The following configuration settings are automatically derived from the Paths File # ######-------------------------------------------------------------------------------###### @@ -336,6 +335,8 @@ run_directory: ~ tmp_directory: ~ features_csv: { } samples_csv: { } +paths_file: { } +gff_files: [ ] run_bowtie_build: false reference_genome_files: [ ] plot_style_sheet: ~ @@ -343,9 +344,9 @@ adapter_fasta: ~ ebwt: ~ -######-------------------------- DERIVED FROM SAMPLE SHEET --------------------------###### +######------------------------- DERIVED FROM SAMPLES SHEET --------------------------###### # -# The following configuration settings are automatically derived from the sample sheet +# The following configuration settings are automatically derived from the Samples Sheet # ######-------------------------------------------------------------------------------###### @@ -370,10 +371,6 @@ run_deseq: True ######------------------------- DERIVED FROM FEATURES SHEET -------------------------###### # -# The following configuration settings are automatically derived from the sample sheet +# The following configuration settings are automatically derived from the Features Sheet # -######-------------------------------------------------------------------------------###### - -###-- Utilized by tiny-count --### -# a list of only unique GFF files -gff_files: [ ] \ No newline at end of file +######-------------------------------------------------------------------------------###### \ No newline at end of file