From 74a92e947d49c9e0f05c9baf1f780a4dc170303a Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 13 Oct 2022 17:04:03 -0700 Subject: [PATCH 1/9] Simplifying the user's procedure for activating bowtie-build. Run Config's run_bowtie_build is now an automatically configured value that is triggered by an empty ebwt prefix in the Paths Sheet. --- START_HERE/paths.yml | 5 ++--- START_HERE/run_config.yml | 5 +---- tiny/templates/paths.yml | 5 ++--- tiny/templates/run_config_template.yml | 7 ++----- 4 files changed, 7 insertions(+), 15 deletions(-) diff --git a/START_HERE/paths.yml b/START_HERE/paths.yml index c83a7607..cfdd6281 100644 --- a/START_HERE/paths.yml +++ b/START_HERE/paths.yml @@ -23,9 +23,8 @@ tmp_directory: ######-------------------------------- BOWTIE-BUILD ---------------------------------###### # # To build bowtie indexes: -# 1. Your Run Config file must contain run_bowtie_build: true -# 2. Your reference genome file(s) must be listed under reference_genome_files (below) -# 3. ebwt (below) must be an empty string, or '' +# 1. Your reference genome file(s) must be listed under reference_genome_files (below) +# 2. ebwt (below) must be empty (nothing after ":") # # Once your indexes have been built, this config file will be modified such # that ebwt points to their location (prefix) within your Run Directory. This diff --git a/START_HERE/run_config.yml b/START_HERE/run_config.yml index fc48ef7c..e6489931 100644 --- a/START_HERE/run_config.yml +++ b/START_HERE/run_config.yml @@ -28,10 +28,6 @@ paths_config: ./paths.yml ##-- If none provided, the default of user_tinyrna will be used --## run_name: tinyrna -##-- If True: run bowtie-build before analyzing libraries --## -##-- NOTE: this option may be ignored depending on your Paths file. See Paths file. --## -run_bowtie_build: True - ##-- Number of threads to use when a step supports multi-threading --## ##-- For best performance, this should be equal to your computer's processor core count --## threads: 4 @@ -334,6 +330,7 @@ run_directory: ~ tmp_directory: ~ features_csv: { } samples_csv: { } +run_bowtie_build: false reference_genome_files: [ ] plot_style_sheet: ~ adapter_fasta: ~ diff --git a/tiny/templates/paths.yml b/tiny/templates/paths.yml index 6d8ed972..c6553f71 100644 --- a/tiny/templates/paths.yml +++ b/tiny/templates/paths.yml @@ -23,9 +23,8 @@ tmp_directory: ######-------------------------------- BOWTIE-BUILD ---------------------------------###### # # To build bowtie indexes: -# 1. Your Run Config file must contain run_bowtie_build: true -# 2. Your reference genome file(s) must be listed under reference_genome_files (below) -# 3. ebwt (below) must be an empty string, or '' +# 1. Your reference genome file(s) must be listed under reference_genome_files (below) +# 2. ebwt (below) must be empty (nothing after ":") # # Once your indexes have been built, this config file will be modified such # that ebwt points to their location (prefix) within your Run Directory. This diff --git a/tiny/templates/run_config_template.yml b/tiny/templates/run_config_template.yml index 71d85e99..86853a02 100644 --- a/tiny/templates/run_config_template.yml +++ b/tiny/templates/run_config_template.yml @@ -22,16 +22,12 @@ user: ~ run_date: ~ run_time: ~ -paths_config: ../../START_HERE/paths.yml +paths_config: paths.yml ##-- The label for final outputs --## ##-- If none provided, the default of user_tinyrna will be used --## run_name: my_first_run -##-- If True: run bowtie-build before analyzing libraries --## -##-- NOTE: this option may be ignored depending on your Paths file. See Paths file. --## -run_bowtie_build: True - ##-- Number of threads to use when a step supports multi-threading --## ##-- For best performance, this should be equal to your computer's processor core count --## threads: 4 @@ -334,6 +330,7 @@ run_directory: ~ tmp_directory: ~ features_csv: { } samples_csv: { } +run_bowtie_build: false reference_genome_files: [ ] plot_style_sheet: ~ adapter_fasta: ~ From 129229c91ae8f174aaf3e1daeebda360637eaaa4 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 13 Oct 2022 17:05:13 -0700 Subject: [PATCH 2/9] Updating the CWL to collect long bowtie indexes (*.ebwtl) if bowtie-build produces them --- tiny/cwl/tools/bowtie-build.cwl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tiny/cwl/tools/bowtie-build.cwl b/tiny/cwl/tools/bowtie-build.cwl index 0658c987..7f4bfd09 100644 --- a/tiny/cwl/tools/bowtie-build.cwl +++ b/tiny/cwl/tools/bowtie-build.cwl @@ -83,7 +83,7 @@ outputs: index_files: type: File[] outputBinding: - glob: $(inputs.ebwt_base).*.ebwt + glob: $(inputs.ebwt_base).*.ebwt* console_output: type: stdout \ No newline at end of file From 05856b02e3cb5ba652646d5685bf6e9c25d92fac Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 13 Oct 2022 17:14:15 -0700 Subject: [PATCH 3/9] Configuration.setup_ebwt_idx() has been updated for the new bowtie-build activation procedure. If the user defines ebwt but index files can't be found, and they also provided their reference genome files, then the pipeline will automatically rebuild the indexes and update the Paths Sheet at the end of the end-to-end run. setup_ebwt_idx() has also been significantly refactored and cleaned up. It has been bugging me for a long time and it feels good to see it in better shape. --- tiny/rna/configuration.py | 81 ++++++++++++++++++++++++++------------- 1 file changed, 54 insertions(+), 27 deletions(-) diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py index dde86823..9e629c10 100644 --- a/tiny/rna/configuration.py +++ b/tiny/rna/configuration.py @@ -279,38 +279,65 @@ def setup_pipeline(self): self.templates = resource_filename('tiny', 'templates/') def setup_ebwt_idx(self): - """Bowtie index files and prefix""" - - # Determine if bowtie-build should run, and set Bowtie index prefix accordingly - bt_index_prefix = self.paths['ebwt'] - if self['run_bowtie_build'] and not bt_index_prefix: - if not self['reference_genome_files']: - raise ValueError(f"If {self.basename} contains 'run_bowtie_build: True', you " - f"need to provide your reference genome files in {self.paths.basename}") - - # Outputs are saved in {run_directory}/bowtie-build, within which prefix is first genome file's basename - first_genome_file = self.paths.from_here(self['reference_genome_files'][0]['path']) - bt_index_prefix = self.prefix(os.path.join( - self['run_directory'], "bowtie-build", os.path.basename(first_genome_file)) - ) - - self['ebwt'] = self.paths['ebwt'] = bt_index_prefix + """Determines Bowtie index prefix and whether bowtie-build should run""" + + # Empty values for ebwt (''/~/None) trigger bowtie-build + self['run_bowtie_build'] = not bool(self.paths['ebwt']) + + if self['run_bowtie_build']: + # Set the prefix to the run directory outputs. This is necessary + # because workflow requires bt_index_files to be a populated list. + prefix = self.get_ebwt_prefix() else: - # bowtie-build should only run if 'run_bowtie_build' is True AND ebwt (index prefix) is undefined - self['run_bowtie_build'] = False - bt_index_prefix = self.paths.from_here(bt_index_prefix) + prefix = self.paths.from_here(self.paths['ebwt']) - # Bowtie index files - try: - self['bt_index_files'] = [self.cwl_file(bt_index_prefix + postfix, verify=(not self['run_bowtie_build'])) - for postfix in ['.1.ebwt', '.2.ebwt', '.3.ebwt', '.4.ebwt', '.rev.1.ebwt', '.rev.2.ebwt']] - except FileNotFoundError as e: - sys.exit("The following file could not be found from the Bowtie index prefix defined in your Paths File:\n" - "%s" % (e.filename,)) + # verify_bowtie_build_outputs() will check if these end up being long indexes + self['bt_index_files'] = self.get_bt_index_files(prefix) # When CWL copies bt_index_filex for the bowtie.cwl InitialWorkDirRequirement, it does not # preserve the prefix path. What the workflow "sees" is the ebwt files at working dir root - self["ebwt"] = os.path.basename(self["ebwt"]) + self["ebwt"] = os.path.basename(prefix) + + def get_ebwt_prefix(self): + """Determines the output prefix path for bowtie indexes that haven't been built yet. The basename + of the prefix path is simply the basename of the reference genome sans file extension""" + + genome_files = self['reference_genome_files'] + if not genome_files: + raise ValueError("If your Paths Sheet doesn't have a value for \"ebtw:\", then bowtie indexes " + "will be built, but you'll need to provide your reference genome files under " + '"reference_genome_files:" (also in your Paths Sheet)') + + genome_basename = os.path.basename(genome_files[0]['path']) + return self.prefix(os.path.join( # prefix path: + self['run_directory'], self['dir_name_bt_build'], genome_basename + )) + + def get_bt_index_files(self, prefix): + """Builds the list of expected bowtie index files from the ebwt prefix. If an index file + doesn't exist then they will be automatically rebuilt from the user's reference genomes. + File existence isn't checked if bowtie-build is already scheduled for this run.""" + + try: + verify_file_paths = not bool(self['run_bowtie_build']) + ext = "ebwt" + + return [ + self.cwl_file(f"{prefix}.{subext}.{ext}", verify=verify_file_paths) + for subext in ['1', '2', '3', '4', 'rev.1', 'rev.2'] + ] + except FileNotFoundError as e: + problem = "The following Bowtie index file couldn't be found:\n\t%s\n\n" % (e.filename,) + rebuild = "Indexes will be built from your reference genome files during this run." + userfix = "Please either correct your ebwt prefix or add reference genomes in the Paths File." + + if self['reference_genome_files']: + print(problem + rebuild, file=sys.stderr) + new_prefix = self.get_ebwt_prefix() + self['run_bowtie_build'] = True + return self.get_bt_index_files(new_prefix) + else: + sys.exit(problem + userfix) def validate_inputs(self): """For now, only GFF files are validated here""" From 875b205a9e4fde1f20653298036a8fdd26407479 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 13 Oct 2022 18:05:07 -0700 Subject: [PATCH 4/9] Adding execute_post_run_tasks() to Configuration. This function calls others that verify bowtie-build outputs were produced, updates index paths if long indexes were produced, then saves updates to the Paths Sheet and Run Config. This addresses a long running problem where end-to-end runs with a bowtie-build step would not save the updated ebwt in Paths Sheet if a downstream step produced an error. Now, the updated ebwt path is written to the Paths File at the end of any run where at least bowtie-build ran successfully. --- tiny/entry.py | 6 +----- tiny/rna/configuration.py | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/tiny/entry.py b/tiny/entry.py index feccf61d..6dc6c44c 100644 --- a/tiny/entry.py +++ b/tiny/entry.py @@ -100,11 +100,7 @@ def run(tinyrna_cwl_path: str, config_file: str) -> None: # Use the cwltool CWL runner via command line return_code = run_cwltool_subprocess(config_object, workflow, run_directory) - # If the workflow completed without errors, we want to update - # the Paths Sheet to point to the new bowtie index prefix - if config_object['run_bowtie_build'] and return_code == 0: - paths_sheet_filename = config_object.paths.inf - config_object.paths.write_processed_config(paths_sheet_filename) + config_object.execute_post_run_tasks(return_code) @report_execution_time("Pipeline resume runtime") diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py index 9e629c10..bdd9d777 100644 --- a/tiny/rna/configuration.py +++ b/tiny/rna/configuration.py @@ -11,6 +11,7 @@ from collections import Counter, OrderedDict from datetime import datetime from typing import Union, Any +from glob import glob from tiny.rna.counter.validation import GFFValidator @@ -353,6 +354,28 @@ def validate_inputs(self): genomes=self.paths['reference_genome_files'], alignments=None # Used in tiny-count standalone runs ).validate() + + def execute_post_run_tasks(self, return_code): + if self['run_bowtie_build']: + self.verify_bowtie_build_outputs() + + def verify_bowtie_build_outputs(self): + """Ensures that bowtie indexes were produced before saving the new ebwt prefix to the Paths File. + If large indexes were produced, paths under bt_index_files need to be updated in the processed Run Config""" + + indexes = glob(os.path.join(self['run_directory'], self['dir_name_bt_build'], "*.ebwt*")) + large_indexes = [f for f in indexes if f.endswith(".ebwtl")] + + # Update Paths File + if indexes: + self.paths.write_processed_config(self.paths.inf) + + # Update Run Config + if large_indexes: + for expected in self['bt_index_files']: + expected['path'] += "l" + assert expected['path'] in large_indexes + self.write_processed_config() def save_run_profile(self, config_file_name=None) -> str: """Saves Samples Sheet and processed run config to the Run Directory for record keeping""" From 3154f76f15bbcec13968d9decb67c68ffb02861c Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 13 Oct 2022 18:30:39 -0700 Subject: [PATCH 5/9] Misc bugfixes addressing changes made in this issue --- tiny/rna/configuration.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py index bdd9d777..31da0969 100644 --- a/tiny/rna/configuration.py +++ b/tiny/rna/configuration.py @@ -288,16 +288,16 @@ def setup_ebwt_idx(self): if self['run_bowtie_build']: # Set the prefix to the run directory outputs. This is necessary # because workflow requires bt_index_files to be a populated list. - prefix = self.get_ebwt_prefix() + self.paths['ebwt'] = self.get_ebwt_prefix() else: - prefix = self.paths.from_here(self.paths['ebwt']) + self.paths['ebwt'] = self.paths.from_here(self.paths['ebwt']) # verify_bowtie_build_outputs() will check if these end up being long indexes - self['bt_index_files'] = self.get_bt_index_files(prefix) + self['bt_index_files'] = self.get_bt_index_files() # When CWL copies bt_index_filex for the bowtie.cwl InitialWorkDirRequirement, it does not # preserve the prefix path. What the workflow "sees" is the ebwt files at working dir root - self["ebwt"] = os.path.basename(prefix) + self['ebwt'] = os.path.basename(self.paths['ebwt']) def get_ebwt_prefix(self): """Determines the output prefix path for bowtie indexes that haven't been built yet. The basename @@ -314,13 +314,14 @@ def get_ebwt_prefix(self): self['run_directory'], self['dir_name_bt_build'], genome_basename )) - def get_bt_index_files(self, prefix): + def get_bt_index_files(self): """Builds the list of expected bowtie index files from the ebwt prefix. If an index file doesn't exist then they will be automatically rebuilt from the user's reference genomes. File existence isn't checked if bowtie-build is already scheduled for this run.""" try: verify_file_paths = not bool(self['run_bowtie_build']) + prefix = self.paths['ebwt'] ext = "ebwt" return [ @@ -334,9 +335,9 @@ def get_bt_index_files(self, prefix): if self['reference_genome_files']: print(problem + rebuild, file=sys.stderr) - new_prefix = self.get_ebwt_prefix() + self.paths['ebwt'] = self.get_ebwt_prefix() self['run_bowtie_build'] = True - return self.get_bt_index_files(new_prefix) + return self.get_bt_index_files() else: sys.exit(problem + userfix) From 0476e0459f473c380a9f2df987aef25b5c87ab25 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 13 Oct 2022 18:32:24 -0700 Subject: [PATCH 6/9] Misc small improvements to clarity of code and comments, plus a stability fix for cases where ["reference_genome_files"] contains empty list items --- tiny/rna/configuration.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py index 31da0969..ea02ac30 100644 --- a/tiny/rna/configuration.py +++ b/tiny/rna/configuration.py @@ -45,7 +45,7 @@ def __init__(self, config_file: str): def __getitem__(self, key: str) -> Any: return self.get(key) - def __setitem__(self, key: str, val: Union[str, list, dict, bool]) -> Union[str, list, dict, bool]: + def __setitem__(self, key: str, val: Union[str, list, dict, bool, None]) -> Union[str, list, dict, bool, None]: return self.set(key, val) def __contains__(self, key: str) -> bool: @@ -169,14 +169,14 @@ def __init__(self, config_file: str, validate_inputs=False): self.paths = self.load_paths_config() self.process_paths_sheet() - + self.setup_pipeline() self.setup_per_file() self.setup_ebwt_idx() self.process_sample_sheet() self.process_feature_sheet() if validate_inputs: self.validate_inputs() - + def load_paths_config(self): """Constructs a sub-configuration object containing workflow file preferences""" path_sheet = self.from_here(self['paths_config']) @@ -194,11 +194,12 @@ def to_cwl_file_class(input_file_path): self['run_directory'] = self.paths.from_here(self.paths['run_directory']) # Configurations that need to be converted from string to a CWL File object - self['samples_csv'] = to_cwl_file_class(self.paths.from_here(self.paths['samples_csv'])) - self['features_csv'] = to_cwl_file_class(self.paths.from_here(self.paths['features_csv'])) + self['samples_csv'] = to_cwl_file_class(self.paths['samples_csv']) + self['features_csv'] = to_cwl_file_class(self.paths['features_csv']) self['reference_genome_files'] = [ - to_cwl_file_class(self.paths.from_here(genome)) + to_cwl_file_class(genome) for genome in self.paths['reference_genome_files'] + if genome is not None ] def process_sample_sheet(self): @@ -280,7 +281,10 @@ def setup_pipeline(self): self.templates = resource_filename('tiny', 'templates/') def setup_ebwt_idx(self): - """Determines Bowtie index prefix and whether bowtie-build should run""" + """Determines Bowtie index prefix and whether bowtie-build should run. + self['ebwt'] is used for the bowtie commandline argument (see note below) + self.paths['ebwt'] is the actual prefix path + """ # Empty values for ebwt (''/~/None) trigger bowtie-build self['run_bowtie_build'] = not bool(self.paths['ebwt']) @@ -295,7 +299,7 @@ def setup_ebwt_idx(self): # verify_bowtie_build_outputs() will check if these end up being long indexes self['bt_index_files'] = self.get_bt_index_files() - # When CWL copies bt_index_filex for the bowtie.cwl InitialWorkDirRequirement, it does not + # When CWL copies bt_index_files for the bowtie.cwl InitialWorkDirRequirement, it does not # preserve the prefix path. What the workflow "sees" is the ebwt files at working dir root self['ebwt'] = os.path.basename(self.paths['ebwt']) From 98462e754c4669997e8246316616cb206ad763dc Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 13 Oct 2022 18:36:03 -0700 Subject: [PATCH 7/9] Adding unit tests for new bowtie index handling procedure --- tests/unit_tests_configuration.py | 139 ++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 tests/unit_tests_configuration.py diff --git a/tests/unit_tests_configuration.py b/tests/unit_tests_configuration.py new file mode 100644 index 00000000..a3d56821 --- /dev/null +++ b/tests/unit_tests_configuration.py @@ -0,0 +1,139 @@ +import contextlib +import io +import os +import unittest +from unittest.mock import patch, mock_open, call + +from tiny.rna.configuration import Configuration + + +class ConfigurationTests(unittest.TestCase): + @classmethod + def setUpClass(self): + self.root_cfg_dir = os.path.abspath("../tiny/templates") + self.run_config = self.root_cfg_dir + "/run_config_template.yml" + self.paths = self.root_cfg_dir + "/paths.yml" + + self.default_prefix = os.path.join( + self.root_cfg_dir, + Configuration(self.run_config)['run_directory'], + "bowtie-build/ram1" + ) + self.maxDiff = 1522 + + """============ Helper functions ============""" + + def config_with(self, prefs): + config = Configuration(self.run_config) + for key, val in prefs.items(): + config[key] = val + return config + + def bt_idx_files_from_prefix(self, prefix): + return [ + {'path': f"{prefix}.{subext}.ebwt", 'class': 'File'} + for subext in ['1', '2', '3', '4', 'rev.1', 'rev.2'] + ] + + """================ Tests ==================""" + + """Does get_ebwt_prefix() produce the expected prefix path?""" + + def test_get_ebwt_prefix(self): + config = Configuration(self.run_config) + actual_prefix = config.get_ebwt_prefix() + expected_prefix = self.default_prefix + + self.assertEqual(actual_prefix, expected_prefix) + + """Does get_ebwt_prefix() throw an error if reference genome files aren't provided?""" + + def test_get_ebwt_prefix_no_genome(self): + config = Configuration(self.run_config) + config['reference_genome_files'] = None + + with self.assertRaises(ValueError): + config.get_ebwt_prefix() + + """Does get_bt_index_files() output the paths of indexes that have already been built?""" + + def test_get_bt_index_files_prebuilt_indexes(self): + config = self.config_with({'run_bowtie_build': False}) + prefix = config.paths['ebwt'] = os.path.abspath("./testdata/counter/validation/ebwt/ram1") + expected = self.bt_idx_files_from_prefix(prefix) + self.assertListEqual(config.get_bt_index_files(), expected) + + """Does get_bt_index_files() output the paths of the index files that are expected + to be built from the reference genome?""" + + def test_get_bt_index_files_unbuilt_indexes_with_genome(self): + config = self.config_with({'run_bowtie_build': True}) + prefix = config.paths['ebwt'] = "mock_prefix" + expected = self.bt_idx_files_from_prefix(prefix) + self.assertListEqual(config.get_bt_index_files(), expected) + + """Does get_bt_index_files() produce an error and quit when index files are + missing and a reference genome has not been provided?""" + + def test_get_bt_index_files_missing_indexes_without_genome(self): + config = self.config_with({'run_bowtie_build': False, 'reference_genome_files': None}) + prefix = config.paths['ebwt'] = "missing" + errmsg = '\n'.join([ + "The following Bowtie index file couldn't be found:", + "\t" + f"{prefix}.1.ebwt", + "\nPlease either correct your ebwt prefix or add reference genomes in the Paths File." + ]) + + with self.assertRaisesRegex(SystemExit, errmsg): + config.get_bt_index_files() + + """Does get_bt_index_files() produce an error without quitting when index files + are missing but a reference genome was provided, and does it return the list of + index files that will be built from the genome?""" + + def test_get_bt_index_files_missing_indexes_with_genome(self): + config = self.config_with({'run_bowtie_build': False}) + bad_prefix = config.paths['ebwt'] = "missing" + genome_prefix = self.default_prefix + + expected_files = self.bt_idx_files_from_prefix(genome_prefix) + expected_error = '\n'.join([ + "The following Bowtie index file couldn't be found:", + "\t" + f"{bad_prefix}.1.ebwt", + "\nIndexes will be built from your reference genome files during this run.", + "" + ]) + + stderr = io.StringIO() + with contextlib.redirect_stderr(stderr): + actual = config.get_bt_index_files() + + self.assertEqual(stderr.getvalue(), expected_error) + self.assertListEqual(actual, expected_files) + + """Does verify_bowtie_build_outputs() update the paths in ["bt_index_files"] and rewrite + these changes to the processed Run Config if long indexes were produced? Does it also + write to the Paths File to update the new ebwt prefix?""" + + def test_verify_bowtie_build_outputs(self): + ebwt_short = ["1.ebwt", "2.ebwt", "3.ebwt"] + ebwt_long = ["1.ebwtl", "2.ebwtl", "3.ebwtl"] + run_conf_ebwt = [Configuration.cwl_file(f, verify=False) for f in ebwt_short] + expected_ebwt = [Configuration.cwl_file(f, verify=False) for f in ebwt_long] + + config = self.config_with({'bt_index_files': run_conf_ebwt}) + + with patch('tiny.rna.configuration.open', mock_open()) as mo, \ + patch('tiny.rna.configuration.glob', return_value=ebwt_long) as g: + config.verify_bowtie_build_outputs() + + expected_writes = [ + call(self.paths, 'w'), + call(os.path.join(self.root_cfg_dir, config['run_directory'], os.path.basename(self.run_config)), 'w') + ] + + self.assertListEqual(config['bt_index_files'], expected_ebwt) + self.assertListEqual(mo.call_args_list, expected_writes) + +if __name__ == '__main__': + unittest.main() From f751747d1ccfec5098215b4bddf8c7d13efec178 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 13 Oct 2022 19:01:00 -0700 Subject: [PATCH 8/9] Documentation updates removing previous bowtie index requirements, and descriptions of the new activation procedure for the bowtie-build step. --- README.md | 3 +-- START_HERE/TUTORIAL.md | 4 +--- doc/Configuration.md | 9 ++++----- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index a418c74c..43336870 100644 --- a/README.md +++ b/README.md @@ -95,9 +95,8 @@ tiny get-template | Reference annotations
[(example)](START_HERE/reference_data/ram1.gff3) | GFF3 / GFF2 / GTF | Column 9 attributes (defined as "tag=value" or "tag "): | | Sequencing data
[(example)](START_HERE/fastq_files) | FASTQ(.gz) | Files must be demultiplexed. | | Reference genome
[(example)](START_HERE/reference_data/ram1.fa) | FASTA | Chromosome identifiers (e.g. Chr1): | -| Bowtie indexes (optional) 1 | ebwt | Must be small indexes (.ebwtl indexes are not supported) | -
1 Bowtie indexes can be created for you. See the [configuration file documentation](doc/Configuration.md#building-bowtie-indexes). + ### Running an End-to-End Analysis In most cases you will use this toolset as an end-to-end pipeline. This will run a full, standard small RNA sequencing data analysis according to your configuration file. Before starting, you will need the following: diff --git a/START_HERE/TUTORIAL.md b/START_HERE/TUTORIAL.md index 6fe3a4ee..7cac23dd 100644 --- a/START_HERE/TUTORIAL.md +++ b/START_HERE/TUTORIAL.md @@ -35,9 +35,7 @@ The output you see on your terminal is from `cwltool`, which coordinates the exe When the analysis is complete you'll notice a new folder has appeared whose name contains the date and time of the run. Inside you'll find subdirectories containing the file and terminal outputs for each step, and the processed Run Config file for auto-documentation of the run. ### Bowtie indexes -Bowtie indexes were built during this run because paths.yml didn't define an `ebwt` prefix. Now, you'll see the `ebwt` points to the freshly built indexes in your run directory. This means that indexes won't be rebuilt during any subsequent runs that use this `paths.yml` file. If you need to rebuild your indexes: -1. Change the value of ebwt to `ebwt: ''` in paths.yml -2. Ensure that your Run Config file contains `run_bowtie_build: True` +Bowtie indexes were built during this run because `paths.yml` didn't define an `ebwt` prefix. Now, you'll see the `ebwt` points to the freshly built indexes in your run directory. This means that indexes won't be rebuilt during any subsequent runs that use this `paths.yml` file. If you need to rebuild your indexes, simply delete the value to the right of `ebwt` in paths.yml ## Running Your Data Expected runtime: ~10-60 minutes (expect longer runtimes if a bowtie index must be built) diff --git a/doc/Configuration.md b/doc/Configuration.md index 4dbf5869..b5ec77ec 100644 --- a/doc/Configuration.md +++ b/doc/Configuration.md @@ -95,12 +95,11 @@ When the pipeline starts up, tinyRNA will process the Run Config based on the co If you don't have bowtie indexes already built for your reference genome, tinyRNA can build them for you at the beginning of an end-to-end run and reuse them on subsequent runs with the same Paths File. To build bowtie indexes: -1. Open your Run Config in a text editor and find the `run_bowtie_build` key. Set its value to `true` and save it. -2. Open your Paths File in a text editor and find the `reference_genome_files` key. Add your reference genome file(s) under this key, one per line with a `- ` in front. -3. Find the `ebwt` key and delete its value. -4. Execute an end-to-end pipeline run. +1. Open your Paths File in a text editor and find the `reference_genome_files` key. Add your reference genome file(s) under this key, one per line with a `- ` in front. +2. Find the `ebwt` key and delete its value. +3. Execute an end-to-end pipeline run. -Once your indexes have been built, your Paths File will be modified such that `ebwt` points to their location (prefix) within your Run Directory. This means that indexes will not be unnecessarily rebuilt on subsequent runs as long as the same Paths File is used. If you need them rebuilt, simply repeat steps 3 and 4 above. +Once your indexes have been built, your Paths File will be modified such that `ebwt` points to their location (prefix) within your Run Directory. This means that indexes will not be unnecessarily rebuilt on subsequent runs as long as the same Paths File is used. If you need them rebuilt, simply repeat steps 2 and 3 above. ## Samples Sheet Details | _Column:_ | Input FASTQ Files | Sample/Group Name | Replicate Number | Control | Normalization | From ad888e2c84bc8bc504910688fec97b7998806a5c Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Mon, 17 Oct 2022 15:35:39 -0700 Subject: [PATCH 9/9] Correcting table formatting after having resolved merge conflicts for the PR --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2b69792b..0681e113 100644 --- a/README.md +++ b/README.md @@ -93,8 +93,8 @@ tiny get-template | Input Type | File Extension | Requirements | |----------------------------------------------------------------------------|-------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | Reference annotations
[(example)](START_HERE/reference_data/ram1.gff3) | GFF3 / GFF2 / GTF | Column 9 attributes (defined as "tag=value" or "tag "): | -| Sequencing data
[(example)](START_HERE/fastq_files) | FASTQ(.gz) | Files must be demultiplexed. | -| Reference genome
[(example)](START_HERE/reference_data/ram1.fa) | FASTA | Chromosome identifiers (e.g. Chr1): | +| Sequencing data
[(example)](START_HERE/fastq_files) | FASTQ(.gz) | Files must be demultiplexed. | +| Reference genome
[(example)](START_HERE/reference_data/ram1.fa) | FASTA | Chromosome identifiers (e.g. Chr1): |