From 1f53130614c1c5dbfefc917939e530f9f1832192 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sat, 15 Oct 2022 16:16:52 -0700 Subject: [PATCH 01/24] Added GFF input handling to process_paths_sheet(). Also includes style and reliability improvements. The routine for determining the final output directory name has been cleaned up in setup_file_groups() Removed "Alias by..." and "Feature Source" from the list of expected Features Sheet columns. --- tiny/rna/configuration.py | 64 ++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 31 deletions(-) diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py index dde86823..6d990d46 100644 --- a/tiny/rna/configuration.py +++ b/tiny/rna/configuration.py @@ -170,7 +170,7 @@ def __init__(self, config_file: str, validate_inputs=False): self.process_paths_sheet() self.setup_pipeline() - self.setup_per_file() + self.setup_file_groups() self.setup_ebwt_idx() self.process_sample_sheet() self.process_feature_sheet() @@ -182,23 +182,38 @@ def load_paths_config(self): return ConfigBase(path_sheet) def process_paths_sheet(self): - """Loads the paths of all related config files and workflow inputs""" + """Loads file paths and groups from the Paths File and resolves relative paths""" def to_cwl_file_class(input_file_path): path_to_input = self.paths.from_here(input_file_path) return self.cwl_file(path_to_input) - for absorb_key in ['ebwt', 'plot_style_sheet', 'adapter_fasta', 'tmp_directory']: - self[absorb_key] = self.paths[absorb_key] - self['run_directory'] = self.paths.from_here(self.paths['run_directory']) + required = ('samples_csv', 'features_csv', 'gff_files') + single = ('samples_csv', 'features_csv', 'plot_style_sheet', 'adapter_fasta') + groups = ('reference_genome_files', 'gff_files') - # Configurations that need to be converted from string to a CWL File object - self['samples_csv'] = to_cwl_file_class(self.paths.from_here(self.paths['samples_csv'])) - self['features_csv'] = to_cwl_file_class(self.paths.from_here(self.paths['features_csv'])) - self['reference_genome_files'] = [ - to_cwl_file_class(self.paths.from_here(genome)) - for genome in self.paths['reference_genome_files'] - ] + # Convert file path strings to CWL File objects where necessary + try: + # Individual path strings + for key in single: + if not self.paths[key] and key not in required: continue + self[key] = to_cwl_file_class(self.paths[key]) + + # Lists of path strings + for key in groups: + self[key] = [ + to_cwl_file_class(file) for file in self.paths[key] + if key in required or bool(file) + ] + except FileNotFoundError as e: + msg = f"The file provided for {key} in your Paths File could not be found" + msg += "\n\t" + e.filename + sys.exit(msg) + + # Path strings that do not need to be converted to CWL File objects + for file_string in ['run_directory', 'tmp_directory', 'ebwt']: + path = self.paths[file_string] + self[file_string] = self.paths.from_here(path) if path else '' def process_sample_sheet(self): sample_sheet = self.paths.from_here(self['samples_csv']['path']) @@ -242,20 +257,11 @@ def check_deseq_compatibility(self, sample_groups): file=sys.stderr) def process_feature_sheet(self): - feature_sheet = self.paths.from_here(self['features_csv']['path']) - feature_sheet_dir = os.path.dirname(feature_sheet) - - csv_reader = CSVReader(feature_sheet, "Features Sheet") - for row in csv_reader.rows(): - gff_file = self.from_here(row['Source'], origin=feature_sheet_dir) - try: - self.append_if_absent('gff_files', self.cwl_file(gff_file)) - except FileNotFoundError: - line = csv_reader.line_num - sys.exit("The GFF file on line %d of your Features Sheet was not found:\n%s" % (line, gff_file)) + # Configuration doesn't currently do anything with Feature Sheet contents + return - def setup_per_file(self): - """Per-library settings lists to be populated by entries from samples_csv""" + def setup_file_groups(self): + """Configuration keys that represent lists of files""" self.set_default_dict({per_file_setting_key: [] for per_file_setting_key in ['in_fq', 'sample_basenames', 'gff_files', 'fastp_report_titles'] @@ -271,10 +277,8 @@ def setup_pipeline(self): self['run_name'] = self.get('run_name', default=default_run_name) + "_" + self.dt # Create prefixed Run Directory name - run_dir_resolved = self.paths.from_here(self.get('run_directory', default='run_directory')) - run_dir_parent = os.path.dirname(run_dir_resolved) - run_dir_withdt = self['run_name'] + '_' + os.path.basename(run_dir_resolved) - self['run_directory'] = self.joinpath(run_dir_parent, run_dir_withdt) + run_dir_parent, run_dir = os.path.split(self['run_directory'].rstrip(os.sep)) + self['run_directory'] = self.joinpath(run_dir_parent, self['run_name'] + run_dir) self.templates = resource_filename('tiny', 'templates/') @@ -364,14 +368,12 @@ class CSVReader(csv.DictReader): "Features Sheet": OrderedDict({ "Select for...": "Key", "with value...": "Value", - "Alias by...": "Name", "Tag": "Tag", "Hierarchy": "Hierarchy", "Strand": "Strand", "5' End Nucleotide": "nt5end", "Length": "Length", "Overlap": "Overlap", - "Feature Source": "Source" }), "Samples Sheet": OrderedDict({ "Input FASTQ Files": "File", From 53f1683199a503cd3fabea2b34aaff9c5d9cdb82 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sat, 15 Oct 2022 16:19:53 -0700 Subject: [PATCH 02/24] Commandline arguments for tiny-count have been modified. The required args for Features Sheet and Samples Sheet have been replaced by a required arg for the Paths File. load_config() and load_samples() have been modified to use the Paths File to determine their inputs. Handling of GFFs and aliases have been removed from load_config(). This is now handled by a new function: load_annotations() --- tiny/rna/counter/counter.py | 47 ++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/tiny/rna/counter/counter.py b/tiny/rna/counter/counter.py index c7f48f0b..ba5dacf9 100644 --- a/tiny/rna/counter/counter.py +++ b/tiny/rna/counter/counter.py @@ -17,7 +17,7 @@ from tiny.rna.counter.features import Features, FeatureCounter from tiny.rna.counter.statistics import MergedStatsManager from tiny.rna.util import report_execution_time, from_here, ReadOnlyDict -from tiny.rna.configuration import CSVReader +from tiny.rna.configuration import CSVReader, ConfigBase # Global variables for multiprocessing counter: FeatureCounter @@ -31,10 +31,8 @@ def get_args(): optional_args = arg_parser.add_argument_group("Optional arguments") # Required arguments - required_args.add_argument('-i', '--samples-csv', metavar='SAMPLES', required=True, - help='your Samples Sheet') - required_args.add_argument('-f', '--features-csv', metavar='FEATURES', required=True, - help='your Features Sheet') + required_args.add_argument('-p', '--paths-file', metavar='PATHS', required=True, + help='your Paths File') required_args.add_argument('-o', '--out-prefix', metavar='OUTPUTPREFIX', required=True, help='output prefix to use for file names') @@ -71,14 +69,14 @@ def get_args(): return ReadOnlyDict(vars(args)) -def load_samples(samples_csv: str, is_pipeline: bool) -> List[Dict[str, str]]: +def load_samples(paths: 'ConfigBase', is_pipeline: bool) -> List[Dict[str, str]]: """Parses the Samples Sheet to determine library names and alignment files for counting Sample files may have a .fastq(.gz) extension (i.e. when tiny-count is called as part of a pipeline run) or a .sam extension (i.e. when tiny-count is called as a standalone tool). Args: - samples_csv: a csv file which defines sample group, replicate, and file location + paths: a loaded Paths File which provides a path to the Samples Sheet is_pipeline: helps locate sample SAM files. If true, files are assumed to reside in the working directory. If false, files are assumed to reside in the same directory as their source FASTQ files with '_aligned_seqs.sam' appended @@ -107,6 +105,7 @@ def get_library_filename(csv_row_file: str, samples_csv: str) -> str: "The following filename contained neither:\n%s" % (csv_row_file,)) return csv_row_file + samples_csv = paths.from_here(paths['samples_csv']) inputs = list() for row in CSVReader(samples_csv, "Samples Sheet").rows(): @@ -125,11 +124,11 @@ def get_library_filename(csv_row_file: str, samples_csv: str) -> str: return inputs -def load_config(features_csv: str, is_pipeline: bool) -> Tuple[List[dict], Dict[str, list]]: - """Parses the Features Sheet to provide inputs to FeatureSelector and build_reference_tables +def load_config(paths: 'ConfigBase', is_pipeline: bool) -> List[dict]: + """Parses the Features Sheet to provide inputs to FeatureSelector and ReferenceTables Args: - features_csv: a csv file which defines feature sources and selection rules + paths: a loaded Paths File which provides a path to the Features Sheet is_pipeline: helps locate GFF files defined in the Features Sheet. If true, GFF files are assumed to reside in the working directory. @@ -137,11 +136,11 @@ def load_config(features_csv: str, is_pipeline: bool) -> Tuple[List[dict], Dict[ rules: a list of dictionaries, each representing a parsed row from input. Note that these are just rule definitions which FeatureSelector will further digest to produce its rules table. - gff_files: a dict of GFF files and associated Name Attribute preferences """ - rules, gff_files = list(), defaultdict(list) + rules = list() + features_csv = paths.from_here(paths['features_csv']) for row in CSVReader(features_csv, "Features Sheet").rows(): rule = {col: row[col] for col in ["Tag", "Hierarchy", "Strand", "nt5end", "Length", "Overlap"]} rule['nt5end'] = rule['nt5end'].upper().translate({ord('U'): 'T'}) # Convert RNA base to cDNA base @@ -149,13 +148,21 @@ def load_config(features_csv: str, is_pipeline: bool) -> Tuple[List[dict], Dict[ rule['Hierarchy'] = int(rule['Hierarchy']) # Convert hierarchy to number rule['Overlap'] = rule['Overlap'].lower() # Built later in ReferenceTables - gff = os.path.basename(row['Source']) if is_pipeline else from_here(features_csv, row['Source']) - - # Duplicate Name Attributes and rule entries are not allowed - if row['Name'] not in ["ID", *gff_files[gff]]: gff_files[gff].append(row['Name']) + # Duplicate rule entries are not allowed if rule not in rules: rules.append(rule) - return rules, gff_files + return rules + +def load_annotations(paths, is_pipeline: bool): + gffs = {} + for entry in paths['gff_files']: + if is_pipeline: + filename = os.path.basename(entry['path']) + else: + filename = paths.from_here(entry['path']) + gffs[filename] = entry['alias'] + + return gffs def validate_inputs(gffs, libraries, prefs): @@ -192,11 +199,13 @@ def main(): args = get_args() try: + paths = ConfigBase(args['paths_file']) + # Determine SAM inputs and their associated library names - libraries = load_samples(args['samples_csv'], args['is_pipeline']) + libraries = load_samples(paths, args['is_pipeline']) # Load selection rules and feature sources from the Features Sheet - selection_rules, gff_file_set = load_config(args['features_csv'], args['is_pipeline']) + selection_rules, gff_file_set = load_config(paths, args['is_pipeline']) validate_inputs(gff_file_set, libraries, args) # global for multiprocessing From 4bfa4453c344d54c9bba484d6c0b5cdd0d9033e5 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sat, 15 Oct 2022 16:23:27 -0700 Subject: [PATCH 03/24] Updated the directions in the comments to include providing GFF files in the Paths File. Also corrected some rare usages of "Paths Sheet" to "Paths File" --- START_HERE/paths.yml | 12 ++++++++++-- START_HERE/run_config.yml | 2 +- tiny/entry.py | 2 +- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/START_HERE/paths.yml b/START_HERE/paths.yml index c83a7607..105ca2f7 100644 --- a/START_HERE/paths.yml +++ b/START_HERE/paths.yml @@ -5,8 +5,9 @@ # # Directions: # 1. Fill out the Samples Sheet with files to process + naming scheme. [samples.csv] -# 2. Fill out the Features Sheet with reference files and selection rules [features.csv] -# 3. Set samples_csv and features_csv to point to these files +# 2. Fill out the Features Sheet with selection rules [features.csv] +# 3. Set samples_csv and features_csv (below) to point to these files +# 4. Add annotation files and per-file alias preferences to gff_files # ######-------------------------------------------------------------------------------###### @@ -14,6 +15,13 @@ samples_csv: ./samples.csv features_csv: ./features.csv +##-- Each entry: 1. the file, 2. (optional) list of attribute keys for feature aliases --## +gff_files: + - path: "./reference_data/ram1.gff3" + alias: [Alias] + - path: + alias: [] + ##-- The final output directory for files produced by the pipeline --# run_directory: run_directory diff --git a/START_HERE/run_config.yml b/START_HERE/run_config.yml index fc48ef7c..90bdceec 100644 --- a/START_HERE/run_config.yml +++ b/START_HERE/run_config.yml @@ -324,7 +324,7 @@ dir_name_plotter: plots ########################################################################################### -######--------------------------- DERIVED FROM PATHS SHEET --------------------------###### +######--------------------------- DERIVED FROM PATHS FILE ---------------------------###### # # The following configuration settings are automatically derived from the sample sheet # diff --git a/tiny/entry.py b/tiny/entry.py index feccf61d..0d05489f 100644 --- a/tiny/entry.py +++ b/tiny/entry.py @@ -101,7 +101,7 @@ def run(tinyrna_cwl_path: str, config_file: str) -> None: return_code = run_cwltool_subprocess(config_object, workflow, run_directory) # If the workflow completed without errors, we want to update - # the Paths Sheet to point to the new bowtie index prefix + # the Paths File to point to the new bowtie index prefix if config_object['run_bowtie_build'] and return_code == 0: paths_sheet_filename = config_object.paths.inf config_object.paths.write_processed_config(paths_sheet_filename) From f1799660a7d9a30e1b41e17839504736dc6f7310 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sat, 15 Oct 2022 16:37:22 -0700 Subject: [PATCH 04/24] Progress commit, switching branches --- tiny/rna/counter/counter.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/tiny/rna/counter/counter.py b/tiny/rna/counter/counter.py index ba5dacf9..4f0ef874 100644 --- a/tiny/rna/counter/counter.py +++ b/tiny/rna/counter/counter.py @@ -129,8 +129,7 @@ def load_config(paths: 'ConfigBase', is_pipeline: bool) -> List[dict]: Args: paths: a loaded Paths File which provides a path to the Features Sheet - is_pipeline: helps locate GFF files defined in the Features Sheet. If true, - GFF files are assumed to reside in the working directory. + is_pipeline: not currently used; helps properly locate input files Returns: rules: a list of dictionaries, each representing a parsed row from input. @@ -153,14 +152,26 @@ def load_config(paths: 'ConfigBase', is_pipeline: bool) -> List[dict]: return rules -def load_annotations(paths, is_pipeline: bool): + +def load_gffs(paths, is_pipeline: bool) -> Dict[str, list]: + """Determines GFF file paths and the alias attributes for each + + Args: + paths: a loaded Paths File which provides a list of GFFs + is_pipeline: helps locate GFF files defined in the Paths File. If true, + GFF files are assumed to reside in the working directory. + + Returns: + gff: a dict of GFF files with lists of alias attribute keys + """ + gffs = {} for entry in paths['gff_files']: if is_pipeline: filename = os.path.basename(entry['path']) else: filename = paths.from_here(entry['path']) - gffs[filename] = entry['alias'] + gffs[filename] = entry.get('alias', default=[]) return gffs @@ -200,17 +211,15 @@ def main(): try: paths = ConfigBase(args['paths_file']) - - # Determine SAM inputs and their associated library names libraries = load_samples(paths, args['is_pipeline']) + selection = load_config(paths, args['is_pipeline']) + gff_files = load_gffs(paths, args['is_pipeline']) - # Load selection rules and feature sources from the Features Sheet - selection_rules, gff_file_set = load_config(paths, args['is_pipeline']) - validate_inputs(gff_file_set, libraries, args) + validate_inputs(gff_files, libraries, args) # global for multiprocessing global counter - counter = FeatureCounter(gff_file_set, selection_rules, **args) + counter = FeatureCounter(gff_files, selection, **args) # Assign and count features using multiprocessing and merge results merged_counts = map_and_reduce(libraries, args) From 6467672408a1ea98799d6ae3f461facbef8df293 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sun, 23 Oct 2022 14:15:21 -0700 Subject: [PATCH 05/24] Tiny-count is now able to perform counting using the new Paths File input. I still think it's a good idea to create a standalone PathsFile class that tiny-config and tiny-count can use. --- tiny/rna/counter/counter.py | 13 ++++++++----- tiny/rna/counter/statistics.py | 4 ++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/tiny/rna/counter/counter.py b/tiny/rna/counter/counter.py index 4f0ef874..639e14a5 100644 --- a/tiny/rna/counter/counter.py +++ b/tiny/rna/counter/counter.py @@ -31,7 +31,7 @@ def get_args(): optional_args = arg_parser.add_argument_group("Optional arguments") # Required arguments - required_args.add_argument('-p', '--paths-file', metavar='PATHS', required=True, + required_args.add_argument('-pf', '--paths-file', metavar='PATHS', required=True, help='your Paths File') required_args.add_argument('-o', '--out-prefix', metavar='OUTPUTPREFIX', required=True, help='output prefix to use for file names') @@ -153,7 +153,7 @@ def load_config(paths: 'ConfigBase', is_pipeline: bool) -> List[dict]: return rules -def load_gffs(paths, is_pipeline: bool) -> Dict[str, list]: +def load_gffs(paths: 'ConfigBase', is_pipeline: bool) -> Dict[str, list]: """Determines GFF file paths and the alias attributes for each Args: @@ -165,6 +165,9 @@ def load_gffs(paths, is_pipeline: bool) -> Dict[str, list]: gff: a dict of GFF files with lists of alias attribute keys """ + if paths['gff_files'] is None: + raise ValueError("No GFF files were found in your Paths File") + gffs = {} for entry in paths['gff_files']: if is_pipeline: @@ -183,11 +186,11 @@ def validate_inputs(gffs, libraries, prefs): @report_execution_time("Counting and merging") -def map_and_reduce(libraries, prefs): +def map_and_reduce(libraries, features_csv, prefs): """Assigns one worker process per library and merges the statistics they report""" # MergedStatsManager handles final output files, regardless of multiprocessing status - summary = MergedStatsManager(Features, prefs) + summary = MergedStatsManager(Features, features_csv, prefs) # Use a multiprocessing pool if multiple sam files were provided if len(libraries) > 1: @@ -222,7 +225,7 @@ def main(): counter = FeatureCounter(gff_files, selection, **args) # Assign and count features using multiprocessing and merge results - merged_counts = map_and_reduce(libraries, args) + merged_counts = map_and_reduce(libraries, paths.from_here(paths['features_csv']), args) # Write final outputs merged_counts.write_report_files() diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index 8d625564..7f7d5261 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -146,11 +146,11 @@ class MergedStatsManager: chrom_misses = Counter() warnings = [] - def __init__(self, Features_obj, prefs): + def __init__(self, Features_obj, features_csv, prefs): MergedStat.prefix = prefs.get('out_prefix', '') self.merged_stats = [ - FeatureCounts(Features_obj), RuleCounts(prefs['features_csv']), + FeatureCounts(Features_obj), RuleCounts(features_csv), AlignmentStats(), SummaryStats(), NtLenMatrices() ] From 9105779e6bcf75aa3aab9480131b0fc38d360985 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sun, 23 Oct 2022 14:15:44 -0700 Subject: [PATCH 06/24] Updated config files in START_HERE for the new GFF input method --- START_HERE/features.csv | 14 +++++++------- START_HERE/paths.yml | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/START_HERE/features.csv b/START_HERE/features.csv index 959a1e73..b3e1e698 100755 --- a/START_HERE/features.csv +++ b/START_HERE/features.csv @@ -1,7 +1,7 @@ -Select for...,with value...,Alias by...,Tag,Hierarchy,Strand,5' End Nucleotide,Length,Overlap,Feature Source -Class,mask,Alias,,1,both,all,all,Partial,./reference_data/ram1.gff3 -Class,miRNA,Alias,,2,sense,all,16-22,Full,./reference_data/ram1.gff3 -Class,piRNA,Alias,5pA,2,both,A,24-32,Full,./reference_data/ram1.gff3 -Class,piRNA,Alias,5pT,2,both,T,24-32,Full,./reference_data/ram1.gff3 -Class,siRNA,Alias,,2,both,all,15-22,Full,./reference_data/ram1.gff3 -Class,unk,Alias,,3,both,all,all,Full,./reference_data/ram1.gff3 \ No newline at end of file +Select for...,with value...,Tag,Hierarchy,Strand,5' End Nucleotide,Length,Overlap +Class,mask,,1,both,all,all,Partial +Class,miRNA,,2,sense,all,16-22,Full +Class,piRNA,5pA,2,both,A,24-32,Full +Class,piRNA,5pT,2,both,T,24-32,Full +Class,siRNA,,2,both,all,15-22,Full +Class,unk,,3,both,all,all,Full \ No newline at end of file diff --git a/START_HERE/paths.yml b/START_HERE/paths.yml index 105ca2f7..8322026f 100644 --- a/START_HERE/paths.yml +++ b/START_HERE/paths.yml @@ -19,8 +19,8 @@ features_csv: ./features.csv gff_files: - path: "./reference_data/ram1.gff3" alias: [Alias] - - path: - alias: [] +# - path: +# alias: [] ##-- The final output directory for files produced by the pipeline --# run_directory: run_directory From c5ecbf5bda25f94ca7c17e175197ba9ba7e25a33 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sun, 23 Oct 2022 14:16:00 -0700 Subject: [PATCH 07/24] Bugfixes and reliability improvements. ConfigBase.from_here() can now handle CWL file objects. Required inputs are checked against None in Configuration.process_paths_sheet(). CSVReader.validate_csv_header() now handles cases where more than the expected number of columns are provided (not sure how I missed that before). --- tiny/rna/configuration.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py index 6d990d46..76e17311 100644 --- a/tiny/rna/configuration.py +++ b/tiny/rna/configuration.py @@ -10,7 +10,7 @@ from pkg_resources import resource_filename from collections import Counter, OrderedDict from datetime import datetime -from typing import Union, Any +from typing import Union, Any, Optional from tiny.rna.counter.validation import GFFValidator @@ -113,8 +113,11 @@ def cwl_dir(dir: str) -> dict: """Returns a minimal Directory object as defined by CWL""" return {'class': 'Directory', 'path': dir} - def from_here(self, destination: str, origin: str = None): + def from_here(self, destination: Union[str,dict], origin: Optional[Union[str,dict]] = None): """Calculates paths relative to the input config file""" + if isinstance(destination, dict): destination = destination['path'] + if isinstance(origin, dict): origin = origin['path'] + origin = self.dir if origin is None else origin return self.joinpath(origin, destination) @@ -192,6 +195,8 @@ def to_cwl_file_class(input_file_path): single = ('samples_csv', 'features_csv', 'plot_style_sheet', 'adapter_fasta') groups = ('reference_genome_files', 'gff_files') + assert all(self.paths[req] is not None for req in required) + # Convert file path strings to CWL File objects where necessary try: # Individual path strings @@ -213,7 +218,7 @@ def to_cwl_file_class(input_file_path): # Path strings that do not need to be converted to CWL File objects for file_string in ['run_directory', 'tmp_directory', 'ebwt']: path = self.paths[file_string] - self[file_string] = self.paths.from_here(path) if path else '' + self[file_string] = self.paths.from_here(path) if path else None def process_sample_sheet(self): sample_sheet = self.paths.from_here(self['samples_csv']['path']) @@ -405,10 +410,15 @@ def rows(self): yield row def validate_csv_header(self, header: OrderedDict): + # The expected header values doc_reference = self.tinyrna_sheet_fields[self.doctype] expected = {key.lower() for key in doc_reference.keys()} - read_vals = {val.lower() for val in header.values() if val is not None} + # The header values that were read + read_vals = {val.lower() for key, val in header.items() if None not in (key, val)} + read_vals.update(val.lower() for val in header.get(None, ())) # Extra headers + + # Find differences between actual and expected headers unknown = {col_name for col_name in read_vals if col_name not in expected} missing = expected - read_vals From 49330f2f9fe0e93b11dfa7052fd4c92dc6f72aec Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Tue, 25 Oct 2022 16:33:04 -0700 Subject: [PATCH 08/24] Updates to ConfigBase.joinpath() to make it more robust against inputs that are Nonetype. Updates to ConfigBase.from_here() to allow it to handle inputs that are already CWL file classes, and inputs that are Nonetype or empty string. Previously, this function would return the config file's directory if destination="" or None. In this context this isn't very helpful. Now it will return an empty string. --- tiny/rna/configuration.py | 49 ++++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py index cd4ba1af..7e4694a8 100644 --- a/tiny/rna/configuration.py +++ b/tiny/rna/configuration.py @@ -95,15 +95,9 @@ def prefix(path: str) -> str: return os.path.splitext(path)[0] @staticmethod - def joinpath(path1: str, path2: str) -> str: - """Combines two relative paths intelligently""" - path1, path2 = (os.path.expanduser(p) for p in [path1, path2]) - if os.path.isabs(path2): return path2 - return os.path.normpath(os.path.join(path1, path2)) - - @staticmethod - def cwl_file(file: str, verify=True) -> dict: + def cwl_file(file: Union[str,dict], verify=True) -> dict: """Returns a minimal File object as defined by CWL""" + if isinstance(file, dict): file = file['path'] if '~' in file: file = os.path.expanduser(file) if verify and not os.path.exists(file): raise(FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file)) @@ -114,13 +108,36 @@ def cwl_dir(dir: str) -> dict: """Returns a minimal Directory object as defined by CWL""" return {'class': 'Directory', 'path': dir} - def from_here(self, destination: Union[str,dict], origin: Optional[Union[str,dict]] = None): - """Calculates paths relative to the input config file""" - if isinstance(destination, dict): destination = destination['path'] - if isinstance(origin, dict): origin = origin['path'] + @staticmethod + def joinpath(path1: str, path2: str) -> str: + """Combines two relative paths intelligently""" + path1 = os.path.expanduser(path1) if path1 is not None else "" + path2 = os.path.expanduser(path2) if path2 is not None else "" + if os.path.isabs(path2): return path2 + return os.path.normpath(os.path.join(path1, path2)) + + def from_here(self, destination: Union[str,dict,None], origin: Union[str,dict,None] = None) -> Union[str,dict,None]: + """Calculates paths relative to the input config file. An empty destination returns an empty path. + If destination is an absolute path, then destination is returned without any further joining. + Args: + destination: the path to evaluate relative to the config file. Can be a string or CWL file dictionary. + origin: the starting point (default: the config file's directory) + """ - origin = self.dir if origin is None else origin - return self.joinpath(origin, destination) + if isinstance(origin, dict): origin = origin.get('path') + if origin is None: origin = self.dir + + if isinstance(destination, dict): + dest_path = destination.get('path') + if dest_path: + return dict(destination, path=self.joinpath(origin, dest_path)) + else: + return destination + else: + if destination: + return self.joinpath(origin, destination) + else: + return destination def create_run_directory(self) -> str: """Create the destination directory for pipeline outputs""" @@ -287,9 +304,9 @@ def get_ebwt_prefix(self): genome_files = self['reference_genome_files'] if not genome_files: - raise ValueError("If your Paths Sheet doesn't have a value for \"ebtw:\", then bowtie indexes " + raise ValueError("If your Paths File doesn't have a value for \"ebtw:\", then bowtie indexes " "will be built, but you'll need to provide your reference genome files under " - '"reference_genome_files:" (also in your Paths Sheet)') + '"reference_genome_files:" (also in your Paths File)') genome_basename = os.path.basename(genome_files[0]['path']) return self.prefix(os.path.join( # prefix path: From e5956fd90b292c55eb08bdbbffa5dca328012477 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Tue, 25 Oct 2022 16:35:37 -0700 Subject: [PATCH 09/24] A new configuration class has been created: PathsFile. In addition to performing validation of the Paths File, it also includes some convenience functions like automatic "from_here" path resolution on key lookup and a function that converts any of the contained parameter types to a CWL file object. --- tiny/rna/configuration.py | 125 ++++++++++++++++++++++++++------------ 1 file changed, 87 insertions(+), 38 deletions(-) diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py index 7e4694a8..dab11db6 100644 --- a/tiny/rna/configuration.py +++ b/tiny/rna/configuration.py @@ -188,7 +188,7 @@ def __init__(self, config_file: str, validate_inputs=False): super().__init__(config_file) self.paths = self.load_paths_config() - self.process_paths_sheet() + self.assimilate_paths_file() self.setup_pipeline() self.setup_file_groups() @@ -199,44 +199,14 @@ def __init__(self, config_file: str, validate_inputs=False): def load_paths_config(self): """Constructs a sub-configuration object containing workflow file preferences""" - path_sheet = self.from_here(self['paths_config']) - return ConfigBase(path_sheet) + paths_file_path = self.from_here(self['paths_config']) + return PathsFile(paths_file_path) - def process_paths_sheet(self): - """Loads file paths and groups from the Paths File and resolves relative paths""" - - def to_cwl_file_class(input_file_path): - path_to_input = self.paths.from_here(input_file_path) - return self.cwl_file(path_to_input) - - required = ('samples_csv', 'features_csv', 'gff_files') - single = ('samples_csv', 'features_csv', 'plot_style_sheet', 'adapter_fasta') - groups = ('reference_genome_files', 'gff_files') - - assert all(self.paths[req] is not None for req in required) - - # Convert file path strings to CWL File objects where necessary - try: - # Individual path strings - for key in single: - if not self.paths[key] and key not in required: continue - self[key] = to_cwl_file_class(self.paths[key]) - - # Lists of path strings - for key in groups: - self[key] = [ - to_cwl_file_class(file) for file in self.paths[key] - if key in required or bool(file) - ] - except FileNotFoundError as e: - msg = f"The file provided for {key} in your Paths File could not be found" - msg += "\n\t" + e.filename - sys.exit(msg) - - # Path strings that do not need to be converted to CWL File objects - for file_string in ['run_directory', 'tmp_directory', 'ebwt']: - path = self.paths[file_string] - self[file_string] = self.paths.from_here(path) if path else None + def assimilate_paths_file(self): + for key in [*PathsFile.single, *PathsFile.groups]: + self[key] = self.paths.as_cwl_file_obj(key) + for key in PathsFile.prefix: + self[key] = self.paths[key] def process_samples_sheet(self): samples_sheet_path = self.paths.from_here(self['samples_csv']['path']) @@ -403,6 +373,85 @@ def main(): config_object.write_processed_config(f"processed_{file_basename}") +class PathsFile(ConfigBase): + required = ('samples_csv', 'features_csv', 'gff_files') + single = ('samples_csv', 'features_csv', 'plot_style_sheet', 'adapter_fasta') + groups = ('reference_genome_files', 'gff_files') + prefix = ('ebwt', 'run_directory', 'tmp_directory') + + def __init__(self, file: str): + super().__init__(file) + self.check_backward_compatibility() + self.validate_paths() + + def __getitem__(self, key: str): + """Automatically performs path resolution for both single and group parameters. + Note that only keys listed in self.groups are guaranteed to be returned as lists.""" + + value = self.config.get(key) + if key in self.groups: + if value is None: return [] + return [self.from_here(sub) for sub in value] + else: + return self.from_here(value) + + def as_cwl_file_obj(self, key: str): + """Returns the specified parameter with file paths converted to CWL file objects.""" + + val = self[key] + + if not val: + return val + elif key in self.single: + return self.cwl_file(val) + elif key in self.groups: + return [self.cwl_file(sub) for sub in val if sub] + elif key in self.prefix: + raise ValueError(f"The parameter {key} isn't meant to be a CWL file object.") + else: + raise ValueError(f'Unrecognized parameter: "{key}"') + + def validate_paths(self): + assert all(self[req] for req in self.required), \ + "The following parameters are required in {selfname}: {params}" \ + .format(selfname=self.basename, params=', '.join(self.required)) + + assert any(gff.get('path') for gff in self['gff_files']), \ + "At least one GFF file path must be specified under gff_files in {selfname}" \ + .format(selfname=self.basename) + + for key in self.single: + resolved_path = self[key] + if not resolved_path: continue + assert os.path.isfile(resolved_path), \ + "The file provided for {key} in {selfname} could not be found:\n\t{file}" \ + .format(key=key, selfname=self.basename, file=resolved_path) + + for key in self.groups: + for entry in self[key]: + if isinstance(entry, dict): entry = entry['path'] + assert os.path.isfile(entry), \ + "The following file provided under {key} in {selfname} could not be found:\n\t{file}" \ + .format(key=key, selfname=self.basename, file=entry) + + def check_backward_compatibility(self): + assert 'gff_files' in self, \ + "The gff_files parameter was not found in your Paths File. This likely means " \ + "that you are using a Paths File from an earlier version of tinyRNA. Please " \ + "check the release notes and update your configuration files." + + # Override + def append_to(self, key: str, val: Any): + """Overrides method in the base class. This is necessary due to automatic + path resolution in __getitem__ which returns a *temporary* list value, and + items appended to the temporary list would otherwise be lost.""" + + assert key in self.groups, "Tried appending to a non-list type parameter" + target = self.config.get(key, []) + target.append(val) + return target + + class SamplesSheet: def __init__(self, file): self.csv = CSVReader(file, "Samples Sheet") From 0d1a0173aa044d4279ae1f55da7ea748332847ea Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Tue, 25 Oct 2022 16:49:59 -0700 Subject: [PATCH 10/24] Updating the Configuration class to use the new PathsFile class when calling self.paths --- tiny/rna/configuration.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py index dab11db6..501cf863 100644 --- a/tiny/rna/configuration.py +++ b/tiny/rna/configuration.py @@ -209,7 +209,7 @@ def assimilate_paths_file(self): self[key] = self.paths[key] def process_samples_sheet(self): - samples_sheet_path = self.paths.from_here(self['samples_csv']['path']) + samples_sheet_path = self.paths['samples_csv'] samples_sheet = SamplesSheet(samples_sheet_path) self['sample_basenames'] = samples_sheet.sample_basenames @@ -258,8 +258,6 @@ def setup_ebwt_idx(self): # Set the prefix to the run directory outputs. This is necessary # because workflow requires bt_index_files to be a populated list. self.paths['ebwt'] = self.get_ebwt_prefix() - else: - self.paths['ebwt'] = self.paths.from_here(self.paths['ebwt']) # verify_bowtie_build_outputs() will check if these end up being long indexes self['bt_index_files'] = self.get_bt_index_files() From ae25379d82316ad954dc417fcc8c3a4a273adce5 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Tue, 25 Oct 2022 17:02:14 -0700 Subject: [PATCH 11/24] The new PathsFile class has been integrated with tiny-count. This has the benefit of validating Paths File contents when tiny-count is run as a standalone step. Eventually GFFValidator will be modified so that we can use genome and ebwt inputs as validation targets during standalone runs too (currently only the SAM files are used). For now, I need to keep things moving with other tasks. GFF loading and validation now take place in the same function. --- tiny/rna/counter/counter.py | 66 +++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 35 deletions(-) diff --git a/tiny/rna/counter/counter.py b/tiny/rna/counter/counter.py index 639e14a5..128ae868 100644 --- a/tiny/rna/counter/counter.py +++ b/tiny/rna/counter/counter.py @@ -17,7 +17,7 @@ from tiny.rna.counter.features import Features, FeatureCounter from tiny.rna.counter.statistics import MergedStatsManager from tiny.rna.util import report_execution_time, from_here, ReadOnlyDict -from tiny.rna.configuration import CSVReader, ConfigBase +from tiny.rna.configuration import CSVReader, PathsFile # Global variables for multiprocessing counter: FeatureCounter @@ -69,14 +69,14 @@ def get_args(): return ReadOnlyDict(vars(args)) -def load_samples(paths: 'ConfigBase', is_pipeline: bool) -> List[Dict[str, str]]: +def load_samples(samples_csv: str, is_pipeline: bool) -> List[Dict[str, str]]: """Parses the Samples Sheet to determine library names and alignment files for counting Sample files may have a .fastq(.gz) extension (i.e. when tiny-count is called as part of a pipeline run) or a .sam extension (i.e. when tiny-count is called as a standalone tool). Args: - paths: a loaded Paths File which provides a path to the Samples Sheet + samples_csv: a csv file which defines sample group, replicate, and file location is_pipeline: helps locate sample SAM files. If true, files are assumed to reside in the working directory. If false, files are assumed to reside in the same directory as their source FASTQ files with '_aligned_seqs.sam' appended @@ -105,7 +105,6 @@ def get_library_filename(csv_row_file: str, samples_csv: str) -> str: "The following filename contained neither:\n%s" % (csv_row_file,)) return csv_row_file - samples_csv = paths.from_here(paths['samples_csv']) inputs = list() for row in CSVReader(samples_csv, "Samples Sheet").rows(): @@ -124,11 +123,11 @@ def get_library_filename(csv_row_file: str, samples_csv: str) -> str: return inputs -def load_config(paths: 'ConfigBase', is_pipeline: bool) -> List[dict]: +def load_config(features_csv: str, is_pipeline: bool) -> List[dict]: """Parses the Features Sheet to provide inputs to FeatureSelector and ReferenceTables Args: - paths: a loaded Paths File which provides a path to the Features Sheet + features_csv: a csv file which defines feature sources and selection rules is_pipeline: not currently used; helps properly locate input files Returns: @@ -139,7 +138,6 @@ def load_config(paths: 'ConfigBase', is_pipeline: bool) -> List[dict]: rules = list() - features_csv = paths.from_here(paths['features_csv']) for row in CSVReader(features_csv, "Features Sheet").rows(): rule = {col: row[col] for col in ["Tag", "Hierarchy", "Strand", "nt5end", "Length", "Overlap"]} rule['nt5end'] = rule['nt5end'].upper().translate({ord('U'): 'T'}) # Convert RNA base to cDNA base @@ -153,44 +151,44 @@ def load_config(paths: 'ConfigBase', is_pipeline: bool) -> List[dict]: return rules -def load_gffs(paths: 'ConfigBase', is_pipeline: bool) -> Dict[str, list]: - """Determines GFF file paths and the alias attributes for each +def load_gff_files(paths: PathsFile, libraries: List[dict], args: ReadOnlyDict) -> Dict[str, list]: + """Retrieves the appropriate file path and alias attributes for each GFF, + then validates Args: - paths: a loaded Paths File which provides a list of GFFs - is_pipeline: helps locate GFF files defined in the Paths File. If true, - GFF files are assumed to reside in the working directory. + paths: a loaded PathsFile with + gff_files: a list of gff files, each as a dict with keys `path` and `alias` + is_pipeline: helps locate inputs defined in the Paths File. If true, files + will be sourced from the working directory. Otherwise, original path. Returns: gff: a dict of GFF files with lists of alias attribute keys """ - if paths['gff_files'] is None: - raise ValueError("No GFF files were found in your Paths File") + def map_path(x): + if args['is_pipeline']: return os.path.basename(x) + else: return x - gffs = {} - for entry in paths['gff_files']: - if is_pipeline: - filename = os.path.basename(entry['path']) - else: - filename = paths.from_here(entry['path']) - gffs[filename] = entry.get('alias', default=[]) + # Get GFF definitions from Paths File + gff_files = {map_path(gff['path']): gff.get('alias', []) for gff in paths['gff_files']} - return gffs + # Prepare supporting file inputs for GFF validation + sam_files = [lib['File'] for lib in libraries] + ref_genomes = [map_path(fasta) for fasta in paths['reference_genome_files']] + ebwt_prefix = map_path(paths['ebwt']) + # Todo: update GFFValidator so that genomes and ebwt can be safely passed here + GFFValidator(gff_files, args, alignments=sam_files).validate() -def validate_inputs(gffs, libraries, prefs): - if prefs.get('is_pipeline'): return - libraries = [lib['File'] for lib in libraries] - GFFValidator(gffs, prefs, alignments=libraries).validate() + return gff_files @report_execution_time("Counting and merging") -def map_and_reduce(libraries, features_csv, prefs): +def map_and_reduce(libraries, paths, prefs): """Assigns one worker process per library and merges the statistics they report""" # MergedStatsManager handles final output files, regardless of multiprocessing status - summary = MergedStatsManager(Features, features_csv, prefs) + summary = MergedStatsManager(Features, paths['features_csv'], prefs) # Use a multiprocessing pool if multiple sam files were provided if len(libraries) > 1: @@ -213,19 +211,17 @@ def main(): args = get_args() try: - paths = ConfigBase(args['paths_file']) - libraries = load_samples(paths, args['is_pipeline']) - selection = load_config(paths, args['is_pipeline']) - gff_files = load_gffs(paths, args['is_pipeline']) - - validate_inputs(gff_files, libraries, args) + paths = PathsFile(args['paths_file']) + libraries = load_samples(paths['samples_csv'], args['is_pipeline']) + selection = load_config(paths['features_csv'], args['is_pipeline']) + gff_files = load_gff_files(paths, libraries, args) # global for multiprocessing global counter counter = FeatureCounter(gff_files, selection, **args) # Assign and count features using multiprocessing and merge results - merged_counts = map_and_reduce(libraries, paths.from_here(paths['features_csv']), args) + merged_counts = map_and_reduce(libraries, paths, args) # Write final outputs merged_counts.write_report_files() From 39320627f695d07c79faacf518490c344ce658bf Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Tue, 25 Oct 2022 17:08:54 -0700 Subject: [PATCH 12/24] Correcting the tuple unpacking order in some functions. This isn't necessarily a bugfix because the order was reversed twice (hence why it passed testing). chroms_shared_with_ebwt() now silences stderr output from bowtie-inspect if there was an error. chroms_shared_with_genomes() now skips genome files that don't exist, and validate_chroms() will continue searching for "sequence file chromosomes" if no chromosomes were retained from genomes. These changes were made in preparation for including ebwt and genomes in tiny-count's call to GFFValidator. --- tiny/rna/counter/validation.py | 41 +++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/tiny/rna/counter/validation.py b/tiny/rna/counter/validation.py index 40485e25..d44b10f7 100644 --- a/tiny/rna/counter/validation.py +++ b/tiny/rna/counter/validation.py @@ -1,8 +1,10 @@ import functools import subprocess import sys +import os from collections import Counter, defaultdict +from typing import List, Dict from tiny.rna.util import sorted_natural from tiny.rna.counter.hts_parsing import parse_gff, ReferenceTables @@ -155,17 +157,19 @@ def validate_chroms(self, ebwt=None, genomes=None, alignments=None): # First search bowtie indexes if they are available if ebwt is not None: try: - chroms, shared = self.chroms_shared_with_ebwt(ebwt) - self.generate_chrom_report(chroms, shared) + shared, chroms = self.chroms_shared_with_ebwt(ebwt) + self.generate_chrom_report(shared, chroms) return except Exception: pass # Fallback to other input options # Next search the genome fasta(s) if available if genomes is not None: - chroms, shared = self.chroms_shared_with_genomes(genomes) - self.generate_chrom_report(chroms, shared) - return + shared, chroms = self.chroms_shared_with_genomes(genomes) + if chroms: + self.generate_chrom_report(shared, chroms) + return + # Continue search if chroms weren't gathered from genomes # Preferred inputs aren't available; continue testing with heuristic options if alignments is not None: @@ -178,11 +182,17 @@ def validate_chroms_heuristic(self, alignments): self.generate_chrom_heuristics_report(suspect_files) def chroms_shared_with_ebwt(self, index_prefix): - """Returns the set intersection between parsed GFF chromosomes and those in the bowtie index""" + """Determines the complete set of chromosomes in the bowtie index, then finds + their set intersection with the chromosomes parsed from GFF files. + Returns: + a tuple of (shared chromosomes, index chromosomes)""" - summary = subprocess.check_output(['bowtie-inspect', index_prefix, '-s']).decode('latin1').splitlines() - ebwt_chroms = set() + summary = subprocess.check_output( + ['bowtie-inspect', index_prefix, '-s'], + stderr=subprocess.DEVNULL + ).decode('latin1').splitlines() + ebwt_chroms = set() for line in summary: if line.startswith("Sequence-"): try: @@ -194,10 +204,15 @@ def chroms_shared_with_ebwt(self, index_prefix): return shared, ebwt_chroms def chroms_shared_with_genomes(self, genome_fastas): - """Returns the set intersection between parsed GFF chromosomes and those in the reference genome""" + """Determines the complete set of chromosomes defined in each genome file, + then finds their set intersection with the chromosomes parsed from GFF files. + Returns: + a tuple of (shared chromosomes, genome chromosomes)""" genome_chroms = set() for fasta in genome_fastas: + if not os.path.isfile(fasta): + continue with open(fasta, 'rb') as f: for line in f: if line[0] == ord('>'): @@ -206,9 +221,13 @@ def chroms_shared_with_genomes(self, genome_fastas): shared = genome_chroms & self.chrom_set return shared, genome_chroms - def alignment_chroms_mismatch_heuristic(self, sam_files, subset_size=50000): + def alignment_chroms_mismatch_heuristic(self, sam_files: List[str], subset_size=50000) -> Dict[str, set]: """Since alignment files can be very large, we only check that there's at least one shared - chromosome identifier and only the first subset_size lines are read from each file.""" + chromosome identifier and only the first subset_size lines are read from each file. The + returned dictionary contains only the SAM files whose sampled chromosome set failed to + intersect with the chromosomes parsed from GFF files. + Returns: + a dictionary of {SAM filename: SAM chromosomes sampled}""" files_wo_overlap = {} From cd9b05009ab11d23663e831ae2b9cc0b9690a829 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Tue, 25 Oct 2022 17:09:13 -0700 Subject: [PATCH 13/24] Unit tests have been added for the new PathsFile class --- tests/unit_tests_configuration.py | 119 +++++++++++++++++++++++++++++- 1 file changed, 118 insertions(+), 1 deletion(-) diff --git a/tests/unit_tests_configuration.py b/tests/unit_tests_configuration.py index e06eb114..625912dc 100644 --- a/tests/unit_tests_configuration.py +++ b/tests/unit_tests_configuration.py @@ -1,10 +1,11 @@ import contextlib import io import os +import random import unittest from unittest.mock import patch, mock_open, call -from tiny.rna.configuration import Configuration, SamplesSheet +from tiny.rna.configuration import Configuration, SamplesSheet, PathsFile from unit_test_helpers import csv_factory @@ -200,5 +201,121 @@ def test_validate_fastq_filepath(self): SamplesSheet('mock_filename') +class PathsFileTest(unittest.TestCase): + + """============ Helper functions ============""" + + @classmethod + def setUpClass(self): + self.template_file = os.path.abspath('../tiny/templates/paths.yml') + self.template_dir = os.path.dirname(self.template_file) + + def make_paths_file(self, prefs=None): + """IMPORTANT: relative file paths are evaluated relative to /tiny/templates/""" + + paths_file = self.template_file + config = PathsFile(paths_file) + if prefs is None: return config + + for key, val in prefs.items(): + config[key] = val + + return config + + """Does PathsFile automatically resolve paths when queried?""" + + def test_getitem_single(self): + config = self.make_paths_file() + config['mock_parameter'] = "./some/../file" + self.assertEqual(config['mock_parameter'], os.path.join(self.template_dir, "file")) + + """Does PathsFile automatically resolve lists of paths when queried? What if some list items are + strings, others are mappings with a "path" key, and others are None/empty?""" + + def test_getitem_group(self): + config = self.make_paths_file() + config.groups = ('mock_parameter',) + + mapping_1 = {'path': "./some/../file", "other_key": "irrelevant"} + mapping_2 = {'path': "../templates/another_file"} + path_string = "../../START_HERE/reference_data/ram1.gff3" + + config['mock_parameter'] = [mapping_1, mapping_2, path_string, None, ''] + + # Notice: only 'path' is modified in mappings and empty entries are returned unmodified + expected = [ + dict(mapping_1, path=os.path.join(self.template_dir, "file")), + dict(mapping_2, path=os.path.join(self.template_dir, "another_file")), + os.path.normpath(os.path.join(self.template_dir, path_string)), + None, + '' + ] + + self.assertListEqual(config['mock_parameter'], expected) + + """Does PathsFile check for required parameters?""" + + def test_validate_required_parameters(self): + config = self.make_paths_file() + for key in PathsFile.required: + oldval = config[key] + + with self.assertRaisesRegex(AssertionError, r".*(parameters are required).*"): + config[key] = '' + config.validate_paths() + + with self.assertRaisesRegex(AssertionError, r".*(parameters are required).*"): + config[key] = None + config.validate_paths() + + with self.assertRaisesRegex(AssertionError, r".*(parameters are required).*"): + del config.config[key] + config.validate_paths() + + config[key] = oldval + + """Does PathsFile check that at least one GFF file has been provided?""" + + def test_validate_gff_files(self): + config = self.make_paths_file() + + with self.assertRaisesRegex(AssertionError, r".*(At least one GFF).*"): + config['gff_files'] = [{'path': "", 'alias': []}] + config.validate_paths() + + with self.assertRaisesRegex(AssertionError, r".*(At least one GFF).*"): + config['gff_files'] = [{'irrelevant': "value"}] + config.validate_paths() + + """Does PathsFile check for missing files for single entry parameters?""" + + def test_validate_missing_file_single(self): + config = self.make_paths_file() + key = random.choice(PathsFile.single) + config[key] = '/dev/null/file_dne' + + with self.assertRaisesRegex(AssertionError, f".*(file provided for {key}).*"): + config.validate_paths() + + """Does PathsFile check for missing files under list-type parameters?""" + + def test_validate_missing_file_group(self): + config = self.make_paths_file() + bad_path = "/dev/null/file_dne" + config.append_to('gff_files', {'path': bad_path, 'alias': []}) + + # Config now has one good entry and one bad entry; make sure the bad entry was specifically reported + with self.assertRaisesRegex(AssertionError, f".*(file provided under gff_files).*\n\t{bad_path}"): + config.validate_paths() + + """Does PathsFile detect a backward compatibility issue?""" + + def test_backward_incompatibility(self): + config = self.make_paths_file() + del config.config['gff_files'] + + with self.assertRaisesRegex(AssertionError, r".*(check the release notes).*"): + config.check_backward_compatibility() + if __name__ == '__main__': unittest.main() From bb0293f5948d7910c705b913385f4c10bdbe89aa Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Tue, 25 Oct 2022 17:10:00 -0700 Subject: [PATCH 14/24] Config files in the templates directory have been updated to reflect the new location for gff_files --- tiny/templates/features.csv | 14 +++++++------- tiny/templates/paths.yml | 5 +++++ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/tiny/templates/features.csv b/tiny/templates/features.csv index 57ec4a8e..b3e1e698 100755 --- a/tiny/templates/features.csv +++ b/tiny/templates/features.csv @@ -1,7 +1,7 @@ -Select for...,with value...,Alias by...,Tag,Hierarchy,Strand,5' End Nucleotide,Length,Overlap,Feature Source -Class,mask,Alias,,1,both,all,all,Partial,../../START_HERE/reference_data/ram1.gff3 -Class,miRNA,Alias,,2,sense,all,16-22,Full,../../START_HERE/reference_data/ram1.gff3 -Class,piRNA,Alias,5pA,2,both,A,24-32,Full,../../START_HERE/reference_data/ram1.gff3 -Class,piRNA,Alias,5pT,2,both,T,24-32,Full,../../START_HERE/reference_data/ram1.gff3 -Class,siRNA,Alias,,2,both,all,15-22,Full,../../START_HERE/reference_data/ram1.gff3 -Class,unk,Alias,,3,both,all,all,Full,../../START_HERE/reference_data/ram1.gff3 \ No newline at end of file +Select for...,with value...,Tag,Hierarchy,Strand,5' End Nucleotide,Length,Overlap +Class,mask,,1,both,all,all,Partial +Class,miRNA,,2,sense,all,16-22,Full +Class,piRNA,5pA,2,both,A,24-32,Full +Class,piRNA,5pT,2,both,T,24-32,Full +Class,siRNA,,2,both,all,15-22,Full +Class,unk,,3,both,all,all,Full \ No newline at end of file diff --git a/tiny/templates/paths.yml b/tiny/templates/paths.yml index c6553f71..8bb710ec 100644 --- a/tiny/templates/paths.yml +++ b/tiny/templates/paths.yml @@ -14,6 +14,11 @@ samples_csv: './samples.csv' features_csv: './features.csv' +##-- Each entry: 1. the file, 2. (optional) list of attribute keys for feature aliases --## +gff_files: + - path: "../../START_HERE/reference_data/ram1.gff3" + alias: [ID] + ##-- The final output directory for files produced by the pipeline --# run_directory: run_directory From 509f8810ffd21e600e5bce00a30cc927ff86fee7 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Wed, 26 Oct 2022 16:53:13 -0700 Subject: [PATCH 15/24] ConfigBase.from_here() has slightly cleaner logic and is much more resilient. A CWL file object entry is now included for "paths_file" in the Run Config. This means there will be two keys containing the path to the Paths File; one of them is at the top and is a regular string for user friendliness, and the other is in the automatic section for the workflow runner. Replaced a missing "_" from the run_directory name that had been recently removed. --- tiny/rna/configuration.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py index 501cf863..5cb8d6a1 100644 --- a/tiny/rna/configuration.py +++ b/tiny/rna/configuration.py @@ -127,17 +127,16 @@ def from_here(self, destination: Union[str,dict,None], origin: Union[str,dict,No if isinstance(origin, dict): origin = origin.get('path') if origin is None: origin = self.dir - if isinstance(destination, dict): - dest_path = destination.get('path') - if dest_path: - return dict(destination, path=self.joinpath(origin, dest_path)) - else: - return destination + if ( + isinstance(destination, dict) and + isinstance(destination.get("path"), (str, bytes)) and + len(destination['path']) + ): + return dict(destination, path=self.joinpath(origin, destination["path"])) + elif isinstance(destination, (str, bytes)) and bool(destination): + return self.joinpath(origin, destination) else: - if destination: - return self.joinpath(origin, destination) - else: - return destination + return destination def create_run_directory(self) -> str: """Create the destination directory for pipeline outputs""" @@ -198,7 +197,9 @@ def __init__(self, config_file: str, validate_inputs=False): if validate_inputs: self.validate_inputs() def load_paths_config(self): - """Constructs a sub-configuration object containing workflow file preferences""" + """Constructs a sub-configuration object containing workflow file preferences + self['paths_config'] is the user-facing file path (just the path string) + self['paths_file'] is a CWL file object used as a workflow input.""" paths_file_path = self.from_here(self['paths_config']) return PathsFile(paths_file_path) @@ -207,6 +208,7 @@ def assimilate_paths_file(self): self[key] = self.paths.as_cwl_file_obj(key) for key in PathsFile.prefix: self[key] = self.paths[key] + self['paths_file'] = self.cwl_file(self.paths.inf) def process_samples_sheet(self): samples_sheet_path = self.paths['samples_csv'] @@ -241,7 +243,7 @@ def setup_pipeline(self): # Create prefixed Run Directory name run_dir_parent, run_dir = os.path.split(self['run_directory'].rstrip(os.sep)) - self['run_directory'] = self.joinpath(run_dir_parent, self['run_name'] + run_dir) + self['run_directory'] = self.joinpath(run_dir_parent, self['run_name'] + "_" + run_dir) self.templates = resource_filename('tiny', 'templates/') From b10da8a425ecb6b91ba2672ffe8860f65008da1e Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Wed, 26 Oct 2022 16:55:33 -0700 Subject: [PATCH 16/24] The PathsFile class is now compatible with pipeline-context path resolution --- tiny/rna/configuration.py | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py index 5cb8d6a1..f4b29a19 100644 --- a/tiny/rna/configuration.py +++ b/tiny/rna/configuration.py @@ -374,16 +374,38 @@ def main(): class PathsFile(ConfigBase): + """A configuration class for managing and validating Paths Files. + Relative paths are automatically resolved on lookup and list types are enforced. + While this is convenient, developers should be aware of the following caveats: + - Lookups that return list values do not return the original object; don't + append to them. Instead, use the append_to() helper function. + - Chained assignments can produce unexpected results. + """ + required = ('samples_csv', 'features_csv', 'gff_files') single = ('samples_csv', 'features_csv', 'plot_style_sheet', 'adapter_fasta') groups = ('reference_genome_files', 'gff_files') prefix = ('ebwt', 'run_directory', 'tmp_directory') - def __init__(self, file: str): + def __init__(self, file: str, in_pipeline=False): super().__init__(file) + self.in_pipeline = in_pipeline + self.map_path = self.from_pipeline if in_pipeline else self.from_here self.check_backward_compatibility() self.validate_paths() + @staticmethod + def from_pipeline(value): + """When tiny-count runs as a pipeline step, all file inputs are + sourced from the working directory regardless of original path.""" + + if isinstance(value, dict) and value.get("path") is not None: + return dict(value, path=os.path.basename(value['path'])) + elif isinstance(value, (str, bytes)): + return os.path.basename(value) + else: + return value + def __getitem__(self, key: str): """Automatically performs path resolution for both single and group parameters. Note that only keys listed in self.groups are guaranteed to be returned as lists.""" @@ -391,9 +413,9 @@ def __getitem__(self, key: str): value = self.config.get(key) if key in self.groups: if value is None: return [] - return [self.from_here(sub) for sub in value] + return [self.map_path(p) for p in value] else: - return self.from_here(value) + return self.map_path(value) def as_cwl_file_obj(self, key: str): """Returns the specified parameter with file paths converted to CWL file objects.""" @@ -420,6 +442,10 @@ def validate_paths(self): "At least one GFF file path must be specified under gff_files in {selfname}" \ .format(selfname=self.basename) + # Some entries in Paths File are omitted from tiny-count's working directory during + # pipeline runs. There is no advantage to checking file existence here vs. in load_* + if self.in_pipeline: return + for key in self.single: resolved_path = self[key] if not resolved_path: continue From 5c7cd067ca97c7522dbd52d40fbdf1a6e1cbb0c0 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Wed, 26 Oct 2022 17:00:54 -0700 Subject: [PATCH 17/24] Updated load_gff_files() to utilize PathsFile's new pipeline-context path resolution. This commit also reinstates the old guarantees that each GFF is parsed only once, and that "ID" alias values are not shown in the Feature Name column. Tried to make it pretty resilient. If multiple entries exist for the same GFF file with unique alias keys in each entry, the alias keys are merged under the single resulting record for the file --- tiny/rna/counter/counter.py | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/tiny/rna/counter/counter.py b/tiny/rna/counter/counter.py index 128ae868..f66e6c6d 100644 --- a/tiny/rna/counter/counter.py +++ b/tiny/rna/counter/counter.py @@ -157,29 +157,36 @@ def load_gff_files(paths: PathsFile, libraries: List[dict], args: ReadOnlyDict) Args: paths: a loaded PathsFile with + libraries: a list of library files, each as a dict with a 'File' key gff_files: a list of gff files, each as a dict with keys `path` and `alias` - is_pipeline: helps locate inputs defined in the Paths File. If true, files - will be sourced from the working directory. Otherwise, original path. + args: command line arguments passed to GFFValidator for source/type filters Returns: gff: a dict of GFF files with lists of alias attribute keys """ - def map_path(x): - if args['is_pipeline']: return os.path.basename(x) - else: return x + gff_files = defaultdict(list) + screen_alias = ["id"] - # Get GFF definitions from Paths File - gff_files = {map_path(gff['path']): gff.get('alias', []) for gff in paths['gff_files']} + # Build dictionary of unique files and allowed aliases + for gff in paths['gff_files']: + gff_files[gff['path']].extend( + alias for alias in gff.get('alias', ()) + if alias.lower() not in screen_alias + ) + + # Remove duplicate aliases (per file), keep original order + for file, alias in gff_files.items(): + gff_files[file] = sorted(set(alias), key=alias.index) # Prepare supporting file inputs for GFF validation sam_files = [lib['File'] for lib in libraries] - ref_genomes = [map_path(fasta) for fasta in paths['reference_genome_files']] - ebwt_prefix = map_path(paths['ebwt']) # Todo: update GFFValidator so that genomes and ebwt can be safely passed here - GFFValidator(gff_files, args, alignments=sam_files).validate() + # ref_genomes = [map_path(fasta) for fasta in paths['reference_genome_files']] + # ebwt_prefix = map_path(paths['ebwt']) + GFFValidator(gff_files, args, alignments=sam_files).validate() return gff_files @@ -209,11 +216,13 @@ def map_and_reduce(libraries, paths, prefs): def main(): # Get command line arguments. args = get_args() + is_pipeline = args['is_pipeline'] try: - paths = PathsFile(args['paths_file']) - libraries = load_samples(paths['samples_csv'], args['is_pipeline']) - selection = load_config(paths['features_csv'], args['is_pipeline']) + # Load and validate config files and input files + paths = PathsFile(args['paths_file'], is_pipeline) + libraries = load_samples(paths['samples_csv'], is_pipeline) + selection = load_config(paths['features_csv'], is_pipeline) gff_files = load_gff_files(paths, libraries, args) # global for multiprocessing From 5dfffddf99c65971d36bddb85974dfad3ed67c94 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Wed, 26 Oct 2022 17:04:22 -0700 Subject: [PATCH 18/24] Updated CWL for new tiny-count inputs --- tiny/cwl/tools/tiny-count.cwl | 17 ++++++++++------- tiny/cwl/workflows/tinyrna_wf.cwl | 2 ++ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/tiny/cwl/tools/tiny-count.cwl b/tiny/cwl/tools/tiny-count.cwl index 419aa730..5bdf5a17 100644 --- a/tiny/cwl/tools/tiny-count.cwl +++ b/tiny/cwl/tools/tiny-count.cwl @@ -6,6 +6,8 @@ class: CommandLineTool requirements: - class: InitialWorkDirRequirement listing: [ + $(inputs.samples_csv), + $(inputs.features_csv), $(inputs.gff_files), $(inputs.aligned_seqs), $(inputs.fastp_logs), @@ -16,15 +18,10 @@ baseCommand: tiny-count stdout: console_output.log inputs: - samples_csv: - type: File - inputBinding: - prefix: -i - - features_csv: + paths_file: type: File inputBinding: - prefix: -f + prefix: -pf out_prefix: type: string @@ -75,6 +72,12 @@ inputs: # The following optional inputs are for staging InitialWorkingDir files for pipeline execution + samples_csv: + type: File + + features_csv: + type: File + # Specifies the GFF files defined in features.csv gff_files: type: File[]? diff --git a/tiny/cwl/workflows/tinyrna_wf.cwl b/tiny/cwl/workflows/tinyrna_wf.cwl index 06fb79b1..b46556b6 100644 --- a/tiny/cwl/workflows/tinyrna_wf.cwl +++ b/tiny/cwl/workflows/tinyrna_wf.cwl @@ -76,6 +76,7 @@ inputs: shared_memory: boolean? # counter inputs + paths_file: File samples_csv: File features_csv: File gff_files: File[]? @@ -202,6 +203,7 @@ steps: counter: run: ../tools/tiny-count.cwl in: + paths_file: paths_file samples_csv: samples_csv features_csv: features_csv aligned_seqs: bowtie/sam_out From 5b02f1ea9c9c7136fa3de4266b6e17b7bf47dd16 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Wed, 26 Oct 2022 17:04:52 -0700 Subject: [PATCH 19/24] Cleaning up the Run Config templates --- START_HERE/run_config.yml | 45 ++++++++++++------------ tiny/templates/run_config_template.yml | 47 ++++++++++++-------------- 2 files changed, 43 insertions(+), 49 deletions(-) diff --git a/START_HERE/run_config.yml b/START_HERE/run_config.yml index 9e687e39..3e050c29 100644 --- a/START_HERE/run_config.yml +++ b/START_HERE/run_config.yml @@ -1,21 +1,21 @@ ######----------------------------- tinyRNA Configuration -----------------------------###### # -# In this file you may specify your configuration preferences for the workflow and +# In this file you can specify your configuration preferences for the workflow and # each workflow step. # # If you want to use DEFAULT settings for the workflow, all you need to do is provide the path -# to your Samples Sheet and Features Sheet in your Paths file, then make sure that the -# 'paths_config' setting below points to your Paths file. +# to your Samples Sheet and Features Sheet in your Paths File, then make sure that the +# 'paths_config' setting below points to your Paths File. # # We suggest that you also: # 1. Add a username to identify the person performing runs, if desired for record keeping -# 2. Add a run directory name in your Paths file. If not provided, "run_directory" is used +# 2. Add a run directory name in your Paths File. If not provided, "run_directory" is used # 3. Add a run name to label your run directory and run-specific summary reports. # If not provided, user_tinyrna will be used. # # This file will be further processed at run time to generate the appropriate pipeline # settings for each workflow step. A copy of this processed configuration will be stored -# in your run directory (as specified by your Paths configuration file). +# in your run directory. # ######-------------------------------------------------------------------------------###### @@ -48,7 +48,7 @@ run_native: false # paths_config file. # # We have specified default parameters for small RNA data based on our own "best practices". -# You may change the parameters here. +# You can change the parameters here. # ######-------------------------------------------------------------------------------###### @@ -75,7 +75,7 @@ ftabchars: ~ # pipeline github: https://github.com/MontgomeryLab/tinyrna # # We have specified default parameters for small RNA data based on our own "best practices". -# You may change the parameters here. +# You can change the parameters here. # ######-------------------------------------------------------------------------------###### @@ -135,7 +135,7 @@ compression: 4 # Trimming takes place prior to counting/collapsing. # # We have specified default parameters for small RNA data based on our own "best practices". -# You may change the parameters here. +# You can change the parameters here. # ######-------------------------------------------------------------------------------###### @@ -157,7 +157,7 @@ compress: False # We use bowtie for read alignment to a genome. # # We have specified default parameters for small RNA data based on our own "best practices". -# You may change the parameters here. +# You can change the parameters here. # ######-------------------------------------------------------------------------------###### @@ -263,12 +263,11 @@ dge_drop_zero: False ######-------------------------------- PLOTTING OPTIONS -----------------------------###### # -# We use a custom Python script for creating all plots. The default base style is called -# 'smrna-light'. If you wish to use another matplotlib stylesheet you may specify that in -# the Paths File. +# We use a custom Python script for creating all plots. If you wish to use another matplotlib +# stylesheet you can specify that in the Paths File. # # We have specified default parameters for small RNA data based on our own "best practices". -# You may change the parameters here. +# You can change the parameters here. # ######-------------------------------------------------------------------------------###### @@ -297,7 +296,7 @@ plot_len_dist_max: ######----------------------------- OUTPUT DIRECTORIES ------------------------------###### # # Outputs for each step are organized into their own subdirectories in your run -# directory. You may set these folder names here. +# directory. You can set these folder names here. # ######-------------------------------------------------------------------------------###### @@ -314,7 +313,7 @@ dir_name_plotter: plots ######################### AUTOMATICALLY GENERATED CONFIGURATIONS ######################### # # Do not make any changes to the following sections. These options are automatically -# generated using your Paths file, your Samples and Features sheets, and the above +# generated using your Paths File, your Samples and Features sheets, and the above # settings in this file. # ########################################################################################### @@ -322,7 +321,7 @@ dir_name_plotter: plots ######--------------------------- DERIVED FROM PATHS FILE ---------------------------###### # -# The following configuration settings are automatically derived from the sample sheet +# The following configuration settings are automatically derived from the Paths File # ######-------------------------------------------------------------------------------###### @@ -330,6 +329,8 @@ run_directory: ~ tmp_directory: ~ features_csv: { } samples_csv: { } +paths_file: { } +gff_files: [ ] run_bowtie_build: false reference_genome_files: [ ] plot_style_sheet: ~ @@ -337,9 +338,9 @@ adapter_fasta: ~ ebwt: ~ -######-------------------------- DERIVED FROM SAMPLE SHEET --------------------------###### +######------------------------- DERIVED FROM SAMPLES SHEET --------------------------###### # -# The following configuration settings are automatically derived from the sample sheet +# The following configuration settings are automatically derived from the Samples Sheet # ######-------------------------------------------------------------------------------###### @@ -364,10 +365,6 @@ run_deseq: True ######------------------------- DERIVED FROM FEATURES SHEET -------------------------###### # -# The following configuration settings are automatically derived from the sample sheet +# The following configuration settings are automatically derived from the Features Sheet # -######-------------------------------------------------------------------------------###### - -###-- Utilized by tiny-count --### -# a list of only unique GFF files -gff_files: [ ] \ No newline at end of file +######-------------------------------------------------------------------------------###### \ No newline at end of file diff --git a/tiny/templates/run_config_template.yml b/tiny/templates/run_config_template.yml index 86853a02..01df1708 100644 --- a/tiny/templates/run_config_template.yml +++ b/tiny/templates/run_config_template.yml @@ -1,21 +1,21 @@ ######----------------------------- tinyRNA Configuration -----------------------------###### # -# In this file you may specify your configuration preferences for the workflow and +# In this file you can specify your configuration preferences for the workflow and # each workflow step. # # If you want to use DEFAULT settings for the workflow, all you need to do is provide the path -# to your Samples Sheet and Features Sheet in your Paths file, then make sure that the -# 'paths_config' setting below points to your Paths file. +# to your Samples Sheet and Features Sheet in your Paths File, then make sure that the +# 'paths_config' setting below points to your Paths File. # # We suggest that you also: # 1. Add a username to identify the person performing runs, if desired for record keeping -# 2. Add a run directory name in your Paths file. If not provided, "run_directory" is used +# 2. Add a run directory name in your Paths File. If not provided, "run_directory" is used # 3. Add a run name to label your run directory and run-specific summary reports. # If not provided, user_tinyrna will be used. # # This file will be further processed at run time to generate the appropriate pipeline # settings for each workflow step. A copy of this processed configuration will be stored -# in your run directory (as specified by your Paths configuration file). +# in your run directory. # ######-------------------------------------------------------------------------------###### @@ -48,7 +48,7 @@ run_native: false # paths_config file. # # We have specified default parameters for small RNA data based on our own "best practices". -# You may change the parameters here. +# You can change the parameters here. # ######-------------------------------------------------------------------------------###### @@ -75,7 +75,7 @@ ftabchars: ~ # pipeline github: https://github.com/MontgomeryLab/tinyrna # # We have specified default parameters for small RNA data based on our own "best practices". -# You may change the parameters here. +# You can change the parameters here. # ######-------------------------------------------------------------------------------###### @@ -135,7 +135,7 @@ compression: 4 # Trimming takes place prior to counting/collapsing. # # We have specified default parameters for small RNA data based on our own "best practices". -# You may change the parameters here. +# You can change the parameters here. # ######-------------------------------------------------------------------------------###### @@ -157,7 +157,7 @@ compress: False # We use bowtie for read alignment to a genome. # # We have specified default parameters for small RNA data based on our own "best practices". -# You may change the parameters here. +# You can change the parameters here. # ######-------------------------------------------------------------------------------###### @@ -263,12 +263,11 @@ dge_drop_zero: False ######-------------------------------- PLOTTING OPTIONS -----------------------------###### # -# We use a custom Python script for creating all plots. The default base style is called -# 'smrna-light'. If you wish to use another matplotlib stylesheet you may specify that in -# the Paths File. +# We use a custom Python script for creating all plots. If you wish to use another matplotlib +# stylesheet you can specify that in the Paths File. # # We have specified default parameters for small RNA data based on our own "best practices". -# You may change the parameters here. +# You can change the parameters here. # ######-------------------------------------------------------------------------------###### @@ -297,7 +296,7 @@ plot_len_dist_max: ######----------------------------- OUTPUT DIRECTORIES ------------------------------###### # # Outputs for each step are organized into their own subdirectories in your run -# directory. You may set these folder names here. +# directory. You can set these folder names here. # ######-------------------------------------------------------------------------------###### @@ -314,15 +313,15 @@ dir_name_plotter: plots ######################### AUTOMATICALLY GENERATED CONFIGURATIONS ######################### # # Do not make any changes to the following sections. These options are automatically -# generated using your Paths file, your Samples and Features sheets, and the above +# generated using your Paths File, your Samples and Features sheets, and the above # settings in this file. # ########################################################################################### -######--------------------------- DERIVED FROM PATHS SHEET --------------------------###### +######--------------------------- DERIVED FROM PATHS FILE ---------------------------###### # -# The following configuration settings are automatically derived from the sample sheet +# The following configuration settings are automatically derived from the Paths File # ######-------------------------------------------------------------------------------###### @@ -330,6 +329,8 @@ run_directory: ~ tmp_directory: ~ features_csv: { } samples_csv: { } +paths_file: { } +gff_files: [ ] run_bowtie_build: false reference_genome_files: [ ] plot_style_sheet: ~ @@ -337,9 +338,9 @@ adapter_fasta: ~ ebwt: ~ -######-------------------------- DERIVED FROM SAMPLE SHEET --------------------------###### +######------------------------- DERIVED FROM SAMPLES SHEET --------------------------###### # -# The following configuration settings are automatically derived from the sample sheet +# The following configuration settings are automatically derived from the Samples Sheet # ######-------------------------------------------------------------------------------###### @@ -364,10 +365,6 @@ run_deseq: True ######------------------------- DERIVED FROM FEATURES SHEET -------------------------###### # -# The following configuration settings are automatically derived from the sample sheet +# The following configuration settings are automatically derived from the Features Sheet # -######-------------------------------------------------------------------------------###### - -###-- Utilized by tiny-count --### -# a list of only unique GFF files -gff_files: [ ] \ No newline at end of file +######-------------------------------------------------------------------------------###### \ No newline at end of file From e9257ef4a3d99f08a428b3c2296117d3c164094a Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Wed, 26 Oct 2022 18:10:29 -0700 Subject: [PATCH 20/24] Moving paths_template_file and make_paths_file() into unit_test_helpers so that it can be used by unit_tests_counter --- tests/unit_test_helpers.py | 18 ++++++++++++++++- tests/unit_tests_configuration.py | 33 +++++++++---------------------- 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/tests/unit_test_helpers.py b/tests/unit_test_helpers.py index b9538b1b..d71dca04 100644 --- a/tests/unit_test_helpers.py +++ b/tests/unit_test_helpers.py @@ -14,7 +14,7 @@ from typing import List -from tiny.rna.configuration import CSVReader +from tiny.rna.configuration import CSVReader, PathsFile rules_template = [{'Identity': ("Name", "N/A"), 'Strand': "both", @@ -47,6 +47,22 @@ def csv_factory(type: str, rows: List[dict], header=()): return csv_string.getvalue() +paths_template_file = os.path.abspath('../tiny/templates/paths.yml') + + +def make_paths_file(in_pipeline=False, prefs=None): + """IMPORTANT: relative file paths are evaluated relative to /tiny/templates/""" + + paths_file = paths_template_file + config = PathsFile(paths_file, in_pipeline) + if prefs is None: return config + + for key, val in prefs.items(): + config[key] = val + + return config + + def get_dir_tree(root_path: str) -> dict: """Returns a nested dictionary representation of a given directory tree. diff --git a/tests/unit_tests_configuration.py b/tests/unit_tests_configuration.py index 625912dc..fe99d846 100644 --- a/tests/unit_tests_configuration.py +++ b/tests/unit_tests_configuration.py @@ -6,7 +6,7 @@ from unittest.mock import patch, mock_open, call from tiny.rna.configuration import Configuration, SamplesSheet, PathsFile -from unit_test_helpers import csv_factory +from unit_test_helpers import csv_factory, paths_template_file, make_paths_file class BowtieIndexesTest(unittest.TestCase): @@ -203,29 +203,14 @@ def test_validate_fastq_filepath(self): class PathsFileTest(unittest.TestCase): - """============ Helper functions ============""" - @classmethod def setUpClass(self): - self.template_file = os.path.abspath('../tiny/templates/paths.yml') - self.template_dir = os.path.dirname(self.template_file) - - def make_paths_file(self, prefs=None): - """IMPORTANT: relative file paths are evaluated relative to /tiny/templates/""" - - paths_file = self.template_file - config = PathsFile(paths_file) - if prefs is None: return config - - for key, val in prefs.items(): - config[key] = val - - return config + self.template_dir = os.path.dirname(paths_template_file) """Does PathsFile automatically resolve paths when queried?""" def test_getitem_single(self): - config = self.make_paths_file() + config = make_paths_file() config['mock_parameter'] = "./some/../file" self.assertEqual(config['mock_parameter'], os.path.join(self.template_dir, "file")) @@ -233,7 +218,7 @@ def test_getitem_single(self): strings, others are mappings with a "path" key, and others are None/empty?""" def test_getitem_group(self): - config = self.make_paths_file() + config = make_paths_file() config.groups = ('mock_parameter',) mapping_1 = {'path': "./some/../file", "other_key": "irrelevant"} @@ -256,7 +241,7 @@ def test_getitem_group(self): """Does PathsFile check for required parameters?""" def test_validate_required_parameters(self): - config = self.make_paths_file() + config = make_paths_file() for key in PathsFile.required: oldval = config[key] @@ -277,7 +262,7 @@ def test_validate_required_parameters(self): """Does PathsFile check that at least one GFF file has been provided?""" def test_validate_gff_files(self): - config = self.make_paths_file() + config = make_paths_file() with self.assertRaisesRegex(AssertionError, r".*(At least one GFF).*"): config['gff_files'] = [{'path': "", 'alias': []}] @@ -290,7 +275,7 @@ def test_validate_gff_files(self): """Does PathsFile check for missing files for single entry parameters?""" def test_validate_missing_file_single(self): - config = self.make_paths_file() + config = make_paths_file() key = random.choice(PathsFile.single) config[key] = '/dev/null/file_dne' @@ -300,7 +285,7 @@ def test_validate_missing_file_single(self): """Does PathsFile check for missing files under list-type parameters?""" def test_validate_missing_file_group(self): - config = self.make_paths_file() + config = make_paths_file() bad_path = "/dev/null/file_dne" config.append_to('gff_files', {'path': bad_path, 'alias': []}) @@ -311,7 +296,7 @@ def test_validate_missing_file_group(self): """Does PathsFile detect a backward compatibility issue?""" def test_backward_incompatibility(self): - config = self.make_paths_file() + config = make_paths_file() del config.config['gff_files'] with self.assertRaisesRegex(AssertionError, r".*(check the release notes).*"): From 5949395a32a9cee3113a4573246a4ef8ed10c63b Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Wed, 26 Oct 2022 18:14:24 -0700 Subject: [PATCH 21/24] Added new test for pipeline-context path mapping. This commit also contains updates for PathsFile's new automatic path resolution, bugfixes for chained assignment using PathsFile (BAD!), and so forms the default prefix from the given test's PathsFile object (previously a new timestamp was created at minute boundaries and causing intermittent test failures) --- tests/unit_tests_configuration.py | 57 ++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 13 deletions(-) diff --git a/tests/unit_tests_configuration.py b/tests/unit_tests_configuration.py index fe99d846..5b9b63e2 100644 --- a/tests/unit_tests_configuration.py +++ b/tests/unit_tests_configuration.py @@ -16,11 +16,6 @@ def setUpClass(self): self.run_config = self.root_cfg_dir + "/run_config_template.yml" self.paths = self.root_cfg_dir + "/paths.yml" - self.default_prefix = os.path.join( - self.root_cfg_dir, - Configuration(self.run_config)['run_directory'], - "bowtie-build/ram1" - ) self.maxDiff = 1522 """============ Helper functions ============""" @@ -32,11 +27,21 @@ def config_with(self, prefs): return config def bt_idx_files_from_prefix(self, prefix): + if not os.path.abspath(prefix): + prefix = os.path.normpath(os.path.join(self.root_cfg_dir, prefix)) + return [ {'path': f"{prefix}.{subext}.ebwt", 'class': 'File'} for subext in ['1', '2', '3', '4', 'rev.1', 'rev.2'] ] + def get_default_prefix(self, config): + return os.path.join( + self.root_cfg_dir, + config['run_directory'], + "bowtie-build/ram1" + ) + """================ Tests ==================""" """Does get_ebwt_prefix() produce the expected prefix path?""" @@ -44,7 +49,7 @@ def bt_idx_files_from_prefix(self, prefix): def test_get_ebwt_prefix(self): config = Configuration(self.run_config) actual_prefix = config.get_ebwt_prefix() - expected_prefix = self.default_prefix + expected_prefix = self.get_default_prefix(config) self.assertEqual(actual_prefix, expected_prefix) @@ -61,7 +66,8 @@ def test_get_ebwt_prefix_no_genome(self): def test_get_bt_index_files_prebuilt_indexes(self): config = self.config_with({'run_bowtie_build': False}) - prefix = config.paths['ebwt'] = os.path.abspath("./testdata/counter/validation/ebwt/ram1") + prefix = os.path.abspath("./testdata/counter/validation/ebwt/ram1") + config.paths['ebwt'] = prefix expected = self.bt_idx_files_from_prefix(prefix) self.assertListEqual(config.get_bt_index_files(), expected) @@ -70,7 +76,8 @@ def test_get_bt_index_files_prebuilt_indexes(self): def test_get_bt_index_files_unbuilt_indexes_with_genome(self): config = self.config_with({'run_bowtie_build': True}) - prefix = config.paths['ebwt'] = "mock_prefix" + config.paths['ebwt'] = "mock_prefix" + prefix = config.paths['ebwt'] expected = self.bt_idx_files_from_prefix(prefix) self.assertListEqual(config.get_bt_index_files(), expected) @@ -79,10 +86,11 @@ def test_get_bt_index_files_unbuilt_indexes_with_genome(self): def test_get_bt_index_files_missing_indexes_without_genome(self): config = self.config_with({'run_bowtie_build': False, 'reference_genome_files': None}) - prefix = config.paths['ebwt'] = "missing" + prefix = "missing" + config.paths['ebwt'] = prefix errmsg = '\n'.join([ "The following Bowtie index file couldn't be found:", - "\t" + f"{prefix}.1.ebwt", + "\t" + f"{self.root_cfg_dir}/{prefix}.1.ebwt", "\nPlease either correct your ebwt prefix or add reference genomes in the Paths File." ]) @@ -95,13 +103,14 @@ def test_get_bt_index_files_missing_indexes_without_genome(self): def test_get_bt_index_files_missing_indexes_with_genome(self): config = self.config_with({'run_bowtie_build': False}) - bad_prefix = config.paths['ebwt'] = "missing" - genome_prefix = self.default_prefix + bad_prefix = "missing" + config.paths['ebwt'] = bad_prefix + genome_prefix = self.get_default_prefix(config) expected_files = self.bt_idx_files_from_prefix(genome_prefix) expected_error = '\n'.join([ "The following Bowtie index file couldn't be found:", - "\t" + f"{bad_prefix}.1.ebwt", + "\t" + f"{self.root_cfg_dir}/{bad_prefix}.1.ebwt", "\nIndexes will be built from your reference genome files during this run.", "" ]) @@ -302,5 +311,27 @@ def test_backward_incompatibility(self): with self.assertRaisesRegex(AssertionError, r".*(check the release notes).*"): config.check_backward_compatibility() + """Does PathsFile map all paths to the working directory when in_pipeline=True?""" + + def test_pipeline_mapping(self): + # There are entries in the template Paths File that will cause FileNotFound + # when in_pipeline=True (they're not in the template directory). Since + # file existence isn't relevant for this test, we patch that out. + with patch('tiny.rna.configuration.os.path.isfile', return_value=True): + config = make_paths_file(in_pipeline=True) + + config['mock_mapping'] = {'path': "/dev/null/file_dne", 'other': "irrelevant"} + config['mapping_no_path'] = {'nopath': True} + config['mock_path'] = "/a/very/long/path.gz" + config['empty_path'] = "" + config['none_path'] = None + + self.assertDictEqual(config['mock_mapping'], {'path': "file_dne", 'other': "irrelevant"}) + self.assertDictEqual(config['mapping_no_path'], {'nopath': True}) + self.assertEqual(config['mock_path'], "path.gz") + self.assertEqual(config['empty_path'], "") + self.assertEqual(config['none_path'], None) + + if __name__ == '__main__': unittest.main() From 80dfe5d5b79e9e78130af93d73825f0e0f82541b Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Wed, 26 Oct 2022 18:15:03 -0700 Subject: [PATCH 22/24] Updated tests for new Features Sheet layout, and added PathsFile-specific tests for loading GFF entries --- tests/unit_tests_counter.py | 66 ++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 31 deletions(-) diff --git a/tests/unit_tests_counter.py b/tests/unit_tests_counter.py index 6d79207f..080ca838 100644 --- a/tests/unit_tests_counter.py +++ b/tests/unit_tests_counter.py @@ -32,14 +32,12 @@ def setUpClass(self): self.csv_feat_row_dict = { 'Key': "Class", 'Value': "CSR", - 'Name': "Alias", 'Tag': "", 'Hierarchy': "1", 'Strand': "antisense", "nt5end": '"C,G,U"', # Needs to be double-quoted due to commas 'Length': "all", - 'Overlap': "Partial", - 'Source': "test_file.gff3" + 'Overlap': "Partial" } # Represents the parsed Features Sheet row above @@ -183,13 +181,9 @@ def test_load_config_single_cmd(self): with patch('tiny.rna.configuration.open', mock_open(read_data=csv)): dummy_file = '/dev/null' - ruleset, gff_files = counter.load_config(dummy_file, is_pipeline=False) + ruleset = counter.load_config(dummy_file, is_pipeline=False) expected_ruleset = self.parsed_feat_rule - expected_gff_file = from_here(dummy_file, row['Source']) - expected_gff_ret = defaultdict(list, zip([expected_gff_file], [[row['Name']]])) - - self.assertEqual(gff_files, expected_gff_ret) self.assertEqual(ruleset, expected_ruleset) """Does load_config correctly parse a single-entry features.csv config file for pipeline invocation?""" @@ -201,13 +195,9 @@ def test_load_config_single_pipeline(self): with patch('tiny.rna.configuration.open', mock_open(read_data=csv)): dummy_file = '/dev/null' - ruleset, gff_files = counter.load_config(dummy_file, is_pipeline=True) + ruleset = counter.load_config(dummy_file, is_pipeline=True) expected_ruleset = self.parsed_feat_rule - expected_gff_file = os.path.basename(row['Source']) - expected_gff_ret = defaultdict(list, zip([expected_gff_file], [[row['Name']]])) - - self.assertEqual(gff_files, expected_gff_ret) self.assertEqual(ruleset, expected_ruleset) """Does load_config correctly handle duplicate rules? Want: no duplicate rules and no duplicate Name Attributes.""" @@ -219,13 +209,9 @@ def test_load_config_duplicate_rules(self): with patch('tiny.rna.configuration.open', mock_open(read_data=csv)): dummy_filename = '/dev/null' - ruleset, gff_files = counter.load_config(dummy_filename, False) + ruleset = counter.load_config(dummy_filename, False) expected_ruleset = self.parsed_feat_rule - expected_gff_file = from_here(dummy_filename, row['Source']) - expected_gff_ret = defaultdict(list, zip([expected_gff_file], [[row['Name']]])) - - self.assertEqual(gff_files, expected_gff_ret) self.assertEqual(ruleset, expected_ruleset) """Does load_config convert uracil to thymine for proper matching with cDNA sequences?""" @@ -237,26 +223,44 @@ def test_load_config_rna_to_cDNA(self): with patch('tiny.rna.configuration.open', mock_open(read_data=csv)): dummy_file = '/dev/null' - ruleset, _ = counter.load_config(dummy_file, False) + ruleset = counter.load_config(dummy_file, False) self.assertEqual(ruleset[0]['nt5end'], 'T') - """Does load_config properly screen for "ID" Name Attributes?""" + """Does load_gff_files properly screen for "ID" alias attributes?""" - def test_load_config_id_name_attr(self): - row = self.csv_feat_row_dict.copy() - row['Name'] = 'ID' - csv = self.csv("features.csv", [row]) + def test_load_gff_files_id_alias_attr(self): + config = helpers.make_paths_file() + config['gff_files'] = [{'path': "/irrelevant", 'alias': ['ID']}] - with patch('tiny.rna.configuration.open', mock_open(read_data=csv)): - dummy_file = '/dev/null' - _, gff_files = counter.load_config(dummy_file, False) + # Patch GFFValidator so that it isn't called (not relevant) + with patch('tiny.rna.counter.counter.GFFValidator'): + gff_files = counter.load_gff_files(config, [], {}) + + expected = {"/irrelevant": []} + self.assertDictEqual(gff_files, expected) + + """Does load_gff_files merge aliases if the same GFF file is listed more than once? + Are duplicates also removed and original order preserved?""" - # Expect {file: [empty Name Attribute list]} - from_dummy = from_here(dummy_file, row['Source']) - expected = defaultdict(list, zip([from_dummy], [[]])) - self.assertEqual(gff_files, expected) + def test_load_gff_files_merge_alias_attr(self): + config = helpers.make_paths_file() + config['gff_files'] = [ + {'path': "/same/path", 'alias': ['attr1', 'attr2']}, + {'path': "/same/path", 'alias': ['attr1', 'attrN']}, + {'path': "/different/path", 'alias': ['attr3']} + ] + + # Patch GFFValidator so that it isn't called (not relevant) + with patch('tiny.rna.counter.counter.GFFValidator'): + gff_files = counter.load_gff_files(config, [], {}) + + expected = { + "/same/path": ['attr1', 'attr2', 'attrN'], + "/different/path": ['attr3'] + } + self.assertDictEqual(gff_files, expected) if __name__ == '__main__': unittest.main() From a10c4e9a62f7e05ed8c3445cec0d16437dfb4793 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 27 Oct 2022 14:34:56 -0700 Subject: [PATCH 23/24] Minor formatting simplification and helpful comment for the gff_files entry --- START_HERE/paths.yml | 8 ++++---- tiny/templates/paths.yml | 6 ++++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/START_HERE/paths.yml b/START_HERE/paths.yml index 7d58eb79..444ac5c6 100644 --- a/START_HERE/paths.yml +++ b/START_HERE/paths.yml @@ -17,10 +17,10 @@ features_csv: ./features.csv ##-- Each entry: 1. the file, 2. (optional) list of attribute keys for feature aliases --## gff_files: - - path: "./reference_data/ram1.gff3" - alias: [Alias] -# - path: -# alias: [] +- path: "./reference_data/ram1.gff3" + alias: [Alias] +#- path: +# alias: [ ] ##-- The final output directory for files produced by the pipeline --# run_directory: run_directory diff --git a/tiny/templates/paths.yml b/tiny/templates/paths.yml index 8bb710ec..8404f51a 100644 --- a/tiny/templates/paths.yml +++ b/tiny/templates/paths.yml @@ -16,8 +16,10 @@ features_csv: './features.csv' ##-- Each entry: 1. the file, 2. (optional) list of attribute keys for feature aliases --## gff_files: - - path: "../../START_HERE/reference_data/ram1.gff3" - alias: [ID] +- path: "../../START_HERE/reference_data/ram1.gff3" + alias: [ID] +#- path: +# alias: [ ] ##-- The final output directory for files produced by the pipeline --# run_directory: run_directory From 64adab618fb4e7237a7907f8cb4e1aff135f350c Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 27 Oct 2022 14:39:20 -0700 Subject: [PATCH 24/24] Documentation updates for the new Paths-File-hosted GFF files and their aliases --- README.md | 8 ++++---- doc/Configuration.md | 31 ++++++++++++++++++++++--------- doc/Parameters.md | 10 ++++------ doc/tiny-count.md | 12 ++---------- 4 files changed, 32 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 0681e113..917dc442 100644 --- a/README.md +++ b/README.md @@ -105,8 +105,8 @@ In most cases you will use this toolset as an end-to-end pipeline. This will run 2. The genome sequence of interest in fasta format. 3. Genome coordinates of small RNA features of interest in GFF format. 4. A completed Samples Sheet (`samples.csv`) with paths to the fastq files. -5. A completed Features Sheet (`features.csv`) with paths to the GFF file(s). -6. An updated Paths File (`paths.yml`) with the path to the genome sequence and/or your bowtie index prefix. +5. A completed Features Sheet (`features.csv`) with feature selection rules. +6. An updated Paths File (`paths.yml`) with paths to your GFF files, the genome sequence and/or your bowtie index prefix, as well as the paths to `samples.csv` and `features.csv`. 7. A Run Config file (`run_config.yml`) located in your working directory or the path to the file. The template provided does not need to be updated if you wish to use the default settings. To run an end-to-end analysis, be sure that you're working within the conda tinyrna environment ([instructions above](#usage)) in your terminal and optionally navigate to the location of your Run Config file. Then, simply run the following in your terminal: @@ -174,9 +174,9 @@ A "collapsed" FASTA contains unique reads found in fastp's quality filtered FAST The tiny-count step produces a variety of outputs #### Feature Counts -Custom Python scripts and HTSeq are used to generate a single table of feature counts that includes columns for each library analyzed. A feature's _Feature ID_ and _Feature Class_ are simply the values of its `ID` and `Class` attributes. Features lacking a Class attribute will be assigned class `_UNKNOWN_`. We have also included a _Feature Name_ column which displays aliases of your choice, as specified in the _Alias by..._ column of the Features Sheet. If _Alias by..._ is set to`ID`, the _Feature Name_ column is left empty. +Custom Python scripts and HTSeq are used to generate a single table of feature counts that includes columns for each library analyzed. A feature's _Feature ID_ and _Feature Class_ are simply the values of its `ID` and `Class` attributes. Features lacking a Class attribute will be assigned class `_UNKNOWN_`. We have also included a _Feature Name_ column which displays aliases of your choice, as specified in the `alias` key under each GFF listed in your Paths File. If `alias` is set to`ID`, the _Feature Name_ column is left empty. -For example, if your Features Sheet has a rule which specifies _Alias by..._ `sequence_name` and the GFF entry for this feature has the following attributes column: +For example, if your Paths File has a GFF entry which specifies `alias: [sequence_name]`, and the corresponding GFF file has a feature with the following attributes column: ``` ... ID=406904;sequence_name=mir-1,hsa-miR-1;Class=miRNA; ... ``` diff --git a/doc/Configuration.md b/doc/Configuration.md index b5ec77ec..9ad24084 100644 --- a/doc/Configuration.md +++ b/doc/Configuration.md @@ -14,15 +14,15 @@ tiny get-template ## Overview ->**Tip**: Each of the following will allow you to map out paths to your input files for analysis. You can use either relative or absolute paths to do so. **Relative paths will be evaluated relative to the file in which they are defined.** This allows you to flexibly organize and share configurations between projects. +>**Tip**: You can use either relative or absolute paths for your file inputs. **Relative paths will be evaluated relative to the file in which they are defined.** This allows you to flexibly organize and share configurations between projects. #### Run Config -The overall behavior of the pipeline and its steps is determined by the Run Config file (`run_config.yml`). This YAML file can be edited using a simple text editor. Within it you must specify the location of your Paths file (`paths.yml`). All other settings are optional. [More info](#run-config-details). +The overall behavior of the pipeline and its steps is determined by the Run Config file (`run_config.yml`). This YAML file can be edited using a simple text editor. Within it you must specify the location of your Paths File (`paths.yml`). All other settings are optional. [More info](#run-config-details). #### Paths File -The locations of pipeline file inputs are defined in the Paths file (`paths.yml`). This YAML file includes paths to your Samples and Features Sheets, in addition to your bowtie index prefix (optional) and the final run directory name. The final run directory will contain all pipeline outputs. The directory name is prepended with the `run_name` and current date and time to keep outputs separate. [More info](#paths-file-details). +The locations of pipeline file inputs are defined in the Paths file (`paths.yml`). This YAML file includes paths to your configuration files, your GFF files, and your bowtie indexes and/or reference genome. [More info](#paths-file-details). #### Samples Sheet @@ -91,6 +91,15 @@ When the pipeline starts up, tinyRNA will process the Run Config based on the co ## Paths File Details +### GFF Files +GFF annotations are required by tinyRNA. For each file, you can provide an `alias` which is a list of attributes to represent each feature in the Feature Name column of output counts tables. Each entry under the `gff_files` parameter must look something like the following mock example: +```yaml + - path: 'a/path/to/your/file.gff' # 0 spaces before - + alias: [optional, list, of attributes] # 2 spaces before alias + +# ^ Each new GFF path must begin with - +``` + ### Building Bowtie Indexes If you don't have bowtie indexes already built for your reference genome, tinyRNA can build them for you at the beginning of an end-to-end run and reuse them on subsequent runs with the same Paths File. @@ -101,6 +110,12 @@ To build bowtie indexes: Once your indexes have been built, your Paths File will be modified such that `ebwt` points to their location (prefix) within your Run Directory. This means that indexes will not be unnecessarily rebuilt on subsequent runs as long as the same Paths File is used. If you need them rebuilt, simply repeat steps 2 and 3 above. +### The Run Directory +The final output directory name has three components: +- The `run_name` defined in your Run Config +- The date and time at pipeline startup +- The `run_directory` basename defined in your Paths File + ## Samples Sheet Details | _Column:_ | Input FASTQ Files | Sample/Group Name | Replicate Number | Control | Normalization | |-----------:|---------------------|-------------------|------------------|---------|---------------| @@ -123,19 +138,17 @@ Supported values are: DESeq2 requires that your experiment design has at least one degree of freedom. If your experiment doesn't include at least one sample group with more than one replicate, tiny-deseq.r will be skipped and DGE related plots will not be produced. ## Features Sheet Details -| _Column:_ | Select for... | with value... | Alias by... | Tag | Hierarchy | Strand | 5' End Nucleotide | Length | Overlap | Feature Source | -|------------|---------------|---------------|-------------|-----|-----------|--------|-------------------|--------|-------------|----------------| -| _Example:_ | Class | miRNA | Name | | 1 | sense | all | all | 5' anchored | ram1.gff3 | +| _Column:_ | Select for... | with value... | Tag | Hierarchy | Strand | 5' End Nucleotide | Length | Overlap | +|------------|---------------|---------------|-----|-----------|--------|-------------------|--------|-------------| +| _Example:_ | Class | miRNA | | 1 | sense | all | all | 5' anchored | The Features Sheet allows you to define selection rules that determine how features are chosen when multiple features are found overlap an alignment locus. Selected features are "assigned" a portion of the reads associated with the alignment. -Rules apply to features parsed from **all** Feature Sources, with the exception of "Alias by..." which only applies to the Feature Source on the same row. Selection first takes place against feature attributes (GFF column 9), and is directed by defining the attribute you want to be considered (Select for...) and the acceptable values for that attribute (with value...). +Selection first takes place against the feature attributes defined in your GFF files, and is directed by defining the attribute you want to be considered (Select for...) and the acceptable values for that attribute (with value...). Rules that match features in the first stage of selection will be used in a second stage which evaluates alignment vs. feature interval overlap. These matches are sorted by hierarchy value and passed to the third and final stage of selection which examines characteristics of the alignment itself: strand relative to the feature of interest, 5' end nucleotide, and length. See [tiny-count's documentation](tiny-count.md#feature-selection) for an explanation of each column. ->**Tip**: Don't worry about having duplicate Feature Source entries. Each GFF file is parsed only once. - ## Plot Stylesheet Details Matplotlib uses key-value "rc parameters" to allow for customization of its properties and styles, and one way these parameters can be specified is with a [matplotlibrc file](https://matplotlib.org/3.4.3/tutorials/introductory/customizing.html#a-sample-matplotlibrc-file), which we simply refer to as the Plot Stylesheet. You can obtain a copy of the default stylesheet used by tiny-plot with the command `tiny get-template`. Please keep in mind that tiny-plot overrides these defaults for a few specific elements of certain plots. Feel free to reach out if there is a plot style you wish to override but find you are unable to. \ No newline at end of file diff --git a/doc/Parameters.md b/doc/Parameters.md index b8ca2d63..77ccb787 100644 --- a/doc/Parameters.md +++ b/doc/Parameters.md @@ -122,8 +122,8 @@ Diagnostic information will include intermediate alignment files for each librar ### Full tiny-count Help String ``` -tiny-count -i SAMPLES -f FEATURES -o OUTPUTPREFIX [-h] - [-sf [SOURCE ...]] [-tf [TYPE ...]] [-nh T/F] [-dc] [-a] +tiny-count -pf PATHS -o OUTPUTPREFIX [-h] [-sf [SOURCE ...]] + [-tf [TYPE ...]] [-nh T/F] [-dc] [-sv {Cython,HTSeq}] [-a] [-p] [-d] This submodule assigns feature counts for SAM alignments using a Feature Sheet @@ -132,10 +132,8 @@ prior run, we recommend that you instead run `tiny recount` within that run's directory. Required arguments: - -i SAMPLES, --samples-csv SAMPLES - your Samples Sheet - -f FEATURES, --features-csv FEATURES - your Features Sheet + -pf PATHS, --paths-file PATHS + your Paths File -o OUTPUTPREFIX, --out-prefix OUTPUTPREFIX output prefix to use for file names diff --git a/doc/tiny-count.md b/doc/tiny-count.md index ad5e1314..fc08d240 100644 --- a/doc/tiny-count.md +++ b/doc/tiny-count.md @@ -111,23 +111,15 @@ Examples: >**Tip:** you may specify U and T bases in your rules. Uracil bases will be converted to thymine when your Features Sheet is loaded. N bases are also allowed. -### Misc -| features.csv columns: | Alias by... | Feature Source | -|-----------------------|-------------|----------------| - -You may specify an **Alias by...** which is a GFF column 9 attribute key you wish to represent each feature. The intention of this column is to provide a human-friendly name for each feature. The value associated with each feature's **Alias by...** attribute will be shown in the `Feature Name` column of the Feature Counts output table. For example, if one of your rules specifies an alias of `sequence_name` and gene1's `sequence_name` attribute is "abc123", then gene1's `Feature Name` column in the Feature Counts table will read "abc123". - -The **Feature Source** field of a rule is tied only to the **Alias by...**; rules are _not_ partitioned on a GFF file basis, and features parsed from these GFF files are similarly not partitioned as they all go into the same lookup table regardless of source. For each rule, aliases are built on a per-GFF file basis; that is, **Alias by...** values will only be gathered from their corresponding **Feature Source**. Additionally, each GFF file is parsed only once regardless of the number of times it occurs in the Features Sheet. - ## Count Normalization Small RNA reads passing selection will receive a normalized count increment. By default, read counts are normalized twice before being assigned to a feature. The second normalization step can be disabled in `run_config.yml` if desired. Counts for each small RNA sequence are divided: 1. By the number of loci it aligns to in the genome. 2. By the number of _selected_ features for each of its alignments. ## The Details -You may encounter the following cases when you have more than one unique GFF file listed in your **Feature Source**s: +You may encounter the following cases when you have more than one unique GFF file listed in your Paths File: - If a feature is defined in one GFF file, then again in another but with differing attributes, rule and alias matches will be merged for the feature -- If a feature is defined in one GFF file, then again but under a different **Alias by...**, then both aliases are retained and treated as a list. All aliases will be present in the `Feature Name` column of the Feature Counts output table. They will be comma separated. +- If a feature is defined in one GFF file, then again but with a different `alias` parameter, then all unique aliases are retained for the feature and listed in its `Feature Name` column. Discontinuous features and feature filtering support: - Discontinuous features are supported (as defined by the `Parent` attribute key, or by a shared `ID`/`gene_id` attribute value). Rule and alias matches of descendents are merged with the root parent's.