From 8b1114a9691652e4877bfdb138ea4445e5136b0d Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 27 Jan 2023 14:00:37 -0800 Subject: [PATCH 01/21] Updating PathsFile class to no longer treat gff_files as required, and to skip empty paths under the gff_files key --- tiny/rna/configuration.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py index e2cdb2db..88f00a85 100644 --- a/tiny/rna/configuration.py +++ b/tiny/rna/configuration.py @@ -449,7 +449,7 @@ class PathsFile(ConfigBase): """ # Parameter types - required = ('samples_csv', 'features_csv', 'gff_files') + required = ('samples_csv', 'features_csv') single = ('samples_csv', 'features_csv', 'plot_style_sheet', 'adapter_fasta') groups = ('reference_genome_files', 'gff_files') prefix = ('ebwt', 'run_directory', 'tmp_directory') @@ -508,9 +508,10 @@ def validate_paths(self): "The following parameters are required in {selfname}: {params}" \ .format(selfname=self.basename, params=', '.join(self.required)) - assert any(gff.get('path') for gff in self['gff_files']), \ - "At least one GFF file path must be specified under gff_files in {selfname}" \ - .format(selfname=self.basename) + if not any(gff.get('path') for gff in self['gff_files']): + print("No GFF files were specified in {selfname} so " + "reads will be counted in non-genomic mode." + .format(selfname=self.basename)) # Some entries in Paths File are omitted from tiny-count's working directory during # pipeline runs. There is no advantage to checking file existence here vs. in load_* @@ -526,6 +527,7 @@ def validate_paths(self): for key in self.groups: for entry in self[key]: if isinstance(entry, dict): entry = entry['path'] + if not entry: continue assert os.path.isfile(entry), \ "The following file provided under {key} in {selfname} could not be found:\n\t{file}" \ .format(key=key, selfname=self.basename, file=entry) From 0b3cc407ba0e77c91a190e6d25c947ce0e1cdd0f Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 27 Jan 2023 14:02:41 -0800 Subject: [PATCH 02/21] Minor refactor to match the order of enumerate()'s (index, element) outputs since this code is essentially an enumerate() with an upper limit --- tiny/rna/counter/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tiny/rna/counter/validation.py b/tiny/rna/counter/validation.py index dbc6d5cc..ae8c757c 100644 --- a/tiny/rna/counter/validation.py +++ b/tiny/rna/counter/validation.py @@ -241,7 +241,7 @@ def alignment_chroms_mismatch_heuristic(self, sam_files: List[str], subset_size= for file in sam_files: file_chroms = set() with open(file, 'rb') as f: - for line, i in zip(f, range(subset_size)): + for i, line in zip(range(subset_size), f): if line[0] == ord('@'): continue file_chroms.add(line.split(b'\t')[2].strip().decode()) if i % 10000 == 0 and len(file_chroms & self.chrom_set): break From bd168cca3cb596be1d6a06a0d0fe1b9c6ea0b542 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 27 Jan 2023 14:04:28 -0800 Subject: [PATCH 03/21] Updating tiny-count's load_gff_files() to treat GFF files as optional --- tiny/rna/counter/counter.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/tiny/rna/counter/counter.py b/tiny/rna/counter/counter.py index a20dd3a1..1d4cbe41 100644 --- a/tiny/rna/counter/counter.py +++ b/tiny/rna/counter/counter.py @@ -178,23 +178,21 @@ def load_gff_files(paths: PathsFile, libraries: List[dict], rules: List[dict]) - # Build dictionary of unique files and allowed aliases for gff in paths['gff_files']: - gff_files[gff['path']].extend( - alias for alias in gff.get('alias', ()) - if alias.lower() not in ignore_alias - ) + if gff['path'] is not None: + gff_files[gff['path']].extend( + alias for alias in gff.get('alias', ()) + if alias.lower() not in ignore_alias + ) # Remove duplicate aliases (per file), keep original order for file, alias in gff_files.items(): gff_files[file] = sorted(set(alias), key=alias.index) - # Prepare supporting file inputs for GFF validation - sam_files = [lib['File'] for lib in libraries] + if gff_files: + # Prepare supporting file inputs for GFF validation + sam_files = [lib['File'] for lib in libraries] + GFFValidator(gff_files, rules, alignments=sam_files).validate() - # Todo: update GFFValidator so that genomes and ebwt can be safely passed here - # ref_genomes = [map_path(fasta) for fasta in paths['reference_genome_files']] - # ebwt_prefix = map_path(paths['ebwt']) - - GFFValidator(gff_files, rules, alignments=sam_files).validate() return gff_files @@ -235,7 +233,7 @@ def main(): # global for multiprocessing global counter - counter = FeatureCounter(gff_files, selection, **args) + counter = FeatureCounter(gff_files, libraries, selection, **args) # Assign and count features using multiprocessing and merge results merged_counts = map_and_reduce(libraries, paths, args) From 46ffd44af336f0a846dc7b1cd365f02e217dfb0f Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 27 Jan 2023 14:05:47 -0800 Subject: [PATCH 04/21] Updating SAM_reader to properly store header data when multiple header lines start with the same flag --- tiny/rna/counter/hts_parsing.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index 031f6e7b..39341791 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -38,7 +38,7 @@ def __init__(self, **prefs): self._decollapsed_filename = None self._decollapsed_reads = [] self._header_lines = [] - self._header_dict = {} + self._header_dict = defaultdict(list) def bundle_multi_alignments(self, file: str) -> Iterator[Bundle]: """Bundles multi-alignments (determined by a shared QNAME) and reports the associated read's count""" @@ -142,10 +142,11 @@ def _parse_header_line(self, header_line: str) -> None: fields = header_line[3:].strip().split('\t') if rec_type == "@CO": - self._header_dict[rec_type] = fields[0] + self._header_dict[rec_type].append(fields[0]) else: - self._header_dict[rec_type] = \ + self._header_dict[rec_type].append( {field[:2]: field[3:].strip() for field in fields} + ) def _determine_collapser_type(self, first_aln_line: str) -> None: """Attempts to determine the collapsing utility that was used before producing the @@ -157,7 +158,7 @@ def _determine_collapser_type(self, first_aln_line: str) -> None: elif re.match(_re_fastx, first_aln_line) is not None: self.collapser_type = "fastx" - sort_order = self._header_dict.get('@HD', {}).get('SO', None) + sort_order = self._header_dict.get('@HD', [{}])[0].get('SO', None) if sort_order is None or sort_order != "queryname": raise ValueError("SAM files from fastx collapsed outputs must be sorted by queryname\n" "(and the @HD [...] SO header must be set accordingly).") From e7c6c7e89021c3803b8ef06225ac359e70e5b856 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 27 Jan 2023 14:08:33 -0800 Subject: [PATCH 05/21] Refactoring ReferenceTables to use a base class for functionality that is shared between it and the new class for non-genomic references. The overlap between the two is pretty much the GenomicArray/StepVector and related functions --- tiny/rna/counter/hts_parsing.py | 115 +++++++++++++++++--------------- 1 file changed, 62 insertions(+), 53 deletions(-) diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index 39341791..0769a393 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -6,6 +6,7 @@ from collections import Counter, defaultdict from typing import Tuple, List, Dict, Iterator, Optional, DefaultDict, Set, Union, IO, Callable +from abc import ABC, abstractmethod from urllib.parse import unquote from inspect import stack @@ -409,6 +410,58 @@ def not_implemented(self): raise NotImplementedError(f"CaseInsensitiveAttrs does not support {stack()[1].function}") +class AnnotationParsing(ABC): + def __init__(self, feature_selector, **prefs): + self.stepvector = prefs.get('stepvector', 'HTSeq') + self.selector = feature_selector + self.feats = self._init_genomic_array() + + @abstractmethod + def get(self): pass + + def _init_genomic_array(self): + if self.stepvector == 'Cython': + try: + from tiny.rna.counter.stepvector import StepVector + setattr(HTSeq.StepVector, 'StepVector', StepVector) + return HTSeq.GenomicArray("auto", stranded=False) + except ModuleNotFoundError: + self.stepvector = 'HTSeq' + print("Failed to import Cython StepVector\n" + "Falling back to HTSeq's StepVector", + file=sys.stderr) + + if self.stepvector == 'HTSeq': + return HTSeq.GenomicArrayOfSets("auto", stranded=False) + + def chrom_vector_setdefault(self, chrom): + """Behaves like dict.setdefault() for chrom_vectors. Even though chrom_vectors are + dictionaries themselves, calling setdefault on them will break the GenomicArrayOfSets""" + + if chrom not in self.feats.chrom_vectors: + self.feats.add_chrom(chrom) + + def get_feats_table_size(self) -> int: + """Returns the sum of features across all chromosomes and strands""" + + total_feats = 0 + empty_size = {"Cython": 1, "HTSeq": 3}[self.stepvector] + for chrom in self.feats.chrom_vectors: + for strand in self.feats.chrom_vectors[chrom]: + total_feats += self.feats.chrom_vectors[chrom][strand].array.num_steps() - empty_size + + return total_feats + + @staticmethod + def map_strand(htseq_value: str): + """Maps HTSeq's strand representation (+/-/.) to + tinyRNA's strand representation (True/False/None)""" + + return { + '+': True, + '-': False, + }.get(htseq_value, None) + def parse_gff(file, row_fn: Callable, alias_keys=None): if alias_keys is not None: row_fn = functools.partial(row_fn, alias_keys=alias_keys) @@ -433,8 +486,7 @@ def parse_gff(file, row_fn: Callable, alias_keys=None): TagTable = DefaultDict[str, Set[Tuple[str, str]]] GenomicArray = HTSeq.GenomicArrayOfSets - -class ReferenceTables: +class ReferenceTables(AnnotationParsing): """A GFF parser which builds feature, alias, and class reference tables Discontinuous features are supported, and comma-separated attribute values (GFF3 column 9) @@ -462,17 +514,16 @@ class ReferenceTables: type_filter = [] def __init__(self, gff_files: Dict[str, list], feature_selector, **prefs): + super().__init__(feature_selector, **prefs) self.all_features = prefs.get('all_features', False) - self.stepvector = prefs.get('stepvector', 'HTSeq') - self.selector = feature_selector self.gff_files = gff_files - # ----------------------------------------------------------- Primary Key: - self.feats = self._init_genomic_array() # Root Match ID - self.parents, self.filtered = {}, set() # Original Feature ID - self.intervals = defaultdict(list) # Root Feature ID - self.matches = defaultdict(set) # Root Match ID - self.alias = defaultdict(set) # Root Feature ID - self.tags = defaultdict(set) # Root Feature ID -> Root Match ID + # ----------------------------------------------- Primary Key: + # self.feats # Root Match ID + self.parents, self.filtered = {}, set() # Original Feature ID + self.intervals = defaultdict(list) # Root Feature ID + self.matches = defaultdict(set) # Root Match ID + self.alias = defaultdict(set) # Root Feature ID + self.tags = defaultdict(set) # Root Feature ID -> Root Match ID # Patch the GFF attribute parser to support comma separated attribute value lists setattr(HTSeq.features.GFF_Reader, 'parse_GFF_attribute_string', staticmethod(parse_GFF_attribute_string)) @@ -664,26 +715,6 @@ def _merge_adjacent_subintervals(unmerged_sub_ivs: List[HTSeq.GenomicInterval]) return merged_ivs - def get_feats_table_size(self) -> int: - """Returns the sum of features across all chromosomes and strands""" - - total_feats = 0 - empty_size = {"Cython": 1, "HTSeq": 3}[self.stepvector] - for chrom in self.feats.chrom_vectors: - for strand in self.feats.chrom_vectors[chrom]: - total_feats += self.feats.chrom_vectors[chrom][strand].array.num_steps() - empty_size - - return total_feats - - def map_strand(self, gff_value: str): - """Maps HTSeq's strand representation (+/-/.) to - tinyRNA's strand representation (True/False/None)""" - - return { - '+': True, - '-': False, - }.get(gff_value, None) - def was_matched(self, untagged_id): """Checks if the feature ID previously matched on identity, regardless of whether the matching rule was tagged or untagged.""" @@ -691,13 +722,6 @@ def was_matched(self, untagged_id): # any() will short circuit on first match when provided a generator function return any(tagged_id in self.matches for tagged_id in self.tags.get(untagged_id, ())) - def chrom_vector_setdefault(self, chrom): - """Behaves like dict.setdefault() for chrom_vectors. Even though chrom_vectors are - dictionaries themselves, calling setdefault on them will break the GenomicArrayOfSets""" - - if chrom not in self.feats.chrom_vectors: - self.feats.add_chrom(chrom) - @staticmethod def get_feature_id(row): id_collection = row.attr.get('ID') \ @@ -713,21 +737,6 @@ def get_feature_id(row): return id_collection[0] - def _init_genomic_array(self): - if self.stepvector == 'Cython': - try: - from tiny.rna.counter.stepvector import StepVector - setattr(HTSeq.StepVector, 'StepVector', StepVector) - return HTSeq.GenomicArray("auto", stranded=False) - except ModuleNotFoundError: - self.stepvector = 'HTSeq' - print("Failed to import Cython StepVector\n" - "Falling back to HTSeq's StepVector", - file=sys.stderr) - - if self.stepvector == 'HTSeq': - return HTSeq.GenomicArrayOfSets("auto", stranded=False) - @staticmethod def column_filter_match(row, rule): """Checks if the GFF row passes the inclusive filter(s) From 9841d6e7995131767740472bd83d3b9254289b8e Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 27 Jan 2023 14:15:12 -0800 Subject: [PATCH 06/21] Adding a new class that is essentially the equivalent of ReferenceTables but for non-GFF read counting. It produces a GenomicArray of reference sequences, where there is one "chromosome" for each sequence, and each chromosome contains an entry for the sequence on the + and - strand. Tags and aliases don't apply in this mode, but they are returned nonetheless for compatibility --- tiny/rna/counter/hts_parsing.py | 74 +++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index 0769a393..c89250ae 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -743,3 +743,77 @@ def column_filter_match(row, rule): If both filters are defined then they must both evaluate to true for a match""" return row.source in rule['Filter_s'] and row.type in rule['Filter_t'] + + +class NonGenomicAnnotations(AnnotationParsing): + + def __init__(self, sam_files, feature_selector, **prefs): + super().__init__(feature_selector, **prefs) + self.sam_files = sam_files + self.alias = {} + self.tags = {} + + @report_execution_time("Non-genomic annotations parsing") + def get(self): + ref_seqs = self.get_reference_seq_definitions() + unbuilt_match_tuples = self.get_matches() + + for seq_id, seq_len in ref_seqs.items(): + self.add_reference_seq(seq_id, seq_len, unbuilt_match_tuples) + + # Aliases are irrelevant for non-GFF references + aliases = {seq_id: () for seq_id in self.tags} + return self.feats, aliases, self.tags + + def get_reference_seq_definitions(self) -> Dict[str,int]: + reader = SAM_reader() + for sam in self.sam_files: + with open(sam['File'], 'rb') as f: + reader._read_to_first_aln(f) + + ref_seqs = defaultdict(set) + for sq in reader._header_dict['@SQ']: + seq_id = sq['SN'] + seq_len = int(sq['LN']) + ref_seqs[seq_id].add(seq_len) + + return self.get_seq_len_consensus(ref_seqs) + + @staticmethod + def get_seq_len_consensus(ref_seqs): + consensus = {} + for seq_id, lengths in ref_seqs.items(): + if len(lengths) > 1: + msg = f"Reference sequence identifier {seq_id} " \ + "was given multiple lengths in your SAM files. " \ + "This might indicate that your files were produced " \ + "by alignments that used different indexes." + raise ValueError(msg) + consensus[seq_id] = lengths.pop() + + return consensus + + def get_matches(self): + """Stage 1 selection is skipped in non-genomic counting. + Simply build match_tuples for all rules. These will be used + uniformly in each reference sequence's feature_record_tuple""" + + return [(idx, rule['Hierarchy'], rule['Overlap']) + for idx, rule in sorted( + enumerate(self.selector.rules_table), + key=lambda x: x[1]['Hierarchy'] + )] + + def add_reference_seq(self, seq_id, seq_len, unbuilt_match_tuples): + + # Features are classified in Reference Tables (Stage 1 selection) + # For compatibility, use the seq_id with an empty classifier (index 1) + tagged_id = (seq_id,) + self.tags[seq_id] = {tagged_id} + + for strand in ('+', '-'): + iv = HTSeq.GenomicInterval(seq_id, 0, seq_len, strand) + match_tuples = self.selector.build_interval_selectors(iv, unbuilt_match_tuples.copy()) + tinyrna_strand = self.map_strand(strand) + + self.feats[iv] += (tagged_id, tinyrna_strand, tuple(match_tuples)) \ No newline at end of file From 3f88265b88e0f1f8340d6f619f32573c4b932c8f Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 27 Jan 2023 14:17:22 -0800 Subject: [PATCH 07/21] Updating FeatureCounter to switch between the two reference annotation classes based on the presence of GFF files. Also changing the order in which assign_features and count_reads occur in the class because my preference has changed since it was written --- tiny/rna/counter/features.py | 46 ++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/tiny/rna/counter/features.py b/tiny/rna/counter/features.py index b17c0cdc..73cd540f 100644 --- a/tiny/rna/counter/features.py +++ b/tiny/rna/counter/features.py @@ -4,7 +4,7 @@ from collections import defaultdict from typing import List, Tuple, Set, Dict, Mapping -from tiny.rna.counter.hts_parsing import ReferenceTables, SAM_reader +from tiny.rna.counter.hts_parsing import ReferenceTables, NonGenomicAnnotations, SAM_reader from .statistics import LibraryStats from .matching import * @@ -27,30 +27,20 @@ def __init__(_, features: HTSeq.GenomicArrayOfSets, aliases: dict, classes: dict class FeatureCounter: - def __init__(self, gff_file_set, selection_rules, **prefs): + def __init__(self, gff_file_set, sam_file_set, selection_rules, **prefs): self.stats = LibraryStats(**prefs) self.sam_reader = SAM_reader(**prefs) self.selector = FeatureSelector(selection_rules, self.stats, **prefs) - reference_tables = ReferenceTables(gff_file_set, self.selector, **prefs) - Features(*reference_tables.get()) - self.prefs = prefs + if gff_file_set: + self.mode = "genomic" + annotations = ReferenceTables(gff_file_set, self.selector, **prefs) + else: + self.mode = "non-genomic" + annotations = NonGenomicAnnotations(sam_file_set, self.selector, **prefs) - def assign_features(self, al: dict) -> Tuple[dict, int]: - """Determines features associated with the interval then performs rule-based feature selection""" - - try: - feat_matches = set().union( - *Features.chrom_vectors[al['Chrom']]['.'] # GenomicArrayOfSets -> ChromVector - .array[al['Start']:al['End']] # ChromVector -> StepVector - .get_steps(values_only=True)) # StepVector -> {features} - except KeyError as ke: - self.stats.chrom_misses[ke.args[0]] += 1 - return {}, 0 - - # If features are associated with the alignment interval, perform selection - assignment = self.selector.choose(feat_matches, al) if feat_matches else {} - return assignment, len(feat_matches) + Features(*annotations.get()) + self.prefs = prefs def count_reads(self, library: dict): """Collects statistics on features assigned to each alignment associated with each read""" @@ -71,6 +61,22 @@ def count_reads(self, library: dict): return self.stats + def assign_features(self, al: dict) -> Tuple[dict, int]: + """Determines features associated with the interval then performs rule-based feature selection""" + + try: + feat_matches = set().union( + *Features.chrom_vectors[al['Chrom']]['.'] # GenomicArrayOfSets -> ChromVector + .array[al['Start']:al['End']] # ChromVector -> StepVector + .get_steps(values_only=True)) # StepVector -> {features} + except KeyError as ke: + self.stats.chrom_misses[ke.args[0]] += 1 + return {}, 0 + + # If features are associated with the alignment interval, perform selection + assignment = self.selector.choose(feat_matches, al) if feat_matches else {} + return assignment, len(feat_matches) + class FeatureSelector: """Performs hierarchical selection given a set of candidate features for a locus From f59ef1fc2d56f8602d0e55887650169ef2f2b3ec Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 3 Feb 2023 16:44:04 -0800 Subject: [PATCH 08/21] SAM file headers need some basic validation now that we're relying on them for non-genomic counting. The validating regex pattern was copied from the Aug. '22 version of the SAM v1 specification --- tiny/rna/counter/hts_parsing.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index c89250ae..c9343c32 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -21,7 +21,8 @@ AlignmentDict = Dict[str, Union[str, int, bytes]] Bundle = Tuple[List[AlignmentDict], int] _re_tiny = r"\d+_count=\d+" -_re_fastx = r'seq\d+_x(\d+)' +_re_fastx = r"seq\d+_x(\d+)" +_re_header = re.compile(r"^@(HD|SQ|RG|PG)(\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$|^@CO\t.*") # For Alignment complement = {ord('A'): 'T', ord('T'): 'A', ord('G'): 'C', ord('C'): 'G'} @@ -136,6 +137,7 @@ def _read_to_first_aln(self, file_obj: IO) -> Tuple[bytes, int]: def _parse_header_line(self, header_line: str) -> None: """Parses and saves information from the provided header line""" + self.validate_header_line(header_line) self._header_lines.append(header_line) # Header dict @@ -149,6 +151,13 @@ def _parse_header_line(self, header_line: str) -> None: {field[:2]: field[3:].strip() for field in fields} ) + def validate_header_line(self, line): + if not re.match(_re_header, line): + msg = "Invalid SAM header" + msg += f" in {os.path.basename(self.file)}" if self.file else "" + msg += ':\n\t' + f'"{line}"' + raise ValueError(msg) + def _determine_collapser_type(self, first_aln_line: str) -> None: """Attempts to determine the collapsing utility that was used before producing the input alignment file, then checks basic requirements for the utility's outputs.""" From 15e27952c13757236ed27333b0e827c82bfe8483 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 3 Feb 2023 16:52:30 -0800 Subject: [PATCH 09/21] Restructuring AnnotationParsing classes so that self.selector is assigned during calls to .get(). This would allow us to create groups of GFF files, each with a distinct Stage 1 ruleset, that would be pooled in the same tables. Also changing the approach with validation/parsing of SAM headers for non-genomic counting. Since @SQ headers could be very abundant, I'm reusing the parsing results from validation rather than reparsing. NonGenomicAnnotations therefore is now constructed with a dictionary of {sequence_id: sequence_length} which is produced during successful validation. --- tiny/rna/counter/hts_parsing.py | 63 +++++++++++---------------------- tiny/rna/counter/validation.py | 2 +- 2 files changed, 22 insertions(+), 43 deletions(-) diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index c9343c32..3c5b2ea4 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -420,13 +420,18 @@ def not_implemented(self): class AnnotationParsing(ABC): - def __init__(self, feature_selector, **prefs): + def __init__(self, **prefs): self.stepvector = prefs.get('stepvector', 'HTSeq') - self.selector = feature_selector self.feats = self._init_genomic_array() + # The selector is assigned whenever get() is called. + # While it isn't the current use case, this would allow + # for groups of GFF files to be processed with different + # Stage 1 selection rules and pooled into the same tables + self.selector = None + @abstractmethod - def get(self): pass + def get(self, feature_selector): pass def _init_genomic_array(self): if self.stepvector == 'Cython': @@ -471,6 +476,7 @@ def map_strand(htseq_value: str): '-': False, }.get(htseq_value, None) + def parse_gff(file, row_fn: Callable, alias_keys=None): if alias_keys is not None: row_fn = functools.partial(row_fn, alias_keys=alias_keys) @@ -522,8 +528,8 @@ class ReferenceTables(AnnotationParsing): source_filter = [] type_filter = [] - def __init__(self, gff_files: Dict[str, list], feature_selector, **prefs): - super().__init__(feature_selector, **prefs) + def __init__(self, gff_files: Dict[str, list], **prefs): + super().__init__(**prefs) self.all_features = prefs.get('all_features', False) self.gff_files = gff_files # ----------------------------------------------- Primary Key: @@ -538,9 +544,10 @@ def __init__(self, gff_files: Dict[str, list], feature_selector, **prefs): setattr(HTSeq.features.GFF_Reader, 'parse_GFF_attribute_string', staticmethod(parse_GFF_attribute_string)) @report_execution_time("GFF parsing") - def get(self) -> Tuple[GenomicArray, AliasTable, TagTable]: + def get(self, feature_selector) -> Tuple[GenomicArray, AliasTable, TagTable]: """Initiates GFF parsing and returns complete feature, alias, and tag tables""" + self.selector = feature_selector for file, alias_keys in self.gff_files.items(): parse_gff(file, self.parse_row, alias_keys=alias_keys) @@ -756,52 +763,24 @@ def column_filter_match(row, rule): class NonGenomicAnnotations(AnnotationParsing): - def __init__(self, sam_files, feature_selector, **prefs): - super().__init__(feature_selector, **prefs) - self.sam_files = sam_files + def __init__(self, reference_seqs, **prefs): + super().__init__(**prefs) + self.seqs = reference_seqs self.alias = {} self.tags = {} @report_execution_time("Non-genomic annotations parsing") - def get(self): - ref_seqs = self.get_reference_seq_definitions() - unbuilt_match_tuples = self.get_matches() + def get(self, selector): + self.selector = selector + match_tuples = self.get_matches() - for seq_id, seq_len in ref_seqs.items(): - self.add_reference_seq(seq_id, seq_len, unbuilt_match_tuples) + for seq_id, seq_len in self.seqs.items(): + self.add_reference_seq(seq_id, seq_len, match_tuples) # Aliases are irrelevant for non-GFF references aliases = {seq_id: () for seq_id in self.tags} return self.feats, aliases, self.tags - def get_reference_seq_definitions(self) -> Dict[str,int]: - reader = SAM_reader() - for sam in self.sam_files: - with open(sam['File'], 'rb') as f: - reader._read_to_first_aln(f) - - ref_seqs = defaultdict(set) - for sq in reader._header_dict['@SQ']: - seq_id = sq['SN'] - seq_len = int(sq['LN']) - ref_seqs[seq_id].add(seq_len) - - return self.get_seq_len_consensus(ref_seqs) - - @staticmethod - def get_seq_len_consensus(ref_seqs): - consensus = {} - for seq_id, lengths in ref_seqs.items(): - if len(lengths) > 1: - msg = f"Reference sequence identifier {seq_id} " \ - "was given multiple lengths in your SAM files. " \ - "This might indicate that your files were produced " \ - "by alignments that used different indexes." - raise ValueError(msg) - consensus[seq_id] = lengths.pop() - - return consensus - def get_matches(self): """Stage 1 selection is skipped in non-genomic counting. Simply build match_tuples for all rules. These will be used diff --git a/tiny/rna/counter/validation.py b/tiny/rna/counter/validation.py index ae8c757c..e2825920 100644 --- a/tiny/rna/counter/validation.py +++ b/tiny/rna/counter/validation.py @@ -87,7 +87,7 @@ class GFFValidator: } def __init__(self, gff_files, rules, ebwt=None, genomes=None, alignments=None): - self.ReferenceTables = ReferenceTables(gff_files, None) + self.ReferenceTables = ReferenceTables(gff_files) self.column_filters = self.build_column_filters(rules) self.report = ReportFormatter(self.targets) self.chrom_set = set() From a44743f76201670954503c5632cbcd240a888728 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 3 Feb 2023 17:00:36 -0800 Subject: [PATCH 10/21] Adding SamSqValidator which follows the same design patterns as GFFValidator. @SQ headers are first validated for syntax (error on missing or incomplete @SQ header). Next they are checked for duplicate entries under the same identifier in each file, and for inconsistent length definitions between files. This is intended to catch issues that might arise from standalone use of tiny-count with third party SAM files, which may have been produced by more than one alignment event using different indexes --- tiny/rna/counter/validation.py | 125 ++++++++++++++++++++++++++++++++- 1 file changed, 122 insertions(+), 3 deletions(-) diff --git a/tiny/rna/counter/validation.py b/tiny/rna/counter/validation.py index e2825920..bf83f40a 100644 --- a/tiny/rna/counter/validation.py +++ b/tiny/rna/counter/validation.py @@ -6,9 +6,9 @@ from collections import Counter, defaultdict from typing import List, Dict -from tiny.rna.counter.hts_parsing import parse_gff, ReferenceTables +from tiny.rna.counter.hts_parsing import parse_gff, ReferenceTables, SAM_reader from tiny.rna.counter.features import FeatureSelector -from tiny.rna.util import sorted_natural, gzip_open +from tiny.rna.util import sorted_natural, gzip_open, report_execution_time class ReportFormatter: @@ -286,4 +286,123 @@ def build_column_filters(rules): for selector in ["Filter_s", "Filter_t"] } - return FeatureSelector.build_selectors([selector_defs])[0] \ No newline at end of file + return FeatureSelector.build_selectors([selector_defs])[0] + + +class SamSqValidator: + """Validates @SQ headers for tiny-count's non-genomic counting mode""" + + targets = { + "inter sq": "Sequence identifiers with inconsistent lengths", + "intra sq": "SAM files with repeated sequence identifiers", + "incomplete sq": "SAM files with incomplete @SQ headers", + "missing sq": "SAM files that lack @SQ headers" + } + + def __init__(self, sam_files): + self.report = ReportFormatter(self.targets) + self.sam_files = sam_files + self.reference_seqs = {} + self.sq_headers = {} + + @report_execution_time("Non-genomic annotations validation") + def validate(self): + print("Validating sequence identifiers in SAM files...") + self.read_sq_headers() + self.validate_sq_headers() + self.report.print_report() + if self.report.errors: + sys.exit(1) + + def validate_sq_headers(self): + """Performs validation tests in the required order + It is important to check for and return upon syntax error, otherwise + the remaining tests are likely to raise an exception before the report + has a chance to be printed""" + + # First verify @SQ header syntax + missing = self.get_missing_headers() + incomplete = self.get_incomplete_sq_headers() + if missing or incomplete: + self.generate_header_syntax_report(missing, incomplete) + return + + # Next verify identifier definitions + duplicate = self.get_duplicate_identifiers() + ambiguous = self.get_ambiguous_lengths() + if duplicate or ambiguous: + self.generate_identifier_report(duplicate, ambiguous) + + def get_ambiguous_lengths(self) -> List[str]: + """Returns a list of sequence IDs that have inconsistent length definitions. + Sequences with consistent length definitions are added to self.reference_seqs""" + + seq_lengths = defaultdict(set) + for sam in self.sam_files: + for sq in self.sq_headers[sam]: + seq_id = sq['SN'] + seq_len = int(sq['LN']) + seq_lengths[seq_id].add(seq_len) + + bad_seqs = [] + for seq_id, lengths in seq_lengths.items(): + if len(lengths) == 1: + lengths = lengths.pop() + self.reference_seqs[seq_id] = lengths + else: + bad_seqs.append(seq_id) + + return bad_seqs + + def get_duplicate_identifiers(self) -> Dict[str, List[str]]: + """Returns a dictionary of SAM files that contain duplicate sequence identifiers""" + + bad_files = {} + for file in self.sam_files: + sq = self.sq_headers[file] + id_count = Counter(seq['SN'] for seq in sq) + duplicates = [seq_id for seq_id, count in id_count.items() if count > 1] + if duplicates: bad_files[file] = duplicates + + return bad_files + + def get_incomplete_sq_headers(self) -> List[str]: + """Returns a list of SAM files that have incomplete @SQ headers""" + + return [file for file, sqs in self.sq_headers.items() + if not all("SN" in sq and "LN" in sq for sq in sqs)] + + def get_missing_headers(self) -> List[str]: + """Returns a list of SAM files that lack @SQ headers""" + + return [file for file, sqs in self.sq_headers.items() + if len(sqs) == 0] + + def generate_header_syntax_report(self, missing, incomplete): + report = {} + if missing: + report['missing sq'] = sorted_natural(missing) + if incomplete: + report['incomplete sq'] = sorted_natural(incomplete) + + header = "Every SAM file must have complete @SQ headers with SN and LN\n" \ + "fields when counting in non-genomic mode.\n" + self.report.add_error_section(header, report) + + def generate_identifier_report(self, duplicate, ambiguous): + report = {} + if duplicate: + report['intra sq'] = sorted_natural(duplicate) + if ambiguous: + report['inter sq'] = sorted_natural(ambiguous) + + header = "Sequence identifiers must be unique and have consistent length definitions.\n" + self.report.add_error_section(header, report) + + def read_sq_headers(self): + for file in self.sam_files: + with open(file, 'rb') as f: + reader = SAM_reader() + reader._read_to_first_aln(f) + + self.sq_headers[file] = reader._header_dict.get('@SQ', []) \ No newline at end of file From cbf0ab3fed2457c39928ec8b781aa70ee90dc383 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 3 Feb 2023 17:09:35 -0800 Subject: [PATCH 11/21] Replacing load_gff_files() with load_references(). This is now where the reference parsing object is constructed, and also where references are validated. It just made sense for these to be among the initial operations when running tiny-count. The routine for reorganizing the YAML representation of gff_files is now part of the PathsFile class. I'm happy with how this has cleaned up the code. --- tiny/rna/configuration.py | 71 +++++++++++++++++++++++++++++-------- tiny/rna/counter/counter.py | 51 ++++++++++++-------------- 2 files changed, 79 insertions(+), 43 deletions(-) diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py index 88f00a85..61cf919a 100644 --- a/tiny/rna/configuration.py +++ b/tiny/rna/configuration.py @@ -136,6 +136,22 @@ def from_here(self, destination: Union[str,dict,None], origin: Union[str,dict,No else: return destination + @staticmethod + def is_path_dict(val, empty_ok=False): + return (isinstance(val, dict) and + isinstance(val.get("path"), (str, bytes)) and + (len(val['path']) or empty_ok)) + + @staticmethod + def is_path_str(val, empty_ok=False): + return (isinstance(val, (str, bytes)) and + (len(val) or empty_ok)) + + @classmethod + def is_path(cls, val, empty_ok=False): + return (cls.is_path_dict(val, empty_ok) or + cls.is_path_str(val, empty_ok)) + def setup_step_inputs(self): """For now, only tiny-plot requires additional setup for step inputs This function is called at both startup and resume""" @@ -340,17 +356,15 @@ def get_bt_index_files(self): def validate_inputs(self): """For now, only GFF files are validated here""" - selection_rules = self.process_features_sheet() - gff_files = {gff['path']: [] for gff in self['gff_files']} - ebwt = self.paths['ebwt'] if not self['run_bowtie_build'] else None + gff_files = self.paths.get_gff_config() - GFFValidator( - gff_files, - selection_rules, - ebwt=ebwt, - genomes=self.paths['reference_genome_files'], - alignments=None # Used in tiny-count standalone runs - ).validate() + if gff_files: + GFFValidator( + gff_files, + self.process_features_sheet(), + self.paths['ebwt'] if not self['run_bowtie_build'] else None, + self.paths['reference_genome_files'] + ).validate() def execute_post_run_tasks(self, return_code): if self['run_bowtie_build']: @@ -469,9 +483,9 @@ def from_pipeline(value): """When tiny-count runs as a pipeline step, all file inputs are sourced from the working directory regardless of original path.""" - if isinstance(value, dict) and value.get("path") is not None: + if ConfigBase.is_path_dict(value, empty_ok=True): return dict(value, path=os.path.basename(value['path'])) - elif isinstance(value, (str, bytes)): + elif ConfigBase.is_path_str(value, empty_ok=True): return os.path.basename(value) else: return value @@ -497,7 +511,7 @@ def as_cwl_file_obj(self, key: str): elif key in self.single: return self.cwl_file(val) elif key in self.groups: - return [self.cwl_file(sub) for sub in val if sub] + return [self.cwl_file(sub) for sub in val if self.is_path(sub)] elif key in self.prefix: raise ValueError(f"The parameter {key} isn't meant to be a CWL file object.") else: @@ -526,7 +540,7 @@ def validate_paths(self): for key in self.groups: for entry in self[key]: - if isinstance(entry, dict): entry = entry['path'] + if isinstance(entry, dict): entry = entry.get('path') if not entry: continue assert os.path.isfile(entry), \ "The following file provided under {key} in {selfname} could not be found:\n\t{file}" \ @@ -538,6 +552,35 @@ def check_backward_compatibility(self): "that you are using a Paths File from an earlier version of tinyRNA. Please " \ "check the release notes and update your configuration files." + def get_gff_config(self) -> Dict[str, list]: + """Restructures GFF input info so that it can be more easily handled. + To be clear, the Paths File YAML could be structured to match the desired output, + but the current format was chosen because it's more readable with more forgiving syntax. + + The YAML format is [{"path": "gff_path_1", "alias": [alias1, alias2, ...]}, { ... }, ...] + The output format is {"gff_path_1": [alias1, alias2, ...], ...} + """ + + gff_files = defaultdict(list) + id_filter = lambda alias: alias.lower() != 'id' + + # Build dictionary of files and allowed aliases + for gff in self['gff_files']: + if not self.is_path_dict(gff): continue + path, aliases = gff['path'], gff.get('alias', ()) + gff_files[path].extend(filter(id_filter, aliases)) + + # Remove duplicate aliases per file, keep order + for file, alias in gff_files.items(): + gff_files[file] = sorted(set(alias), key=alias.index) + + if not len(gff_files) and not self.in_pipeline: + print("No GFF files were specified in {selfname} so " + "reads will be counted in non-genomic mode." + .format(selfname=self.basename)) + + return dict(gff_files) + # Override def append_to(self, key: str, val: Any): """Overrides method in the base class. This is necessary due to automatic diff --git a/tiny/rna/counter/counter.py b/tiny/rna/counter/counter.py index 1d4cbe41..6ee45ecc 100644 --- a/tiny/rna/counter/counter.py +++ b/tiny/rna/counter/counter.py @@ -3,17 +3,15 @@ import multiprocessing as mp import traceback import argparse -import shutil import sys import os -from collections import defaultdict -from typing import Tuple, List, Dict -from pkg_resources import resource_filename +from typing import List, Dict -from tiny.rna.counter.validation import GFFValidator +from tiny.rna.counter.validation import GFFValidator, SamSqValidator from tiny.rna.counter.features import Features, FeatureCounter from tiny.rna.counter.statistics import MergedStatsManager +from tiny.rna.counter.hts_parsing import ReferenceTables, NonGenomicAnnotations, AnnotationParsing from tiny.rna.util import report_execution_time, from_here, ReadOnlyDict, get_timestamp, add_transparent_help from tiny.rna.configuration import CSVReader, PathsFile, get_templates @@ -160,40 +158,35 @@ def load_config(features_csv: str, is_pipeline: bool) -> List[dict]: return rules -def load_gff_files(paths: PathsFile, libraries: List[dict], rules: List[dict]) -> Dict[str, list]: - """Retrieves the appropriate file path and alias attributes for each GFF, - then validates +def load_references(paths: PathsFile, libraries: List[dict], rules: List[dict], prefs) -> AnnotationParsing: + """Determines the reference source (GFF or SAM @SQ headers) and constructs the appropriate object Args: - paths: a loaded PathsFile with a populated gff_files parameter + paths: a PathsFile object that optionally contains a `gff_files` configuration libraries: libraries parsed from Samples Sheet, each as a dict with a 'File' key rules: selection rules parsed from Features Sheet + prefs: command line arguments to pass to the AnnotationParsing subclass Returns: - gff: a dict of GFF files with lists of alias attribute keys + references: an AnnotationParsing object, subclassed based on + the presence of GFF files in the Paths File """ - gff_files = defaultdict(list) - ignore_alias = ["id"] - - # Build dictionary of unique files and allowed aliases - for gff in paths['gff_files']: - if gff['path'] is not None: - gff_files[gff['path']].extend( - alias for alias in gff.get('alias', ()) - if alias.lower() not in ignore_alias - ) - - # Remove duplicate aliases (per file), keep original order - for file, alias in gff_files.items(): - gff_files[file] = sorted(set(alias), key=alias.index) + gff_files = paths.get_gff_config() + sam_files = [lib['File'] for lib in libraries] if gff_files: - # Prepare supporting file inputs for GFF validation - sam_files = [lib['File'] for lib in libraries] GFFValidator(gff_files, rules, alignments=sam_files).validate() + references = ReferenceTables(gff_files, **prefs) + else: + sq_validator = SamSqValidator(sam_files) + + # Reuse sq_validator's parsing results to save time + sq_validator.validate() + sequences = sq_validator.reference_seqs + references = NonGenomicAnnotations(sequences, **prefs) - return gff_files + return references @report_execution_time("Counting and merging") @@ -229,11 +222,11 @@ def main(): paths = PathsFile(args['paths_file'], is_pipeline) libraries = load_samples(paths['samples_csv'], is_pipeline) selection = load_config(paths['features_csv'], is_pipeline) - gff_files = load_gff_files(paths, libraries, selection) + reference = load_references(paths, libraries, selection, args) # global for multiprocessing global counter - counter = FeatureCounter(gff_files, libraries, selection, **args) + counter = FeatureCounter(reference, selection, **args) # Assign and count features using multiprocessing and merge results merged_counts = map_and_reduce(libraries, paths, args) From e8f417355207c795c887b6645beccde866fbfd23 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 3 Feb 2023 17:16:44 -0800 Subject: [PATCH 12/21] Updating the FeatureCounter constructor to use the new AnnotationParsing approach --- tiny/rna/counter/features.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tiny/rna/counter/features.py b/tiny/rna/counter/features.py index 73cd540f..d85d02a3 100644 --- a/tiny/rna/counter/features.py +++ b/tiny/rna/counter/features.py @@ -4,7 +4,7 @@ from collections import defaultdict from typing import List, Tuple, Set, Dict, Mapping -from tiny.rna.counter.hts_parsing import ReferenceTables, NonGenomicAnnotations, SAM_reader +from tiny.rna.counter.hts_parsing import SAM_reader, ReferenceTables, NonGenomicAnnotations from .statistics import LibraryStats from .matching import * @@ -27,19 +27,19 @@ def __init__(_, features: HTSeq.GenomicArrayOfSets, aliases: dict, classes: dict class FeatureCounter: - def __init__(self, gff_file_set, sam_file_set, selection_rules, **prefs): + def __init__(self, references, selection_rules, **prefs): self.stats = LibraryStats(**prefs) self.sam_reader = SAM_reader(**prefs) self.selector = FeatureSelector(selection_rules, self.stats, **prefs) - if gff_file_set: + if isinstance(references, ReferenceTables): self.mode = "genomic" - annotations = ReferenceTables(gff_file_set, self.selector, **prefs) - else: + elif isinstance(references, NonGenomicAnnotations): self.mode = "non-genomic" - annotations = NonGenomicAnnotations(sam_file_set, self.selector, **prefs) + else: + raise TypeError("Expected ReferenceTables or NonGenomicAnnotations, got %s" % type(references)) - Features(*annotations.get()) + Features(*references.get(self.selector)) self.prefs = prefs def count_reads(self, library: dict): From 8e15f705db0f44deb9c655ab8907b91711e63b18 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 3 Feb 2023 17:30:09 -0800 Subject: [PATCH 13/21] Misc. cleanup in configuration.py and resume.py: - Moving paths_config/paths_file bookkeeping out of ConfigBase. This is used by Configuration and Resume* classes, but not by PathsFile which also subclasses it. It is also better documented and consolidated in one function to make it clearer - Streamlining path object type checking with is_path_dict(), is_path_str(), and is_path() - The notice about "no GFF files provided" has been moved to get_gff_config() - Comment improvements --- tiny/rna/configuration.py | 54 +++++++++++++++++++-------------------- tiny/rna/resume.py | 8 +++--- 2 files changed, 32 insertions(+), 30 deletions(-) diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py index 61cf919a..08eac45c 100644 --- a/tiny/rna/configuration.py +++ b/tiny/rna/configuration.py @@ -5,11 +5,10 @@ import sys import csv import os -import re from pkg_resources import resource_filename from collections import Counter, OrderedDict, defaultdict -from typing import Union, Any, Optional, List +from typing import Union, Any, List, Dict from glob import glob from tiny.rna.counter.validation import GFFValidator @@ -57,7 +56,10 @@ def set(self, key: str, val: Union[str, list, dict, bool]) -> Union[str, list, d return val def set_if_not(self, key: str, val: Union[str, list, dict, bool]) -> Any: - """Apply the setting if it has not been previously set""" + """Apply the setting if it has not been previously set. This differs from + dict.setdefault() in that existing keys are overwritten if the associated + value is false in boolean context (e.g. None, False, empty container, etc.)""" + if not self[key]: self[key] = val return val @@ -125,13 +127,9 @@ def from_here(self, destination: Union[str,dict,None], origin: Union[str,dict,No if isinstance(origin, dict): origin = origin.get('path') if origin is None: origin = self.dir - if ( - isinstance(destination, dict) and - isinstance(destination.get("path"), (str, bytes)) and - len(destination['path']) - ): + if self.is_path_dict(destination): return dict(destination, path=self.joinpath(origin, destination["path"])) - elif isinstance(destination, (str, bytes)) and bool(destination): + elif self.is_path_str(destination): return self.joinpath(origin, destination) else: return destination @@ -193,11 +191,6 @@ def write_processed_config(self, filename: str = None) -> str: if filename is None: filename = self.get_outfile_path(self.inf) with open(filename, 'w') as outconf: - if 'paths_config' in self and not os.path.isabs(self['paths_config']): - # Processed config will be written to the Run Directory - # Ensure paths_config is an absolute path so it remains valid - self['paths_config'] = self.from_here(self['paths_config']) - self.yaml.dump(self.config, outconf) return filename @@ -224,7 +217,7 @@ def __init__(self, config_file: str, validate_inputs=False): super().__init__(config_file) self.paths = self.load_paths_config() - self.assimilate_paths_file() + self.absorb_paths_file() self.setup_pipeline() self.setup_file_groups() @@ -235,18 +228,24 @@ def __init__(self, config_file: str, validate_inputs=False): if validate_inputs: self.validate_inputs() def load_paths_config(self): - """Constructs a sub-configuration object containing workflow file preferences - self['paths_config'] is the user-facing file path (just the path string) - self['paths_file'] is a CWL file object used as a workflow input.""" - paths_file_path = self.from_here(self['paths_config']) - return PathsFile(paths_file_path) + """Returns a PathsFile object and updates keys related to the Paths File path""" + + # paths_config: user-specified + # Resolve the absolute path so that it remains valid when + # the processed Run Config is copied to the Run Directory + self['paths_config'] = self.from_here(self['paths_config']) + + # paths_file: automatically generated + # CWL file dictionary is used as a workflow input + self['paths_file'] = self.cwl_file(self['paths_config']) + + return PathsFile(self['paths_config']) - def assimilate_paths_file(self): + def absorb_paths_file(self): for key in [*PathsFile.single, *PathsFile.groups]: self[key] = self.paths.as_cwl_file_obj(key) for key in PathsFile.prefix: self[key] = self.paths[key] - self['paths_file'] = self.cwl_file(self.paths.inf) def process_samples_sheet(self): samples_sheet_path = self.paths['samples_csv'] @@ -460,6 +459,12 @@ class PathsFile(ConfigBase): - Lookups that return list values do not return the original object; don't append to them. Instead, use the append_to() helper function. - Chained assignments can produce unexpected results. + + Args: + file: The path to the Paths File, as a string + in_pipeline: True only when utilized by a step in the workflow, + in which case input files are sourced from the working directory + regardless of the path indicated in the Paths File """ # Parameter types @@ -522,11 +527,6 @@ def validate_paths(self): "The following parameters are required in {selfname}: {params}" \ .format(selfname=self.basename, params=', '.join(self.required)) - if not any(gff.get('path') for gff in self['gff_files']): - print("No GFF files were specified in {selfname} so " - "reads will be counted in non-genomic mode." - .format(selfname=self.basename)) - # Some entries in Paths File are omitted from tiny-count's working directory during # pipeline runs. There is no advantage to checking file existence here vs. in load_* if self.in_pipeline: return diff --git a/tiny/rna/resume.py b/tiny/rna/resume.py index e3cd7840..8adcf083 100644 --- a/tiny/rna/resume.py +++ b/tiny/rna/resume.py @@ -91,9 +91,11 @@ def _create_truncated_workflow(self): wf_steps[self.steps[0]]['in'][param] = new_input['var'] def load_paths_config(self): - """Constructs a sub-configuration object containing workflow file preferences""" - paths_file_path = self.from_here(self['paths_config']) - return PathsFile(paths_file_path) + """Returns a PathsFile object and updates keys related to the Paths File path""" + + self['paths_config'] = self.from_here(self['paths_config']) + self['paths_file'] = self.cwl_file(self['paths_config']) + return PathsFile(self['paths_config']) def assimilate_paths_file(self): """Updates the processed workflow with resume-safe Paths File parameters""" From 70783b2b34b0e28b339c1dec242201e2ddf0345b Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 3 Feb 2023 17:30:54 -0800 Subject: [PATCH 14/21] Misc. cleanup in validation.py: empty results are more clearly indicated when printing reports --- tiny/rna/counter/validation.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tiny/rna/counter/validation.py b/tiny/rna/counter/validation.py index bf83f40a..714d6f99 100644 --- a/tiny/rna/counter/validation.py +++ b/tiny/rna/counter/validation.py @@ -61,14 +61,19 @@ def recursive_indent(self, mapping: dict, indent: str): lines = [] for key, val in mapping.items(): - if not val: continue key_header = f"{indent}{self.key_mapper.get(key, key)}: " if isinstance(val, dict): lines.append(key_header) - lines.extend(self.recursive_indent(val, indent + '\t')) + if len(val): + lines.extend(self.recursive_indent(val, indent + '\t')) + else: + lines.append(indent + "\t" + "{ NONE }") elif isinstance(val, (list, set)): lines.append(key_header) - lines.extend([indent + '\t' + line for line in map(str, val)]) + if len(val): + lines.extend([indent + '\t' + line for line in map(str, val)]) + else: + lines.append(indent + "\t" + "[ NONE ]") else: lines.append(key_header + str(val)) return lines From ddf3bfeba096c3ad69e5fe9ec07c7b1ef7bba42e Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 3 Feb 2023 17:33:57 -0800 Subject: [PATCH 15/21] Updating unit tests --- tests/unit_tests_configuration.py | 58 ++++++++++++++++++++++++++----- tests/unit_tests_counter.py | 34 ------------------ tests/unit_tests_hts_parsing.py | 46 ++++++++++++------------ 3 files changed, 73 insertions(+), 65 deletions(-) diff --git a/tests/unit_tests_configuration.py b/tests/unit_tests_configuration.py index 5eed377e..70c6fd37 100644 --- a/tests/unit_tests_configuration.py +++ b/tests/unit_tests_configuration.py @@ -286,18 +286,29 @@ def test_validate_required_parameters(self): config[key] = oldval - """Does PathsFile check that at least one GFF file has been provided?""" + """Does PathsFile notify the user of the change in counting style + when no GFF files have been provided?""" - def test_validate_gff_files(self): + def test_no_gff_files(self): config = make_paths_file() + stdout = io.StringIO() - with self.assertRaisesRegex(AssertionError, r".*(At least one GFF).*"): - config['gff_files'] = [{'path': "", 'alias': []}] - config.validate_paths() + gff_configs = [ + [{'path': "", 'alias': []}], + [{'irrelevant': "value"}], + None, + [] + ] - with self.assertRaisesRegex(AssertionError, r".*(At least one GFF).*"): - config['gff_files'] = [{'irrelevant': "value"}] - config.validate_paths() + with contextlib.redirect_stdout(stdout): + for gff_config in gff_configs: + config['gff_files'] = gff_config + config.validate_paths() + config_return = config.get_gff_config() + + self.assertRegex(stdout.getvalue(), r".*(No GFF files).*") + self.assertEqual(config_return, {}) + stdout.truncate(0) """Does PathsFile check for missing files for single entry parameters?""" @@ -350,6 +361,37 @@ def test_pipeline_mapping(self): self.assertEqual(config['empty_path'], "") self.assertEqual(config['none_path'], None) + """Does get_gff_config properly screen for "ID" alias attributes?""" + + def test_get_gff_config_id_alias_attr(self): + config = make_paths_file() + config['gff_files'] = [{'path': "/irrelevant", 'alias': ['ID']}] + + gff_files = config.get_gff_config() + expected = {"/irrelevant": []} + + self.assertDictEqual(gff_files, expected) + + """Does get_gff_config merge aliases if the same GFF file is listed more than once? + Are duplicates also removed and original order preserved?""" + + def test_get_gff_config_merge_alias_attr(self): + config = make_paths_file() + config['gff_files'] = [ + {'path': "/same/path", 'alias': ['attr1', 'attr2']}, + {'path': "/same/path", 'alias': ['attr1', 'attrN']}, + {'path': "/different/path", 'alias': ['attr3']} + ] + + gff_files = config.get_gff_config() + + expected = { + "/same/path": ['attr1', 'attr2', 'attrN'], + "/different/path": ['attr3'] + } + + self.assertDictEqual(gff_files, expected) + class ConfigurationTest(unittest.TestCase): diff --git a/tests/unit_tests_counter.py b/tests/unit_tests_counter.py index 0110ab50..20d9107c 100644 --- a/tests/unit_tests_counter.py +++ b/tests/unit_tests_counter.py @@ -231,40 +231,6 @@ def test_load_config_rna_to_cDNA(self): self.assertEqual(ruleset[0]['nt5end'], 'T') - """Does load_gff_files properly screen for "ID" alias attributes?""" - - def test_load_gff_files_id_alias_attr(self): - config = helpers.make_paths_file() - config['gff_files'] = [{'path': "/irrelevant", 'alias': ['ID']}] - - # Patch GFFValidator so that it isn't called (not relevant) - with patch('tiny.rna.counter.counter.GFFValidator'): - gff_files = counter.load_gff_files(config, [], {}) - - expected = {"/irrelevant": []} - self.assertDictEqual(gff_files, expected) - - """Does load_gff_files merge aliases if the same GFF file is listed more than once? - Are duplicates also removed and original order preserved?""" - - def test_load_gff_files_merge_alias_attr(self): - config = helpers.make_paths_file() - config['gff_files'] = [ - {'path': "/same/path", 'alias': ['attr1', 'attr2']}, - {'path': "/same/path", 'alias': ['attr1', 'attrN']}, - {'path': "/different/path", 'alias': ['attr3']} - ] - - # Patch GFFValidator so that it isn't called (not relevant) - with patch('tiny.rna.counter.counter.GFFValidator'): - gff_files = counter.load_gff_files(config, [], {}) - - expected = { - "/same/path": ['attr1', 'attr2', 'attrN'], - "/different/path": ['attr3'] - } - - self.assertDictEqual(gff_files, expected) if __name__ == '__main__': unittest.main() diff --git a/tests/unit_tests_hts_parsing.py b/tests/unit_tests_hts_parsing.py index 9a19d6f8..f5aa9966 100644 --- a/tests/unit_tests_hts_parsing.py +++ b/tests/unit_tests_hts_parsing.py @@ -145,7 +145,7 @@ def test_ref_tables_single_feature(self): iv = HTSeq.GenomicInterval("I", 3746, 3909, "-") kwargs = {'all_features': True} - feats, alias, tags = ReferenceTables(feature_source, feature_selector, **kwargs).get() + feats, alias, tags = ReferenceTables(feature_source, **kwargs).get(feature_selector) steps = list(feats[iv].array[iv.start:iv.end].get_steps(values_only=True)) self.assertEqual((type(feats), type(alias), type(tags)), (HTSeq.GenomicArrayOfSets, dict, defaultdict)) @@ -169,7 +169,7 @@ def test_ref_tables_single_feature_all_features_false(self): iv = HTSeq.GenomicInterval("I", 3746, 3909, "-") kwargs = {'all_features': True} - feats, alias, tags = ReferenceTables(feature_source, feature_selector, **kwargs).get() + feats, alias, tags = ReferenceTables(feature_source, **kwargs).get(feature_selector) steps = list(feats[iv].array[iv.start:iv.end].get_steps(values_only=True)) self.assertEqual((type(feats), type(alias), type(tags)), (HTSeq.GenomicArrayOfSets, dict, defaultdict)) @@ -192,7 +192,7 @@ def test_ref_tables_missing_name_attribute_all_features_false(self): # ReferenceTables.get() skips the steps for recording the feature's alias. # Instead, a different exception is raised due to reference tables being empty with self.assertRaisesRegex(ValueError, expected_err): - ReferenceTables(feature_source, feature_selector, **kwargs).get() + ReferenceTables(feature_source, **kwargs).get(feature_selector) """Does ReferenceTables.get() raise ValueError when a feature lacks an ID attribute?""" @@ -209,7 +209,7 @@ def test_ref_tables_missing_id_attribute(self): with patch('tiny.rna.counter.hts_parsing.HTSeq.utils.open', new=mock_reader): with self.assertRaisesRegex(ValueError, expected_err): - _ = ReferenceTables(feature_source, feature_selector, **kwargs).get() + _ = ReferenceTables(feature_source, **kwargs).get(feature_selector) """Does ReferenceTables.get() properly concatenate aliases if there is more than one alias for a feature?""" """Does ReferenceTables.get() properly concatenate aliases when Name Attribute refers to a list-type alias?""" @@ -221,7 +221,7 @@ def test_ref_tables_alias_multisource_concat(self): # Notice: screening for "ID" name attribute happens earlier in counter.load_config() expected_alias = {"Gene:WBGene00023193": ("additional_class", "Gene:WBGene00023193", "unknown")} - _, alias, _ = ReferenceTables(feature_source, MockFeatureSelector([]), **kwargs).get() + _, alias, _ = ReferenceTables(feature_source, **kwargs).get(MockFeatureSelector([])) self.assertDictEqual(alias, expected_alias) @@ -236,7 +236,7 @@ def test_ref_tables_alias_multisource_concat_all_features_false(self): with self.assertRaisesRegex(ValueError, expected_err): # No aliases saved due to all_features=False and the lack of identity matches - _, alias, _ = ReferenceTables(feature_source, MockFeatureSelector([]), **kwargs).get() + _, alias, _ = ReferenceTables(feature_source, **kwargs).get(MockFeatureSelector([])) """Does ReferenceTables.get() properly concatenate identity match tuples when multiple GFF files define matches for a feature?""" @@ -262,7 +262,7 @@ def test_ref_tables_identity_matches_multisource_concat(self): set() ] - feats, _, _ = ReferenceTables(feature_source, feature_selector, **kwargs).get() + feats, _, _ = ReferenceTables(feature_source, **kwargs).get(feature_selector) actual_matches = list(feats.chrom_vectors['I']['.'].array.get_steps(values_only=True)) self.assertListEqual(actual_matches, expected_matches) @@ -274,7 +274,7 @@ def test_ref_tables_discontinuous_aliases(self): feature_source = {f"{resources}/discontinuous.gff3": ["Name"]} mock_selector = self.selector_with_template(helpers.rules_template) - _, alias, _ = ReferenceTables(feature_source, mock_selector, **kwargs).get() + _, alias, _ = ReferenceTables(feature_source, **kwargs).get(mock_selector) # Ancestor depth of 1, distinct aliases self.assertEqual(alias['Parent2'], ('Child2Name', 'Parent2Name')) @@ -294,7 +294,7 @@ def test_ref_tables_discontinuous_no_match_all_features_false(self): "This may be due to a lack of features matching 'Select for...with value...'" with self.assertRaisesRegex(ValueError, expected_err): - ReferenceTables(feature_source, mock_selector, **kwargs).get() + ReferenceTables(feature_source, **kwargs).get(mock_selector) """Does ReferenceTables.get() properly build a GenomicArrayOfSets for discontinuous features where no features match but all_features is True? If the feature didn't match we only want to @@ -311,7 +311,7 @@ def test_ref_tables_discontinuous_features(self): # EVEN if all_features = True. expected = [set()] - feats, _, _ = ReferenceTables(feature_source, feature_selector, **kwargs).get() + feats, _, _ = ReferenceTables(feature_source, **kwargs).get(feature_selector) actual = list(feats.chrom_vectors["I"]["."].array.get_steps(values_only=True)) self.assertListEqual(actual, expected) @@ -331,8 +331,8 @@ def test_ref_tables_discontinuous_intervals(self): sib_2 = HTSeq.GenomicInterval('I', 110, 120, '-') sib_3 = HTSeq.GenomicInterval('I', 139, 150, '-') - RT_instance = ReferenceTables(feature_source, feature_selector, **kwargs) - _ = RT_instance.get() + RT_instance = ReferenceTables(feature_source, **kwargs) + _ = RT_instance.get(feature_selector) # Ancestor depth of 1 self.assertEqual(RT_instance.intervals['GrandParent'], [grandparent_iv, parent_w_p_iv, child_w_gp_iv]) @@ -381,7 +381,7 @@ def test_ref_tables_discontinuous_identity_matches(self): {(Sibling, False, (rule3_sib['139:150'], rule2_sib['139:150']))}, set()] - feats, _, _ = ReferenceTables(feature_source, feature_selector, **rt_kwargs).get() + feats, _, _ = ReferenceTables(feature_source, **rt_kwargs).get(feature_selector) actual_steps = list(feats.chrom_vectors["I"]["."].array.get_steps(values_only=True)) self.assertListEqual(actual_steps, expected) @@ -401,8 +401,8 @@ def test_ref_tables_source_filter(self): exp_filtered = {"GrandParent", "ParentWithGrandparent", "Parent2", "Child1", "Sibling"} exp_parents = {'ParentWithGrandparent': 'GrandParent', 'Child1': 'ParentWithGrandparent', 'Child2': 'Parent2'} - rt = ReferenceTables(feature_source, feature_selector, **kwargs) - feats, alias, _ = rt.get() + rt = ReferenceTables(feature_source, **kwargs) + feats, alias, _ = rt.get(feature_selector) self.assertEqual(alias, exp_alias) self.assertEqual(rt.parents, exp_parents) @@ -426,8 +426,8 @@ def test_ref_tables_type_filter(self): exp_filtered = {"GrandParent", "ParentWithGrandparent", "Parent2", "Child2", "Sibling"} exp_parents = {'ParentWithGrandparent': 'GrandParent', 'Child1': 'ParentWithGrandparent', 'Child2': 'Parent2'} - rt = ReferenceTables(feature_source, feature_selector, **kwargs) - feats, alias, _ = rt.get() + rt = ReferenceTables(feature_source, **kwargs) + feats, alias, _ = rt.get(feature_selector) self.assertEqual(alias, exp_alias) self.assertEqual(rt.intervals, exp_intervals) @@ -443,8 +443,8 @@ def test_ref_tables_both_filter(self): feature_source = {f"{resources}/discontinuous.gff3": ["Name"]} feature_selector = self.selector_with_template([{'Filter_s': "SourceName", 'Filter_t': "gene"}]) - rt = ReferenceTables(feature_source, feature_selector, **kwargs) - feats, alias, _ = rt.get() + rt = ReferenceTables(feature_source, **kwargs) + feats, alias, _ = rt.get(feature_selector) self.assertEqual(rt.filtered, {'Child1', 'Child2'}) self.assertEqual(rt.parents, {'ParentWithGrandparent': 'GrandParent', 'Child1': 'ParentWithGrandparent', 'Child2': 'Parent2'}) @@ -473,7 +473,7 @@ def test_ref_tables_tagged_match_single(self): set() ] - feats, aliases, tags = ReferenceTables(feature_source, feature_selector, **kwargs).get() + feats, aliases, tags = ReferenceTables(feature_source, **kwargs).get(feature_selector) actual_feats = list(feats.chrom_vectors['I']['.'].array.get_steps(values_only=True)) self.assertListEqual(actual_feats, expected_feats) @@ -509,7 +509,7 @@ def test_ref_tables_tagged_match_merging(self): set() ] - feats, aliases, tags = ReferenceTables(feature_source, feature_selector).get() + feats, aliases, tags = ReferenceTables(feature_source).get(feature_selector) stepvec = list(feats.chrom_vectors['I']['.'].array.get_steps(values_only=True)) self.assertListEqual(stepvec, expected_feats) @@ -762,11 +762,11 @@ def test_gff_megazord(self): files = {gff.format(ftype): [] for gff in self.genomes.values() for ftype in ('gff3', 'gtf')} fs = FeatureSelector(rules, LibraryStats()) - rt = ReferenceTables(files, fs) + rt = ReferenceTables(files) # The test is passed if this command # completes without throwing errors. - rt.get() + rt.get(fs) if __name__ == '__main__': unittest.main() From cb1107696eecc30363df0539b4c2d160787e4454 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 3 Feb 2023 17:34:30 -0800 Subject: [PATCH 16/21] Adding unit tests for SAM @SQ header validation --- tests/unit_tests_validation.py | 116 ++++++++++++++++++++++++++++++++- 1 file changed, 114 insertions(+), 2 deletions(-) diff --git a/tests/unit_tests_validation.py b/tests/unit_tests_validation.py index a692c216..79226116 100644 --- a/tests/unit_tests_validation.py +++ b/tests/unit_tests_validation.py @@ -6,10 +6,10 @@ from glob import glob from unittest.mock import patch, mock_open -from rna.counter.validation import GFFValidator, ReportFormatter +from rna.counter.validation import GFFValidator, ReportFormatter, SamSqValidator -class ValidationTests(unittest.TestCase): +class GFFValidatorTest(unittest.TestCase): """======== Helper functions =========================== """ @@ -295,6 +295,118 @@ def test_chrom_heuristics_runtime(self): self.assertLessEqual(end-start, 2) +class SamSqValidatorTest(unittest.TestCase): + @classmethod + def setUpClass(self): + self.syntax_header = "Every SAM file must have complete @SQ headers with SN and LN\n" \ + "fields when counting in non-genomic mode.\n" + + self.identifier_header = "Sequence identifiers must be unique and have consistent length definitions.\n" + + def make_sam_sq_header(self, chroms): + return '\n'.join( + [f"@SQ\tSN:{chrom[0]}\tLN:{chrom[1]}" + for chrom in chroms] + ) + + def make_parsed_sq_header(self, chroms): + return [{'SN': chrom[0], 'LN': chrom[1]} if chrom[0] and chrom[1] else + {'SN': chrom[0]} if not chrom[1] else + {'LN': chrom[1]} + for chrom in chroms] + + """Does SamSqValidator correctly identify missing @SQ headers?""" + + def test_missing_sq_headers(self): + mock_files = ['sam1', 'sam2'] + mock_sam_headers = { + mock_files[0]: self.make_parsed_sq_header([('chr1', 100), ('chr2', 200)]), + mock_files[1]: [] + } + + expected = '\n'.join([ + self.syntax_header, + "\t" + f"{SamSqValidator.targets['missing sq']}: ", + "\t\t" + mock_files[1] + ]) + + validator = SamSqValidator(mock_files) + validator.sq_headers = mock_sam_headers + validator.validate_sq_headers() + + self.assertListEqual(validator.report.errors, [expected]) + self.assertListEqual(validator.report.warnings, []) + + """Does SamSqValidator correctly identify incomplete @SQ headers?""" + + def test_incomplete_sq_headers(self): + mock_files = ['sam1', 'sam2', 'sam3'] + mock_sam_headers = { + mock_files[0]: self.make_parsed_sq_header([('chr1', 100), ('chr2', 200)]), + mock_files[1]: self.make_parsed_sq_header([('chr3', None), ('chr4', 400)]), + mock_files[2]: self.make_parsed_sq_header([('chr5', 500), (None, 600)]) + } + + expected = '\n'.join([ + self.syntax_header, + "\t" + f"{SamSqValidator.targets['incomplete sq']}: ", + "\t\t" + mock_files[1], + "\t\t" + mock_files[2], + ]) + + validator = SamSqValidator(mock_files) + validator.sq_headers = mock_sam_headers + validator.validate_sq_headers() + + self.assertListEqual(validator.report.errors, [expected]) + self.assertListEqual(validator.report.warnings, []) + + """Does SamSqValidator correctly identify duplicate identifiers?""" + + def test_duplicate_identifiers(self): + mock_files = ['sam1', 'sam2', 'sam3'] + mock_sam_headers = { + mock_files[0]: self.make_parsed_sq_header([('chr1', 100), ('chr2', 200)]), + mock_files[1]: self.make_parsed_sq_header([('chr1', 100), ('chr2', 200)]), + mock_files[2]: self.make_parsed_sq_header([('chr1', 100), ('chr1', 100)]) + } + + expected = '\n'.join([ + self.identifier_header, + "\t" + f"{SamSqValidator.targets['intra sq']}: ", + "\t\t" + mock_files[2], + ]) + + validator = SamSqValidator(mock_files) + validator.sq_headers = mock_sam_headers + validator.validate_sq_headers() + + self.assertListEqual(validator.report.errors, [expected]) + self.assertListEqual(validator.report.warnings, []) + + """Does SamSqValidator correctly identify identifiers with inconsistent length definitions?""" + + def test_inconsistent_identifier_length(self): + mock_files = ['sam1', 'sam2', 'sam3'] + mock_sam_headers = { + mock_files[0]: self.make_parsed_sq_header([('chr1', 100), ('chr2', 200)]), + mock_files[1]: self.make_parsed_sq_header([('chr1', 100), ('chr2', 300)]), + mock_files[2]: self.make_parsed_sq_header([('chr1', 200), ('chr2', 100)]) + } + + expected = '\n'.join([ + self.identifier_header, + "\t" + f"{SamSqValidator.targets['inter sq']}: ", + "\t\t" + 'chr1', + "\t\t" + 'chr2', + ]) + + validator = SamSqValidator(mock_files) + validator.sq_headers = mock_sam_headers + validator.validate_sq_headers() + + self.assertListEqual(validator.report.errors, [expected]) + self.assertListEqual(validator.report.warnings, []) if __name__ == '__main__': unittest.main() From a0df42a0ff36058806bf684c03323efb84990410 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 3 Feb 2023 18:04:20 -0800 Subject: [PATCH 17/21] Bugfix: empty classifier fields are parsed from Features Sheet as an empty string. NonGenomicAnnotations has to emulate this. --- tiny/rna/counter/hts_parsing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index 3c5b2ea4..39d1c508 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -796,7 +796,7 @@ def add_reference_seq(self, seq_id, seq_len, unbuilt_match_tuples): # Features are classified in Reference Tables (Stage 1 selection) # For compatibility, use the seq_id with an empty classifier (index 1) - tagged_id = (seq_id,) + tagged_id = (seq_id, '') self.tags[seq_id] = {tagged_id} for strand in ('+', '-'): From 9a4e00e5c62a34441988eaa5a670acefd8afec79 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 3 Feb 2023 19:19:48 -0800 Subject: [PATCH 18/21] Renaming classes so they follow a better pattern: AnnotationParsing -> ReferenceBase ReferenceTables -> ReferenceFeatures NonGenomicAnnotations -> ReferenceSeqs Also made some corrections to comments: - Uses of ReferenceTables that also apply to ReferenceSeqs have been changed to "reference parsers" for the most part - A few spelling mistakes --- tests/unit_tests_hts_parsing.py | 72 ++++++++++++++++----------------- tests/unit_tests_stepvector.py | 2 +- tiny/rna/counter/counter.py | 16 ++++---- tiny/rna/counter/features.py | 12 +++--- tiny/rna/counter/hts_parsing.py | 15 ++++--- tiny/rna/counter/validation.py | 8 ++-- 6 files changed, 64 insertions(+), 61 deletions(-) diff --git a/tests/unit_tests_hts_parsing.py b/tests/unit_tests_hts_parsing.py index e6c54213..74de20d7 100644 --- a/tests/unit_tests_hts_parsing.py +++ b/tests/unit_tests_hts_parsing.py @@ -130,7 +130,7 @@ def test_gff_attr_list_vals(self): self.assertTrue(all([len(val) == 1 for key, val in attr.items() if key != "Class"])) self.assertEqual(len(attr['Class']), 2) - """Does ReferenceTables.get() return the expected features, aliases, and classes for a single record GFF?""" + """Does ReferenceFeatures.get() return the expected features, aliases, and classes for a single record GFF?""" def test_ref_tables_single_feature(self): feature_source = {self.short_gff_file: ["sequence_name"]} @@ -145,7 +145,7 @@ def test_ref_tables_single_feature(self): iv = HTSeq.GenomicInterval("I", 3746, 3909, "-") kwargs = {'all_features': True} - feats, alias, tags = ReferenceTables(feature_source, **kwargs).get(feature_selector) + feats, alias, tags = ReferenceFeatures(feature_source, **kwargs).get(feature_selector) steps = list(feats[iv].array[iv.start:iv.end].get_steps(values_only=True)) self.assertEqual((type(feats), type(alias), type(tags)), (HTSeq.GenomicArrayOfSets, dict, defaultdict)) @@ -169,7 +169,7 @@ def test_ref_tables_single_feature_all_features_false(self): iv = HTSeq.GenomicInterval("I", 3746, 3909, "-") kwargs = {'all_features': True} - feats, alias, tags = ReferenceTables(feature_source, **kwargs).get(feature_selector) + feats, alias, tags = ReferenceFeatures(feature_source, **kwargs).get(feature_selector) steps = list(feats[iv].array[iv.start:iv.end].get_steps(values_only=True)) self.assertEqual((type(feats), type(alias), type(tags)), (HTSeq.GenomicArrayOfSets, dict, defaultdict)) @@ -189,12 +189,12 @@ def test_ref_tables_missing_name_attribute_all_features_false(self): "This may be due to a lack of features matching 'Select for...with value...'" # Since all_features is False and there are no identity matches, the main loop in - # ReferenceTables.get() skips the steps for recording the feature's alias. + # ReferenceFeatures.get() skips the steps for recording the feature's alias. # Instead, a different exception is raised due to reference tables being empty with self.assertRaisesRegex(ValueError, expected_err): - ReferenceTables(feature_source, **kwargs).get(feature_selector) + ReferenceFeatures(feature_source, **kwargs).get(feature_selector) - """Does ReferenceTables.get() raise ValueError when a feature lacks an ID attribute?""" + """Does ReferenceFeatures.get() raise ValueError when a feature lacks an ID attribute?""" def test_ref_tables_missing_id_attribute(self): feature_source = {self.short_gff_file: ["ID"]} @@ -209,10 +209,10 @@ def test_ref_tables_missing_id_attribute(self): with patch('tiny.rna.counter.hts_parsing.HTSeq.utils.open', new=mock_reader): with self.assertRaisesRegex(ValueError, expected_err): - _ = ReferenceTables(feature_source, **kwargs).get(feature_selector) + _ = ReferenceFeatures(feature_source, **kwargs).get(feature_selector) - """Does ReferenceTables.get() properly concatenate aliases if there is more than one alias for a feature?""" - """Does ReferenceTables.get() properly concatenate aliases when Name Attribute refers to a list-type alias?""" + """Does ReferenceFeatures.get() properly concatenate aliases if there is more than one alias for a feature?""" + """Does ReferenceFeatures.get() properly concatenate aliases when Name Attribute refers to a list-type alias?""" # 2 for 1! def test_ref_tables_alias_multisource_concat(self): @@ -221,7 +221,7 @@ def test_ref_tables_alias_multisource_concat(self): # Notice: screening for "ID" name attribute happens earlier in counter.load_config() expected_alias = {"Gene:WBGene00023193": ("additional_class", "Gene:WBGene00023193", "unknown")} - _, alias, _ = ReferenceTables(feature_source, **kwargs).get(MockFeatureSelector([])) + _, alias, _ = ReferenceFeatures(feature_source, **kwargs).get(MockFeatureSelector([])) self.assertDictEqual(alias, expected_alias) @@ -236,9 +236,9 @@ def test_ref_tables_alias_multisource_concat_all_features_false(self): with self.assertRaisesRegex(ValueError, expected_err): # No aliases saved due to all_features=False and the lack of identity matches - _, alias, _ = ReferenceTables(feature_source, **kwargs).get(MockFeatureSelector([])) + _, alias, _ = ReferenceFeatures(feature_source, **kwargs).get(MockFeatureSelector([])) - """Does ReferenceTables.get() properly concatenate identity match tuples when multiple GFF files define + """Does ReferenceFeatures.get() properly concatenate identity match tuples when multiple GFF files define matches for a feature?""" def test_ref_tables_identity_matches_multisource_concat(self): @@ -262,19 +262,19 @@ def test_ref_tables_identity_matches_multisource_concat(self): set() ] - feats, _, _ = ReferenceTables(feature_source, **kwargs).get(feature_selector) + feats, _, _ = ReferenceFeatures(feature_source, **kwargs).get(feature_selector) actual_matches = list(feats.chrom_vectors['I']['.'].array.get_steps(values_only=True)) self.assertListEqual(actual_matches, expected_matches) - """Does ReferenceTables.get() properly handle aliases for discontinuous features?""" + """Does ReferenceFeatures.get() properly handle aliases for discontinuous features?""" def test_ref_tables_discontinuous_aliases(self): kwargs = {'all_features': True} feature_source = {f"{resources}/discontinuous.gff3": ["Name"]} mock_selector = self.selector_with_template(helpers.rules_template) - _, alias, _ = ReferenceTables(feature_source, **kwargs).get(mock_selector) + _, alias, _ = ReferenceFeatures(feature_source, **kwargs).get(mock_selector) # Ancestor depth of 1, distinct aliases self.assertEqual(alias['Parent2'], ('Child2Name', 'Parent2Name')) @@ -294,9 +294,9 @@ def test_ref_tables_discontinuous_no_match_all_features_false(self): "This may be due to a lack of features matching 'Select for...with value...'" with self.assertRaisesRegex(ValueError, expected_err): - ReferenceTables(feature_source, **kwargs).get(mock_selector) + ReferenceFeatures(feature_source, **kwargs).get(mock_selector) - """Does ReferenceTables.get() properly build a GenomicArrayOfSets for discontinuous features + """Does ReferenceFeatures.get() properly build a GenomicArrayOfSets for discontinuous features where no features match but all_features is True? If the feature didn't match we only want to display it in the Feature ID column of counts table with 0 counts. We DON'T want it to pop up as a candidate in Stage 2 selection.""" @@ -311,11 +311,11 @@ def test_ref_tables_discontinuous_features(self): # EVEN if all_features = True. expected = [set()] - feats, _, _ = ReferenceTables(feature_source, **kwargs).get(feature_selector) + feats, _, _ = ReferenceFeatures(feature_source, **kwargs).get(feature_selector) actual = list(feats.chrom_vectors["I"]["."].array.get_steps(values_only=True)) self.assertListEqual(actual, expected) - """Does ReferenceTables.get() properly handle intervals for discontinous features?""" + """Does ReferenceFeatures.get() properly handle intervals for discontinous features?""" def test_ref_tables_discontinuous_intervals(self): kwargs = {'all_features': True} @@ -331,7 +331,7 @@ def test_ref_tables_discontinuous_intervals(self): sib_2 = HTSeq.GenomicInterval('I', 110, 120, '-') sib_3 = HTSeq.GenomicInterval('I', 139, 150, '-') - RT_instance = ReferenceTables(feature_source, **kwargs) + RT_instance = ReferenceFeatures(feature_source, **kwargs) _ = RT_instance.get(feature_selector) # Ancestor depth of 1 @@ -341,7 +341,7 @@ def test_ref_tables_discontinuous_intervals(self): # Siblings self.assertEqual(RT_instance.intervals['Sibling'], [sib_1, sib_2, sib_3]) - """Does ReferenceTables.get() properly merge identity matches of discontinuous features with the root feature? + """Does ReferenceFeatures.get() properly merge identity matches of discontinuous features with the root feature? Identity match tuples now also contain the corresponding rule's IntervalSelector, so extra bookkeeping must be performed for intervals in this test.""" @@ -381,11 +381,11 @@ def test_ref_tables_discontinuous_identity_matches(self): {(Sibling, False, (rule3_sib['139:150'], rule2_sib['139:150']))}, set()] - feats, _, _ = ReferenceTables(feature_source, **rt_kwargs).get(feature_selector) + feats, _, _ = ReferenceFeatures(feature_source, **rt_kwargs).get(feature_selector) actual_steps = list(feats.chrom_vectors["I"]["."].array.get_steps(values_only=True)) self.assertListEqual(actual_steps, expected) - """Does ReferenceTables.get() properly handle source filters for discontinuous features?""" + """Does ReferenceFeatures.get() properly handle source filters for discontinuous features?""" def test_ref_tables_source_filter(self): @@ -401,7 +401,7 @@ def test_ref_tables_source_filter(self): exp_filtered = {"GrandParent", "ParentWithGrandparent", "Parent2", "Child1", "Sibling"} exp_parents = {'ParentWithGrandparent': 'GrandParent', 'Child1': 'ParentWithGrandparent', 'Child2': 'Parent2'} - rt = ReferenceTables(feature_source, **kwargs) + rt = ReferenceFeatures(feature_source, **kwargs) feats, alias, _ = rt.get(feature_selector) self.assertEqual(alias, exp_alias) @@ -410,7 +410,7 @@ def test_ref_tables_source_filter(self): self.assertEqual(list(feats.chrom_vectors['I']['.'].array.get_steps(values_only=True)), exp_feats) self.assertDictEqual(rt.intervals, exp_intervals) - """Does ReferenceTables.get() properly handle type filters for discontinuous features?""" + """Does ReferenceFeatures.get() properly handle type filters for discontinuous features?""" def test_ref_tables_type_filter(self): @@ -426,7 +426,7 @@ def test_ref_tables_type_filter(self): exp_filtered = {"GrandParent", "ParentWithGrandparent", "Parent2", "Child2", "Sibling"} exp_parents = {'ParentWithGrandparent': 'GrandParent', 'Child1': 'ParentWithGrandparent', 'Child2': 'Parent2'} - rt = ReferenceTables(feature_source, **kwargs) + rt = ReferenceFeatures(feature_source, **kwargs) feats, alias, _ = rt.get(feature_selector) self.assertEqual(alias, exp_alias) @@ -435,7 +435,7 @@ def test_ref_tables_type_filter(self): self.assertEqual(rt.filtered, exp_filtered) self.assertEqual(list(feats.chrom_vectors['I']['.'].array.get_steps(values_only=True)), exp_feats) - """Does ReferenceTables.get() properly handle both source and type filters for discontinuous features?""" + """Does ReferenceFeatures.get() properly handle both source and type filters for discontinuous features?""" def test_ref_tables_both_filter(self): @@ -443,7 +443,7 @@ def test_ref_tables_both_filter(self): feature_source = {f"{resources}/discontinuous.gff3": ["Name"]} feature_selector = self.selector_with_template([{'Filter_s': "SourceName", 'Filter_t': "gene"}]) - rt = ReferenceTables(feature_source, **kwargs) + rt = ReferenceFeatures(feature_source, **kwargs) feats, alias, _ = rt.get(feature_selector) self.assertEqual(rt.filtered, {'Child1', 'Child2'}) @@ -451,7 +451,7 @@ def test_ref_tables_both_filter(self): self.assertEqual(list(alias.keys()), ['GrandParent', 'Parent2', 'Sibling']) self.assertEqual(len(list(feats.chrom_vectors['I']['.'].array.get_steps(values_only=True))), 9) - """Does ReferenceTables.get() maintain correct records for a single feature matching tagged rules?""" + """Does ReferenceFeatures.get() maintain correct records for a single feature matching tagged rules?""" def test_ref_tables_tagged_match_single(self): kwargs = {'all_features': False} @@ -473,14 +473,14 @@ def test_ref_tables_tagged_match_single(self): set() ] - feats, aliases, tags = ReferenceTables(feature_source, **kwargs).get(feature_selector) + feats, aliases, tags = ReferenceFeatures(feature_source, **kwargs).get(feature_selector) actual_feats = list(feats.chrom_vectors['I']['.'].array.get_steps(values_only=True)) self.assertListEqual(actual_feats, expected_feats) self.assertDictEqual(aliases, expected_aliases) self.assertDictEqual(tags, expected_tags) - """Does ReferenceTables.get() correctly merge records for discontinuous features matching multiple tagged rules?""" + """Does ReferenceFeatures.get() correctly merge records for discontinuous features matching multiple tagged rules?""" def test_ref_tables_tagged_match_merging(self): feature_source = {f"{resources}/discontinuous.gff3": ['Name']} @@ -509,7 +509,7 @@ def test_ref_tables_tagged_match_merging(self): set() ] - feats, aliases, tags = ReferenceTables(feature_source).get(feature_selector) + feats, aliases, tags = ReferenceFeatures(feature_source).get(feature_selector) stepvec = list(feats.chrom_vectors['I']['.'].array.get_steps(values_only=True)) self.assertListEqual(stepvec, expected_feats) @@ -717,7 +717,7 @@ def test_CaseInsensitiveAttrs_contains_ident_wildcard(self): @unittest.skip("Long-running test, execute manually") class GenomeParsingTests(unittest.TestCase): - """Runs full-scale, unmodified GFF3/GTF genomes for select species through the ReferenceTables class""" + """Runs full-scale, unmodified GFF3/GTF genomes for select species through the ReferenceFeatures class""" @classmethod def setUpClass(self): @@ -753,17 +753,17 @@ def setUpClass(self): for chunk in r.iter_content(chunk_size=16384): f.write(chunk) - """Does ReferenceTables.get() process all genomes without throwing any errors?""" + """Does ReferenceFeatures.get() process all genomes without throwing any errors?""" def test_gff_megazord(self): print("Running GFF Megazord test. This will take a long time...") - # Single rule with all wildcard selectors, but only Identity is actually relevant within ReferenceTables + # Single rule with all wildcard selectors, but only Identity is actually relevant within ReferenceFeatures rules = [{'Identity': ('', ''), 'Class': '', 'Filter_s': '', 'Filter_t': '', 'Hierarchy': 0, 'Overlap': 'partial', 'Strand': '', 'nt5end': '', 'Length': ''}] files = {gff.format(ftype): [] for gff in self.genomes.values() for ftype in ('gff3', 'gtf')} fs = FeatureSelector(rules, LibraryStats()) - rt = ReferenceTables(files) + rt = ReferenceFeatures(files) # The test is passed if this command # completes without throwing errors. diff --git a/tests/unit_tests_stepvector.py b/tests/unit_tests_stepvector.py index c72facb5..0487868f 100644 --- a/tests/unit_tests_stepvector.py +++ b/tests/unit_tests_stepvector.py @@ -10,7 +10,7 @@ class StepVectorTests(unittest.TestCase): def test_genomicarray_with_cython_stepvec(self): # Patch the StepVector reference in the HTSeq module and use a GenomicArray - # instead of a GenomicArrayOfSets, just as we do in ReferenceTables + # instead of a GenomicArrayOfSets, just as we do in reference parsers setattr(HTSeq.StepVector, 'StepVector', StepVector) gas = HTSeq.GenomicArray('auto', stranded=False) diff --git a/tiny/rna/counter/counter.py b/tiny/rna/counter/counter.py index 6ee45ecc..6f402842 100644 --- a/tiny/rna/counter/counter.py +++ b/tiny/rna/counter/counter.py @@ -11,7 +11,7 @@ from tiny.rna.counter.validation import GFFValidator, SamSqValidator from tiny.rna.counter.features import Features, FeatureCounter from tiny.rna.counter.statistics import MergedStatsManager -from tiny.rna.counter.hts_parsing import ReferenceTables, NonGenomicAnnotations, AnnotationParsing +from tiny.rna.counter.hts_parsing import ReferenceFeatures, ReferenceSeqs, ReferenceBase from tiny.rna.util import report_execution_time, from_here, ReadOnlyDict, get_timestamp, add_transparent_help from tiny.rna.configuration import CSVReader, PathsFile, get_templates @@ -132,7 +132,7 @@ def get_library_filename(csv_row_file: str, samples_csv: str) -> str: def load_config(features_csv: str, is_pipeline: bool) -> List[dict]: - """Parses the Features Sheet to provide inputs to FeatureSelector and ReferenceTables + """Parses the Features Sheet to provide inputs to FeatureSelector and reference parsers Args: features_csv: a csv file which defines feature sources and selection rules @@ -150,7 +150,7 @@ def load_config(features_csv: str, is_pipeline: bool) -> List[dict]: rule['nt5end'] = rule['nt5end'].upper().translate({ord('U'): 'T'}) # Convert RNA base to cDNA base rule['Identity'] = (rule.pop('Key'), rule.pop('Value')) # Create identity tuple rule['Hierarchy'] = int(rule['Hierarchy']) # Convert hierarchy to number - rule['Overlap'] = rule['Overlap'].lower() # Built later in ReferenceTables + rule['Overlap'] = rule['Overlap'].lower() # Built later in reference parsers # Duplicate rule entries are not allowed if rule not in rules: rules.append(rule) @@ -158,17 +158,17 @@ def load_config(features_csv: str, is_pipeline: bool) -> List[dict]: return rules -def load_references(paths: PathsFile, libraries: List[dict], rules: List[dict], prefs) -> AnnotationParsing: +def load_references(paths: PathsFile, libraries: List[dict], rules: List[dict], prefs) -> ReferenceBase: """Determines the reference source (GFF or SAM @SQ headers) and constructs the appropriate object Args: paths: a PathsFile object that optionally contains a `gff_files` configuration libraries: libraries parsed from Samples Sheet, each as a dict with a 'File' key rules: selection rules parsed from Features Sheet - prefs: command line arguments to pass to the AnnotationParsing subclass + prefs: command line arguments to pass to the ReferenceBase subclass Returns: - references: an AnnotationParsing object, subclassed based on + references: a ReferenceBase object, subclassed based on the presence of GFF files in the Paths File """ @@ -177,14 +177,14 @@ def load_references(paths: PathsFile, libraries: List[dict], rules: List[dict], if gff_files: GFFValidator(gff_files, rules, alignments=sam_files).validate() - references = ReferenceTables(gff_files, **prefs) + references = ReferenceFeatures(gff_files, **prefs) else: sq_validator = SamSqValidator(sam_files) # Reuse sq_validator's parsing results to save time sq_validator.validate() sequences = sq_validator.reference_seqs - references = NonGenomicAnnotations(sequences, **prefs) + references = ReferenceSeqs(sequences, **prefs) return references diff --git a/tiny/rna/counter/features.py b/tiny/rna/counter/features.py index d85d02a3..42b0d1d4 100644 --- a/tiny/rna/counter/features.py +++ b/tiny/rna/counter/features.py @@ -4,7 +4,7 @@ from collections import defaultdict from typing import List, Tuple, Set, Dict, Mapping -from tiny.rna.counter.hts_parsing import SAM_reader, ReferenceTables, NonGenomicAnnotations +from tiny.rna.counter.hts_parsing import SAM_reader, ReferenceFeatures, ReferenceSeqs from .statistics import LibraryStats from .matching import * @@ -32,12 +32,12 @@ def __init__(self, references, selection_rules, **prefs): self.sam_reader = SAM_reader(**prefs) self.selector = FeatureSelector(selection_rules, self.stats, **prefs) - if isinstance(references, ReferenceTables): + if isinstance(references, ReferenceFeatures): self.mode = "genomic" - elif isinstance(references, NonGenomicAnnotations): + elif isinstance(references, ReferenceSeqs): self.mode = "non-genomic" else: - raise TypeError("Expected ReferenceTables or NonGenomicAnnotations, got %s" % type(references)) + raise TypeError("Expected ReferenceFeatures or ReferenceSeqs, got %s" % type(references)) Features(*references.get(self.selector)) self.prefs = prefs @@ -84,7 +84,7 @@ class FeatureSelector: Two sources of data serve as targets for selection: feature attributes (sourced from input GFF files), and sequence attributes (sourced from input SAM files). All candidate features are assumed to have matched at least one Identity selector, - as determined by hts_parsing.ReferenceTables.get_matches_and_classes() + as determined by hts_parsing.ReferenceFeatures.get_matches_and_classes() The first round of selection was performed during GFF parsing. @@ -205,7 +205,7 @@ def build_interval_selectors(iv: 'HTSeq.GenomicInterval', match_tuples: List[unb Unlike build_selectors() and build_inverted_identities(), this function is not called at construction time. Instead, it is called when finalizing - match-tuples in ReferenceTables. This is because interval selectors are + match-tuples in reference parsers. This is because interval selectors are created for each feature (requiring start/stop/strand to be known) for each of the feature's identity matches (each match-tuple). diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index 39d1c508..797f7f69 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -236,7 +236,7 @@ def infer_strandedness(sam_file: str, intervals: dict) -> str: Args: sam_file: the path of the SAM file to evaluate - intervals: the intervals instance attribute of ReferenceTables, populated by .get() + intervals: the intervals instance attribute of ReferenceFeatures, populated by .get() """ unstranded = HTSeq.GenomicArrayOfSets("auto", stranded=False) @@ -419,7 +419,9 @@ def not_implemented(self): raise NotImplementedError(f"CaseInsensitiveAttrs does not support {stack()[1].function}") -class AnnotationParsing(ABC): +class ReferenceBase(ABC): + """The base class for reference parsers""" + def __init__(self, **prefs): self.stepvector = prefs.get('stepvector', 'HTSeq') self.feats = self._init_genomic_array() @@ -501,7 +503,8 @@ def parse_gff(file, row_fn: Callable, alias_keys=None): TagTable = DefaultDict[str, Set[Tuple[str, str]]] GenomicArray = HTSeq.GenomicArrayOfSets -class ReferenceTables(AnnotationParsing): + +class ReferenceFeatures(ReferenceBase): """A GFF parser which builds feature, alias, and class reference tables Discontinuous features are supported, and comma-separated attribute values (GFF3 column 9) @@ -514,7 +517,7 @@ class ReferenceTables(AnnotationParsing): Children of the root ancestor are otherwise not stored in the reference tables. Match-tuples are created for each Features Sheet rule which matches a feature's attributes. - They are structured as (rank, rule, overlap). "Rank" is the heirarchy value of the matching + They are structured as (rank, rule, overlap). "Rank" is the hierarchy value of the matching rule, "rule" is the index of that rule in FeatureSelector's rules table, and "overlap" is the IntervalSelector per the rule's overlap requirements. @@ -582,7 +585,7 @@ def get_root_feature(self, lineage: list) -> str: """Returns the highest feature ID in the ancestral tree which passed stage 1 selection. The original feature ID is returned if there are no valid ancestors.""" - # Descend tree until the descendent is found in the matches table + # Descend tree until the descendant is found in the matches table # This is because ancestor feature(s) may have been filtered for ancestor in lineage[::-1]: if self.was_matched(ancestor): @@ -761,7 +764,7 @@ def column_filter_match(row, rule): return row.source in rule['Filter_s'] and row.type in rule['Filter_t'] -class NonGenomicAnnotations(AnnotationParsing): +class ReferenceSeqs(ReferenceBase): def __init__(self, reference_seqs, **prefs): super().__init__(**prefs) diff --git a/tiny/rna/counter/validation.py b/tiny/rna/counter/validation.py index 714d6f99..349fb615 100644 --- a/tiny/rna/counter/validation.py +++ b/tiny/rna/counter/validation.py @@ -6,7 +6,7 @@ from collections import Counter, defaultdict from typing import List, Dict -from tiny.rna.counter.hts_parsing import parse_gff, ReferenceTables, SAM_reader +from tiny.rna.counter.hts_parsing import parse_gff, ReferenceFeatures, SAM_reader from tiny.rna.counter.features import FeatureSelector from tiny.rna.util import sorted_natural, gzip_open, report_execution_time @@ -92,7 +92,7 @@ class GFFValidator: } def __init__(self, gff_files, rules, ebwt=None, genomes=None, alignments=None): - self.ReferenceTables = ReferenceTables(gff_files) + self.ReferenceFeatures = ReferenceFeatures(gff_files) self.column_filters = self.build_column_filters(rules) self.report = ReportFormatter(self.targets) self.chrom_set = set() @@ -120,13 +120,13 @@ def parse_and_validate_gffs(self, file_set): def validate_gff_row(self, row, issues, file): # Skip definitions of whole chromosomes and obey source/type filters if row.type.lower() == "chromosome": return - if not self.ReferenceTables.column_filter_match(row, self.column_filters): return + if not self.ReferenceFeatures.column_filter_match(row, self.column_filters): return if row.iv.strand not in ('+', '-'): issues['strand'][file] += 1 try: - self.ReferenceTables.get_feature_id(row) + self.ReferenceFeatures.get_feature_id(row) except: issues['ID attribute'][file] += 1 From 3247ea63814dd29aeae16bc49d7d0cedfbf339ad Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 3 Feb 2023 19:23:06 -0800 Subject: [PATCH 19/21] This commit only changes the order in which methods are defined in ReferenceFeatures. No code changes and no functional changes. The previous order was mostly arbitrary and it has been bothering me for a while. The new order mostly prioritizes core functions/concepts and the rest is roughly by calling order. This should make it a little easier to browse. --- tiny/rna/counter/hts_parsing.py | 122 ++++++++++++++++---------------- 1 file changed, 61 insertions(+), 61 deletions(-) diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index 797f7f69..b8e4ba73 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -581,31 +581,37 @@ def parse_row(self, row, alias_keys=None): # Add alias to root ancestor if it is unique self.add_alias(root_id, alias_keys, row.attr) - def get_root_feature(self, lineage: list) -> str: - """Returns the highest feature ID in the ancestral tree which passed stage 1 selection. - The original feature ID is returned if there are no valid ancestors.""" + @staticmethod + def get_feature_id(row): + id_collection = row.attr.get('ID') \ + or row.attr.get('gene_id') \ + or row.attr.get('Parent') - # Descend tree until the descendant is found in the matches table - # This is because ancestor feature(s) may have been filtered - for ancestor in lineage[::-1]: - if self.was_matched(ancestor): - return ancestor - else: - return lineage[0] # Default: the original feature_id + if id_collection is None: + raise ValueError(f"Feature {row.name} does not have an ID attribute.") + if len(id_collection) == 0: + raise ValueError("A feature's ID attribute cannot be empty. This value is required.") + if len(id_collection) > 1: + return ','.join(id_collection) - def get_feature_ancestors(self, feature_id: str, row_attrs: CaseInsensitiveAttrs): - if "Parent" not in row_attrs: - return [feature_id] + return id_collection[0] - parent_id = self.get_row_parent(feature_id, row_attrs) - lineage = [feature_id, parent_id] + def get_matches(self, row: HTSeq.GenomicFeature) -> DefaultDict: + """Performs Stage 1 selection and returns match tuples under their associated classifier""" - # Climb ancestral tree until the root parent is found - while parent_id in self.parents: - parent_id = self.parents[parent_id] - lineage.append(parent_id) + identity_matches = defaultdict(set) + for ident, rule_indexes in self.selector.inv_ident.items(): + if row.attr.contains_ident(ident): + for index in rule_indexes: + rule = self.selector.rules_table[index] + if self.column_filter_match(row, rule): + # Unclassified matches are pooled under '' empty string + identity_matches[rule['Class']].add( + (index, rule['Hierarchy'], rule['Overlap']) + ) - return lineage + # -> identity_matches: {classifier: {(rule, rank, overlap), ...}, ...} + return identity_matches def add_feature(self, feature_id: str, row, matches: defaultdict) -> str: """Adds the feature to the intervals table under its root ID, and to the matches table @@ -630,29 +636,38 @@ def add_feature(self, feature_id: str, row, matches: defaultdict) -> str: return root_id - def add_alias(self, root_id: str, alias_keys: List[str], row_attrs: CaseInsensitiveAttrs) -> None: - """Add feature's aliases to the root ancestor's alias set""" + def get_feature_ancestors(self, feature_id: str, row_attrs: CaseInsensitiveAttrs): + if "Parent" not in row_attrs: + return [feature_id] - for alias_key in alias_keys: - for row_val in row_attrs.get(alias_key, ()): - self.alias[root_id].add(row_val) + parent_id = self.get_row_parent(feature_id, row_attrs) + lineage = [feature_id, parent_id] - def get_matches(self, row: HTSeq.GenomicFeature) -> DefaultDict: - """Performs Stage 1 selection and returns match tuples under their associated classifier""" + # Climb ancestral tree until the root parent is found + while parent_id in self.parents: + parent_id = self.parents[parent_id] + lineage.append(parent_id) - identity_matches = defaultdict(set) - for ident, rule_indexes in self.selector.inv_ident.items(): - if row.attr.contains_ident(ident): - for index in rule_indexes: - rule = self.selector.rules_table[index] - if self.column_filter_match(row, rule): - # Unclassified matches are pooled under '' empty string - identity_matches[rule['Class']].add( - (index, rule['Hierarchy'], rule['Overlap']) - ) + return lineage - # -> identity_matches: {class: (rule, rank, overlap), ...} - return identity_matches + def get_root_feature(self, lineage: list) -> str: + """Returns the highest feature ID in the ancestral tree which passed stage 1 selection. + The original feature ID is returned if there are no valid ancestors.""" + + # Descend tree until the descendant is found in the matches table + # This is because ancestor feature(s) may have been filtered + for ancestor in lineage[::-1]: + if self.was_matched(ancestor): + return ancestor + else: + return lineage[0] # Default: the original feature_id + + def was_matched(self, untagged_id): + """Checks if the feature ID previously matched on identity, regardless of whether + the matching rule was tagged or untagged.""" + + # any() will short circuit on first match when provided a generator function + return any(tagged_id in self.matches for tagged_id in self.tags.get(untagged_id, ())) def get_row_parent(self, feature_id: str, row_attrs: CaseInsensitiveAttrs) -> str: """Get the current feature's parent while cooperating with filtered features""" @@ -687,6 +702,13 @@ def exclude_row(self, row): self.parents[feature_id] = self.get_row_parent(feature_id, row.attr) self.chrom_vector_setdefault(row.iv.chrom) + def add_alias(self, root_id: str, alias_keys: List[str], row_attrs: CaseInsensitiveAttrs) -> None: + """Add feature's aliases to the root ancestor's alias set""" + + for alias_key in alias_keys: + for row_val in row_attrs.get(alias_key, ()): + self.alias[root_id].add(row_val) + def _finalize_aliases(self): self.alias = {feat: tuple(sorted(aliases, key=str.lower)) for feat, aliases in self.alias.items()} @@ -734,28 +756,6 @@ def _merge_adjacent_subintervals(unmerged_sub_ivs: List[HTSeq.GenomicInterval]) return merged_ivs - def was_matched(self, untagged_id): - """Checks if the feature ID previously matched on identity, regardless of whether - the matching rule was tagged or untagged.""" - - # any() will short circuit on first match when provided a generator function - return any(tagged_id in self.matches for tagged_id in self.tags.get(untagged_id, ())) - - @staticmethod - def get_feature_id(row): - id_collection = row.attr.get('ID') \ - or row.attr.get('gene_id') \ - or row.attr.get('Parent') - - if id_collection is None: - raise ValueError(f"Feature {row.name} does not have an ID attribute.") - if len(id_collection) == 0: - raise ValueError("A feature's ID attribute cannot be empty. This value is required.") - if len(id_collection) > 1: - return ','.join(id_collection) - - return id_collection[0] - @staticmethod def column_filter_match(row, rule): """Checks if the GFF row passes the inclusive filter(s) From ff79c98ef4496a60434cd27ec2c707388f829ca9 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Fri, 3 Feb 2023 19:23:35 -0800 Subject: [PATCH 20/21] Updating comments in the Paths File to indicate that gff_files is optional --- START_HERE/paths.yml | 2 +- tests/testdata/config_files/paths.yml | 2 +- tiny/templates/paths.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/START_HERE/paths.yml b/START_HERE/paths.yml index 1bda5099..782f2fb3 100644 --- a/START_HERE/paths.yml +++ b/START_HERE/paths.yml @@ -7,7 +7,7 @@ # 1. Fill out the Samples Sheet with files to process + naming scheme. [samples.csv] # 2. Fill out the Features Sheet with selection rules [features.csv] # 3. Set samples_csv and features_csv (below) to point to these files -# 4. Add annotation files and per-file alias preferences to gff_files +# 4. Add annotation files and per-file alias preferences to gff_files (optional) # ######-------------------------------------------------------------------------------###### diff --git a/tests/testdata/config_files/paths.yml b/tests/testdata/config_files/paths.yml index 125e6a76..92cc0da4 100644 --- a/tests/testdata/config_files/paths.yml +++ b/tests/testdata/config_files/paths.yml @@ -7,7 +7,7 @@ # 1. Fill out the Samples Sheet with files to process + naming scheme. [samples.csv] # 2. Fill out the Features Sheet with selection rules [features.csv] # 3. Set samples_csv and features_csv (below) to point to these files -# 4. Add annotation files and per-file alias preferences to gff_files +# 4. Add annotation files and per-file alias preferences to gff_files (optional) # ######-------------------------------------------------------------------------------###### diff --git a/tiny/templates/paths.yml b/tiny/templates/paths.yml index 3f7a332d..2b70aba2 100644 --- a/tiny/templates/paths.yml +++ b/tiny/templates/paths.yml @@ -7,7 +7,7 @@ # 1. Fill out the Samples Sheet with files to process + naming scheme. [samples.csv] # 2. Fill out the Features Sheet with selection rules [features.csv] # 3. Set samples_csv and features_csv (below) to point to these files -# 4. Add annotation files and per-file alias preferences to gff_files +# 4. Add annotation files and per-file alias preferences to gff_files (optional) # ######-------------------------------------------------------------------------------###### From ba9762872eb3904ce9c37eafa8183a2583f2e412 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Mon, 6 Feb 2023 16:08:25 -0800 Subject: [PATCH 21/21] Adding a timeout to the sequence-based counting notice that is issued when users don't have GFF files listed. Also correcting instances of "non-genomic" to "sequence-based" --- tests/unit_tests_validation.py | 2 +- tiny/rna/configuration.py | 15 ++++++++++++--- tiny/rna/counter/features.py | 4 ++-- tiny/rna/counter/hts_parsing.py | 3 +-- tiny/rna/counter/validation.py | 5 ++--- 5 files changed, 18 insertions(+), 11 deletions(-) diff --git a/tests/unit_tests_validation.py b/tests/unit_tests_validation.py index 79226116..7df9cac5 100644 --- a/tests/unit_tests_validation.py +++ b/tests/unit_tests_validation.py @@ -299,7 +299,7 @@ class SamSqValidatorTest(unittest.TestCase): @classmethod def setUpClass(self): self.syntax_header = "Every SAM file must have complete @SQ headers with SN and LN\n" \ - "fields when counting in non-genomic mode.\n" + "fields when performing sequence-based counting.\n" self.identifier_header = "Sequence identifiers must be unique and have consistent length definitions.\n" diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py index 5d8f73c4..e7f6a2d8 100644 --- a/tiny/rna/configuration.py +++ b/tiny/rna/configuration.py @@ -2,6 +2,7 @@ import argparse import shutil import errno +import time import sys import csv import os @@ -581,12 +582,20 @@ def get_gff_config(self) -> Dict[str, list]: gff_files[file] = sorted(set(alias), key=alias.index) if not len(gff_files) and not self.in_pipeline: - print("No GFF files were specified in {selfname} so " - "reads will be counted in non-genomic mode." - .format(selfname=self.basename)) + self.print_sequence_counting_notice() return dict(gff_files) + def print_sequence_counting_notice(self): + print("No GFF files were specified in {selfname}.\n" + "Reads will be quantified by sequence " + "rather than by feature." + .format(selfname=self.basename)) + + for s in reversed(range(6)): + print(f"Proceeding in {s}s", end='\r') + time.sleep(1) + # Override def append_to(self, key: str, val: Any): """Overrides method in the base class. This is necessary due to automatic diff --git a/tiny/rna/counter/features.py b/tiny/rna/counter/features.py index 42b0d1d4..14aad110 100644 --- a/tiny/rna/counter/features.py +++ b/tiny/rna/counter/features.py @@ -33,9 +33,9 @@ def __init__(self, references, selection_rules, **prefs): self.selector = FeatureSelector(selection_rules, self.stats, **prefs) if isinstance(references, ReferenceFeatures): - self.mode = "genomic" + self.mode = "by feature" elif isinstance(references, ReferenceSeqs): - self.mode = "non-genomic" + self.mode = "by sequence" else: raise TypeError("Expected ReferenceFeatures or ReferenceSeqs, got %s" % type(references)) diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index b8e4ba73..fdd6b6c5 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -772,7 +772,6 @@ def __init__(self, reference_seqs, **prefs): self.alias = {} self.tags = {} - @report_execution_time("Non-genomic annotations parsing") def get(self, selector): self.selector = selector match_tuples = self.get_matches() @@ -785,7 +784,7 @@ def get(self, selector): return self.feats, aliases, self.tags def get_matches(self): - """Stage 1 selection is skipped in non-genomic counting. + """Stage 1 selection is skipped in sequence-based counting. Simply build match_tuples for all rules. These will be used uniformly in each reference sequence's feature_record_tuple""" diff --git a/tiny/rna/counter/validation.py b/tiny/rna/counter/validation.py index 349fb615..1605b62b 100644 --- a/tiny/rna/counter/validation.py +++ b/tiny/rna/counter/validation.py @@ -295,7 +295,7 @@ def build_column_filters(rules): class SamSqValidator: - """Validates @SQ headers for tiny-count's non-genomic counting mode""" + """Validates @SQ headers for tiny-count's sequence-based counting mode""" targets = { "inter sq": "Sequence identifiers with inconsistent lengths", @@ -310,7 +310,6 @@ def __init__(self, sam_files): self.reference_seqs = {} self.sq_headers = {} - @report_execution_time("Non-genomic annotations validation") def validate(self): print("Validating sequence identifiers in SAM files...") self.read_sq_headers() @@ -391,7 +390,7 @@ def generate_header_syntax_report(self, missing, incomplete): report['incomplete sq'] = sorted_natural(incomplete) header = "Every SAM file must have complete @SQ headers with SN and LN\n" \ - "fields when counting in non-genomic mode.\n" + "fields when performing sequence-based counting.\n" self.report.add_error_section(header, report) def generate_identifier_report(self, duplicate, ambiguous):