From 146fa384f3bc83b156f6599504bd84e05564aa23 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sat, 28 Jan 2023 15:20:16 -0800 Subject: [PATCH 01/10] Drafting support for overlap selector parameters. Right now the parameter order isn't switched for 3' anchored, and honestly I'm having second thoughts on whether it makes sense to break the pattern for that selector --- tiny/rna/counter/features.py | 23 +++++++------- tiny/rna/counter/matching.py | 58 +++++++++++++++++++++++++----------- 2 files changed, 54 insertions(+), 27 deletions(-) diff --git a/tiny/rna/counter/features.py b/tiny/rna/counter/features.py index b17c0cdc..8925f381 100644 --- a/tiny/rna/counter/features.py +++ b/tiny/rna/counter/features.py @@ -211,20 +211,23 @@ def build_interval_selectors(iv: 'HTSeq.GenomicInterval', match_tuples: List[unb built_selectors = {} selector_factory = { - 'full': lambda: IntervalFullMatch(iv), - 'exact': lambda: IntervalExactMatch(iv), - 'partial': lambda: IntervalPartialMatch(iv), - 'anchored': lambda: IntervalAnchorMatch(iv), - "5' anchored": lambda: Interval5pMatch(iv) if iv.strand in ('+', '-') else IntervalAnchorMatch(iv), - "3' anchored": lambda: Interval3pMatch(iv) if iv.strand in ('+', '-') else IntervalAnchorMatch(iv), + 'full': lambda: IntervalFullMatch(*args), + 'exact': lambda: IntervalExactMatch(*args), + 'partial': lambda: IntervalPartialMatch(*args), + 'anchored': lambda: IntervalAnchorMatch(*args), + "5' anchored": lambda: Interval5pMatch(*args) if iv.strand in ('+', '-') else IntervalAnchorMatch(*args), + "3' anchored": lambda: Interval3pMatch(*args) if iv.strand in ('+', '-') else IntervalAnchorMatch(*args), } - for i in range(len(match_tuples)): + for i, match in enumerate(match_tuples): try: - match = match_tuples[i] + # Split into keyword and params + defn = re.split(r'\s+', match[2], 2) + selector, args = defn[0], (iv, *defn[1:]) + # Cache instances to prevent duplicates for the same match type on the same iv - selector = built_selectors.setdefault(match[2], selector_factory[match[2]]()) - match_tuples[i] = (match[0], match[1], selector) + selector_obj = built_selectors.setdefault(selector, selector_factory[selector]()) + match_tuples[i] = (match[0], match[1], selector_obj) except KeyError: raise ValueError(f'Invalid overlap selector: "{match_tuples[i][2]}"') diff --git a/tiny/rna/counter/matching.py b/tiny/rna/counter/matching.py index 6946e684..7a011ae3 100644 --- a/tiny/rna/counter/matching.py +++ b/tiny/rna/counter/matching.py @@ -118,7 +118,7 @@ def validate_definition(defn: str): class IntervalSelector: - __slots__ = ("start", "end") + __slots__ = ("start", "end", "shift") """IntervalSelector classes use __slots__ rather than class attributes to reduce memory footprint. As a result, each instance requires only 128 bytes @@ -127,10 +127,14 @@ class IntervalSelector: one IntervalSelector is created for each unique match definition, for each interval associated with each retained feature.""" - def __init__(self, iv: HTSeq.GenomicInterval): - """Descendents only need to know the start and end coordinates of the target feature""" - self.start = iv.start - self.end = iv.end + def __init__(self, iv: HTSeq.GenomicInterval, shift:str = None): + """Descendants only need to know the start and end coordinates of the target feature""" + if shift: + self.validate_shift_params(shift) + self.start, self.end, self.shift = self.parse_shift_parameters(shift, iv) + else: + self.start, self.end = iv.start, iv.end + self.shift = None def __hash__(self): """Descendents must be hashable in order to be stored in a GenomicArrayOfSets""" @@ -141,6 +145,26 @@ def __eq__(self, other): self.start == other.start and \ self.end == other.end + @staticmethod + def parse_shift_parameters(defn, iv): + split = defn.strip().split(',', 2) + shift = shift_5, shift_3 = int(split[0]), int(split[1]) + + if iv.strand == '+': + start, end = iv.start + shift_5, iv.end + shift_3 + elif iv.strand == '-': + start, end = iv.start + shift_3, iv.end + shift_5 + else: # iv.strand == '.': + start, end = iv.start, iv.end + shift = None + + return start, end, shift + + @staticmethod + def validate_shift_params(defn): + assert re.match(r'^-?\d+\s*,\s*-?\d+$', defn.strip()) is not None, \ + f'Invalid overlap shift parameters: "{defn}"' + class IntervalPartialMatch(IntervalSelector): """This is a no-op filter @@ -151,8 +175,8 @@ class IntervalPartialMatch(IntervalSelector): __contains__ = Wildcard.__contains__ - def __init__(self, iv: HTSeq.GenomicInterval): - super().__init__(iv) + def __init__(self, iv: HTSeq.GenomicInterval, shift=None): + super().__init__(iv, shift) def __repr__(self): return f"" @@ -160,8 +184,8 @@ def __repr__(self): class IntervalFullMatch(IntervalSelector): - def __init__(self, iv: HTSeq.GenomicInterval): - super().__init__(iv) + def __init__(self, iv: HTSeq.GenomicInterval, shift=None): + super().__init__(iv, shift) def __contains__(self, alignment): return self.start <= alignment['Start'] and alignment['End'] <= self.end @@ -172,8 +196,8 @@ def __repr__(self): class IntervalExactMatch(IntervalSelector): - def __init__(self, iv: HTSeq.GenomicInterval): - super().__init__(iv) + def __init__(self, iv: HTSeq.GenomicInterval, shift=None): + super().__init__(iv, shift) def __contains__(self, alignment): return self.start == alignment['Start'] and alignment['End'] == self.end @@ -185,8 +209,8 @@ def __repr__(self): class IntervalAnchorMatch(IntervalSelector): """Evaluates whether an alignment's start matches the feature's start, and vice versa for end.""" - def __init__(self, iv: HTSeq.GenomicInterval): - super().__init__(iv) + def __init__(self, iv: HTSeq.GenomicInterval, shift=None): + super().__init__(iv, shift) assert iv.strand not in ('+', '-') def __contains__(self, alignment: dict): @@ -213,8 +237,8 @@ def __repr__(self): class Interval5pMatch(IntervalSelector): """Evaluates whether an alignment's 5' end is anchored to the corresponding terminus of the feature""" - def __init__(self, iv: HTSeq.GenomicInterval): - super().__init__(iv) + def __init__(self, iv: HTSeq.GenomicInterval, shift=None): + super().__init__(iv, shift) def __contains__(self, alignment: dict): """The following diagram demonstrates 5' anchored matching semantics. @@ -250,8 +274,8 @@ def __repr__(self): class Interval3pMatch(IntervalSelector): """Evaluates whether an alignment's 5' end is anchored to the corresponding terminus of the feature""" - def __init__(self, iv: HTSeq.GenomicInterval): - super().__init__(iv) + def __init__(self, iv: HTSeq.GenomicInterval, shift=None): + super().__init__(iv, shift) def __contains__(self, alignment): """The following diagram demonstrates 3' anchored matching semantics. From 5cad5d16b54de34b1c6dc445de15094c1aaa31bd Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Mon, 6 Feb 2023 09:45:21 -0800 Subject: [PATCH 02/10] Changing the approach for determining the insertion interval for shifted overlap selectors. Rather than handling the shift param entirely within the IntervalSelector base class, I instead provide a static method for returning a shifted HTSeq.GenomicInterval. This has two advantages: - Maintains IntervalSelectors' small memory footprint since this approach doesn't require any new attributes for the class. These objects can be very numerous depending on the GFF and ruleset - Keeps the code clean for adding matches to the correct intervals in ReferenceTables. Match tuples need to be inserted in the GenomicArray under the same interval that the overlap selector is expecting. Due to the above, cached selectors in build_interval_selectors() have to be added under an updated key that includes the shifted interval as well. Additionally, illegal shift operations (interval goes to zero length or inverted) are handled. Match tuples holding these overlap selectors are simply dropped. --- tiny/rna/counter/features.py | 37 ++++++++++------ tiny/rna/counter/matching.py | 83 +++++++++++++++++++++++------------- 2 files changed, 78 insertions(+), 42 deletions(-) diff --git a/tiny/rna/counter/features.py b/tiny/rna/counter/features.py index 8925f381..d05c2e4c 100644 --- a/tiny/rna/counter/features.py +++ b/tiny/rna/counter/features.py @@ -209,29 +209,40 @@ def build_interval_selectors(iv: 'HTSeq.GenomicInterval', match_tuples: List[unb matches. Each tuple index 2 defines and is replaced by the selector. """ - built_selectors = {} + cache = {} selector_factory = { - 'full': lambda: IntervalFullMatch(*args), - 'exact': lambda: IntervalExactMatch(*args), - 'partial': lambda: IntervalPartialMatch(*args), - 'anchored': lambda: IntervalAnchorMatch(*args), - "5' anchored": lambda: Interval5pMatch(*args) if iv.strand in ('+', '-') else IntervalAnchorMatch(*args), - "3' anchored": lambda: Interval3pMatch(*args) if iv.strand in ('+', '-') else IntervalAnchorMatch(*args), + 'full': lambda x: IntervalFullMatch(x), + 'exact': lambda x: IntervalExactMatch(x), + 'partial': lambda x: IntervalPartialMatch(x), + 'anchored': lambda x: IntervalAnchorMatch(x), + "5'anchored": lambda x: Interval5pMatch(x) if iv.strand in ('+', '-') else IntervalAnchorMatch(x), + "3'anchored": lambda x: Interval3pMatch(x) if iv.strand in ('+', '-') else IntervalAnchorMatch(x), } + matches_by_interval = defaultdict(list) for i, match in enumerate(match_tuples): try: - # Split into keyword and params - defn = re.split(r'\s+', match[2], 2) - selector, args = defn[0], (iv, *defn[1:]) + # Split optional shift parameters from definition + defn = re.split(r'\s*,\s*', match[2], 1) + selector, shift = defn[0], defn[1:] + selector = selector.replace(' ', '') + + # Shift the interval before constructing if shift parameters were provided + match_iv = IntervalSelector.get_shifted_interval(shift[0], iv) if shift else iv + cache_key = (selector, match_iv) # Cache instances to prevent duplicates for the same match type on the same iv - selector_obj = built_selectors.setdefault(selector, selector_factory[selector]()) - match_tuples[i] = (match[0], match[1], selector_obj) + selector_obj = cache.setdefault(cache_key, selector_factory[selector](match_iv)) + + built_match_tuple = (match[0], match[1], selector_obj) + matches_by_interval[match_iv].append(built_match_tuple) except KeyError: raise ValueError(f'Invalid overlap selector: "{match_tuples[i][2]}"') + except IllegalShiftError: + # Drop offending match tuple + pass - return match_tuples + return matches_by_interval @staticmethod def build_inverted_identities(rules_table) -> Dict[Tuple[str, str], List[int]]: diff --git a/tiny/rna/counter/matching.py b/tiny/rna/counter/matching.py index 7a011ae3..a90b07ca 100644 --- a/tiny/rna/counter/matching.py +++ b/tiny/rna/counter/matching.py @@ -117,8 +117,19 @@ def validate_definition(defn: str): f'Invalid length selector: "{defn}"' +# Used in IntervalSelector +class IllegalShiftError(Exception): + def __init__(self, iv, shift, subtype): + self.subtype = subtype + self.shift = shift + self.iv = iv + + self.args = (f"The interval {iv} cannot be shifted by {shift} " + f"(results in {self.subtype} interval)",) + + class IntervalSelector: - __slots__ = ("start", "end", "shift") + __slots__ = ("start", "end") """IntervalSelector classes use __slots__ rather than class attributes to reduce memory footprint. As a result, each instance requires only 128 bytes @@ -127,17 +138,13 @@ class IntervalSelector: one IntervalSelector is created for each unique match definition, for each interval associated with each retained feature.""" - def __init__(self, iv: HTSeq.GenomicInterval, shift:str = None): + def __init__(self, iv: HTSeq.GenomicInterval): """Descendants only need to know the start and end coordinates of the target feature""" - if shift: - self.validate_shift_params(shift) - self.start, self.end, self.shift = self.parse_shift_parameters(shift, iv) - else: - self.start, self.end = iv.start, iv.end - self.shift = None + self.start = iv.start + self.end = iv.end def __hash__(self): - """Descendents must be hashable in order to be stored in a GenomicArrayOfSets""" + """Descendants must be hashable in order to be stored in a GenomicArrayOfSets""" return self.start ^ self.end def __eq__(self, other): @@ -145,20 +152,38 @@ def __eq__(self, other): self.start == other.start and \ self.end == other.end - @staticmethod - def parse_shift_parameters(defn, iv): - split = defn.strip().split(',', 2) + @classmethod + def get_shifted_interval(cls, shift_defn: str, iv: HTSeq.GenomicInterval): + """Shifts the interval's 5' and 3' ends according to the shift definition + + Positive values expand and negative values contract the interval on + the specified end. Both values must be specified but zero can be + provided if no change is desired. + + Args: + shift_defn: A string of two signed numbers, `M` and `N`, comma separated. + `M` shifts the interval's 5' end and `N` shifts the interval's 3' end. + iv: The interval to shift""" + + cls.validate_shift_params(shift_defn) + split = shift_defn.split(',', 1) shift = shift_5, shift_3 = int(split[0]), int(split[1]) if iv.strand == '+': - start, end = iv.start + shift_5, iv.end + shift_3 + start, end = iv.start - shift_5, iv.end + shift_3 elif iv.strand == '-': - start, end = iv.start + shift_3, iv.end + shift_5 + start, end = iv.start - shift_3, iv.end + shift_5 else: # iv.strand == '.': - start, end = iv.start, iv.end - shift = None + shift_x = max(shift) + start, end = iv.start - shift_x, iv.end + shift_x + shift = (shift_x,) * 2 + + if start == end - 1: # open interval + raise IllegalShiftError(iv, shift, "null") + if start > end: + raise IllegalShiftError(iv, shift, "inverted") - return start, end, shift + return HTSeq.GenomicInterval(iv.chrom, start, end, iv.strand) @staticmethod def validate_shift_params(defn): @@ -175,8 +200,8 @@ class IntervalPartialMatch(IntervalSelector): __contains__ = Wildcard.__contains__ - def __init__(self, iv: HTSeq.GenomicInterval, shift=None): - super().__init__(iv, shift) + def __init__(self, iv: HTSeq.GenomicInterval): + super().__init__(iv) def __repr__(self): return f"" @@ -184,8 +209,8 @@ def __repr__(self): class IntervalFullMatch(IntervalSelector): - def __init__(self, iv: HTSeq.GenomicInterval, shift=None): - super().__init__(iv, shift) + def __init__(self, iv: HTSeq.GenomicInterval): + super().__init__(iv) def __contains__(self, alignment): return self.start <= alignment['Start'] and alignment['End'] <= self.end @@ -196,8 +221,8 @@ def __repr__(self): class IntervalExactMatch(IntervalSelector): - def __init__(self, iv: HTSeq.GenomicInterval, shift=None): - super().__init__(iv, shift) + def __init__(self, iv: HTSeq.GenomicInterval): + super().__init__(iv) def __contains__(self, alignment): return self.start == alignment['Start'] and alignment['End'] == self.end @@ -209,8 +234,8 @@ def __repr__(self): class IntervalAnchorMatch(IntervalSelector): """Evaluates whether an alignment's start matches the feature's start, and vice versa for end.""" - def __init__(self, iv: HTSeq.GenomicInterval, shift=None): - super().__init__(iv, shift) + def __init__(self, iv: HTSeq.GenomicInterval): + super().__init__(iv) assert iv.strand not in ('+', '-') def __contains__(self, alignment: dict): @@ -237,8 +262,8 @@ def __repr__(self): class Interval5pMatch(IntervalSelector): """Evaluates whether an alignment's 5' end is anchored to the corresponding terminus of the feature""" - def __init__(self, iv: HTSeq.GenomicInterval, shift=None): - super().__init__(iv, shift) + def __init__(self, iv: HTSeq.GenomicInterval): + super().__init__(iv) def __contains__(self, alignment: dict): """The following diagram demonstrates 5' anchored matching semantics. @@ -274,8 +299,8 @@ def __repr__(self): class Interval3pMatch(IntervalSelector): """Evaluates whether an alignment's 5' end is anchored to the corresponding terminus of the feature""" - def __init__(self, iv: HTSeq.GenomicInterval, shift=None): - super().__init__(iv, shift) + def __init__(self, iv: HTSeq.GenomicInterval): + super().__init__(iv) def __contains__(self, alignment): """The following diagram demonstrates 3' anchored matching semantics. From 8825f84c3d28c517aa14325eaab834cba607a6a7 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Mon, 6 Feb 2023 09:48:37 -0800 Subject: [PATCH 03/10] Refactoring _finalize_features() because shifted intervals have increased its complexity. It's a little abstract but hopefully this keeps the code approachable. Also improved explanatory docstrings for these methods. --- tiny/rna/counter/hts_parsing.py | 38 +++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index 031f6e7b..16c418b2 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -620,7 +620,17 @@ def _finalize_aliases(self): self.alias = {feat: tuple(sorted(aliases, key=str.lower)) for feat, aliases in self.alias.items()} def _finalize_features(self): - """Assigns matches to their corresponding intervals by populating GenomicArray with match tuples""" + """Assigns matches to their corresponding intervals by populating GenomicArray with match tuples + + Each feature that is handled here has matched a rule, and its matches might be classified + into subsets depending on the rules it matched. However, the feature's interval + is the same regardless of the match classification. This interval might be + discontinuous, so we start by merging any of its sub-intervals that are + adjacent to reduce loop count in Stage 2 and 3 selection. Each group of + classified matches is then added to the GenomicArray under this shared + interval (but the interval is not necessarily shared by matches that + have a shift parameter in their overlap selector...) + """ for root_id, unmerged_sub_ivs in self.intervals.items(): merged_sub_ivs = self._merge_adjacent_subintervals(unmerged_sub_ivs) @@ -635,13 +645,29 @@ def _finalize_features(self): self.chrom_vector_setdefault(merged_sub_ivs[0].chrom) continue + tagged_matches = self.matches[tagged_id] + self._add_subinterval_matches(tagged_id, merged_sub_ivs, tagged_matches) + + def _add_subinterval_matches(self, tagged_id: tuple, sub_ivs: list, matches: set): + """Adds the classified group of matches to the GenomicArray under each of the + feature's sub-intervals + + These sub-intervals might be further subset if the matches define a shift + parameter. The shift operation has to be applied to each of the feature's + sub-intervals before the given match can be added to the GenomicArray. The + shifted interval must match the interval that the overlap selector expects. + """ + + for sub_iv in sub_ivs: + # Build interval selectors for this + matches_by_shifted_iv = self.selector.build_interval_selectors(sub_iv, matches) + strand = self.map_strand(sub_iv.strand) + + for shifted_iv, matches in matches_by_shifted_iv.items(): # Sort match tuples by rank for more efficient feature selection - sorted_matches = sorted(self.matches[tagged_id], key=lambda x: x[1]) + sorted_matches = sorted(matches, key=lambda x: x[1]) + self.feats[shifted_iv] += (tagged_id, strand, tuple(sorted_matches)) - for sub_iv in merged_sub_ivs: - finalized_match_tuples = self.selector.build_interval_selectors(sub_iv, sorted_matches.copy()) - strand = self.map_strand(sub_iv.strand) - self.feats[sub_iv] += (tagged_id, strand, tuple(finalized_match_tuples)) @staticmethod def _merge_adjacent_subintervals(unmerged_sub_ivs: List[HTSeq.GenomicInterval]) -> list: From daa12b6394006cf1f9d3594f97dbf4d1fd378a64 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Mon, 6 Feb 2023 09:48:37 -0800 Subject: [PATCH 04/10] Correcting the semantics of overlap parameters to instead shift negative values in the 5' direction and positive values in the 3' direction regardless of strand --- tiny/rna/counter/features.py | 20 ++++++++++------- tiny/rna/counter/hts_parsing.py | 38 +++++++++++++++++++++++++++------ tiny/rna/counter/matching.py | 14 ++++++------ 3 files changed, 52 insertions(+), 20 deletions(-) diff --git a/tiny/rna/counter/features.py b/tiny/rna/counter/features.py index d05c2e4c..fe81e4a7 100644 --- a/tiny/rna/counter/features.py +++ b/tiny/rna/counter/features.py @@ -195,18 +195,23 @@ def build_selectors(rules_table) -> List[dict]: @staticmethod def build_interval_selectors(iv: 'HTSeq.GenomicInterval', match_tuples: List[unbuilt_match_tuple]): - """Builds partial/full/exact/3' anchored/5' anchored interval selectors + """Builds partial/full/exact/5'anchored/3'anchored interval selectors Unlike build_selectors() and build_inverted_identities(), this function is not called at construction time. Instead, it is called when finalizing - match-tuples in ReferenceTables. This is because interval selectors are - created for each feature (requiring start/stop/strand to be known) for - each of the feature's identity matches (each match-tuple). + match-tuples in ReferenceTables. This is because the intervals of features + passing Stage 1 selection, and the specific rules they matched, must be known. + + Index 2 of each match tuple is from the Overlap column of the Features Sheet. + It defines the desired selector and, optionally, a shift parameter for shifting + the 5' and/or 3' ends of the interval. Its syntax is: + selector,M,N + M = signed shift value for 5' end + N = signed shift value for 3' end Args: iv: The interval of the feature from which each selector is built - match_tuples: A list of tuples representing the feature's identity - matches. Each tuple index 2 defines and is replaced by the selector. + match_tuples: A list of tuples representing the feature's Stage 1 matches """ cache = {} @@ -239,8 +244,7 @@ def build_interval_selectors(iv: 'HTSeq.GenomicInterval', match_tuples: List[unb except KeyError: raise ValueError(f'Invalid overlap selector: "{match_tuples[i][2]}"') except IllegalShiftError: - # Drop offending match tuple - pass + pass # Drop offending match tuple return matches_by_interval diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index 031f6e7b..16c418b2 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -620,7 +620,17 @@ def _finalize_aliases(self): self.alias = {feat: tuple(sorted(aliases, key=str.lower)) for feat, aliases in self.alias.items()} def _finalize_features(self): - """Assigns matches to their corresponding intervals by populating GenomicArray with match tuples""" + """Assigns matches to their corresponding intervals by populating GenomicArray with match tuples + + Each feature that is handled here has matched a rule, and its matches might be classified + into subsets depending on the rules it matched. However, the feature's interval + is the same regardless of the match classification. This interval might be + discontinuous, so we start by merging any of its sub-intervals that are + adjacent to reduce loop count in Stage 2 and 3 selection. Each group of + classified matches is then added to the GenomicArray under this shared + interval (but the interval is not necessarily shared by matches that + have a shift parameter in their overlap selector...) + """ for root_id, unmerged_sub_ivs in self.intervals.items(): merged_sub_ivs = self._merge_adjacent_subintervals(unmerged_sub_ivs) @@ -635,13 +645,29 @@ def _finalize_features(self): self.chrom_vector_setdefault(merged_sub_ivs[0].chrom) continue + tagged_matches = self.matches[tagged_id] + self._add_subinterval_matches(tagged_id, merged_sub_ivs, tagged_matches) + + def _add_subinterval_matches(self, tagged_id: tuple, sub_ivs: list, matches: set): + """Adds the classified group of matches to the GenomicArray under each of the + feature's sub-intervals + + These sub-intervals might be further subset if the matches define a shift + parameter. The shift operation has to be applied to each of the feature's + sub-intervals before the given match can be added to the GenomicArray. The + shifted interval must match the interval that the overlap selector expects. + """ + + for sub_iv in sub_ivs: + # Build interval selectors for this + matches_by_shifted_iv = self.selector.build_interval_selectors(sub_iv, matches) + strand = self.map_strand(sub_iv.strand) + + for shifted_iv, matches in matches_by_shifted_iv.items(): # Sort match tuples by rank for more efficient feature selection - sorted_matches = sorted(self.matches[tagged_id], key=lambda x: x[1]) + sorted_matches = sorted(matches, key=lambda x: x[1]) + self.feats[shifted_iv] += (tagged_id, strand, tuple(sorted_matches)) - for sub_iv in merged_sub_ivs: - finalized_match_tuples = self.selector.build_interval_selectors(sub_iv, sorted_matches.copy()) - strand = self.map_strand(sub_iv.strand) - self.feats[sub_iv] += (tagged_id, strand, tuple(finalized_match_tuples)) @staticmethod def _merge_adjacent_subintervals(unmerged_sub_ivs: List[HTSeq.GenomicInterval]) -> list: diff --git a/tiny/rna/counter/matching.py b/tiny/rna/counter/matching.py index a90b07ca..13c11edc 100644 --- a/tiny/rna/counter/matching.py +++ b/tiny/rna/counter/matching.py @@ -156,23 +156,25 @@ def __eq__(self, other): def get_shifted_interval(cls, shift_defn: str, iv: HTSeq.GenomicInterval): """Shifts the interval's 5' and 3' ends according to the shift definition - Positive values expand and negative values contract the interval on - the specified end. Both values must be specified but zero can be + Positive values shift the interval in the 3' direction and negative values + shift in the 5' direction. Both values must be specified but zero can be provided if no change is desired. Args: shift_defn: A string of two signed numbers, `M` and `N`, comma separated. - `M` shifts the interval's 5' end and `N` shifts the interval's 3' end. - iv: The interval to shift""" + `M` shifts the interval at the 5' end and `N` shifts the interval + at the 3' end. + iv: The interval to shift + """ cls.validate_shift_params(shift_defn) split = shift_defn.split(',', 1) shift = shift_5, shift_3 = int(split[0]), int(split[1]) if iv.strand == '+': - start, end = iv.start - shift_5, iv.end + shift_3 + start, end = iv.start + shift_5, iv.end + shift_3 elif iv.strand == '-': - start, end = iv.start - shift_3, iv.end + shift_5 + start, end = iv.start - shift_3, iv.end - shift_5 else: # iv.strand == '.': shift_x = max(shift) start, end = iv.start - shift_x, iv.end + shift_x From 49b26d5a4470050a773ee4b1b69a1af6ff159156 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 9 Feb 2023 17:06:44 -0800 Subject: [PATCH 05/10] Updating unit tests for compatibility with the new return format of build_interval_selectors() --- tests/unit_tests_features.py | 6 +++--- tests/unit_tests_matching.py | 15 +++++++++------ 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/tests/unit_tests_features.py b/tests/unit_tests_features.py index 0b1d3ae3..8f8d25cb 100644 --- a/tests/unit_tests_features.py +++ b/tests/unit_tests_features.py @@ -23,12 +23,12 @@ def setUpClass(self): def make_feature_for_interval_test(self, iv_rule, feat_id, chrom, strand: str, start, stop): feat_iv = HTSeq.GenomicInterval(chrom, start, stop, strand) - rules = [ - dict(deepcopy(rules_template[0]), Overlap=iv_rule, Identity=('N/A', 'N/A'), nt5end='all', Length='all')] + rules = [dict(deepcopy(rules_template[0]), Overlap=iv_rule, Identity=('*', '*'), nt5end='*', Length='*')] fs = FeatureSelector(rules, LibraryStats(normalize_by_hits=True)) # Feature with specified coordinates, matching rule 0 with hierarchy 0 and the appropriate selector for iv_rule - match_tuple = tuple(fs.build_interval_selectors(feat_iv, [(0, 0, iv_rule)])) + selectors = fs.build_interval_selectors(feat_iv, [(0, 0, iv_rule)]) + match_tuple = (selectors[feat_iv][0],) feat = {(feat_id, strand_to_bool(strand), match_tuple)} return feat, fs diff --git a/tests/unit_tests_matching.py b/tests/unit_tests_matching.py index 0860ddd7..f0564c27 100644 --- a/tests/unit_tests_matching.py +++ b/tests/unit_tests_matching.py @@ -24,15 +24,18 @@ def test_feature_selector_interval_build(self): ('n/a', 'n/a', 'full'), ('n/a', 'n/a', 'exact'), ('n/a', 'n/a', "5' anchored"), - ('n/a', 'n/a', "3' anchored")] + ('n/a', 'n/a', "3' anchored"), + ('n/a', 'n/a', "5'anchored"), # spaces should be optional + ('n/a', 'n/a', "3'anchored")] result = fs.build_interval_selectors(iv, match_tuples) - self.assertIsInstance(result[0][2], IntervalPartialMatch) - self.assertIsInstance(result[1][2], IntervalFullMatch) - self.assertIsInstance(result[2][2], IntervalExactMatch) - self.assertIsInstance(result[3][2], Interval5pMatch) - self.assertIsInstance(result[4][2], Interval3pMatch) + self.assertEqual(len(result), 1) + self.assertIsInstance(result[iv][0][2], IntervalPartialMatch) + self.assertIsInstance(result[iv][1][2], IntervalFullMatch) + self.assertIsInstance(result[iv][2][2], IntervalExactMatch) + self.assertIsInstance(result[iv][3][2], Interval5pMatch) + self.assertIsInstance(result[iv][4][2], Interval3pMatch) """Are interval selectors hashable and properly compare for equality? This is important for storing feature records in GenomicArraysOfSets""" From 4447a1b5ec39bf731f986cfed8960c30ea815495 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 9 Feb 2023 17:08:39 -0800 Subject: [PATCH 06/10] Resolving a variable name conflict that I created while refactoring --- tiny/rna/counter/hts_parsing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index b6a6c008..ea4f4b8a 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -756,9 +756,9 @@ def _add_subinterval_matches(self, tagged_id: tuple, sub_ivs: list, matches: set matches_by_shifted_iv = self.selector.build_interval_selectors(sub_iv, matches) strand = self.map_strand(sub_iv.strand) - for shifted_iv, matches in matches_by_shifted_iv.items(): + for shifted_iv, built_matches in matches_by_shifted_iv.items(): # Sort match tuples by rank for more efficient feature selection - sorted_matches = sorted(matches, key=lambda x: x[1]) + sorted_matches = sorted(built_matches, key=lambda x: x[1]) self.feats[shifted_iv] += (tagged_id, strand, tuple(sorted_matches)) From 2398a45a2453de96d4b92263be599e9205299c6b Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 9 Feb 2023 17:14:01 -0800 Subject: [PATCH 07/10] Updating ReferenceSeqs to use the shifted return format of build_interval_selectors, and to presort match tuples as ReferenceFeatures does for faster Stage 2 selection --- tiny/rna/counter/hts_parsing.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index ea4f4b8a..71fa5f6a 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -820,7 +820,7 @@ def get_matches(self): key=lambda x: x[1]['Hierarchy'] )] - def add_reference_seq(self, seq_id, seq_len, unbuilt_match_tuples): + def add_reference_seq(self, seq_id, seq_len, matches): # Features are classified in Reference Tables (Stage 1 selection) # For compatibility, use the seq_id with an empty classifier (index 1) @@ -829,7 +829,10 @@ def add_reference_seq(self, seq_id, seq_len, unbuilt_match_tuples): for strand in ('+', '-'): iv = HTSeq.GenomicInterval(seq_id, 0, seq_len, strand) - match_tuples = self.selector.build_interval_selectors(iv, unbuilt_match_tuples.copy()) - tinyrna_strand = self.map_strand(strand) + matches_by_shifted_iv = self.selector.build_interval_selectors(iv, matches) + strand = self.map_strand(strand) - self.feats[iv] += (tagged_id, tinyrna_strand, tuple(match_tuples)) \ No newline at end of file + for shifted_iv, built_matches in matches_by_shifted_iv.items(): + # Sort match tuples by rank for more efficient feature selection + sorted_matches = sorted(built_matches, key=lambda x: x[1]) + self.feats[shifted_iv] += (tagged_id, strand, tuple(sorted_matches)) \ No newline at end of file From bfe6348e1ab1d3f702720128b19a0ef9a3ec7616 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 9 Feb 2023 17:17:23 -0800 Subject: [PATCH 08/10] Adding another IllegalShiftError case to get_shifted_interval(). Intervals with negative start positions aren't supported by ChromVectors in HTSeq's GenomicArrays. Also correcting the definition of a null interval (oops) --- tiny/rna/counter/matching.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tiny/rna/counter/matching.py b/tiny/rna/counter/matching.py index 13c11edc..fdafbca5 100644 --- a/tiny/rna/counter/matching.py +++ b/tiny/rna/counter/matching.py @@ -163,7 +163,7 @@ def get_shifted_interval(cls, shift_defn: str, iv: HTSeq.GenomicInterval): Args: shift_defn: A string of two signed numbers, `M` and `N`, comma separated. `M` shifts the interval at the 5' end and `N` shifts the interval - at the 3' end. + at the 3' end. iv: The interval to shift """ @@ -180,10 +180,12 @@ def get_shifted_interval(cls, shift_defn: str, iv: HTSeq.GenomicInterval): start, end = iv.start - shift_x, iv.end + shift_x shift = (shift_x,) * 2 - if start == end - 1: # open interval + if start == end: raise IllegalShiftError(iv, shift, "null") if start > end: raise IllegalShiftError(iv, shift, "inverted") + if start < 0: + raise IllegalShiftError(iv, shift, "negative start") return HTSeq.GenomicInterval(iv.chrom, start, end, iv.strand) From 78b77722c433a13bc3bd17e586a1a1e9aacfd4a8 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 9 Feb 2023 18:16:14 -0800 Subject: [PATCH 09/10] Patching a PathsFile test for code that uses a countdown timer. This skips the countdown for each test case since we don't really need to wait while running tests --- tests/unit_tests_configuration.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/tests/unit_tests_configuration.py b/tests/unit_tests_configuration.py index 70c6fd37..54ae4ce1 100644 --- a/tests/unit_tests_configuration.py +++ b/tests/unit_tests_configuration.py @@ -301,14 +301,18 @@ def test_no_gff_files(self): ] with contextlib.redirect_stdout(stdout): - for gff_config in gff_configs: - config['gff_files'] = gff_config - config.validate_paths() - config_return = config.get_gff_config() - - self.assertRegex(stdout.getvalue(), r".*(No GFF files).*") - self.assertEqual(config_return, {}) - stdout.truncate(0) + # Skip countdown timer for each config + with patch('tiny.rna.configuration.time.sleep') as countdown: + for gff_config in gff_configs: + config['gff_files'] = gff_config + config.validate_paths() + config_return = config.get_gff_config() + + self.assertRegex(stdout.getvalue(), r".*(No GFF files).*") + self.assertEqual(config_return, {}) + stdout.truncate(0) + + countdown.assert_has_calls([call(1)] * 6 * len(gff_configs)) """Does PathsFile check for missing files for single entry parameters?""" From d4b56bf91f8b14c2c2d1aac3d091a7c18764b6d5 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Thu, 9 Feb 2023 18:19:57 -0800 Subject: [PATCH 10/10] Adding unit tests for overlap selector shifting --- tests/unit_tests_features.py | 52 ++++++++++++++++++++++++++++++++++++ tests/unit_tests_matching.py | 28 +++++++++++++++++++ 2 files changed, 80 insertions(+) diff --git a/tests/unit_tests_features.py b/tests/unit_tests_features.py index 8f8d25cb..65e776f2 100644 --- a/tests/unit_tests_features.py +++ b/tests/unit_tests_features.py @@ -94,6 +94,15 @@ def test_HTSeq_antisense_iv_slice(self): self.assertEqual(matches_with_cooridnates[2][0], HTSeq.GenomicInterval("I", 15, 20, '-')) self.assertEqual(matches_with_cooridnates[2][0], HTSeq.GenomicInterval("I", 15, 20, '-')) + """Do GenomicArraysOfSets support negative start/end positions? No, they don't.""" + + def test_HTSeq_GenomicArrayOfSets_negative_indexing(self): + gas = HTSeq.GenomicArrayOfSets("auto", stranded=True) + ivn = HTSeq.GenomicInterval("I", -10, 10, '+') + + with self.assertRaises(IndexError): + gas[ivn] += "Negative" + """Does assign_features correctly handle alignments with zero feature matches?""" def test_assign_features_no_match(self): @@ -383,6 +392,49 @@ def test_wildcard_identities(self): self.assertDictEqual(actual, expected) + """Does FeatureSelector build both shifted and unshifted selectors and group them by resulting interval?""" + + def test_build_interval_selectors_grouping(self): + fs = FeatureSelector(deepcopy(rules_template), LibraryStats()) + iv = HTSeq.GenomicInterval('I', 10, 20, '+') + + match_tuples = [('n/a', 'n/a', 'partial'), + ('n/a', 'n/a', 'full'), + ('n/a', 'n/a', 'exact'), + ('n/a', 'n/a', "5' anchored"), + ('n/a', 'n/a', "3' anchored"), + # iv_shifted_1 Shift values: + ('n/a', 'n/a', 'partial, -5, 5'), # 5': -5 3': 5 + ('n/a', 'n/a', 'full, -5, 5'), # 5': -5 3': 5 + # iv_shifted_2 + ('n/a', 'n/a', 'exact, -10, 10'), # 5': -10 3': 10 + # iv_shifted_3 + ('n/a', 'n/a', "5' anchored, -1, -1"), # 5': -1 3': -1 + ('n/a', 'n/a', "3' anchored, -1, -1")] # 5': -1 3': -1 + + iv_shifted_1 = HTSeq.GenomicInterval('I', 5, 25, '+') + iv_shifted_2 = HTSeq.GenomicInterval('I', 0, 30, '+') + iv_shifted_3 = HTSeq.GenomicInterval('I', 9, 19, '+') + + result = fs.build_interval_selectors(iv, match_tuples) + + # Correct number of groups + self.assertEqual(len(result), 4) + + # Correct number of selectors per group + self.assertEqual(len(result[iv]), 5) + self.assertEqual(len(result[iv_shifted_1]), 2) + self.assertEqual(len(result[iv_shifted_2]), 1) + self.assertEqual(len(result[iv_shifted_3]), 2) + + # Agreement between group interval and selector's interval + for iv_s, tuples in result.items(): + group_iv = (iv_s.start, iv_s.end) + for match in tuples: + selector_iv = (match[2].start, match[2].end) + self.assertEqual(group_iv, selector_iv) + + if __name__ == '__main__': unittest.main() diff --git a/tests/unit_tests_matching.py b/tests/unit_tests_matching.py index f0564c27..d979e688 100644 --- a/tests/unit_tests_matching.py +++ b/tests/unit_tests_matching.py @@ -111,6 +111,34 @@ def test_NumericalMatch_rule_validation(self): with self.assertRaisesRegex(AssertionError, errmsg): NumericalMatch(defn) + """Does IntervalSelector properly validate the shift parameter?""" + + def test_IntervalSelector_shift_validation(self): + good_defs = ["1,1", "2, 2", " -3 ,-3", "0,0 ", " -0 , -0 "] + bad_defs = ["1,", ",2", "3", "4 4", " ", ",", ", "] + + for defn in good_defs: + IntervalSelector.validate_shift_params(defn) + + for defn in bad_defs: + errmsg = f'Invalid overlap shift parameters: "{defn}"' + with self.assertRaisesRegex(AssertionError, errmsg): + IntervalSelector.validate_shift_params(defn) + + """Does IntervalSelector properly raise IllegalShiftErrors?""" + + def test_IntervalSelector_illegal_shift(self): + iv = HTSeq.GenomicInterval('n/a', 0, 5, '+') + + with self.assertRaisesRegex(IllegalShiftError, "null interval"): + IntervalSelector.get_shifted_interval("0,-5", iv) + + with self.assertRaisesRegex(IllegalShiftError, "inverted interval"): + IntervalSelector.get_shifted_interval("10, 0", iv) + + with self.assertRaisesRegex(IllegalShiftError, "negative start interval"): + IntervalSelector.get_shifted_interval("-1, 0", iv) + if __name__ == '__main__': unittest.main()