diff --git a/START_HERE/features.csv b/START_HERE/features.csv index 012e1130..9b053b11 100644 --- a/START_HERE/features.csv +++ b/START_HERE/features.csv @@ -1,7 +1,7 @@ -Select for...,with value...,Classify as...,Source Filter,Type Filter,Hierarchy,Strand,5' End Nucleotide,Length,Overlap -Class,mask,masked,,,1,both,all,all,Partial -Class,miRNA,miRNA,,,2,sense,all,16-24,5' anchored -Class,piRNA,piRNA-5'A,,,2,both,A,24-32,Nested -Class,piRNA,piRNA-5'T,,,2,both,T,24-32,Nested -Class,siRNA,siRNA,,,2,both,all,15-22,Nested -Class,unk,unknown,,,3,both,all,all,Nested +Select for...,with value...,Classify as...,Source Filter,Type Filter,Hierarchy,Overlap,Mismatches,Strand,5' End Nucleotide,Length +Class,mask,masked,,,1,Partial,,both,all,all +Class,miRNA,miRNA,,,2,5' anchored,,sense,all,16-24 +Class,piRNA,piRNA-5'A,,,2,Nested,,both,A,24-32 +Class,piRNA,piRNA-5'T,,,2,Nested,,both,T,24-32 +Class,siRNA,siRNA,,,2,Nested,,both,all,15-22 +Class,unk,unknown,,,3,Nested,,both,all,all \ No newline at end of file diff --git a/doc/Configuration.md b/doc/Configuration.md index fc5ce86f..44026262 100644 --- a/doc/Configuration.md +++ b/doc/Configuration.md @@ -157,6 +157,7 @@ Selectors in the Features Sheet can be specified as a single value, a list of co | `Type Filter` | ✓ | ✓ | ✓ | | | `Hierarchy` | | ✓ | | | | `Overlap` | ✓ | ✓ | | | +| `Mismatches` | ✓ | ✓ | ✓ | ✓ | | `Strand` | ✓ | ✓ | | | | `5' nt` | ✓ | ✓ | ✓ | | | `Length` | ✓ | ✓ | ✓ | ✓ | diff --git a/doc/tiny-count.md b/doc/tiny-count.md index 96438d47..93af9b64 100644 --- a/doc/tiny-count.md +++ b/doc/tiny-count.md @@ -86,6 +86,12 @@ selector, M, N #### Unstranded Features If these features match rules with `5' anchored` and `3' anchored` overlap selectors, they will be downgraded to `anchored` selectors. Alignments overlapping these features are evaluated for shared start and/or end coordinates, but 5' and 3' ends are not distinguished. +### Mismatches +The Mismatches column allows you to place constraints the edit distance, or the number of mismatches and indels, from the alignment to the reference. The Mismatch definition is explicit, i.e., a value of 3 means exactly 3, not 3 or less. Definitions support ranges (e.g., 0-3), lists (e.g., 1, 3), wildcards, and single values. + +#### Edit Distance Determination +An alignment's edit distance is determined from its NM tag. If the first alignment in a SAM file doesn't have an NM tag, then the edit distance is calculated from the CIGAR string for all subsequent alignments in the file. If the first alignment has an NM tag then any subsequent alignments missing the tag will have a default edit distance of 0. + ### Hierarchy Each rule must be assigned a hierarchy value. This value is used to sort Stage 2 matches so that matches with smaller hierarchy values take precedence in Stage 3. - Each feature can have multiple hierarchy values if it matched more than one rule during Stage 1 selection diff --git a/setup.py b/setup.py index dd1d966d..72089139 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,7 @@ #!/usr/bin/env python import os import sys +import pysam import setuptools from setuptools.command.install import install @@ -40,9 +41,10 @@ def get_cython_extension_defs(): error out if there are build issues, and therefore must be used as optional imports.""" pyx_files = [ - # (file path, optional) - ('tiny/rna/counter/stepvector/_stepvector.pyx', True), - ('tests/cython_tests/stepvector/test_cython.pyx', True) + # (file path, optional, include) + ('tiny/rna/counter/stepvector/_stepvector.pyx', True, []), + ('tests/cython_tests/stepvector/test_cython.pyx', True, []), + ('tiny/rna/counter/parsing/alignments.pyx', False, pysam.get_include()) ] cxx_extension_args = { @@ -61,9 +63,10 @@ def get_cython_extension_defs(): return [setuptools.Extension( pyx_filename.replace('./', '').replace('/', '.').rstrip('.pyx'), sources=[pyx_filename], + include_dirs=include, optional=optional, **cxx_extension_args) - for pyx_filename, optional in pyx_files] + for pyx_filename, optional, include in pyx_files] def get_macos_sdk_path(): @@ -120,6 +123,7 @@ def get_macos_sdk_path(): ext_modules=cythonize( get_cython_extension_defs(), compiler_directives={'language_level': '3'}, + include_path=pysam.get_include(), gdb_debug=False ), scripts=scripts, diff --git a/tests/testdata/config_files/features.csv b/tests/testdata/config_files/features.csv index 8a6e12d1..1ad42073 100755 --- a/tests/testdata/config_files/features.csv +++ b/tests/testdata/config_files/features.csv @@ -1,7 +1,7 @@ -Select for...,with value...,Classify as...,Source Filter,Type Filter,Hierarchy,Strand,5' End Nucleotide,Length,Overlap -Class,mask,,,,1,both,all,all,Partial -Class,miRNA,,,,2,sense,all,16-22,Nested -Class,piRNA,5pA,,,2,both,A,24-32,Nested -Class,piRNA,5pT,,,2,both,T,24-32,Nested -Class,siRNA,,,,2,both,all,15-22,Nested -Class,unk,,,,3,both,all,all,Nested \ No newline at end of file +Select for...,with value...,Classify as...,Source Filter,Type Filter,Hierarchy,Overlap,Mismatches,Strand,5' End Nucleotide,Length +Class,mask,,,,1,Partial,0,both,all,all +Class,miRNA,,,,2,Nested,0,sense,all,16-22 +Class,piRNA,5pA,,,2,Nested,0,both,A,24-32 +Class,piRNA,5pT,,,2,Nested,0,both,T,24-32 +Class,siRNA,,,,2,Nested,0,both,all,15-22 +Class,unk,,,,3,Nested,0,both,all,all \ No newline at end of file diff --git a/tests/testdata/counter/validation/sam/sq_headers.sam b/tests/testdata/counter/validation/sam/sq_headers.sam new file mode 100644 index 00000000..390d94cf --- /dev/null +++ b/tests/testdata/counter/validation/sam/sq_headers.sam @@ -0,0 +1,4 @@ +@HD VN:1.0 SO:unsorted +@SQ SN:I LN:123 +@SQ SN:II LN:456 +@SQ SN:III \ No newline at end of file diff --git a/tests/unit_test_helpers.py b/tests/unit_test_helpers.py index 3bbc6579..0b6aaf45 100644 --- a/tests/unit_test_helpers.py +++ b/tests/unit_test_helpers.py @@ -22,9 +22,10 @@ 'Filter_t': "", 'Strand': "both", 'Hierarchy': 0, + 'Overlap': "partial", + 'Mismatch': "", 'nt5end': "all", - 'Length': "all", # A string is expected by FeatureSelector due to support for lists and ranges - 'Overlap': "partial"}] + 'Length': "all",}] # A string is expected by FeatureSelector due to support for lists and ranges def csv_factory(type: str, rows: List[dict], header=()): @@ -130,7 +131,7 @@ def get_dir_checksum_tree(root_path: str) -> dict: return dir_tree -def make_parsed_sam_record(Name="0_count=1", Seq="CAAGACAGAGCTTCACCGTTC", Chrom='I', Start=15064570, Strand:bool = True): +def make_parsed_sam_record(Name="0_count=1", Seq="CAAGACAGAGCTTCACCGTTC", Chrom='I', Start=15064570, Strand=True, NM=0): return { "Name": Name, "Length": len(Seq), @@ -139,7 +140,8 @@ def make_parsed_sam_record(Name="0_count=1", Seq="CAAGACAGAGCTTCACCGTTC", Chrom= "Chrom": Chrom, "Start": Start, "End": Start + len(Seq), - "Strand": Strand + "Strand": Strand, + "NM": NM } @@ -186,7 +188,7 @@ def strand_to_bool(strand): # For performing reverse complement -complement = bytes.maketrans(b'ACGTacgt', b'TGCAtgca') +complement = str.maketrans('ACGTacgt', 'TGCAtgca') class ShellCapture: diff --git a/tests/unit_tests_counter.py b/tests/unit_tests_counter.py index 20d9107c..e32a0fa6 100644 --- a/tests/unit_tests_counter.py +++ b/tests/unit_tests_counter.py @@ -36,10 +36,11 @@ def setUpClass(self): 'Filter_s': "", 'Filter_t': "", 'Hierarchy': "1", + 'Overlap': "Partial", + 'Mismatch': "", 'Strand': "antisense", "nt5end": '"C,G,U"', # Needs to be double-quoted due to commas 'Length': "all", - 'Overlap': "Partial" } # Represents the parsed Features Sheet row above @@ -51,10 +52,11 @@ def setUpClass(self): 'Filter_s': _row['Filter_s'], 'Filter_t': _row['Filter_t'], 'Hierarchy': int(_row['Hierarchy']), + 'Overlap': _row['Overlap'].lower(), + 'Mismatch': _row['Mismatch'], 'Strand': _row['Strand'], 'nt5end': _row["nt5end"].upper().translate({ord('U'): 'T'}), 'Length': _row['Length'], - 'Overlap': _row['Overlap'].lower() }] # Represents an unparsed Samples Sheet row diff --git a/tests/unit_tests_features.py b/tests/unit_tests_features.py index 55c41410..236be93e 100644 --- a/tests/unit_tests_features.py +++ b/tests/unit_tests_features.py @@ -27,7 +27,7 @@ def make_feature_for_interval_test(self, iv_rule, feat_id, chrom, strand: str, s fs = FeatureSelector(rules) # Feature with specified coordinates, matching rule 0 with hierarchy 0 and the appropriate selector for iv_rule - selectors = fs.build_interval_selectors(feat_iv, [(0, 0, iv_rule)]) + selectors = fs.build_interval_selectors(feat_iv, [(0, 0, iv_rule, Wildcard())]) match_tuple = (selectors[feat_iv][0],) feat = {(feat_id, strand_to_bool(strand), match_tuple)} @@ -398,19 +398,19 @@ def test_build_interval_selectors_grouping(self): fs = FeatureSelector(deepcopy(rules_template)) iv = HTSeq.GenomicInterval('I', 10, 20, '+') - match_tuples = [('n/a', 'n/a', 'partial'), - ('n/a', 'n/a', 'nested'), - ('n/a', 'n/a', 'exact'), - ('n/a', 'n/a', "5' anchored"), - ('n/a', 'n/a', "3' anchored"), + match_tuples = [('n/a', 'n/a', 'partial', 'n/a'), + ('n/a', 'n/a', 'nested', 'n/a'), + ('n/a', 'n/a', 'exact', 'n/a'), + ('n/a', 'n/a', "5' anchored", 'n/a'), + ('n/a', 'n/a', "3' anchored", 'n/a'), # iv_shifted_1 Shift values: - ('n/a', 'n/a', 'partial, -5, 5'), # 5': -5 3': 5 - ('n/a', 'n/a', 'nested, -5, 5'), # 5': -5 3': 5 + ('n/a', 'n/a', 'partial, -5, 5', 'n/a'), # 5': -5 3': 5 + ('n/a', 'n/a', 'nested, -5, 5', 'n/a'), # 5': -5 3': 5 # iv_shifted_2 - ('n/a', 'n/a', 'exact, -10, 10'), # 5': -10 3': 10 + ('n/a', 'n/a', 'exact, -10, 10', 'n/a'), # 5': -10 3': 10 # iv_shifted_3 - ('n/a', 'n/a', "5' anchored, -1, -1"), # 5': -1 3': -1 - ('n/a', 'n/a', "3' anchored, -1, -1")] # 5': -1 3': -1 + ('n/a', 'n/a', "5' anchored, -1, -1", 'n/a'), # 5': -1 3': -1 + ('n/a', 'n/a', "3' anchored, -1, -1", 'n/a')] # 5': -1 3': -1 iv_shifted_1 = HTSeq.GenomicInterval('I', 5, 25, '+') iv_shifted_2 = HTSeq.GenomicInterval('I', 0, 30, '+') diff --git a/tests/unit_tests_hts_parsing.py b/tests/unit_tests_hts_parsing.py index 587fdee0..4af12415 100644 --- a/tests/unit_tests_hts_parsing.py +++ b/tests/unit_tests_hts_parsing.py @@ -15,6 +15,7 @@ import unit_test_helpers as helpers resources = "./testdata/counter" +wc = Wildcard() # used in many tests as a 'n/a' value # To run all test suites if __name__ == '__main__': @@ -44,8 +45,8 @@ def test_sam_reader(self): self.assertEqual(sam_record['Start'], 15064569) self.assertEqual(sam_record['End'], 15064590) self.assertEqual(sam_record['Strand'], False) - self.assertEqual(sam_record['Name'], b"0_count=5") - self.assertEqual(sam_record['Seq'], b"CAAGACAGAGCTTCACCGTTC") + self.assertEqual(sam_record['Name'], "0_count=5") + self.assertEqual(sam_record['Seq'], "CAAGACAGAGCTTCACCGTTC") self.assertEqual(sam_record['Length'], 21) self.assertEqual(sam_record['nt5end'], 'G') @@ -68,13 +69,13 @@ def test_sam_parser_comparison(self): self.assertEqual(our['Chrom'], their.iv.chrom) self.assertEqual(our['Start'], their.iv.start) self.assertEqual(our['End'], their.iv.end) - self.assertEqual(our['Name'].decode(), their.read.name) + self.assertEqual(our['Name'], their.read.name) self.assertEqual(our['nt5end'], chr(their.read.seq[0])) # See note above self.assertEqual(our['Strand'], helpers.strand_to_bool(their.iv.strand)) if our['Strand'] is False: # See note above - self.assertEqual(our['Seq'][::-1].translate(helpers.complement), their.read.seq) + self.assertEqual(our['Seq'][::-1].translate(helpers.complement), their.read.seq.decode()) else: - self.assertEqual(our['Seq'], their.read.seq) + self.assertEqual(our['Seq'], their.read.seq.decode()) """Does SAM_reader._get_decollapsed_filename() create an appropriate filename?""" @@ -86,38 +87,60 @@ def test_SAM_reader_get_decollapsed_filename(self): self.assertEqual(sam_out, "sam_file_decollapsed.sam") - """Does SAM_reader._read_to_first_aln() correctly identify header lines and write them to the decollapsed file?""" + """Does SAM_reader._new_bundle report the correct read count for different collapser types?""" - def test_SAM_reader_read_thru_header(self): + def test_SAM_reader_new_bundle(self): + qnames = ["0_count=3", "seq0_x5", "non-collapsed"] + counts = [3, 5, 1] + + reader = SAM_reader() + reader._header_dict = {'HD': {'SO': 'queryname'}} + + for qname, expected in zip(qnames, counts): + reader._determine_collapser_type(qname) + _, read_count = reader._new_bundle({'Name': qname}) + self.assertEqual(read_count, expected) + + """Does SAM_reader._gather_metadata() correctly identify metadata and write the decollapsed file header?""" + + def test_SAM_reader_gather_metadata(self): reader = SAM_reader(decollapse=True) reader._decollapsed_filename = "mock_outfile_name.sam" - with open(self.short_sam_file, 'rb') as sam_in: - with patch('builtins.open', mock_open()) as sam_out: - line = reader._read_to_first_aln(sam_in) + sam_in = pysam.AlignmentFile(self.short_sam_file) + with patch('builtins.open', mock_open()) as sam_out: + reader._gather_metadata(sam_in) expected_writelines = [ call('mock_outfile_name.sam', 'w'), call().__enter__(), - call().writelines(["@SQ SN:I LN:21\n"]), + call().write("@SQ\tSN:I\tLN:21\n"), call().__exit__(None, None, None) ] sam_out.assert_has_calls(expected_writelines) - self.assertTrue(len(reader._header_lines) == 1) + self.assertEqual(reader.collapser_type, 'tiny-collapse') + self.assertDictEqual(reader._header_dict, {'SQ': [{'SN': "I", 'LN': 21}]}) + self.assertEqual(reader.references, ('I',)) + self.assertTrue(reader.has_nm) """Does SAM_reader._write_decollapsed_sam() write the correct number of duplicates to the decollapsed file?""" def test_SAM_reader_write_decollapsed_sam(self): + header = pysam.AlignmentHeader() + alignment = pysam.AlignedSegment(header) + alignment.query_name = "0_count=5" + reader = SAM_reader(decollapse=True) reader.collapser_type = "tiny-collapse" - reader._decollapsed_reads = [(b"0_count=5", b"mock line from SAM file")] + reader._collapser_token = "=" + reader._decollapsed_reads = [alignment] reader._decollapsed_filename = "mock_outfile_name.sam" expected_writelines = [ - call('mock_outfile_name.sam', 'ab'), + call('mock_outfile_name.sam', 'a'), call().__enter__(), - call().writelines([b"mock line from SAM file"] * 5), + call().write('\n'.join([alignment.to_string()] * 5)), call().__exit__(None, None, None) ] @@ -130,37 +153,28 @@ def test_SAM_reader_write_decollapsed_sam(self): """Does SAM_reader._parse_alignments() save lines and write them to the decollapsed file when appropriate?""" def test_SAM_reader_parse_alignments_decollapse(self): - with patch.object(SAM_reader, "_write_decollapsed_sam") as write_fn, \ - patch('tiny.rna.counter.hts_parsing.open', new_callable=mock_open) as mopen: + with patch.object(SAM_reader, "_write_decollapsed_sam") as write_fn: + # Set up SAM_reader class reader = SAM_reader(decollapse=True) - reader._decollapsed_reads = [0] * 99999 # At 100,001, buffer will be written - reader.file = self.short_sam_file # File with single alignment - - with open(self.short_sam_file, 'rb') as sam_in: - self.exhaust_iterator(reader._parse_alignments(sam_in)) - write_fn.assert_not_called() - - # Rewind and add one more alignment to push it over threshold - sam_in.seek(0) - self.exhaust_iterator(reader._parse_alignments(sam_in)) - write_fn.assert_called_once() - - """Does SAM_reader report a single read count for non-collapsed SAM records?""" - - def test_SAM_reader_single_readcount_non_collapsed_SAM(self): - # Read non-collapsed.sam but duplicate its single record twice - with open(f"{resources}/non-collapsed.sam", 'rb') as f: - sam_lines = f.readlines() - sam_lines.extend([sam_lines[1]] * 2) - mock_file = mock_open(read_data=b''.join(sam_lines)) - - with patch('tiny.rna.counter.hts_parsing.open', new=mock_file): - reader = SAM_reader() - bundle, read_count = next(reader.bundle_multi_alignments('mock_file')) - - self.assertEqual(bundle[0]['Name'], b'NON_COLLAPSED_QNAME') - self.assertEqual(len(bundle), 3) - self.assertEqual(read_count, 1) + reader.collapser_type = "tiny-collapse" + reader._decollapsed_reads = buffer = [0] * 99999 # At 100,001, expect buffer to be written + reader.file = self.short_sam_file # File with single alignment + + # Set up AlignmentIter class + sam_in = pysam.AlignmentFile(reader.file) + callback = reader._write_decollapsed_sam + first_aln_offset = sam_in.tell() + has_nm = True + aln_iter = AlignmentIter(sam_in, has_nm, callback, buffer) + + # Add 100,000th alignment to the buffer + self.exhaust_iterator(aln_iter) + write_fn.assert_not_called() + + # Rewind and add one more alignment to push it over threshold + sam_in.seek(first_aln_offset) + self.exhaust_iterator(aln_iter) + write_fn.assert_called_once() """Are decollapsed outputs skipped when non-collapsed SAM files are supplied?""" @@ -256,7 +270,7 @@ def test_ref_tables_single_feature(self): steps = list(feats[iv].array[iv.start:iv.end].get_steps(values_only=True)) self.assertEqual((type(feats), type(alias), type(tags)), (HTSeq.GenomicArrayOfSets, dict, defaultdict)) - self.assertEqual(steps, [{(("Gene:WBGene00023193", 'tag'), False, ((1, 2, IntervalPartialMatch(iv)),))}]) + self.assertEqual(steps, [{(("Gene:WBGene00023193", 'tag'), False, ((1, 2, IntervalPartialMatch(iv), wc),))}]) self.assertEqual(alias, {"Gene:WBGene00023193": ('Y74C9A.6',)}) self.assertDictEqual(tags, {"Gene:WBGene00023193": {('Gene:WBGene00023193', 'tag')}}) @@ -280,7 +294,7 @@ def test_ref_tables_single_feature_all_features_false(self): steps = list(feats[iv].array[iv.start:iv.end].get_steps(values_only=True)) self.assertEqual((type(feats), type(alias), type(tags)), (HTSeq.GenomicArrayOfSets, dict, defaultdict)) - self.assertEqual(steps, [{(("Gene:WBGene00023193", 'tag'), False, ((1, 2, IntervalPartialMatch(iv)),))}]) + self.assertEqual(steps, [{(("Gene:WBGene00023193", 'tag'), False, ((1, 2, IntervalPartialMatch(iv), wc),))}]) self.assertDictEqual(alias, {"Gene:WBGene00023193": ('Y74C9A.6',)}) self.assertDictEqual(tags, {"Gene:WBGene00023193": {('Gene:WBGene00023193', 'tag')}}) @@ -365,7 +379,7 @@ def test_ref_tables_identity_matches_multisource_concat(self): expected_matches = [ set(), - {(('Gene:WBGene00023193', ''), False, ((0, 1, ivm), (1, 2, ivm), (2, 3, ivm)))}, + {(('Gene:WBGene00023193', ''), False, ((0, 1, ivm, wc), (1, 2, ivm, wc), (2, 3, ivm, wc)))}, set() ] @@ -466,10 +480,10 @@ def test_ref_tables_discontinuous_identity_matches(self): sib_ivs = [HTSeq.GenomicInterval('I', 99, 110, '-'), HTSeq.GenomicInterval('I', 110, 120, '-'), HTSeq.GenomicInterval('I', 139, 150, '-')] - rule1_gp = {f"{iv.start}:{iv.end}": ((0, 2, IntervalPartialMatch(iv)),) for iv in gp_ivs} - rule1_p2 = {f"{iv.start}:{iv.end}": ((0, 2, IntervalPartialMatch(iv)),) for iv in p2_ivs} - rule2_sib = {f"{iv.start}:{iv.end}": (1, 3, IntervalPartialMatch(iv)) for iv in sib_ivs} - rule3_sib = {f"{iv.start}:{iv.end}": (2, 0, IntervalPartialMatch(iv)) for iv in sib_ivs} + rule1_gp = {f"{iv.start}:{iv.end}": ((0, 2, IntervalPartialMatch(iv), wc),) for iv in gp_ivs} + rule1_p2 = {f"{iv.start}:{iv.end}": ((0, 2, IntervalPartialMatch(iv), wc),) for iv in p2_ivs} + rule2_sib = {f"{iv.start}:{iv.end}": (1, 3, IntervalPartialMatch(iv), wc) for iv in sib_ivs} + rule3_sib = {f"{iv.start}:{iv.end}": (2, 0, IntervalPartialMatch(iv), wc) for iv in sib_ivs} # For tables that store features in tagged form GrandParent, Parent2, Sibling = ('GrandParent',''), ('Parent2',''), ('Sibling','') @@ -502,7 +516,7 @@ def test_ref_tables_source_filter(self): child2_iv = HTSeq.GenomicInterval('I', 39, 50, '-') exp_alias = {'Child2': ('Child2Name',)} - exp_feats = [set(), {(('Child2', ''), False, ((0, 0, IntervalPartialMatch(child2_iv)),))}, set()] + exp_feats = [set(), {(('Child2', ''), False, ((0, 0, IntervalPartialMatch(child2_iv), wc),))}, set()] exp_intervals = {'Child2': [child2_iv]} exp_classes = {'Child2': ('NA',)} exp_filtered = {"GrandParent", "ParentWithGrandparent", "Parent2", "Child1", "Sibling"} @@ -527,7 +541,7 @@ def test_ref_tables_type_filter(self): child1_iv = HTSeq.GenomicInterval('I', 29, 40, '-') exp_alias = {'Child1': ('SharedName',)} - exp_feats = [set(), {(('Child1', ''), False, ((0, 0, IntervalPartialMatch(child1_iv)),))}, set()] + exp_feats = [set(), {(('Child1', ''), False, ((0, 0, IntervalPartialMatch(child1_iv), wc),))}, set()] exp_intervals = {'Child1': [child1_iv]} exp_classes = {'Child1': ('NA',)} exp_filtered = {"GrandParent", "ParentWithGrandparent", "Parent2", "Child2", "Sibling"} @@ -574,8 +588,8 @@ def test_ref_tables_tagged_match_single(self): iv = IntervalPartialMatch(HTSeq.GenomicInterval('n/a', 3746, 3909)) expected_feats = [ set(), { - ((feat_id, 'tagged_match'), False, ((0, 1, iv),)), - ((feat_id, ''), False, ((1, 2, iv),)) + ((feat_id, 'tagged_match'), False, ((0, 1, iv, wc),)), + ((feat_id, ''), False, ((1, 2, iv, wc),)) }, set() ] @@ -606,12 +620,12 @@ def test_ref_tables_tagged_match_merging(self): Child2_iv = IntervalPartialMatch(HTSeq.GenomicInterval('n/a', 39, 50)) expected_feats = [ set(), { - (('Parent2', 'shared'), False, ((0, 1, Parent2_iv), (1, 2, Parent2_iv))), - (('Parent2', ''), False, ((2, 3, Parent2_iv),)), + (('Parent2', 'shared'), False, ((0, 1, Parent2_iv, wc), (1, 2, Parent2_iv, wc))), + (('Parent2', ''), False, ((2, 3, Parent2_iv, wc),)), }, set(), { - (('Parent2', 'shared'), False, ((0, 1, Child2_iv), (1, 2, Child2_iv))), - (('Parent2', ''), False, ((2, 3, Child2_iv),)) + (('Parent2', 'shared'), False, ((0, 1, Child2_iv, wc), (1, 2, Child2_iv, wc))), + (('Parent2', ''), False, ((2, 3, Child2_iv, wc),)) }, set() ] @@ -651,7 +665,7 @@ def get_steps(rs, chrom): def test_add_reference_seq_single(self): seq_id = "seq" seq_len = 10 - matches = {'': [(0, 0, "partial")]} + matches = {'': [(0, 0, "partial", wc)]} rs = self.ReferenceSeqs_add(seq_id, seq_len, matches) actual = self.get_steps(rs, seq_id) @@ -660,7 +674,7 @@ def test_add_reference_seq_single(self): # and the overlap selector should have the same interval. # For these selectors, same interval on both strands. iv = HTSeq.GenomicInterval(seq_id, 0, seq_len) - match_tuple = ((0, 0, IntervalPartialMatch(iv)),) + match_tuple = ((0, 0, IntervalPartialMatch(iv), wc),) expected = { (0, 10): { @@ -677,7 +691,7 @@ def test_add_reference_seq_single(self): def test_add_reference_seq_shared_classifier(self): seq_id = "seq" seq_len = 10 - matches = {'shared': [(0, 0, "partial"), (1, 1, "nested")]} + matches = {'shared': [(0, 0, "partial", wc), (1, 1, "nested", wc)]} rs = self.ReferenceSeqs_add(seq_id, seq_len, matches) actual = self.get_steps(rs, seq_id) @@ -686,7 +700,7 @@ def test_add_reference_seq_shared_classifier(self): # and the overlap selector should have the same interval. # For these selectors, same interval on both strands. iv = HTSeq.GenomicInterval(seq_id, 0, seq_len) - match_tuples = (0, 0, IntervalPartialMatch(iv)), (1, 1, IntervalNestedMatch(iv)) + match_tuples = (0, 0, IntervalPartialMatch(iv), wc), (1, 1, IntervalNestedMatch(iv), wc) expected = { (0, 10): { @@ -704,7 +718,7 @@ def test_add_reference_seq_shared_classifier(self): def test_add_reference_seq_shared_iv(self): seq_id = "seq" seq_len = 10 - matches = {'exact': [(0, 0, "exact, 2, -2")], 'nested': [(1, 1, "nested, 2, -2")]} + matches = {'exact': [(0, 0, "exact, 2, -2", wc)], 'nested': [(1, 1, "nested, 2, -2", wc)]} rs = self.ReferenceSeqs_add(seq_id, seq_len, matches) actual = self.get_steps(rs, seq_id) @@ -713,8 +727,8 @@ def test_add_reference_seq_shared_iv(self): # and the overlap selector should have the same interval. # For these selectors, same interval on both strands. iv = HTSeq.GenomicInterval(seq_id, 2, seq_len - 2) - match_exact = ((0, 0, IntervalExactMatch(iv)),) - match_nested = ((1, 1, IntervalNestedMatch(iv)),) + match_exact = ((0, 0, IntervalExactMatch(iv), wc),) + match_nested = ((1, 1, IntervalNestedMatch(iv), wc),) expected = { (0, 2): set(), @@ -737,8 +751,8 @@ def test_add_reference_seq_complex(self): seq_id = "seq" seq_len = 10 matches = { - "class1": [(0, 0, "nested, 1, -1"), (0, 0, "exact, 5, 0")], - "class2": [(0, 0, "5' anchored, 5, 0")] + "class1": [(0, 0, "nested, 1, -1", wc), (0, 0, "exact, 5, 0", wc)], + "class2": [(0, 0, "5' anchored, 5, 0", wc)] } rs = self.ReferenceSeqs_add(seq_id, seq_len, matches) @@ -746,19 +760,19 @@ def test_add_reference_seq_complex(self): # Since nested shift is symmetric, iv is same on both strands iv_n = HTSeq.GenomicInterval(seq_id, 1, seq_len-1) - match_nested = ((0, 0, IntervalNestedMatch(iv_n)),) + match_nested = ((0, 0, IntervalNestedMatch(iv_n), wc),) # Since exact and anchored shift is asymmetric and by the same # amount, iv differs per strand but is shared by both selectors. # If they both had the same classifier then these match tuples # would share the same record tuple on both strands iv_e5_s = HTSeq.GenomicInterval(seq_id, 5, seq_len, '+') - match_exact_sense = ((0, 0, IntervalExactMatch(iv_e5_s)),) - match_5anch_sense = ((0, 0, Interval5pMatch(iv_e5_s)),) + match_exact_sense = ((0, 0, IntervalExactMatch(iv_e5_s), wc),) + match_5anch_sense = ((0, 0, Interval5pMatch(iv_e5_s), wc),) iv_e5_a = HTSeq.GenomicInterval(seq_id, 0, seq_len-5, '-') - match_exact_antis = ((0, 0, IntervalExactMatch(iv_e5_a)),) - match_5anch_antis = ((0, 0, Interval5pMatch(iv_e5_a)),) + match_exact_antis = ((0, 0, IntervalExactMatch(iv_e5_a), wc),) + match_5anch_antis = ((0, 0, Interval5pMatch(iv_e5_a), wc),) expected = { (0, 1): { diff --git a/tests/unit_tests_matching.py b/tests/unit_tests_matching.py index 25f618dd..aa335349 100644 --- a/tests/unit_tests_matching.py +++ b/tests/unit_tests_matching.py @@ -20,15 +20,15 @@ def test_feature_selector_interval_build(self): iv = HTSeq.GenomicInterval('I', 0, 10, '+') # Match tuples formed during GFF parsing - match_tuples = [('n/a', 'n/a', 'partial'), - ('n/a', 'n/a', 'nested'), - ('n/a', 'n/a', 'exact'), - ('n/a', 'n/a', 'anchored'), - ('n/a', 'n/a', "5' anchored"), - ('n/a', 'n/a', "3' anchored"), - ('n/a', 'n/a', "5'anchored"), # spaces should be optional - ('n/a', 'n/a', "3'anchored")] - match_tuples += [('n/a', 'n/a', kwd) for kwd in Wildcard.kwds] + match_tuples = [('n/a', 'n/a', 'partial', Wildcard()), + ('n/a', 'n/a', 'nested', Wildcard()), + ('n/a', 'n/a', 'exact', Wildcard()), + ('n/a', 'n/a', 'anchored', Wildcard()), + ('n/a', 'n/a', "5' anchored", Wildcard()), + ('n/a', 'n/a', "3' anchored", Wildcard()), + ('n/a', 'n/a', "5'anchored", Wildcard()), # spaces should be optional + ('n/a', 'n/a', "3'anchored", Wildcard())] + match_tuples += [('n/a', 'n/a', kwd, Wildcard()) for kwd in Wildcard.kwds] result = fs.build_interval_selectors(iv, match_tuples) diff --git a/tests/unit_tests_validation.py b/tests/unit_tests_validation.py index 7df9cac5..32a0845f 100644 --- a/tests/unit_tests_validation.py +++ b/tests/unit_tests_validation.py @@ -6,7 +6,7 @@ from glob import glob from unittest.mock import patch, mock_open -from rna.counter.validation import GFFValidator, ReportFormatter, SamSqValidator +from tiny.rna.counter.validation import GFFValidator, ReportFormatter, SamSqValidator class GFFValidatorTest(unittest.TestCase): @@ -315,6 +315,23 @@ def make_parsed_sq_header(self, chroms): {'LN': chrom[1]} for chrom in chroms] + """Does read_sq_headers() return the expected data?""" + + def test_read_sq_headers(self): + sam_file = 'testdata/counter/validation/sam/sq_headers.sam' + expected = { + sam_file: [ + {'SN': 'I', 'LN': 123}, + {'SN': 'II', 'LN': 456}, + {'SN': 'III'} + ], + } + + validator = SamSqValidator([sam_file]) + validator.read_sq_headers() + + self.assertDictEqual(validator.sq_headers, expected) + """Does SamSqValidator correctly identify missing @SQ headers?""" def test_missing_sq_headers(self): diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py index 2f0308da..20c49984 100644 --- a/tiny/rna/configuration.py +++ b/tiny/rna/configuration.py @@ -771,6 +771,7 @@ class CSVReader(csv.DictReader): "5' End Nucleotide": "nt5end", "Length": "Length", "Overlap": "Overlap", + "Mismatches": "Mismatch" }), "Samples Sheet": OrderedDict({ "FASTQ/SAM Files": "File", @@ -858,9 +859,13 @@ def validate_csv_header(self, header: OrderedDict): self.fieldnames = tuple(doc_ref_lowercase[key] for key in header_lowercase.values()) def check_backward_compatibility(self, header_vals): + """Catch column differences from old versions so a helpful error can be provided""" + + header_vals_lc = {val.lower() for val in header_vals} + compat_errors = [] if self.doctype == "Features Sheet": - if len(header_vals & {'alias by...', 'feature source'}): + if len(header_vals_lc & {'alias by...', 'feature source'}): compat_errors.append('\n'.join([ "It looks like you're using a Features Sheet from an earlier version of", "tinyRNA. Feature aliases and GFF files are now defined in the Paths File.", @@ -869,7 +874,7 @@ def check_backward_compatibility(self, header_vals): "your Features Sheet to avoid this error." ])) - if len(header_vals & {'source filter', 'type filter'}) != 2: + if len(header_vals_lc & {'source filter', 'type filter'}) != 2: compat_errors.append('\n'.join([ "It looks like you're using a Features Sheet from an earlier version of", "tinyRNA. Source and type filters are now defined in the Features Sheet.", @@ -878,7 +883,7 @@ def check_backward_compatibility(self, header_vals): '"Source Filter" and "Type Filter" to your Features Sheet to avoid this error.' ])) - if 'tag' in header_vals: + if 'tag' in header_vals_lc: compat_errors.append('\n'.join([ "It looks like you're using a Features Sheet from a version of tinyRNA", 'that offered "tagged counting". The "Tag" header has been repurposed as a feature', @@ -888,6 +893,14 @@ def check_backward_compatibility(self, header_vals): '"Classify as..." to avoid this error.' ])) + if 'mismatches' not in header_vals_lc: + compat_errors.append('\n'.join([ + "It looks like you're using a Features Sheet from an earlier version of", + 'tinyRNA. An additional column, "Mismatches", is now expected. Please review' + "the Stage 2 section in tiny-count's documentation for more info, then add" + "the new column to your Features Sheet to avoid this error." + ])) + if compat_errors: raise ValueError('\n\n'.join(compat_errors)) diff --git a/tiny/rna/counter/features.py b/tiny/rna/counter/features.py index 5eb40d16..512292bd 100644 --- a/tiny/rna/counter/features.py +++ b/tiny/rna/counter/features.py @@ -2,17 +2,19 @@ import sys from collections import defaultdict -from typing import List, Tuple, Set, Dict, Mapping +from typing import List, Tuple, Set, Dict, Mapping, Union -from tiny.rna.counter.hts_parsing import SAM_reader, ReferenceFeatures, ReferenceSeqs +from tiny.rna.counter.hts_parsing import ReferenceFeatures, ReferenceSeqs, SAM_reader from .statistics import LibraryStats from .matching import * from ..util import append_to_exception # Type aliases for human readability -match_tuple = Tuple[int, int, IntervalSelector] # (rank, rule, IntervalSelector) -unbuilt_match_tuple = Tuple[int, int, str] # (rank, rule, interval selector keyword) -feature_record_tuple = Tuple[str, str, Tuple[match_tuple]] # (feature ID, strand, match tuple) +interval_selector = Union[IntervalSelector, Wildcard] +mismatch_selector = Union[NumericalMatch, Wildcard] +match_tuple = Tuple[int, int, interval_selector, mismatch_selector] # (rank, rule, IntervalSelector, NumericalMatch) +unbuilt_match_tuple = Tuple[int, int, str, mismatch_selector] # (rank, rule, overlap, NumericalMatch) +feature_record_tuple = Tuple[str, str, Tuple[match_tuple]] # (feature ID, strand, match tuple) class Features(metaclass=Singleton): @@ -89,14 +91,12 @@ class FeatureSelector: The first round of selection was performed during GFF parsing. - The second round is performed against the hierarchy values and - IntervalSelectors in each candidate's match-tuples. + The second round is performed against the overlap and mismatch selectors + which are stored in each candidate's match tuples. These matches are then + sorted by hierarchy value. If more than one candidate remains, a final round of selection is performed - against sequence attributes: strand, 5' end nucleotide, and length. Rules for - 5' end nucleotides support lists (e.g. C,G,U) and wildcards (e.g. "all"). - Rules for length support lists, wildcards, and ranges (e.g. 20-27) which - may be intermixed in the same rule. + against sequence attributes: strand, 5' end nucleotide, and length. """ rules_table: List[dict] @@ -122,7 +122,7 @@ def choose(cls, candidates: Set[feature_record_tuple], alignment: dict) -> Mappi ( featureID, strand, ( match_tuple, ... ) ) Each match_tuple represents a rule which matched the feature on identity: - ( rule, rank, IntervalSelector ) + ( rule, rank, IntervalSelector, NumericalSelector ) Args: candidates: a list of tuples, each representing features associated with @@ -136,8 +136,8 @@ def choose(cls, candidates: Set[feature_record_tuple], alignment: dict) -> Mappi # Stage 2 hits = [(rank, rule, feat, strand) for feat, strand, matches in candidates - for rule, rank, iv_match in matches - if alignment in iv_match] + for rule, rank, iv_match, nm_match in matches + if alignment in iv_match and alignment['NM'] in nm_match] if not hits: return {} hits.sort(key=lambda x: x[0]) @@ -178,6 +178,7 @@ def build_selectors(rules_table) -> List[dict]: "Strand": StrandMatch, "nt5end": NtMatch, "Length": NumericalMatch, + "Mismatch": NumericalMatch, "Identity": lambda x:x } @@ -261,7 +262,7 @@ def build_interval_selectors(self, iv: 'HTSeq.GenomicInterval', match_tuples: Li continue # Replace the match tuple's definition with selector - built_match_tuple = (match[0], match[1], selector_obj) + built_match_tuple = (match[0], match[1], selector_obj, match[3]) matches_by_interval[match_iv].append(built_match_tuple) return matches_by_interval diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index 3b28f265..469e184f 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -1,6 +1,7 @@ import functools import os.path import HTSeq +import pysam import sys import re @@ -10,6 +11,7 @@ from urllib.parse import unquote from inspect import stack +from tiny.rna.counter.parsing import AlignmentIter from tiny.rna.counter.matching import Wildcard from tiny.rna.util import report_execution_time, make_filename, ReportFormatter, append_to_exception @@ -18,14 +20,10 @@ _re_attr_empty = re.compile(r"^\s*$") # For SAM_reader -AlignmentDict = Dict[str, Union[str, int, bytes]] +AlignmentDict = Dict[str, Union[str, int]] Bundle = Tuple[List[AlignmentDict], int] _re_tiny = r"\d+_count=\d+" _re_fastx = r"seq\d+_x(\d+)" -_re_header = re.compile(r"^@(HD|SQ|RG|PG)(\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$|^@CO\t.*") - -# For Alignment -complement = {ord('A'): 'T', ord('T'): 'A', ord('G'): 'C', ord('C'): 'G'} class SAM_reader: @@ -35,28 +33,36 @@ def __init__(self, **prefs): self.decollapse = prefs.get("decollapse", False) self.out_prefix = prefs.get("out_prefix", None) self.collapser_type = None + self.references = [] + self.has_nm = None self.file = None + self._collapser_token = None + self._header_dict = {} + self._header = None + self._iter = None + + self._decollapsed_callback = self._write_decollapsed_sam if self.decollapse else None self._decollapsed_filename = None self._decollapsed_reads = [] - self._header_lines = [] - self._header_dict = defaultdict(list) def bundle_multi_alignments(self, file: str) -> Iterator[Bundle]: """Bundles multi-alignments (determined by a shared QNAME) and reports the associated read's count""" + pysam_reader = pysam.AlignmentFile(file) self.file = file - with open(file, 'rb') as f: - aln_iter = iter(self._parse_alignments(f)) - bundle, read_count = self._new_bundle(next(aln_iter)) - - for aln in aln_iter: - if aln['Name'] != bundle[0]['Name']: - yield bundle, read_count - bundle, read_count = self._new_bundle(aln) - else: - bundle.append(aln) - yield bundle, read_count + + self._gather_metadata(pysam_reader) + self._iter = AlignmentIter(pysam_reader, self.has_nm, self._decollapsed_callback, self._decollapsed_reads) + bundle, read_count = self._new_bundle(next(self._iter)) + + for aln in self._iter: + if aln['Name'] != bundle[0]['Name']: + yield bundle, read_count + bundle, read_count = self._new_bundle(aln) + else: + bundle.append(aln) + yield bundle, read_count if self.decollapse and len(self._decollapsed_reads): self._write_decollapsed_sam() @@ -65,111 +71,45 @@ def _new_bundle(self, aln: AlignmentDict) -> Bundle: """Wraps the provided alignment in a list and reports the read's count""" if self.collapser_type is not None: - token = self.collapser_token - count = int(aln['Name'].split(token)[-1]) + token = self._collapser_token + count = int(aln['Name'].rsplit(token, 1)[1]) else: count = 1 return [aln], count - def _parse_alignments(self, file_obj: IO) -> Iterator[AlignmentDict]: - """Parses and yields individual SAM alignments from the open file_obj""" + def _gather_metadata(self, reader: pysam.AlignmentFile) -> None: + """Saves header information, examines the first alignment to determine which collapser utility was used (if any), and whether the alignment has a query sequence and NM tag. + The input file's header is written to the decollapsed output file if necessary.""" - line, line_no = self._read_to_first_aln(file_obj) - decollapse_sam = self.decollapse + header = reader.header + first_aln = next(reader.head(1)) - try: - while line: - line_no += 1 - cols = line.split(b'\t') - - if decollapse_sam: - self._decollapsed_reads.append((cols[0], line)) - if len(self._decollapsed_reads) > 100000: - self._write_decollapsed_sam() - - line = file_obj.readline() # Next line - start = int(cols[3]) - 1 - seq = cols[9] - length = len(seq) - - # Note: we assume sRNA sequencing data is NOT reversely stranded - if (int(cols[1]) & 16): - strand = False # - - try: - nt5 = complement[seq[-1]] - except KeyError: - nt5 = chr(seq[-1]) - else: - strand = True # + - nt5 = chr(seq[0]) - - yield { - "Name": cols[0], - "Length": length, - "Seq": seq, - "nt5end": nt5, - "Chrom": cols[2].decode(), - "Start": start, - "End": start + length, - "Strand": strand - } - except Exception as e: - # Append to error message while preserving exception provenance and traceback - msg = f"Error occurred on line {line_no} of {self.file}" - append_to_exception(e, msg) - raise - - def _read_to_first_aln(self, file_obj: IO) -> Tuple[bytes, int]: - """Advance file_obj past the SAM header and return the first alignment unparsed""" - - lineno = 1 - line = file_obj.readline() - while line[0] == ord('@'): - self._parse_header_line(line.decode('utf-8')) - line = file_obj.readline() - lineno += 1 - - self._determine_collapser_type(line.decode()) - if self.decollapse: self._write_header_for_decollapsed_sam() - - return line, lineno - - def _parse_header_line(self, header_line: str) -> None: - """Parses and saves information from the provided header line""" - - self.validate_header_line(header_line) - self._header_lines.append(header_line) - - # Header dict - rec_type = header_line[:3] - fields = header_line[3:].strip().split('\t') - - if rec_type == "@CO": - self._header_dict[rec_type].append(fields[0]) - else: - self._header_dict[rec_type].append( - {field[:2]: field[3:].strip() for field in fields} - ) - - def validate_header_line(self, line): - if not re.match(_re_header, line): - msg = "Invalid SAM header" - msg += f" in {os.path.basename(self.file)}" if self.file else "" - msg += ':\n\t' + f'"{line}"' - raise ValueError(msg) - - def _determine_collapser_type(self, first_aln_line: str) -> None: + if first_aln.query_sequence is None: + raise ValueError("SAM alignments must include the read sequence.") + + self._header = header + self._header_dict = header.to_dict() # performs validation + self._determine_collapser_type(first_aln.query_name) + self.has_nm = first_aln.has_tag("NM") + self.references = header.references + + if self.decollapse: + self._write_header_for_decollapsed_sam(str(header)) + + def _determine_collapser_type(self, qname: str) -> None: """Attempts to determine the collapsing utility that was used before producing the input alignment file, then checks basic requirements for the utility's outputs.""" - if re.match(_re_tiny, first_aln_line) is not None: + if re.match(_re_tiny, qname) is not None: self.collapser_type = "tiny-collapse" + self._collapser_token = "=" - elif re.match(_re_fastx, first_aln_line) is not None: + elif re.match(_re_fastx, qname) is not None: self.collapser_type = "fastx" + self._collapser_token = "_x" - sort_order = self._header_dict.get('@HD', [{}])[0].get('SO', None) + sort_order = self._header_dict.get('HD', {}).get('SO', None) if sort_order is None or sort_order != "queryname": raise ValueError("SAM files from fastx collapsed outputs must be sorted by queryname\n" "(and the @HD [...] SO header must be set accordingly).") @@ -189,12 +129,12 @@ def _get_decollapsed_filename(self) -> str: self._decollapsed_filename = make_filename([basename, "decollapsed"], ext='.sam') return self._decollapsed_filename - def _write_header_for_decollapsed_sam(self) -> None: - """Writes the input file's header lines to the decollapsed output file""" + def _write_header_for_decollapsed_sam(self, header_str) -> None: + """Writes the provided header to the decollapsed output file""" assert self.collapser_type is not None with open(self._get_decollapsed_filename(), 'w') as f: - f.writelines(self._header_lines) + f.write(header_str) def _write_decollapsed_sam(self) -> None: """Expands the collapsed alignments in the _decollapsed_reads buffer @@ -202,31 +142,19 @@ def _write_decollapsed_sam(self) -> None: assert self.collapser_type is not None aln_out, prev_name, seq_count = [], None, 0 - for name, line in self._decollapsed_reads: + for aln in self._decollapsed_reads: # Parse count just once per multi-alignment + name = aln.query_name if name != prev_name: - seq_count = int(name.split(self.collapser_token)[-1]) + seq_count = int(name.rsplit(self._collapser_token, 1)[1]) - aln_out.extend([line] * seq_count) + aln_out.extend([aln.to_string()] * seq_count) prev_name = name - with open(self._get_decollapsed_filename(), 'ab') as sam_o: - sam_o.writelines(aln_out) + with open(self._get_decollapsed_filename(), 'a') as sam_o: + sam_o.write('\n'.join(aln_out)) self._decollapsed_reads.clear() - @property - def collapser_token(self) -> bytes: - """Returns the split token to be used for determining read count from the QNAME field""" - - if self._collapser_token is None: - self._collapser_token = { - "tiny-collapse": b"=", - "fastx": b"_x", - "BioSeqZip": b":" - }[self.collapser_type] - - return self._collapser_token - def infer_strandedness(sam_file: str, intervals: dict) -> str: """Infers strandedness from a sample SAM file and intervals from a parsed GFF file @@ -534,9 +462,9 @@ class ReferenceFeatures(ReferenceBase): Children of the root ancestor are otherwise not stored in the reference tables. Match-tuples are created for each Features Sheet rule which matches a feature's attributes. - They are structured as (rank, rule, overlap). "Rank" is the hierarchy value of the matching - rule, "rule" is the index of that rule in FeatureSelector's rules table, and "overlap" is the - IntervalSelector per the rule's overlap requirements. + They are structured as (rank, rule, overlap, mismatches). "Rank" is the hierarchy value of + the matching rule, "rule" is the index of that rule in FeatureSelector's rules table, and + "overlap" is the IntervalSelector per the rule's overlap requirements. Source and type filters allow the user to define acceptable values for columns 2 and 3 of the GFF, respectively. These filters are inclusive (only rows with matching values are parsed), @@ -625,10 +553,10 @@ def get_matches(self, row: HTSeq.GenomicFeature) -> DefaultDict: if self.column_filter_match(row, rule): # Unclassified matches are pooled under '' empty string identity_matches[rule['Class']].add( - (index, rule['Hierarchy'], rule['Overlap']) + (index, rule['Hierarchy'], rule['Overlap'], rule['Mismatch']) ) - # -> identity_matches: {classifier: {(rule, rank, overlap), ...}, ...} + # -> identity_matches: {classifier: {(rule, rank, overlap, mismatch), ...}, ...} return identity_matches def add_feature(self, feature_id: str, row, matches: defaultdict) -> str: @@ -845,7 +773,7 @@ def get_matches(self) -> Dict[str, List[Tuple[int, int, str]]]: matches_by_classifier = defaultdict(list) for idx, rule in enumerate(self.selector.rules_table): - match_tuple = (idx, rule['Hierarchy'], rule['Overlap']) + match_tuple = (idx, rule['Hierarchy'], rule['Overlap'], rule['Mismatch']) matches_by_classifier[rule['Class']].append(match_tuple) return matches_by_classifier diff --git a/tiny/rna/counter/parsing/__init__.py b/tiny/rna/counter/parsing/__init__.py new file mode 100644 index 00000000..9c7015c7 --- /dev/null +++ b/tiny/rna/counter/parsing/__init__.py @@ -0,0 +1 @@ +from .alignments import AlignmentIter \ No newline at end of file diff --git a/tiny/rna/counter/parsing/alignments.pxd b/tiny/rna/counter/parsing/alignments.pxd new file mode 100644 index 00000000..e69de29b diff --git a/tiny/rna/counter/parsing/alignments.pyx b/tiny/rna/counter/parsing/alignments.pyx new file mode 100644 index 00000000..0cb7ce4b --- /dev/null +++ b/tiny/rna/counter/parsing/alignments.pyx @@ -0,0 +1,98 @@ +# cython: language_level = 3 +# cython: profile=False + +cimport cython +from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment +from pysam.libcalignedsegment cimport pysam_bam_get_qname +from pysam.libchtslib cimport bam_aux_get, bam_aux2i, BAM_FUNMAP, BAM_FREVERSE + +from typing import Callable + +cdef tuple cigar_mismatch = (1, 2, 8) + +cdef class AlignmentIter: + + cdef AlignmentFile reader + cdef tuple references + cdef object dc_callback + cdef list dc_queue + cdef bint decollapse + cdef bint has_nm + + def __init__(self, pysam_reader: AlignmentFile, has_nm: bool, dc_callback: Callable, dc_queue: list): + self.reader = pysam_reader + self.references = pysam_reader.header.references + self.decollapse = dc_callback is not None + self.dc_callback = dc_callback + self.dc_queue = dc_queue + self.has_nm = has_nm + + def __iter__(self): + return self + + def __next__(self): + return self._next_alignment() + + cdef dict _next_alignment(self): + cdef: + AlignedSegment aln = next(self.reader) # Equivalent Python API calls: + int flag = aln._delegate.core.flag # aln.flag + str seq = aln.query_sequence + int start = aln._delegate.core.pos # aln.reference_start + int length = aln._delegate.core.l_qseq # aln.query_length + bint strand = (flag & BAM_FREVERSE) == 0 # aln.is_forward + str nt5 = AlignmentIter._get_nt5(seq, strand) + str name = pysam_bam_get_qname(aln._delegate).decode() # aln.query_name + + if (flag & BAM_FUNMAP) != 0: # aln.is_unmapped + return self._next_alignment() + + if self.decollapse: + self._append_to_dc_queue(aln) + + if self.has_nm: + mismatches = AlignmentIter._get_mismatches_from_nm(aln) # aln.get_tag("NM") + else: + mismatches = AlignmentIter._get_mismatches_from_cigar(aln) + + return { + "Name": name, + "Length": length, + "Seq": seq, + "nt5end": nt5, + "Chrom": self.references[aln._delegate.core.tid], # aln.reference_name + "Start": start, + "End": start + length, + "Strand": strand, + "NM": mismatches + } + + @staticmethod + cdef _get_mismatches_from_nm(AlignedSegment aln): + nm = bam_aux_get(aln._delegate, b"NM") + if nm == NULL: return 0 # Assume missing tag means NM:i:0 + return bam_aux2i(nm) + + @staticmethod + cdef _get_mismatches_from_cigar(AlignedSegment aln): + # Calculate mismatches using the CIGAR string's I, D, and X operations + return sum(op_len for op, op_len in aln.cigartuples if op in cigar_mismatch) + + @staticmethod + cdef str _get_nt5(str seq, bint strand): + cdef str nt + + if strand: + return seq[0] + else: + nt = seq[-1] + if nt == "A": return "T" + elif nt == "T": return "A" + elif nt == "G": return "C" + elif nt == "C": return "G" + else: return nt + + cdef _append_to_dc_queue(self, AlignedSegment aln): + self.dc_queue.append(aln) + if len(self.dc_queue) > 100_000: + self.dc_callback() \ No newline at end of file diff --git a/tiny/rna/counter/validation.py b/tiny/rna/counter/validation.py index 43fd2af5..a55b488e 100644 --- a/tiny/rna/counter/validation.py +++ b/tiny/rna/counter/validation.py @@ -1,5 +1,6 @@ import functools import subprocess +import pysam import sys import os @@ -341,8 +342,5 @@ def generate_identifier_report(self, duplicate, ambiguous): def read_sq_headers(self): for file in self.sam_files: - with open(file, 'rb') as f: - reader = SAM_reader() - reader._read_to_first_aln(f) - - self.sq_headers[file] = reader._header_dict.get('@SQ', []) \ No newline at end of file + pr = pysam.AlignmentFile(file, check_sq=False) + self.sq_headers[file] = pr.header.to_dict().get('SQ', []) diff --git a/tiny/templates/features.csv b/tiny/templates/features.csv index 8ced3c8d..067f78dd 100755 --- a/tiny/templates/features.csv +++ b/tiny/templates/features.csv @@ -1,2 +1,2 @@ -Select for...,with value...,Classify as...,Source Filter,Type Filter,Hierarchy,Strand,5' End Nucleotide,Length,Overlap +Select for...,with value...,Classify as...,Source Filter,Type Filter,Hierarchy,Overlap,Mismatches,Strand,5' End Nucleotide,Length ,,,,,,,,, \ No newline at end of file