diff --git a/START_HERE/paths.yml b/START_HERE/paths.yml index 1bda5099..782f2fb3 100644 --- a/START_HERE/paths.yml +++ b/START_HERE/paths.yml @@ -7,7 +7,7 @@ # 1. Fill out the Samples Sheet with files to process + naming scheme. [samples.csv] # 2. Fill out the Features Sheet with selection rules [features.csv] # 3. Set samples_csv and features_csv (below) to point to these files -# 4. Add annotation files and per-file alias preferences to gff_files +# 4. Add annotation files and per-file alias preferences to gff_files (optional) # ######-------------------------------------------------------------------------------###### diff --git a/tests/testdata/config_files/paths.yml b/tests/testdata/config_files/paths.yml index 125e6a76..92cc0da4 100644 --- a/tests/testdata/config_files/paths.yml +++ b/tests/testdata/config_files/paths.yml @@ -7,7 +7,7 @@ # 1. Fill out the Samples Sheet with files to process + naming scheme. [samples.csv] # 2. Fill out the Features Sheet with selection rules [features.csv] # 3. Set samples_csv and features_csv (below) to point to these files -# 4. Add annotation files and per-file alias preferences to gff_files +# 4. Add annotation files and per-file alias preferences to gff_files (optional) # ######-------------------------------------------------------------------------------###### diff --git a/tests/unit_tests_configuration.py b/tests/unit_tests_configuration.py index 5eed377e..70c6fd37 100644 --- a/tests/unit_tests_configuration.py +++ b/tests/unit_tests_configuration.py @@ -286,18 +286,29 @@ def test_validate_required_parameters(self): config[key] = oldval - """Does PathsFile check that at least one GFF file has been provided?""" + """Does PathsFile notify the user of the change in counting style + when no GFF files have been provided?""" - def test_validate_gff_files(self): + def test_no_gff_files(self): config = make_paths_file() + stdout = io.StringIO() - with self.assertRaisesRegex(AssertionError, r".*(At least one GFF).*"): - config['gff_files'] = [{'path': "", 'alias': []}] - config.validate_paths() + gff_configs = [ + [{'path': "", 'alias': []}], + [{'irrelevant': "value"}], + None, + [] + ] - with self.assertRaisesRegex(AssertionError, r".*(At least one GFF).*"): - config['gff_files'] = [{'irrelevant': "value"}] - config.validate_paths() + with contextlib.redirect_stdout(stdout): + for gff_config in gff_configs: + config['gff_files'] = gff_config + config.validate_paths() + config_return = config.get_gff_config() + + self.assertRegex(stdout.getvalue(), r".*(No GFF files).*") + self.assertEqual(config_return, {}) + stdout.truncate(0) """Does PathsFile check for missing files for single entry parameters?""" @@ -350,6 +361,37 @@ def test_pipeline_mapping(self): self.assertEqual(config['empty_path'], "") self.assertEqual(config['none_path'], None) + """Does get_gff_config properly screen for "ID" alias attributes?""" + + def test_get_gff_config_id_alias_attr(self): + config = make_paths_file() + config['gff_files'] = [{'path': "/irrelevant", 'alias': ['ID']}] + + gff_files = config.get_gff_config() + expected = {"/irrelevant": []} + + self.assertDictEqual(gff_files, expected) + + """Does get_gff_config merge aliases if the same GFF file is listed more than once? + Are duplicates also removed and original order preserved?""" + + def test_get_gff_config_merge_alias_attr(self): + config = make_paths_file() + config['gff_files'] = [ + {'path': "/same/path", 'alias': ['attr1', 'attr2']}, + {'path': "/same/path", 'alias': ['attr1', 'attrN']}, + {'path': "/different/path", 'alias': ['attr3']} + ] + + gff_files = config.get_gff_config() + + expected = { + "/same/path": ['attr1', 'attr2', 'attrN'], + "/different/path": ['attr3'] + } + + self.assertDictEqual(gff_files, expected) + class ConfigurationTest(unittest.TestCase): diff --git a/tests/unit_tests_counter.py b/tests/unit_tests_counter.py index 0110ab50..20d9107c 100644 --- a/tests/unit_tests_counter.py +++ b/tests/unit_tests_counter.py @@ -231,40 +231,6 @@ def test_load_config_rna_to_cDNA(self): self.assertEqual(ruleset[0]['nt5end'], 'T') - """Does load_gff_files properly screen for "ID" alias attributes?""" - - def test_load_gff_files_id_alias_attr(self): - config = helpers.make_paths_file() - config['gff_files'] = [{'path': "/irrelevant", 'alias': ['ID']}] - - # Patch GFFValidator so that it isn't called (not relevant) - with patch('tiny.rna.counter.counter.GFFValidator'): - gff_files = counter.load_gff_files(config, [], {}) - - expected = {"/irrelevant": []} - self.assertDictEqual(gff_files, expected) - - """Does load_gff_files merge aliases if the same GFF file is listed more than once? - Are duplicates also removed and original order preserved?""" - - def test_load_gff_files_merge_alias_attr(self): - config = helpers.make_paths_file() - config['gff_files'] = [ - {'path': "/same/path", 'alias': ['attr1', 'attr2']}, - {'path': "/same/path", 'alias': ['attr1', 'attrN']}, - {'path': "/different/path", 'alias': ['attr3']} - ] - - # Patch GFFValidator so that it isn't called (not relevant) - with patch('tiny.rna.counter.counter.GFFValidator'): - gff_files = counter.load_gff_files(config, [], {}) - - expected = { - "/same/path": ['attr1', 'attr2', 'attrN'], - "/different/path": ['attr3'] - } - - self.assertDictEqual(gff_files, expected) if __name__ == '__main__': unittest.main() diff --git a/tests/unit_tests_hts_parsing.py b/tests/unit_tests_hts_parsing.py index 20d49839..74de20d7 100644 --- a/tests/unit_tests_hts_parsing.py +++ b/tests/unit_tests_hts_parsing.py @@ -130,7 +130,7 @@ def test_gff_attr_list_vals(self): self.assertTrue(all([len(val) == 1 for key, val in attr.items() if key != "Class"])) self.assertEqual(len(attr['Class']), 2) - """Does ReferenceTables.get() return the expected features, aliases, and classes for a single record GFF?""" + """Does ReferenceFeatures.get() return the expected features, aliases, and classes for a single record GFF?""" def test_ref_tables_single_feature(self): feature_source = {self.short_gff_file: ["sequence_name"]} @@ -145,7 +145,7 @@ def test_ref_tables_single_feature(self): iv = HTSeq.GenomicInterval("I", 3746, 3909, "-") kwargs = {'all_features': True} - feats, alias, tags = ReferenceTables(feature_source, feature_selector, **kwargs).get() + feats, alias, tags = ReferenceFeatures(feature_source, **kwargs).get(feature_selector) steps = list(feats[iv].array[iv.start:iv.end].get_steps(values_only=True)) self.assertEqual((type(feats), type(alias), type(tags)), (HTSeq.GenomicArrayOfSets, dict, defaultdict)) @@ -169,7 +169,7 @@ def test_ref_tables_single_feature_all_features_false(self): iv = HTSeq.GenomicInterval("I", 3746, 3909, "-") kwargs = {'all_features': True} - feats, alias, tags = ReferenceTables(feature_source, feature_selector, **kwargs).get() + feats, alias, tags = ReferenceFeatures(feature_source, **kwargs).get(feature_selector) steps = list(feats[iv].array[iv.start:iv.end].get_steps(values_only=True)) self.assertEqual((type(feats), type(alias), type(tags)), (HTSeq.GenomicArrayOfSets, dict, defaultdict)) @@ -189,12 +189,12 @@ def test_ref_tables_missing_name_attribute_all_features_false(self): "This may be due to a lack of features matching 'Select for...with value...'" # Since all_features is False and there are no identity matches, the main loop in - # ReferenceTables.get() skips the steps for recording the feature's alias. + # ReferenceFeatures.get() skips the steps for recording the feature's alias. # Instead, a different exception is raised due to reference tables being empty with self.assertRaisesRegex(ValueError, expected_err): - ReferenceTables(feature_source, feature_selector, **kwargs).get() + ReferenceFeatures(feature_source, **kwargs).get(feature_selector) - """Does ReferenceTables.get() raise ValueError when a feature lacks an ID attribute?""" + """Does ReferenceFeatures.get() raise ValueError when a feature lacks an ID attribute?""" def test_ref_tables_missing_id_attribute(self): feature_source = {self.short_gff_file: ["ID"]} @@ -209,10 +209,10 @@ def test_ref_tables_missing_id_attribute(self): with patch('tiny.rna.counter.hts_parsing.HTSeq.utils.open', new=mock_reader): with self.assertRaisesRegex(ValueError, expected_err): - _ = ReferenceTables(feature_source, feature_selector, **kwargs).get() + _ = ReferenceFeatures(feature_source, **kwargs).get(feature_selector) - """Does ReferenceTables.get() properly concatenate aliases if there is more than one alias for a feature?""" - """Does ReferenceTables.get() properly concatenate aliases when Name Attribute refers to a list-type alias?""" + """Does ReferenceFeatures.get() properly concatenate aliases if there is more than one alias for a feature?""" + """Does ReferenceFeatures.get() properly concatenate aliases when Name Attribute refers to a list-type alias?""" # 2 for 1! def test_ref_tables_alias_multisource_concat(self): @@ -221,7 +221,7 @@ def test_ref_tables_alias_multisource_concat(self): # Notice: screening for "ID" name attribute happens earlier in counter.load_config() expected_alias = {"Gene:WBGene00023193": ("additional_class", "Gene:WBGene00023193", "unknown")} - _, alias, _ = ReferenceTables(feature_source, MockFeatureSelector([]), **kwargs).get() + _, alias, _ = ReferenceFeatures(feature_source, **kwargs).get(MockFeatureSelector([])) self.assertDictEqual(alias, expected_alias) @@ -236,9 +236,9 @@ def test_ref_tables_alias_multisource_concat_all_features_false(self): with self.assertRaisesRegex(ValueError, expected_err): # No aliases saved due to all_features=False and the lack of identity matches - _, alias, _ = ReferenceTables(feature_source, MockFeatureSelector([]), **kwargs).get() + _, alias, _ = ReferenceFeatures(feature_source, **kwargs).get(MockFeatureSelector([])) - """Does ReferenceTables.get() properly concatenate identity match tuples when multiple GFF files define + """Does ReferenceFeatures.get() properly concatenate identity match tuples when multiple GFF files define matches for a feature?""" def test_ref_tables_identity_matches_multisource_concat(self): @@ -262,19 +262,19 @@ def test_ref_tables_identity_matches_multisource_concat(self): set() ] - feats, _, _ = ReferenceTables(feature_source, feature_selector, **kwargs).get() + feats, _, _ = ReferenceFeatures(feature_source, **kwargs).get(feature_selector) actual_matches = list(feats.chrom_vectors['I']['.'].array.get_steps(values_only=True)) self.assertListEqual(actual_matches, expected_matches) - """Does ReferenceTables.get() properly handle aliases for discontinuous features?""" + """Does ReferenceFeatures.get() properly handle aliases for discontinuous features?""" def test_ref_tables_discontinuous_aliases(self): kwargs = {'all_features': True} feature_source = {f"{resources}/discontinuous.gff3": ["Name"]} mock_selector = self.selector_with_template(helpers.rules_template) - _, alias, _ = ReferenceTables(feature_source, mock_selector, **kwargs).get() + _, alias, _ = ReferenceFeatures(feature_source, **kwargs).get(mock_selector) # Ancestor depth of 1, distinct aliases self.assertEqual(alias['Parent2'], ('Child2Name', 'Parent2Name')) @@ -294,9 +294,9 @@ def test_ref_tables_discontinuous_no_match_all_features_false(self): "This may be due to a lack of features matching 'Select for...with value...'" with self.assertRaisesRegex(ValueError, expected_err): - ReferenceTables(feature_source, mock_selector, **kwargs).get() + ReferenceFeatures(feature_source, **kwargs).get(mock_selector) - """Does ReferenceTables.get() properly build a GenomicArrayOfSets for discontinuous features + """Does ReferenceFeatures.get() properly build a GenomicArrayOfSets for discontinuous features where no features match but all_features is True? If the feature didn't match we only want to display it in the Feature ID column of counts table with 0 counts. We DON'T want it to pop up as a candidate in Stage 2 selection.""" @@ -311,11 +311,11 @@ def test_ref_tables_discontinuous_features(self): # EVEN if all_features = True. expected = [set()] - feats, _, _ = ReferenceTables(feature_source, feature_selector, **kwargs).get() + feats, _, _ = ReferenceFeatures(feature_source, **kwargs).get(feature_selector) actual = list(feats.chrom_vectors["I"]["."].array.get_steps(values_only=True)) self.assertListEqual(actual, expected) - """Does ReferenceTables.get() properly handle intervals for discontinous features?""" + """Does ReferenceFeatures.get() properly handle intervals for discontinous features?""" def test_ref_tables_discontinuous_intervals(self): kwargs = {'all_features': True} @@ -331,8 +331,8 @@ def test_ref_tables_discontinuous_intervals(self): sib_2 = HTSeq.GenomicInterval('I', 110, 120, '-') sib_3 = HTSeq.GenomicInterval('I', 139, 150, '-') - RT_instance = ReferenceTables(feature_source, feature_selector, **kwargs) - _ = RT_instance.get() + RT_instance = ReferenceFeatures(feature_source, **kwargs) + _ = RT_instance.get(feature_selector) # Ancestor depth of 1 self.assertEqual(RT_instance.intervals['GrandParent'], [grandparent_iv, parent_w_p_iv, child_w_gp_iv]) @@ -341,7 +341,7 @@ def test_ref_tables_discontinuous_intervals(self): # Siblings self.assertEqual(RT_instance.intervals['Sibling'], [sib_1, sib_2, sib_3]) - """Does ReferenceTables.get() properly merge identity matches of discontinuous features with the root feature? + """Does ReferenceFeatures.get() properly merge identity matches of discontinuous features with the root feature? Identity match tuples now also contain the corresponding rule's IntervalSelector, so extra bookkeeping must be performed for intervals in this test.""" @@ -381,11 +381,11 @@ def test_ref_tables_discontinuous_identity_matches(self): {(Sibling, False, (rule3_sib['139:150'], rule2_sib['139:150']))}, set()] - feats, _, _ = ReferenceTables(feature_source, feature_selector, **rt_kwargs).get() + feats, _, _ = ReferenceFeatures(feature_source, **rt_kwargs).get(feature_selector) actual_steps = list(feats.chrom_vectors["I"]["."].array.get_steps(values_only=True)) self.assertListEqual(actual_steps, expected) - """Does ReferenceTables.get() properly handle source filters for discontinuous features?""" + """Does ReferenceFeatures.get() properly handle source filters for discontinuous features?""" def test_ref_tables_source_filter(self): @@ -401,8 +401,8 @@ def test_ref_tables_source_filter(self): exp_filtered = {"GrandParent", "ParentWithGrandparent", "Parent2", "Child1", "Sibling"} exp_parents = {'ParentWithGrandparent': 'GrandParent', 'Child1': 'ParentWithGrandparent', 'Child2': 'Parent2'} - rt = ReferenceTables(feature_source, feature_selector, **kwargs) - feats, alias, _ = rt.get() + rt = ReferenceFeatures(feature_source, **kwargs) + feats, alias, _ = rt.get(feature_selector) self.assertEqual(alias, exp_alias) self.assertEqual(rt.parents, exp_parents) @@ -410,7 +410,7 @@ def test_ref_tables_source_filter(self): self.assertEqual(list(feats.chrom_vectors['I']['.'].array.get_steps(values_only=True)), exp_feats) self.assertDictEqual(rt.intervals, exp_intervals) - """Does ReferenceTables.get() properly handle type filters for discontinuous features?""" + """Does ReferenceFeatures.get() properly handle type filters for discontinuous features?""" def test_ref_tables_type_filter(self): @@ -426,8 +426,8 @@ def test_ref_tables_type_filter(self): exp_filtered = {"GrandParent", "ParentWithGrandparent", "Parent2", "Child2", "Sibling"} exp_parents = {'ParentWithGrandparent': 'GrandParent', 'Child1': 'ParentWithGrandparent', 'Child2': 'Parent2'} - rt = ReferenceTables(feature_source, feature_selector, **kwargs) - feats, alias, _ = rt.get() + rt = ReferenceFeatures(feature_source, **kwargs) + feats, alias, _ = rt.get(feature_selector) self.assertEqual(alias, exp_alias) self.assertEqual(rt.intervals, exp_intervals) @@ -435,7 +435,7 @@ def test_ref_tables_type_filter(self): self.assertEqual(rt.filtered, exp_filtered) self.assertEqual(list(feats.chrom_vectors['I']['.'].array.get_steps(values_only=True)), exp_feats) - """Does ReferenceTables.get() properly handle both source and type filters for discontinuous features?""" + """Does ReferenceFeatures.get() properly handle both source and type filters for discontinuous features?""" def test_ref_tables_both_filter(self): @@ -443,15 +443,15 @@ def test_ref_tables_both_filter(self): feature_source = {f"{resources}/discontinuous.gff3": ["Name"]} feature_selector = self.selector_with_template([{'Filter_s': "SourceName", 'Filter_t': "gene"}]) - rt = ReferenceTables(feature_source, feature_selector, **kwargs) - feats, alias, _ = rt.get() + rt = ReferenceFeatures(feature_source, **kwargs) + feats, alias, _ = rt.get(feature_selector) self.assertEqual(rt.filtered, {'Child1', 'Child2'}) self.assertEqual(rt.parents, {'ParentWithGrandparent': 'GrandParent', 'Child1': 'ParentWithGrandparent', 'Child2': 'Parent2'}) self.assertEqual(list(alias.keys()), ['GrandParent', 'Parent2', 'Sibling']) self.assertEqual(len(list(feats.chrom_vectors['I']['.'].array.get_steps(values_only=True))), 9) - """Does ReferenceTables.get() maintain correct records for a single feature matching tagged rules?""" + """Does ReferenceFeatures.get() maintain correct records for a single feature matching tagged rules?""" def test_ref_tables_tagged_match_single(self): kwargs = {'all_features': False} @@ -473,14 +473,14 @@ def test_ref_tables_tagged_match_single(self): set() ] - feats, aliases, tags = ReferenceTables(feature_source, feature_selector, **kwargs).get() + feats, aliases, tags = ReferenceFeatures(feature_source, **kwargs).get(feature_selector) actual_feats = list(feats.chrom_vectors['I']['.'].array.get_steps(values_only=True)) self.assertListEqual(actual_feats, expected_feats) self.assertDictEqual(aliases, expected_aliases) self.assertDictEqual(tags, expected_tags) - """Does ReferenceTables.get() correctly merge records for discontinuous features matching multiple tagged rules?""" + """Does ReferenceFeatures.get() correctly merge records for discontinuous features matching multiple tagged rules?""" def test_ref_tables_tagged_match_merging(self): feature_source = {f"{resources}/discontinuous.gff3": ['Name']} @@ -509,7 +509,7 @@ def test_ref_tables_tagged_match_merging(self): set() ] - feats, aliases, tags = ReferenceTables(feature_source, feature_selector).get() + feats, aliases, tags = ReferenceFeatures(feature_source).get(feature_selector) stepvec = list(feats.chrom_vectors['I']['.'].array.get_steps(values_only=True)) self.assertListEqual(stepvec, expected_feats) @@ -717,7 +717,7 @@ def test_CaseInsensitiveAttrs_contains_ident_wildcard(self): @unittest.skip("Long-running test, execute manually") class GenomeParsingTests(unittest.TestCase): - """Runs full-scale, unmodified GFF3/GTF genomes for select species through the ReferenceTables class""" + """Runs full-scale, unmodified GFF3/GTF genomes for select species through the ReferenceFeatures class""" @classmethod def setUpClass(self): @@ -753,21 +753,21 @@ def setUpClass(self): for chunk in r.iter_content(chunk_size=16384): f.write(chunk) - """Does ReferenceTables.get() process all genomes without throwing any errors?""" + """Does ReferenceFeatures.get() process all genomes without throwing any errors?""" def test_gff_megazord(self): print("Running GFF Megazord test. This will take a long time...") - # Single rule with all wildcard selectors, but only Identity is actually relevant within ReferenceTables + # Single rule with all wildcard selectors, but only Identity is actually relevant within ReferenceFeatures rules = [{'Identity': ('', ''), 'Class': '', 'Filter_s': '', 'Filter_t': '', 'Hierarchy': 0, 'Overlap': 'partial', 'Strand': '', 'nt5end': '', 'Length': ''}] files = {gff.format(ftype): [] for gff in self.genomes.values() for ftype in ('gff3', 'gtf')} fs = FeatureSelector(rules, LibraryStats()) - rt = ReferenceTables(files, fs) + rt = ReferenceFeatures(files) # The test is passed if this command # completes without throwing errors. - rt.get() + rt.get(fs) if __name__ == '__main__': unittest.main() diff --git a/tests/unit_tests_stepvector.py b/tests/unit_tests_stepvector.py index c72facb5..0487868f 100644 --- a/tests/unit_tests_stepvector.py +++ b/tests/unit_tests_stepvector.py @@ -10,7 +10,7 @@ class StepVectorTests(unittest.TestCase): def test_genomicarray_with_cython_stepvec(self): # Patch the StepVector reference in the HTSeq module and use a GenomicArray - # instead of a GenomicArrayOfSets, just as we do in ReferenceTables + # instead of a GenomicArrayOfSets, just as we do in reference parsers setattr(HTSeq.StepVector, 'StepVector', StepVector) gas = HTSeq.GenomicArray('auto', stranded=False) diff --git a/tests/unit_tests_validation.py b/tests/unit_tests_validation.py index a692c216..7df9cac5 100644 --- a/tests/unit_tests_validation.py +++ b/tests/unit_tests_validation.py @@ -6,10 +6,10 @@ from glob import glob from unittest.mock import patch, mock_open -from rna.counter.validation import GFFValidator, ReportFormatter +from rna.counter.validation import GFFValidator, ReportFormatter, SamSqValidator -class ValidationTests(unittest.TestCase): +class GFFValidatorTest(unittest.TestCase): """======== Helper functions =========================== """ @@ -295,6 +295,118 @@ def test_chrom_heuristics_runtime(self): self.assertLessEqual(end-start, 2) +class SamSqValidatorTest(unittest.TestCase): + @classmethod + def setUpClass(self): + self.syntax_header = "Every SAM file must have complete @SQ headers with SN and LN\n" \ + "fields when performing sequence-based counting.\n" + + self.identifier_header = "Sequence identifiers must be unique and have consistent length definitions.\n" + + def make_sam_sq_header(self, chroms): + return '\n'.join( + [f"@SQ\tSN:{chrom[0]}\tLN:{chrom[1]}" + for chrom in chroms] + ) + + def make_parsed_sq_header(self, chroms): + return [{'SN': chrom[0], 'LN': chrom[1]} if chrom[0] and chrom[1] else + {'SN': chrom[0]} if not chrom[1] else + {'LN': chrom[1]} + for chrom in chroms] + + """Does SamSqValidator correctly identify missing @SQ headers?""" + + def test_missing_sq_headers(self): + mock_files = ['sam1', 'sam2'] + mock_sam_headers = { + mock_files[0]: self.make_parsed_sq_header([('chr1', 100), ('chr2', 200)]), + mock_files[1]: [] + } + + expected = '\n'.join([ + self.syntax_header, + "\t" + f"{SamSqValidator.targets['missing sq']}: ", + "\t\t" + mock_files[1] + ]) + + validator = SamSqValidator(mock_files) + validator.sq_headers = mock_sam_headers + validator.validate_sq_headers() + + self.assertListEqual(validator.report.errors, [expected]) + self.assertListEqual(validator.report.warnings, []) + + """Does SamSqValidator correctly identify incomplete @SQ headers?""" + + def test_incomplete_sq_headers(self): + mock_files = ['sam1', 'sam2', 'sam3'] + mock_sam_headers = { + mock_files[0]: self.make_parsed_sq_header([('chr1', 100), ('chr2', 200)]), + mock_files[1]: self.make_parsed_sq_header([('chr3', None), ('chr4', 400)]), + mock_files[2]: self.make_parsed_sq_header([('chr5', 500), (None, 600)]) + } + + expected = '\n'.join([ + self.syntax_header, + "\t" + f"{SamSqValidator.targets['incomplete sq']}: ", + "\t\t" + mock_files[1], + "\t\t" + mock_files[2], + ]) + + validator = SamSqValidator(mock_files) + validator.sq_headers = mock_sam_headers + validator.validate_sq_headers() + + self.assertListEqual(validator.report.errors, [expected]) + self.assertListEqual(validator.report.warnings, []) + + """Does SamSqValidator correctly identify duplicate identifiers?""" + + def test_duplicate_identifiers(self): + mock_files = ['sam1', 'sam2', 'sam3'] + mock_sam_headers = { + mock_files[0]: self.make_parsed_sq_header([('chr1', 100), ('chr2', 200)]), + mock_files[1]: self.make_parsed_sq_header([('chr1', 100), ('chr2', 200)]), + mock_files[2]: self.make_parsed_sq_header([('chr1', 100), ('chr1', 100)]) + } + + expected = '\n'.join([ + self.identifier_header, + "\t" + f"{SamSqValidator.targets['intra sq']}: ", + "\t\t" + mock_files[2], + ]) + + validator = SamSqValidator(mock_files) + validator.sq_headers = mock_sam_headers + validator.validate_sq_headers() + + self.assertListEqual(validator.report.errors, [expected]) + self.assertListEqual(validator.report.warnings, []) + + """Does SamSqValidator correctly identify identifiers with inconsistent length definitions?""" + + def test_inconsistent_identifier_length(self): + mock_files = ['sam1', 'sam2', 'sam3'] + mock_sam_headers = { + mock_files[0]: self.make_parsed_sq_header([('chr1', 100), ('chr2', 200)]), + mock_files[1]: self.make_parsed_sq_header([('chr1', 100), ('chr2', 300)]), + mock_files[2]: self.make_parsed_sq_header([('chr1', 200), ('chr2', 100)]) + } + + expected = '\n'.join([ + self.identifier_header, + "\t" + f"{SamSqValidator.targets['inter sq']}: ", + "\t\t" + 'chr1', + "\t\t" + 'chr2', + ]) + + validator = SamSqValidator(mock_files) + validator.sq_headers = mock_sam_headers + validator.validate_sq_headers() + + self.assertListEqual(validator.report.errors, [expected]) + self.assertListEqual(validator.report.warnings, []) if __name__ == '__main__': unittest.main() diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py index a0ada948..e7f6a2d8 100644 --- a/tiny/rna/configuration.py +++ b/tiny/rna/configuration.py @@ -2,14 +2,14 @@ import argparse import shutil import errno +import time import sys import csv import os -import re from pkg_resources import resource_filename from collections import Counter, OrderedDict, defaultdict -from typing import Union, Any, Optional, List +from typing import Union, Any, List, Dict from glob import glob from tiny.rna.compatibility import RunConfigCompatibility @@ -63,7 +63,10 @@ def set(self, key: str, val: Union[str, list, dict, bool]) -> Union[str, list, d return val def set_if_not(self, key: str, val: Union[str, list, dict, bool]) -> Any: - """Apply the setting if it has not been previously set""" + """Apply the setting if it has not been previously set. This differs from + dict.setdefault() in that existing keys are overwritten if the associated + value is false in boolean context (e.g. None, False, empty container, etc.)""" + if not self[key]: self[key] = val return val @@ -131,17 +134,29 @@ def from_here(self, destination: Union[str,dict,None], origin: Union[str,dict,No if isinstance(origin, dict): origin = origin.get('path') if origin is None: origin = self.dir - if ( - isinstance(destination, dict) and - isinstance(destination.get("path"), (str, bytes)) and - len(destination['path']) - ): + if self.is_path_dict(destination): return dict(destination, path=self.joinpath(origin, destination["path"])) - elif isinstance(destination, (str, bytes)) and bool(destination): + elif self.is_path_str(destination): return self.joinpath(origin, destination) else: return destination + @staticmethod + def is_path_dict(val, empty_ok=False): + return (isinstance(val, dict) and + isinstance(val.get("path"), (str, bytes)) and + (len(val['path']) or empty_ok)) + + @staticmethod + def is_path_str(val, empty_ok=False): + return (isinstance(val, (str, bytes)) and + (len(val) or empty_ok)) + + @classmethod + def is_path(cls, val, empty_ok=False): + return (cls.is_path_dict(val, empty_ok) or + cls.is_path_str(val, empty_ok)) + def setup_step_inputs(self): """For now, only tiny-plot requires additional setup for step inputs This function is called at both startup and resume""" @@ -183,11 +198,6 @@ def write_processed_config(self, filename: str = None) -> str: if filename is None: filename = self.get_outfile_path(self.inf) with open(filename, 'w') as outconf: - if 'paths_config' in self and not os.path.isabs(self['paths_config']): - # Processed config will be written to the Run Directory - # Ensure paths_config is an absolute path so it remains valid - self['paths_config'] = self.from_here(self['paths_config']) - self.yaml.dump(self.config, outconf) return filename @@ -214,7 +224,7 @@ def __init__(self, config_file: str, validate_inputs=False): super().__init__(config_file, RunConfigCompatibility) self.paths = self.load_paths_config() - self.assimilate_paths_file() + self.absorb_paths_file() self.setup_pipeline() self.setup_file_groups() @@ -225,18 +235,24 @@ def __init__(self, config_file: str, validate_inputs=False): if validate_inputs: self.validate_inputs() def load_paths_config(self): - """Constructs a sub-configuration object containing workflow file preferences - self['paths_config'] is the user-facing file path (just the path string) - self['paths_file'] is a CWL file object used as a workflow input.""" - paths_file_path = self.from_here(self['paths_config']) - return PathsFile(paths_file_path) + """Returns a PathsFile object and updates keys related to the Paths File path""" + + # paths_config: user-specified + # Resolve the absolute path so that it remains valid when + # the processed Run Config is copied to the Run Directory + self['paths_config'] = self.from_here(self['paths_config']) + + # paths_file: automatically generated + # CWL file dictionary is used as a workflow input + self['paths_file'] = self.cwl_file(self['paths_config']) - def assimilate_paths_file(self): + return PathsFile(self['paths_config']) + + def absorb_paths_file(self): for key in [*PathsFile.single, *PathsFile.groups]: self[key] = self.paths.as_cwl_file_obj(key) for key in PathsFile.prefix: self[key] = self.paths[key] - self['paths_file'] = self.cwl_file(self.paths.inf) def process_samples_sheet(self): samples_sheet_path = self.paths['samples_csv'] @@ -346,17 +362,15 @@ def get_bt_index_files(self): def validate_inputs(self): """For now, only GFF files are validated here""" - selection_rules = self.process_features_sheet() - gff_files = {gff['path']: [] for gff in self['gff_files']} - ebwt = self.paths['ebwt'] if not self['run_bowtie_build'] else None + gff_files = self.paths.get_gff_config() - GFFValidator( - gff_files, - selection_rules, - ebwt=ebwt, - genomes=self.paths['reference_genome_files'], - alignments=None # Used in tiny-count standalone runs - ).validate() + if gff_files: + GFFValidator( + gff_files, + self.process_features_sheet(), + self.paths['ebwt'] if not self['run_bowtie_build'] else None, + self.paths['reference_genome_files'] + ).validate() def execute_post_run_tasks(self, return_code): if self['run_bowtie_build']: @@ -452,10 +466,16 @@ class PathsFile(ConfigBase): - Lookups that return list values do not return the original object; don't append to them. Instead, use the append_to() helper function. - Chained assignments can produce unexpected results. + + Args: + file: The path to the Paths File, as a string + in_pipeline: True only when utilized by a step in the workflow, + in which case input files are sourced from the working directory + regardless of the path indicated in the Paths File """ # Parameter types - required = ('samples_csv', 'features_csv', 'gff_files') + required = ('samples_csv', 'features_csv') single = ('samples_csv', 'features_csv', 'plot_style_sheet', 'adapter_fasta') groups = ('reference_genome_files', 'gff_files') prefix = ('ebwt', 'run_directory', 'tmp_directory') @@ -475,9 +495,9 @@ def from_pipeline(value): """When tiny-count runs as a pipeline step, all file inputs are sourced from the working directory regardless of original path.""" - if isinstance(value, dict) and value.get("path") is not None: + if ConfigBase.is_path_dict(value, empty_ok=True): return dict(value, path=os.path.basename(value['path'])) - elif isinstance(value, (str, bytes)): + elif ConfigBase.is_path_str(value, empty_ok=True): return os.path.basename(value) else: return value @@ -503,7 +523,7 @@ def as_cwl_file_obj(self, key: str): elif key in self.single: return self.cwl_file(val) elif key in self.groups: - return [self.cwl_file(sub) for sub in val if sub] + return [self.cwl_file(sub) for sub in val if self.is_path(sub)] elif key in self.prefix: raise ValueError(f"The parameter {key} isn't meant to be a CWL file object.") else: @@ -514,10 +534,6 @@ def validate_paths(self): "The following parameters are required in {selfname}: {params}" \ .format(selfname=self.basename, params=', '.join(self.required)) - assert any(gff.get('path') for gff in self['gff_files']), \ - "At least one GFF file path must be specified under gff_files in {selfname}" \ - .format(selfname=self.basename) - # Some entries in Paths File are omitted from tiny-count's working directory during # pipeline runs. There is no advantage to checking file existence here vs. in load_* if self.in_pipeline: return @@ -531,7 +547,8 @@ def validate_paths(self): for key in self.groups: for entry in self[key]: - if isinstance(entry, dict): entry = entry['path'] + if isinstance(entry, dict): entry = entry.get('path') + if not entry: continue assert os.path.isfile(entry), \ "The following file provided under {key} in {selfname} could not be found:\n\t{file}" \ .format(key=key, selfname=self.basename, file=entry) @@ -542,6 +559,43 @@ def check_backward_compatibility(self): "that you are using a Paths File from an earlier version of tinyRNA. Please " \ "check the release notes and update your configuration files." + def get_gff_config(self) -> Dict[str, list]: + """Restructures GFF input info so that it can be more easily handled. + To be clear, the Paths File YAML could be structured to match the desired output, + but the current format was chosen because it's more readable with more forgiving syntax. + + The YAML format is [{"path": "gff_path_1", "alias": [alias1, alias2, ...]}, { ... }, ...] + The output format is {"gff_path_1": [alias1, alias2, ...], ...} + """ + + gff_files = defaultdict(list) + id_filter = lambda alias: alias.lower() != 'id' + + # Build dictionary of files and allowed aliases + for gff in self['gff_files']: + if not self.is_path_dict(gff): continue + path, aliases = gff['path'], gff.get('alias', ()) + gff_files[path].extend(filter(id_filter, aliases)) + + # Remove duplicate aliases per file, keep order + for file, alias in gff_files.items(): + gff_files[file] = sorted(set(alias), key=alias.index) + + if not len(gff_files) and not self.in_pipeline: + self.print_sequence_counting_notice() + + return dict(gff_files) + + def print_sequence_counting_notice(self): + print("No GFF files were specified in {selfname}.\n" + "Reads will be quantified by sequence " + "rather than by feature." + .format(selfname=self.basename)) + + for s in reversed(range(6)): + print(f"Proceeding in {s}s", end='\r') + time.sleep(1) + # Override def append_to(self, key: str, val: Any): """Overrides method in the base class. This is necessary due to automatic diff --git a/tiny/rna/counter/counter.py b/tiny/rna/counter/counter.py index a20dd3a1..6f402842 100644 --- a/tiny/rna/counter/counter.py +++ b/tiny/rna/counter/counter.py @@ -3,17 +3,15 @@ import multiprocessing as mp import traceback import argparse -import shutil import sys import os -from collections import defaultdict -from typing import Tuple, List, Dict -from pkg_resources import resource_filename +from typing import List, Dict -from tiny.rna.counter.validation import GFFValidator +from tiny.rna.counter.validation import GFFValidator, SamSqValidator from tiny.rna.counter.features import Features, FeatureCounter from tiny.rna.counter.statistics import MergedStatsManager +from tiny.rna.counter.hts_parsing import ReferenceFeatures, ReferenceSeqs, ReferenceBase from tiny.rna.util import report_execution_time, from_here, ReadOnlyDict, get_timestamp, add_transparent_help from tiny.rna.configuration import CSVReader, PathsFile, get_templates @@ -134,7 +132,7 @@ def get_library_filename(csv_row_file: str, samples_csv: str) -> str: def load_config(features_csv: str, is_pipeline: bool) -> List[dict]: - """Parses the Features Sheet to provide inputs to FeatureSelector and ReferenceTables + """Parses the Features Sheet to provide inputs to FeatureSelector and reference parsers Args: features_csv: a csv file which defines feature sources and selection rules @@ -152,7 +150,7 @@ def load_config(features_csv: str, is_pipeline: bool) -> List[dict]: rule['nt5end'] = rule['nt5end'].upper().translate({ord('U'): 'T'}) # Convert RNA base to cDNA base rule['Identity'] = (rule.pop('Key'), rule.pop('Value')) # Create identity tuple rule['Hierarchy'] = int(rule['Hierarchy']) # Convert hierarchy to number - rule['Overlap'] = rule['Overlap'].lower() # Built later in ReferenceTables + rule['Overlap'] = rule['Overlap'].lower() # Built later in reference parsers # Duplicate rule entries are not allowed if rule not in rules: rules.append(rule) @@ -160,42 +158,35 @@ def load_config(features_csv: str, is_pipeline: bool) -> List[dict]: return rules -def load_gff_files(paths: PathsFile, libraries: List[dict], rules: List[dict]) -> Dict[str, list]: - """Retrieves the appropriate file path and alias attributes for each GFF, - then validates +def load_references(paths: PathsFile, libraries: List[dict], rules: List[dict], prefs) -> ReferenceBase: + """Determines the reference source (GFF or SAM @SQ headers) and constructs the appropriate object Args: - paths: a loaded PathsFile with a populated gff_files parameter + paths: a PathsFile object that optionally contains a `gff_files` configuration libraries: libraries parsed from Samples Sheet, each as a dict with a 'File' key rules: selection rules parsed from Features Sheet + prefs: command line arguments to pass to the ReferenceBase subclass Returns: - gff: a dict of GFF files with lists of alias attribute keys + references: a ReferenceBase object, subclassed based on + the presence of GFF files in the Paths File """ - gff_files = defaultdict(list) - ignore_alias = ["id"] - - # Build dictionary of unique files and allowed aliases - for gff in paths['gff_files']: - gff_files[gff['path']].extend( - alias for alias in gff.get('alias', ()) - if alias.lower() not in ignore_alias - ) - - # Remove duplicate aliases (per file), keep original order - for file, alias in gff_files.items(): - gff_files[file] = sorted(set(alias), key=alias.index) - - # Prepare supporting file inputs for GFF validation + gff_files = paths.get_gff_config() sam_files = [lib['File'] for lib in libraries] - # Todo: update GFFValidator so that genomes and ebwt can be safely passed here - # ref_genomes = [map_path(fasta) for fasta in paths['reference_genome_files']] - # ebwt_prefix = map_path(paths['ebwt']) + if gff_files: + GFFValidator(gff_files, rules, alignments=sam_files).validate() + references = ReferenceFeatures(gff_files, **prefs) + else: + sq_validator = SamSqValidator(sam_files) + + # Reuse sq_validator's parsing results to save time + sq_validator.validate() + sequences = sq_validator.reference_seqs + references = ReferenceSeqs(sequences, **prefs) - GFFValidator(gff_files, rules, alignments=sam_files).validate() - return gff_files + return references @report_execution_time("Counting and merging") @@ -231,11 +222,11 @@ def main(): paths = PathsFile(args['paths_file'], is_pipeline) libraries = load_samples(paths['samples_csv'], is_pipeline) selection = load_config(paths['features_csv'], is_pipeline) - gff_files = load_gff_files(paths, libraries, selection) + reference = load_references(paths, libraries, selection, args) # global for multiprocessing global counter - counter = FeatureCounter(gff_files, selection, **args) + counter = FeatureCounter(reference, selection, **args) # Assign and count features using multiprocessing and merge results merged_counts = map_and_reduce(libraries, paths, args) diff --git a/tiny/rna/counter/features.py b/tiny/rna/counter/features.py index b17c0cdc..14aad110 100644 --- a/tiny/rna/counter/features.py +++ b/tiny/rna/counter/features.py @@ -4,7 +4,7 @@ from collections import defaultdict from typing import List, Tuple, Set, Dict, Mapping -from tiny.rna.counter.hts_parsing import ReferenceTables, SAM_reader +from tiny.rna.counter.hts_parsing import SAM_reader, ReferenceFeatures, ReferenceSeqs from .statistics import LibraryStats from .matching import * @@ -27,30 +27,20 @@ def __init__(_, features: HTSeq.GenomicArrayOfSets, aliases: dict, classes: dict class FeatureCounter: - def __init__(self, gff_file_set, selection_rules, **prefs): + def __init__(self, references, selection_rules, **prefs): self.stats = LibraryStats(**prefs) self.sam_reader = SAM_reader(**prefs) self.selector = FeatureSelector(selection_rules, self.stats, **prefs) - reference_tables = ReferenceTables(gff_file_set, self.selector, **prefs) - Features(*reference_tables.get()) - self.prefs = prefs + if isinstance(references, ReferenceFeatures): + self.mode = "by feature" + elif isinstance(references, ReferenceSeqs): + self.mode = "by sequence" + else: + raise TypeError("Expected ReferenceFeatures or ReferenceSeqs, got %s" % type(references)) - def assign_features(self, al: dict) -> Tuple[dict, int]: - """Determines features associated with the interval then performs rule-based feature selection""" - - try: - feat_matches = set().union( - *Features.chrom_vectors[al['Chrom']]['.'] # GenomicArrayOfSets -> ChromVector - .array[al['Start']:al['End']] # ChromVector -> StepVector - .get_steps(values_only=True)) # StepVector -> {features} - except KeyError as ke: - self.stats.chrom_misses[ke.args[0]] += 1 - return {}, 0 - - # If features are associated with the alignment interval, perform selection - assignment = self.selector.choose(feat_matches, al) if feat_matches else {} - return assignment, len(feat_matches) + Features(*references.get(self.selector)) + self.prefs = prefs def count_reads(self, library: dict): """Collects statistics on features assigned to each alignment associated with each read""" @@ -71,6 +61,22 @@ def count_reads(self, library: dict): return self.stats + def assign_features(self, al: dict) -> Tuple[dict, int]: + """Determines features associated with the interval then performs rule-based feature selection""" + + try: + feat_matches = set().union( + *Features.chrom_vectors[al['Chrom']]['.'] # GenomicArrayOfSets -> ChromVector + .array[al['Start']:al['End']] # ChromVector -> StepVector + .get_steps(values_only=True)) # StepVector -> {features} + except KeyError as ke: + self.stats.chrom_misses[ke.args[0]] += 1 + return {}, 0 + + # If features are associated with the alignment interval, perform selection + assignment = self.selector.choose(feat_matches, al) if feat_matches else {} + return assignment, len(feat_matches) + class FeatureSelector: """Performs hierarchical selection given a set of candidate features for a locus @@ -78,7 +84,7 @@ class FeatureSelector: Two sources of data serve as targets for selection: feature attributes (sourced from input GFF files), and sequence attributes (sourced from input SAM files). All candidate features are assumed to have matched at least one Identity selector, - as determined by hts_parsing.ReferenceTables.get_matches_and_classes() + as determined by hts_parsing.ReferenceFeatures.get_matches_and_classes() The first round of selection was performed during GFF parsing. @@ -199,7 +205,7 @@ def build_interval_selectors(iv: 'HTSeq.GenomicInterval', match_tuples: List[unb Unlike build_selectors() and build_inverted_identities(), this function is not called at construction time. Instead, it is called when finalizing - match-tuples in ReferenceTables. This is because interval selectors are + match-tuples in reference parsers. This is because interval selectors are created for each feature (requiring start/stop/strand to be known) for each of the feature's identity matches (each match-tuple). diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index 031f6e7b..fdd6b6c5 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -6,6 +6,7 @@ from collections import Counter, defaultdict from typing import Tuple, List, Dict, Iterator, Optional, DefaultDict, Set, Union, IO, Callable +from abc import ABC, abstractmethod from urllib.parse import unquote from inspect import stack @@ -20,7 +21,8 @@ AlignmentDict = Dict[str, Union[str, int, bytes]] Bundle = Tuple[List[AlignmentDict], int] _re_tiny = r"\d+_count=\d+" -_re_fastx = r'seq\d+_x(\d+)' +_re_fastx = r"seq\d+_x(\d+)" +_re_header = re.compile(r"^@(HD|SQ|RG|PG)(\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$|^@CO\t.*") # For Alignment complement = {ord('A'): 'T', ord('T'): 'A', ord('G'): 'C', ord('C'): 'G'} @@ -38,7 +40,7 @@ def __init__(self, **prefs): self._decollapsed_filename = None self._decollapsed_reads = [] self._header_lines = [] - self._header_dict = {} + self._header_dict = defaultdict(list) def bundle_multi_alignments(self, file: str) -> Iterator[Bundle]: """Bundles multi-alignments (determined by a shared QNAME) and reports the associated read's count""" @@ -135,6 +137,7 @@ def _read_to_first_aln(self, file_obj: IO) -> Tuple[bytes, int]: def _parse_header_line(self, header_line: str) -> None: """Parses and saves information from the provided header line""" + self.validate_header_line(header_line) self._header_lines.append(header_line) # Header dict @@ -142,10 +145,18 @@ def _parse_header_line(self, header_line: str) -> None: fields = header_line[3:].strip().split('\t') if rec_type == "@CO": - self._header_dict[rec_type] = fields[0] + self._header_dict[rec_type].append(fields[0]) else: - self._header_dict[rec_type] = \ + self._header_dict[rec_type].append( {field[:2]: field[3:].strip() for field in fields} + ) + + def validate_header_line(self, line): + if not re.match(_re_header, line): + msg = "Invalid SAM header" + msg += f" in {os.path.basename(self.file)}" if self.file else "" + msg += ':\n\t' + f'"{line}"' + raise ValueError(msg) def _determine_collapser_type(self, first_aln_line: str) -> None: """Attempts to determine the collapsing utility that was used before producing the @@ -157,7 +168,7 @@ def _determine_collapser_type(self, first_aln_line: str) -> None: elif re.match(_re_fastx, first_aln_line) is not None: self.collapser_type = "fastx" - sort_order = self._header_dict.get('@HD', {}).get('SO', None) + sort_order = self._header_dict.get('@HD', [{}])[0].get('SO', None) if sort_order is None or sort_order != "queryname": raise ValueError("SAM files from fastx collapsed outputs must be sorted by queryname\n" "(and the @HD [...] SO header must be set accordingly).") @@ -225,7 +236,7 @@ def infer_strandedness(sam_file: str, intervals: dict) -> str: Args: sam_file: the path of the SAM file to evaluate - intervals: the intervals instance attribute of ReferenceTables, populated by .get() + intervals: the intervals instance attribute of ReferenceFeatures, populated by .get() """ unstranded = HTSeq.GenomicArrayOfSets("auto", stranded=False) @@ -408,6 +419,66 @@ def not_implemented(self): raise NotImplementedError(f"CaseInsensitiveAttrs does not support {stack()[1].function}") +class ReferenceBase(ABC): + """The base class for reference parsers""" + + def __init__(self, **prefs): + self.stepvector = prefs.get('stepvector', 'HTSeq') + self.feats = self._init_genomic_array() + + # The selector is assigned whenever get() is called. + # While it isn't the current use case, this would allow + # for groups of GFF files to be processed with different + # Stage 1 selection rules and pooled into the same tables + self.selector = None + + @abstractmethod + def get(self, feature_selector): pass + + def _init_genomic_array(self): + if self.stepvector == 'Cython': + try: + from tiny.rna.counter.stepvector import StepVector + setattr(HTSeq.StepVector, 'StepVector', StepVector) + return HTSeq.GenomicArray("auto", stranded=False) + except ModuleNotFoundError: + self.stepvector = 'HTSeq' + print("Failed to import Cython StepVector\n" + "Falling back to HTSeq's StepVector", + file=sys.stderr) + + if self.stepvector == 'HTSeq': + return HTSeq.GenomicArrayOfSets("auto", stranded=False) + + def chrom_vector_setdefault(self, chrom): + """Behaves like dict.setdefault() for chrom_vectors. Even though chrom_vectors are + dictionaries themselves, calling setdefault on them will break the GenomicArrayOfSets""" + + if chrom not in self.feats.chrom_vectors: + self.feats.add_chrom(chrom) + + def get_feats_table_size(self) -> int: + """Returns the sum of features across all chromosomes and strands""" + + total_feats = 0 + empty_size = {"Cython": 1, "HTSeq": 3}[self.stepvector] + for chrom in self.feats.chrom_vectors: + for strand in self.feats.chrom_vectors[chrom]: + total_feats += self.feats.chrom_vectors[chrom][strand].array.num_steps() - empty_size + + return total_feats + + @staticmethod + def map_strand(htseq_value: str): + """Maps HTSeq's strand representation (+/-/.) to + tinyRNA's strand representation (True/False/None)""" + + return { + '+': True, + '-': False, + }.get(htseq_value, None) + + def parse_gff(file, row_fn: Callable, alias_keys=None): if alias_keys is not None: row_fn = functools.partial(row_fn, alias_keys=alias_keys) @@ -433,7 +504,7 @@ def parse_gff(file, row_fn: Callable, alias_keys=None): GenomicArray = HTSeq.GenomicArrayOfSets -class ReferenceTables: +class ReferenceFeatures(ReferenceBase): """A GFF parser which builds feature, alias, and class reference tables Discontinuous features are supported, and comma-separated attribute values (GFF3 column 9) @@ -446,7 +517,7 @@ class ReferenceTables: Children of the root ancestor are otherwise not stored in the reference tables. Match-tuples are created for each Features Sheet rule which matches a feature's attributes. - They are structured as (rank, rule, overlap). "Rank" is the heirarchy value of the matching + They are structured as (rank, rule, overlap). "Rank" is the hierarchy value of the matching rule, "rule" is the index of that rule in FeatureSelector's rules table, and "overlap" is the IntervalSelector per the rule's overlap requirements. @@ -460,26 +531,26 @@ class ReferenceTables: source_filter = [] type_filter = [] - def __init__(self, gff_files: Dict[str, list], feature_selector, **prefs): + def __init__(self, gff_files: Dict[str, list], **prefs): + super().__init__(**prefs) self.all_features = prefs.get('all_features', False) - self.stepvector = prefs.get('stepvector', 'HTSeq') - self.selector = feature_selector self.gff_files = gff_files - # ----------------------------------------------------------- Primary Key: - self.feats = self._init_genomic_array() # Root Match ID - self.parents, self.filtered = {}, set() # Original Feature ID - self.intervals = defaultdict(list) # Root Feature ID - self.matches = defaultdict(set) # Root Match ID - self.alias = defaultdict(set) # Root Feature ID - self.tags = defaultdict(set) # Root Feature ID -> Root Match ID + # ----------------------------------------------- Primary Key: + # self.feats # Root Match ID + self.parents, self.filtered = {}, set() # Original Feature ID + self.intervals = defaultdict(list) # Root Feature ID + self.matches = defaultdict(set) # Root Match ID + self.alias = defaultdict(set) # Root Feature ID + self.tags = defaultdict(set) # Root Feature ID -> Root Match ID # Patch the GFF attribute parser to support comma separated attribute value lists setattr(HTSeq.features.GFF_Reader, 'parse_GFF_attribute_string', staticmethod(parse_GFF_attribute_string)) @report_execution_time("GFF parsing") - def get(self) -> Tuple[GenomicArray, AliasTable, TagTable]: + def get(self, feature_selector) -> Tuple[GenomicArray, AliasTable, TagTable]: """Initiates GFF parsing and returns complete feature, alias, and tag tables""" + self.selector = feature_selector for file, alias_keys in self.gff_files.items(): parse_gff(file, self.parse_row, alias_keys=alias_keys) @@ -510,31 +581,37 @@ def parse_row(self, row, alias_keys=None): # Add alias to root ancestor if it is unique self.add_alias(root_id, alias_keys, row.attr) - def get_root_feature(self, lineage: list) -> str: - """Returns the highest feature ID in the ancestral tree which passed stage 1 selection. - The original feature ID is returned if there are no valid ancestors.""" + @staticmethod + def get_feature_id(row): + id_collection = row.attr.get('ID') \ + or row.attr.get('gene_id') \ + or row.attr.get('Parent') - # Descend tree until the descendent is found in the matches table - # This is because ancestor feature(s) may have been filtered - for ancestor in lineage[::-1]: - if self.was_matched(ancestor): - return ancestor - else: - return lineage[0] # Default: the original feature_id + if id_collection is None: + raise ValueError(f"Feature {row.name} does not have an ID attribute.") + if len(id_collection) == 0: + raise ValueError("A feature's ID attribute cannot be empty. This value is required.") + if len(id_collection) > 1: + return ','.join(id_collection) - def get_feature_ancestors(self, feature_id: str, row_attrs: CaseInsensitiveAttrs): - if "Parent" not in row_attrs: - return [feature_id] + return id_collection[0] - parent_id = self.get_row_parent(feature_id, row_attrs) - lineage = [feature_id, parent_id] + def get_matches(self, row: HTSeq.GenomicFeature) -> DefaultDict: + """Performs Stage 1 selection and returns match tuples under their associated classifier""" - # Climb ancestral tree until the root parent is found - while parent_id in self.parents: - parent_id = self.parents[parent_id] - lineage.append(parent_id) + identity_matches = defaultdict(set) + for ident, rule_indexes in self.selector.inv_ident.items(): + if row.attr.contains_ident(ident): + for index in rule_indexes: + rule = self.selector.rules_table[index] + if self.column_filter_match(row, rule): + # Unclassified matches are pooled under '' empty string + identity_matches[rule['Class']].add( + (index, rule['Hierarchy'], rule['Overlap']) + ) - return lineage + # -> identity_matches: {classifier: {(rule, rank, overlap), ...}, ...} + return identity_matches def add_feature(self, feature_id: str, row, matches: defaultdict) -> str: """Adds the feature to the intervals table under its root ID, and to the matches table @@ -559,29 +636,38 @@ def add_feature(self, feature_id: str, row, matches: defaultdict) -> str: return root_id - def add_alias(self, root_id: str, alias_keys: List[str], row_attrs: CaseInsensitiveAttrs) -> None: - """Add feature's aliases to the root ancestor's alias set""" + def get_feature_ancestors(self, feature_id: str, row_attrs: CaseInsensitiveAttrs): + if "Parent" not in row_attrs: + return [feature_id] - for alias_key in alias_keys: - for row_val in row_attrs.get(alias_key, ()): - self.alias[root_id].add(row_val) + parent_id = self.get_row_parent(feature_id, row_attrs) + lineage = [feature_id, parent_id] - def get_matches(self, row: HTSeq.GenomicFeature) -> DefaultDict: - """Performs Stage 1 selection and returns match tuples under their associated classifier""" + # Climb ancestral tree until the root parent is found + while parent_id in self.parents: + parent_id = self.parents[parent_id] + lineage.append(parent_id) - identity_matches = defaultdict(set) - for ident, rule_indexes in self.selector.inv_ident.items(): - if row.attr.contains_ident(ident): - for index in rule_indexes: - rule = self.selector.rules_table[index] - if self.column_filter_match(row, rule): - # Unclassified matches are pooled under '' empty string - identity_matches[rule['Class']].add( - (index, rule['Hierarchy'], rule['Overlap']) - ) + return lineage - # -> identity_matches: {class: (rule, rank, overlap), ...} - return identity_matches + def get_root_feature(self, lineage: list) -> str: + """Returns the highest feature ID in the ancestral tree which passed stage 1 selection. + The original feature ID is returned if there are no valid ancestors.""" + + # Descend tree until the descendant is found in the matches table + # This is because ancestor feature(s) may have been filtered + for ancestor in lineage[::-1]: + if self.was_matched(ancestor): + return ancestor + else: + return lineage[0] # Default: the original feature_id + + def was_matched(self, untagged_id): + """Checks if the feature ID previously matched on identity, regardless of whether + the matching rule was tagged or untagged.""" + + # any() will short circuit on first match when provided a generator function + return any(tagged_id in self.matches for tagged_id in self.tags.get(untagged_id, ())) def get_row_parent(self, feature_id: str, row_attrs: CaseInsensitiveAttrs) -> str: """Get the current feature's parent while cooperating with filtered features""" @@ -616,6 +702,13 @@ def exclude_row(self, row): self.parents[feature_id] = self.get_row_parent(feature_id, row.attr) self.chrom_vector_setdefault(row.iv.chrom) + def add_alias(self, root_id: str, alias_keys: List[str], row_attrs: CaseInsensitiveAttrs) -> None: + """Add feature's aliases to the root ancestor's alias set""" + + for alias_key in alias_keys: + for row_val in row_attrs.get(alias_key, ()): + self.alias[root_id].add(row_val) + def _finalize_aliases(self): self.alias = {feat: tuple(sorted(aliases, key=str.lower)) for feat, aliases in self.alias.items()} @@ -663,73 +756,54 @@ def _merge_adjacent_subintervals(unmerged_sub_ivs: List[HTSeq.GenomicInterval]) return merged_ivs - def get_feats_table_size(self) -> int: - """Returns the sum of features across all chromosomes and strands""" - - total_feats = 0 - empty_size = {"Cython": 1, "HTSeq": 3}[self.stepvector] - for chrom in self.feats.chrom_vectors: - for strand in self.feats.chrom_vectors[chrom]: - total_feats += self.feats.chrom_vectors[chrom][strand].array.num_steps() - empty_size - - return total_feats + @staticmethod + def column_filter_match(row, rule): + """Checks if the GFF row passes the inclusive filter(s) + If both filters are defined then they must both evaluate to true for a match""" - def map_strand(self, gff_value: str): - """Maps HTSeq's strand representation (+/-/.) to - tinyRNA's strand representation (True/False/None)""" + return row.source in rule['Filter_s'] and row.type in rule['Filter_t'] - return { - '+': True, - '-': False, - }.get(gff_value, None) - def was_matched(self, untagged_id): - """Checks if the feature ID previously matched on identity, regardless of whether - the matching rule was tagged or untagged.""" +class ReferenceSeqs(ReferenceBase): - # any() will short circuit on first match when provided a generator function - return any(tagged_id in self.matches for tagged_id in self.tags.get(untagged_id, ())) + def __init__(self, reference_seqs, **prefs): + super().__init__(**prefs) + self.seqs = reference_seqs + self.alias = {} + self.tags = {} - def chrom_vector_setdefault(self, chrom): - """Behaves like dict.setdefault() for chrom_vectors. Even though chrom_vectors are - dictionaries themselves, calling setdefault on them will break the GenomicArrayOfSets""" + def get(self, selector): + self.selector = selector + match_tuples = self.get_matches() - if chrom not in self.feats.chrom_vectors: - self.feats.add_chrom(chrom) + for seq_id, seq_len in self.seqs.items(): + self.add_reference_seq(seq_id, seq_len, match_tuples) - @staticmethod - def get_feature_id(row): - id_collection = row.attr.get('ID') \ - or row.attr.get('gene_id') \ - or row.attr.get('Parent') + # Aliases are irrelevant for non-GFF references + aliases = {seq_id: () for seq_id in self.tags} + return self.feats, aliases, self.tags - if id_collection is None: - raise ValueError(f"Feature {row.name} does not have an ID attribute.") - if len(id_collection) == 0: - raise ValueError("A feature's ID attribute cannot be empty. This value is required.") - if len(id_collection) > 1: - return ','.join(id_collection) + def get_matches(self): + """Stage 1 selection is skipped in sequence-based counting. + Simply build match_tuples for all rules. These will be used + uniformly in each reference sequence's feature_record_tuple""" - return id_collection[0] + return [(idx, rule['Hierarchy'], rule['Overlap']) + for idx, rule in sorted( + enumerate(self.selector.rules_table), + key=lambda x: x[1]['Hierarchy'] + )] - def _init_genomic_array(self): - if self.stepvector == 'Cython': - try: - from tiny.rna.counter.stepvector import StepVector - setattr(HTSeq.StepVector, 'StepVector', StepVector) - return HTSeq.GenomicArray("auto", stranded=False) - except ModuleNotFoundError: - self.stepvector = 'HTSeq' - print("Failed to import Cython StepVector\n" - "Falling back to HTSeq's StepVector", - file=sys.stderr) + def add_reference_seq(self, seq_id, seq_len, unbuilt_match_tuples): - if self.stepvector == 'HTSeq': - return HTSeq.GenomicArrayOfSets("auto", stranded=False) + # Features are classified in Reference Tables (Stage 1 selection) + # For compatibility, use the seq_id with an empty classifier (index 1) + tagged_id = (seq_id, '') + self.tags[seq_id] = {tagged_id} - @staticmethod - def column_filter_match(row, rule): - """Checks if the GFF row passes the inclusive filter(s) - If both filters are defined then they must both evaluate to true for a match""" + for strand in ('+', '-'): + iv = HTSeq.GenomicInterval(seq_id, 0, seq_len, strand) + match_tuples = self.selector.build_interval_selectors(iv, unbuilt_match_tuples.copy()) + tinyrna_strand = self.map_strand(strand) - return row.source in rule['Filter_s'] and row.type in rule['Filter_t'] + self.feats[iv] += (tagged_id, tinyrna_strand, tuple(match_tuples)) \ No newline at end of file diff --git a/tiny/rna/counter/validation.py b/tiny/rna/counter/validation.py index dbc6d5cc..1605b62b 100644 --- a/tiny/rna/counter/validation.py +++ b/tiny/rna/counter/validation.py @@ -6,9 +6,9 @@ from collections import Counter, defaultdict from typing import List, Dict -from tiny.rna.counter.hts_parsing import parse_gff, ReferenceTables +from tiny.rna.counter.hts_parsing import parse_gff, ReferenceFeatures, SAM_reader from tiny.rna.counter.features import FeatureSelector -from tiny.rna.util import sorted_natural, gzip_open +from tiny.rna.util import sorted_natural, gzip_open, report_execution_time class ReportFormatter: @@ -61,14 +61,19 @@ def recursive_indent(self, mapping: dict, indent: str): lines = [] for key, val in mapping.items(): - if not val: continue key_header = f"{indent}{self.key_mapper.get(key, key)}: " if isinstance(val, dict): lines.append(key_header) - lines.extend(self.recursive_indent(val, indent + '\t')) + if len(val): + lines.extend(self.recursive_indent(val, indent + '\t')) + else: + lines.append(indent + "\t" + "{ NONE }") elif isinstance(val, (list, set)): lines.append(key_header) - lines.extend([indent + '\t' + line for line in map(str, val)]) + if len(val): + lines.extend([indent + '\t' + line for line in map(str, val)]) + else: + lines.append(indent + "\t" + "[ NONE ]") else: lines.append(key_header + str(val)) return lines @@ -87,7 +92,7 @@ class GFFValidator: } def __init__(self, gff_files, rules, ebwt=None, genomes=None, alignments=None): - self.ReferenceTables = ReferenceTables(gff_files, None) + self.ReferenceFeatures = ReferenceFeatures(gff_files) self.column_filters = self.build_column_filters(rules) self.report = ReportFormatter(self.targets) self.chrom_set = set() @@ -115,13 +120,13 @@ def parse_and_validate_gffs(self, file_set): def validate_gff_row(self, row, issues, file): # Skip definitions of whole chromosomes and obey source/type filters if row.type.lower() == "chromosome": return - if not self.ReferenceTables.column_filter_match(row, self.column_filters): return + if not self.ReferenceFeatures.column_filter_match(row, self.column_filters): return if row.iv.strand not in ('+', '-'): issues['strand'][file] += 1 try: - self.ReferenceTables.get_feature_id(row) + self.ReferenceFeatures.get_feature_id(row) except: issues['ID attribute'][file] += 1 @@ -241,7 +246,7 @@ def alignment_chroms_mismatch_heuristic(self, sam_files: List[str], subset_size= for file in sam_files: file_chroms = set() with open(file, 'rb') as f: - for line, i in zip(f, range(subset_size)): + for i, line in zip(range(subset_size), f): if line[0] == ord('@'): continue file_chroms.add(line.split(b'\t')[2].strip().decode()) if i % 10000 == 0 and len(file_chroms & self.chrom_set): break @@ -286,4 +291,122 @@ def build_column_filters(rules): for selector in ["Filter_s", "Filter_t"] } - return FeatureSelector.build_selectors([selector_defs])[0] \ No newline at end of file + return FeatureSelector.build_selectors([selector_defs])[0] + + +class SamSqValidator: + """Validates @SQ headers for tiny-count's sequence-based counting mode""" + + targets = { + "inter sq": "Sequence identifiers with inconsistent lengths", + "intra sq": "SAM files with repeated sequence identifiers", + "incomplete sq": "SAM files with incomplete @SQ headers", + "missing sq": "SAM files that lack @SQ headers" + } + + def __init__(self, sam_files): + self.report = ReportFormatter(self.targets) + self.sam_files = sam_files + self.reference_seqs = {} + self.sq_headers = {} + + def validate(self): + print("Validating sequence identifiers in SAM files...") + self.read_sq_headers() + self.validate_sq_headers() + self.report.print_report() + if self.report.errors: + sys.exit(1) + + def validate_sq_headers(self): + """Performs validation tests in the required order + It is important to check for and return upon syntax error, otherwise + the remaining tests are likely to raise an exception before the report + has a chance to be printed""" + + # First verify @SQ header syntax + missing = self.get_missing_headers() + incomplete = self.get_incomplete_sq_headers() + if missing or incomplete: + self.generate_header_syntax_report(missing, incomplete) + return + + # Next verify identifier definitions + duplicate = self.get_duplicate_identifiers() + ambiguous = self.get_ambiguous_lengths() + if duplicate or ambiguous: + self.generate_identifier_report(duplicate, ambiguous) + + def get_ambiguous_lengths(self) -> List[str]: + """Returns a list of sequence IDs that have inconsistent length definitions. + Sequences with consistent length definitions are added to self.reference_seqs""" + + seq_lengths = defaultdict(set) + for sam in self.sam_files: + for sq in self.sq_headers[sam]: + seq_id = sq['SN'] + seq_len = int(sq['LN']) + seq_lengths[seq_id].add(seq_len) + + bad_seqs = [] + for seq_id, lengths in seq_lengths.items(): + if len(lengths) == 1: + lengths = lengths.pop() + self.reference_seqs[seq_id] = lengths + else: + bad_seqs.append(seq_id) + + return bad_seqs + + def get_duplicate_identifiers(self) -> Dict[str, List[str]]: + """Returns a dictionary of SAM files that contain duplicate sequence identifiers""" + + bad_files = {} + for file in self.sam_files: + sq = self.sq_headers[file] + id_count = Counter(seq['SN'] for seq in sq) + duplicates = [seq_id for seq_id, count in id_count.items() if count > 1] + if duplicates: bad_files[file] = duplicates + + return bad_files + + def get_incomplete_sq_headers(self) -> List[str]: + """Returns a list of SAM files that have incomplete @SQ headers""" + + return [file for file, sqs in self.sq_headers.items() + if not all("SN" in sq and "LN" in sq for sq in sqs)] + + def get_missing_headers(self) -> List[str]: + """Returns a list of SAM files that lack @SQ headers""" + + return [file for file, sqs in self.sq_headers.items() + if len(sqs) == 0] + + def generate_header_syntax_report(self, missing, incomplete): + report = {} + if missing: + report['missing sq'] = sorted_natural(missing) + if incomplete: + report['incomplete sq'] = sorted_natural(incomplete) + + header = "Every SAM file must have complete @SQ headers with SN and LN\n" \ + "fields when performing sequence-based counting.\n" + self.report.add_error_section(header, report) + + def generate_identifier_report(self, duplicate, ambiguous): + report = {} + if duplicate: + report['intra sq'] = sorted_natural(duplicate) + if ambiguous: + report['inter sq'] = sorted_natural(ambiguous) + + header = "Sequence identifiers must be unique and have consistent length definitions.\n" + self.report.add_error_section(header, report) + + def read_sq_headers(self): + for file in self.sam_files: + with open(file, 'rb') as f: + reader = SAM_reader() + reader._read_to_first_aln(f) + + self.sq_headers[file] = reader._header_dict.get('@SQ', []) \ No newline at end of file diff --git a/tiny/rna/resume.py b/tiny/rna/resume.py index 88c3a10f..24858947 100644 --- a/tiny/rna/resume.py +++ b/tiny/rna/resume.py @@ -92,9 +92,11 @@ def _create_truncated_workflow(self): wf_steps[self.steps[0]]['in'][param] = new_input['var'] def load_paths_config(self): - """Constructs a sub-configuration object containing workflow file preferences""" - paths_file_path = self.from_here(self['paths_config']) - return PathsFile(paths_file_path) + """Returns a PathsFile object and updates keys related to the Paths File path""" + + self['paths_config'] = self.from_here(self['paths_config']) + self['paths_file'] = self.cwl_file(self['paths_config']) + return PathsFile(self['paths_config']) def assimilate_paths_file(self): """Updates the processed workflow with resume-safe Paths File parameters""" diff --git a/tiny/templates/paths.yml b/tiny/templates/paths.yml index 3f7a332d..2b70aba2 100644 --- a/tiny/templates/paths.yml +++ b/tiny/templates/paths.yml @@ -7,7 +7,7 @@ # 1. Fill out the Samples Sheet with files to process + naming scheme. [samples.csv] # 2. Fill out the Features Sheet with selection rules [features.csv] # 3. Set samples_csv and features_csv (below) to point to these files -# 4. Add annotation files and per-file alias preferences to gff_files +# 4. Add annotation files and per-file alias preferences to gff_files (optional) # ######-------------------------------------------------------------------------------######