MontgomeryLab · taimontgomery · Oct 16, 2022 · Oct 7, 2022 · Oct 7, 2022 · Oct 7, 2022
diff --git a/README.md b/README.md
@@ -90,12 +90,12 @@ tiny get-template
 
 ### Requirements for User-Provided Input Files
 
-| Input Type                                                                 | File Extension    | Requirements                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
-|----------------------------------------------------------------------------|-------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| Reference annotations<br/>[(example)](START_HERE/reference_data/ram1.gff3) | GFF3 / GFF2 / GTF | Column 9 attributes (defined as "tag=value" or "tag "):<ul><li>Each feature must have an `ID` or `gene_id` tag.</li><li>Feature classes can be defined with the `Class` tag. If undefined, the default value \__UNKNOWN_\_ will be used.</li><li>Discontinuous features must be defined with the `Parent` tag whose value is the logical parent's `ID`, or by sharing the same `ID`.</li><li>Attribute values containing commas must represent lists.</li><li>All features must be stranded.</li><li>See the example link (left) for col. 9 formatting.</li></ul> |
-| Sequencing data<br/>[(example)](START_HERE/fastq_files)                    | FASTQ(.gz)        | Files must be demultiplexed.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
-| Reference genome<br/>[(example)](START_HERE/reference_data/ram1.fa)        | FASTA             | Chromosome identifiers (e.g. Chr1): <ul><li>Must match your reference annotation file chromosome identifiers</li><li>Are case sensitive</li></ul>                                                                                                                                                                                                                                                                                                                                                                                                                 |
-| Bowtie indexes (optional) <sup>1</sup>                                     | ebwt              | Must be small indexes (.ebwtl indexes are not supported)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| Input Type                                                                 | File Extension    | Requirements                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+|----------------------------------------------------------------------------|-------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Reference annotations<br/>[(example)](START_HERE/reference_data/ram1.gff3) | GFF3 / GFF2 / GTF | Column 9 attributes (defined as "tag=value" or "tag "):<ul><li>Each feature must have an `ID` or `gene_id`  or `Parent` tag (referred to as `ID` henceforth).</li><li>Feature classes can be defined with the `Class` tag. If undefined, the default value \__UNKNOWN_\_ will be used.</li><li>Discontinuous features must be defined with the `Parent` tag whose value is the logical parent's `ID`, or by sharing the same `ID`.</li><li>Attribute values containing commas must represent lists.</li><li>`Parent` tags with multiple values are not yet supported.</li><li>See the example link (left) for col. 9 formatting.</li></ul> |
+| Sequencing data<br/>[(example)](START_HERE/fastq_files)                    | FASTQ(.gz)        | Files must be demultiplexed.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| Reference genome<br/>[(example)](START_HERE/reference_data/ram1.fa)        | FASTA             | Chromosome identifiers (e.g. Chr1): <ul><li>Must match your reference annotation file chromosome identifiers</li><li>Are case sensitive</li></ul>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| Bowtie indexes (optional) <sup>1</sup>                                     | ebwt              | Must be small indexes (.ebwtl indexes are not supported)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
 
 <br/><sup>1</sup> Bowtie indexes can be created for you. See the [configuration file documentation](doc/Configuration.md#building-bowtie-indexes).
 

diff --git a/START_HERE/run_config.yml b/START_HERE/run_config.yml
@@ -247,10 +247,6 @@ counter_type_filter: []
 ##-- Select the StepVector implementation that is used. Options: HTSeq or Cython --##
 counter_stepvector: 'Cython'
 
-##-- If False: a feature with multiple values in its ID attribute is treated as an error --##
-##-- If True: multiple ID values are allowed, but only the first value is used --##
-counter_allow_multi_id: True
-
 ##-- If True: produce diagnostic logs to indicate what was eliminated and why --##
 counter_diags: False
 

diff --git a/doc/Configuration.md b/doc/Configuration.md
@@ -132,7 +132,7 @@ The Features Sheet allows you to define selection rules that determine how featu
 
 Rules apply to features parsed from **all** Feature Sources, with the exception of "Alias by..." which only applies to the Feature Source on the same row. Selection first takes place against feature attributes (GFF column 9), and is directed by defining the attribute you want to be considered (Select for...) and the acceptable values for that attribute (with value...). 
 
-Rules that match features in the first stage of selection will be used in a second stage which performs elimination by hierarchy and interval overlap characteristics. Remaining candidates pass to the third and final stage of selection which examines characteristics of the alignment itself: strand relative to the feature of interest, 5' end nucleotide, and length. 
+Rules that match features in the first stage of selection will be used in a second stage which evaluates alignment vs. feature interval overlap. These matches are sorted by hierarchy value and passed to the third and final stage of selection which examines characteristics of the alignment itself: strand relative to the feature of interest, 5' end nucleotide, and length. 
 
 See [tiny-count's documentation](tiny-count.md#feature-selection) for an explanation of each column.
 

diff --git a/doc/tiny-count.md b/doc/tiny-count.md
@@ -46,16 +46,24 @@ You can optionally specify a tag for each rule. Feature assignments resulting fr
 | _features.csv columns:_ | Hierarchy | Overlap |
 |-------------------------|-----------|---------|
 
-This stage of selection is concerned with the interval overlap between alignments and features. **Overlap is determined in a strandless fashion.** See the [Strand](#strand) section in Stage 3 for refinement of selections by strand.
+Features overlapping a read alignment are selected based on their overlap characteristics. These matches are then sorted by hierarchy value before proceeding to Stage 3.
 
 ### Overlap
-This column allows you to specify which read alignments should be assigned based on how their start and end points overlap with candidate features. Candidates for each matched rule can be selected using the following options:
+This column allows you to specify the extent of overlap required for candidate feature selection:
 - `partial`: alignment overlaps feature by at least one base
 - `full`: alignment does not extend beyond either terminus of the feature
 - `exact`: alignment termini are equal to the feature's
+- `anchored`: alignment's start and/or end is equal to the feature's
 - `5' anchored`: alignment's 5' end is equal to the corresponding terminus of the feature
 - `3' anchored`: alignment's 3' end is equal to the corresponding terminus of the feature
 
+In order to be a candidate, a feature must match a rule in Stage 1, reside on the same chromosome as the alignment, and must overlap the alignment by at least 1 nucleotide.
+
+#### Strandedness and the Overlap Selector
+A feature does not have to be on the same strand as the alignment in order to be a candidate. See the [Strand](#strand) section in Stage 3 for selection by strand. Unstranded features will have `5' anchored` and `3' anchored` overlap selectors downgraded to `anchored` selectors. Alignments overlapping these features are evaluated for shared start and/or end coordinates, but 5'/3' ends are not distinguished.
+
+#### Selector Demonstration
+
 The following diagrams demonstrate the strand semantics of these interval selectors. The first two options show separate illustrations for features on each strand for emphasis. All matches shown in the remaining three options apply to features on either strand.
 ![3'_anchored_5'_anchored](../images/3'_anchored_5'_anchored_interval.png)
 ![Full_Exact_Partial](../images/full_exact_partial_interval.png)
@@ -79,10 +87,14 @@ You can use larger hierarchy values to exclude features that are not of interest
 The final stage of selection is concerned with the small RNA attributes of each alignment locus. Candidates are evaluated in order of hierarchy value where smaller values take precedence. Once a match has been found, reads are excluded from remaining candidates with larger hierarchy values.
 
 ### Strand
+This selector defines requirements for the alignment's strand relative to the feature's strand. Here, sense and antisense don't refer to the feature's or alignment's strand alone, but rather whether the alignment is sense/antisense to the feature.
 - `sense`: the alignment strand must match the feature's strand for a match
 - `antisense`: the alignment strand must not match the feature's strand for a match
 - `both`: strand is not evaluated
 
+#### Unstranded Features
+These features will match all strand selectors regardless of the alignment's strand.
+
 
 ### 5' End Nucleotide and Length
 | Parameter  | Single | List | Range | Wildcard |

diff --git a/tests/testdata/counter/validation/ebwt/ram1.1.ebwt b/tests/testdata/counter/validation/ebwt/ram1.1.ebwt
diff --git a/tests/testdata/counter/validation/ebwt/ram1.2.ebwt b/tests/testdata/counter/validation/ebwt/ram1.2.ebwt
diff --git a/tests/testdata/counter/validation/ebwt/ram1.3.ebwt b/tests/testdata/counter/validation/ebwt/ram1.3.ebwt
diff --git a/tests/testdata/counter/validation/ebwt/ram1.4.ebwt b/tests/testdata/counter/validation/ebwt/ram1.4.ebwt
diff --git a/tests/testdata/counter/validation/ebwt/ram1.rev.1.ebwt b/tests/testdata/counter/validation/ebwt/ram1.rev.1.ebwt
diff --git a/tests/testdata/counter/validation/ebwt/ram1.rev.2.ebwt b/tests/testdata/counter/validation/ebwt/ram1.rev.2.ebwt
diff --git a/tests/testdata/counter/validation/genome/genome.fasta b/tests/testdata/counter/validation/genome/genome.fasta
@@ -0,0 +1,6 @@
+>chr1
+ATGC
+>chr2
+CGTA
+>chr3
+ATATAT
diff --git a/tests/unit_test_helpers.py b/tests/unit_test_helpers.py
@@ -135,7 +135,8 @@ def reassemble_gz_w(mock_calls):
 
 # Converts strand character to a boolean value
 def strand_to_bool(strand):
-    assert strand in ['+', '-']
+    assert strand in ('+', '-', None)
+    if strand is None: return None
     return strand == '+'
 
 

diff --git a/tests/unit_tests_collapser.py b/tests/unit_tests_collapser.py
@@ -35,9 +35,9 @@ def setUpClass(self):
         self.output = {"file": {
             "out": {"exists": "mockPrefixExists_collapsed.fa", "dne": "mockPrefixDNE_collapsed.fa"},
             "low": {"exists": "mockPrefixExists_collapsed_lowcounts.fa", "dne": "mockPrefixDNE_collapsed_lowcounts.fa"}}}
-        self.output["msg"] = {k: "Collapser critical error: "+v['exists']+" already exists.\n"
+        self.output["msg"] = {k: "tiny-collapse critical error: "+v['exists']+" already exists.\n"
                               for k,v in self.output['file'].items()}
-        self.prefix_required_msg = "Collapser critical error: an output file must be specified.\n"
+        self.prefix_required_msg = "tiny-collapse critical error: an output file must be specified.\n"
 
         # Min-length fastq/fasta (single record)
         self.min_seq = "GTTTTGTTGGGCTTTCGCGAAGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCT"
@@ -351,7 +351,7 @@ def test_collapser_command(self):
             with ShellCapture(f'tiny-collapse -i /dev/null -o {prefix}') as test:
                 test()
                 self.assertEqual(test.get_stdout(), '')
-                self.assertIn(f"Collapser critical error: {expected_out_file} already exists.\n", test.get_stderr())
+                self.assertIn(f"tiny-collapse critical error: {expected_out_file} already exists.\n", test.get_stderr())
                 # (Very) roughly tests that the output file of the last test (same prefix) was not modified by this call
                 self.assertEqual(test_collapsed_fa_size, os.path.getsize(expected_out_file))
         finally:

diff --git a/tests/unit_tests_hts_parsing.py b/tests/unit_tests_hts_parsing.py
@@ -204,7 +204,7 @@ def test_ref_tables_missing_id_attribute(self):
         gff_row_without_id = helpers.read(self.short_gff_file).replace('ID=Gene:WBGene00023193;', '')
         mock_reader = mock_open(read_data=gff_row_without_id)
 
-        expected_err = f"Feature WBGene00023193 does not contain an ID attribute.\n"
+        expected_err = f"Feature WBGene00023193 does not have an ID attribute.\n"
         expected_err += f"Error occurred on line 1 of {self.short_gff_file}"
 
         with patch('tiny.rna.counter.hts_parsing.HTSeq.utils.open', new=mock_reader):
@@ -732,5 +732,59 @@ def test_CaseInsensitiveAttrs_contains_ident_wildcard(self):
         self.assertFalse(cia.contains_ident(("attrkey4", Wildcard())))
         self.assertFalse(cia.contains_ident((Wildcard(), "attrval7")))
 
+
+@unittest.skip("Long-running test, execute manually")
+class GenomeParsingTests(unittest.TestCase):
+    """Runs full-scale, unmodified GFF3/GTF genomes for select species through the ReferenceTables class"""
+
+    @classmethod
+    def setUpClass(self):
+        import requests
+        release = "54"
+        baseurl = "http://ftp.ensemblgenomes.org/pub/release-"
+        self.data_dir = "./testdata/local_only/gff/"
+        if not os.path.exists(self.data_dir): os.makedirs(self.data_dir)
+
+        self.urls = {
+            'Arabidopsis':
+                baseurl + '%s/plants/{}/arabidopsis_thaliana/Arabidopsis_thaliana.TAIR10.%s.{}.gz' % (release, release),
+            'C. elegans':
+                baseurl + '%s/metazoa/{}/caenorhabditis_elegans/Caenorhabditis_elegans.WBcel235.%s.{}.gz' % (release, release)
+        }
+
+        self.genomes = {
+            species: os.path.join(self.data_dir, os.path.basename(url))
+            for species, url in self.urls.items()
+        }
+
+        # Download genome files if they aren't already in the data dir
+        for ftype in ('gff3', 'gtf'):
+            for species, file_template in self.genomes.items():
+                file = file_template.format(ftype)
+                if os.path.isfile(file): continue
+
+                print(f"Downloading {ftype} genome for {species} from Ensembl...")
+                url = self.urls[species].format(ftype, ftype)
+                with requests.get(url, stream=True) as r:
+                    r.raise_for_status()
+                    with open(file, 'wb') as f:
+                        for chunk in r.iter_content(chunk_size=16384):
+                            f.write(chunk)
+
+    """Does ReferenceTables.get() process all genomes without throwing any errors?"""
+    def test_gff_megazord(self):
+        print("Running GFF Megazord test. This will take a long time...")
+
+        # Single rule with all wildcard selectors, but only Identity is actually relevant within ReferenceTables
+        rules = [{'Identity': ('', ''), 'Tag': '', 'Hierarchy': 0, 'Overlap': 'partial', 'Strand': '', 'nt5end': '', 'Length': ''}]
+        files = {gff.format(ftype): [] for gff in self.genomes.values() for ftype in ('gff3', 'gtf')}
+
+        fs = FeatureSelector(rules, LibraryStats())
+        rt = ReferenceTables(files, fs)
+
+        # The test is passed if this command
+        # completes without throwing errors.
+        rt.get()
+
 if __name__ == '__main__':
     unittest.main()
-Original file line number
+Diff line change
@@ -0,0 +1,6 @@
+    >chr1
+    ATGC
+    >chr2
+    CGTA
+    >chr3
+    ATATAT