SecureDNA · lynn · Nov 14, 2022 · Nov 11, 2022 · Nov 11, 2022 · Nov 11, 2022
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
+/dist
+/quickdna/*.pyd
 /target
 /.hypothesis
 /.vscode

diff --git a/README.md b/README.md
@@ -6,6 +6,8 @@ Quickdna is a simple, fast library for working with DNA sequences. It is up to 1
 translation tasks, in part because it uses a native Rust module (via PyO3) for the translation. However, it exposes
 an easy-to-use, type-annotated API that should still feel familiar for Biopython users.
 
+⚠ *Quickdna is "pre-1.0" software. Its API is still evolving. For now, if you're interested in using quickdna, we suggest you depend on an [exact version](https://python-poetry.org/docs/dependency-specification/#exact-requirements) or [git `rev`](https://python-poetry.org/docs/dependency-specification/#git-dependencies), so that new releases don't break your code.*
+
 ```python
 # These are the two main library types. Unlike Biopython, DnaSequence and
 # ProteinSequence are distinct, though they share a common BaseSequence base class
@@ -55,12 +57,20 @@ ProteinSequence(seq='WLNSLD'), ProteinSequence(seq='G*IVLI'))
 # There is a similar method, `translate_self_frames`, that only returns the
 # (up to 3) translated frames for this direction, without the reverse complement
 
-# The IUPAC ambiguity code 'N' is supported as well.
+# The IUPAC ambiguity codes are supported as well.
 # Codons with N will translate to a specific amino acid if it is unambiguous,
 # such as GGN -> G, or the ambiguous amino acid code 'X' if there are multiple
 # possible translations.
 >>> DnaSequence("GGNATN").translate()
 ProteinSequence(seq='GX')
+
+# The fine-grained ambiguity codes like "R = A or G" are accepted too, and
+# translation results are the same as Biopython. In the output, amino acid
+# ambiguity code 'B' means "either asparagine or aspartic acid" (N or D).
+>>> DnaSequence("RAT").translate()
+ProteinSequence(seq='B')
+
+# To disallow ambiguity codes in translation, try: `.translate(strict=True)`
 ```
 
 ## Benchmarks
@@ -91,10 +101,6 @@ reverse_complement_biopython(covid_genome) | 0.02928ms / iter | 121.55%
   * It's not yet 1.0 -- the API is liable to change in the future.
   * It doesn't support reading FASTA files or many of the other tasks Biopython can do,
     so you'll probably end up still using Biopython or something else to do those tasks.
-  * It doesn't support the (rarer) IUPAC ambiguity codes like B for non-A nucleotides,
-    instead only supporting the general N ambiguity code.
-      * If support for these codes is important to you, please make an issue! It may be possible
-        to support them, it just isn't a priority right now.
 
 ## Installation
 

diff --git a/quickdna/__init__.py b/quickdna/__init__.py
@@ -1,6 +1,6 @@
 import typing as ty
 
-from .quickdna import _translate, _reverse_complement  # type: ignore
+from .quickdna import _translate, _translate_strict, _reverse_complement, _reverse_complement_strict  # type: ignore
 
 T = ty.TypeVar("T", bound="BaseSequence")
 
@@ -102,22 +102,31 @@ class ProteinSequence(BaseSequence):
 
 class DnaSequence(BaseSequence):
     """
-    A DNA sequence is a sequence of A, T, C, G, or N (ambiguous) nucleotide ASCII bytes.
+    A DNA sequence is a sequence of A, T, C, G, or IUPAC nucleotide ambiguity code ASCII bytes.
     """
 
-    def translate(self, table: int = 1) -> ProteinSequence:
+    def translate(self, table: int = 1, strict: bool = False) -> ProteinSequence:
         """
         Translate a DNA sequence into a protein sequence, using the specified
         NCBI table ID.
 
         Raises ValueError if the table argument is invalid or any characters in
         this sequence are invalid nucleotides.
+
+        If `strict` is true, then the input must be all `ATCG`, with no
+        ambiguous nucleotides. Otherwise, a ValueError is raised.
         """
 
-        seq = _translate(table, self._seq)
+        if strict:
+            seq = _translate_strict(table, self._seq)
+        else:
+            seq = _translate(table, self._seq)
+
         return ProteinSequence(seq)
 
-    def translate_self_frames(self, table: int = 1) -> ty.List[ProteinSequence]:
+    def translate_self_frames(
+        self, table: int = 1, strict: bool = False
+    ) -> ty.List[ProteinSequence]:
         """
         Translate this DNA sequence into up to 3 protein sequences, one for each possible
         reading frame on this sense.
@@ -127,27 +136,32 @@ def translate_self_frames(self, table: int = 1) -> ty.List[ProteinSequence]:
         and a sequence of length 2 has none.
 
         Can raise ValueError, see `self.translate()`
+
+        If `strict` is true, then the input must be all `ATCG`, with no
+        ambiguous nucleotides. Otherwise, a ValueError is raised.
         """
 
         if len(self) >= 5:
             return [
-                self.translate(table),
-                self[1:].translate(table),
-                self[2:].translate(table),
+                self.translate(table, strict),
+                self[1:].translate(table, strict),
+                self[2:].translate(table, strict),
             ]
         elif len(self) == 4:
             return [
-                self.translate(table),
-                self[1:].translate(table),
+                self.translate(table, strict),
+                self[1:].translate(table, strict),
             ]
         elif len(self) == 3:
             return [
-                self.translate(table),
+                self.translate(table, strict),
             ]
         else:
             return []
 
-    def translate_all_frames(self, table: int = 1) -> ty.List[ProteinSequence]:
+    def translate_all_frames(
+        self, table: int = 1, strict: bool = False
+    ) -> ty.List[ProteinSequence]:
         """
         Translate this DNA sequence into at most 6 protein sequences, one for each possible
         reading frame on this sense and the reverse complement.
@@ -157,21 +171,32 @@ def translate_all_frames(self, table: int = 1) -> ty.List[ProteinSequence]:
         and a sequence of length 2 has none.
 
         Can raise ValueError, see `self.translate()`
+
+        If `strict` is true, then the input must be all `ATCG`, with no
+        ambiguous nucleotides. Otherwise, a ValueError is raised.
         """
 
         return [
-            *self.translate_self_frames(table=table),
-            *self.reverse_complement().translate_self_frames(table=table),
+            *self.translate_self_frames(table=table, strict=strict),
+            *self.reverse_complement().translate_self_frames(
+                table=table, strict=strict
+            ),
         ]
 
-    def reverse_complement(self) -> "DnaSequence":
+    def reverse_complement(self, strict: bool = False) -> "DnaSequence":
         """
         Takes the reverse complement of a DNA sequence.
 
         Raises ValueError if any character in this sequence is an invalid nucleotide.
+
+        If `strict` is true, then the input must be all `ATCG`, with no
+        ambiguous nucleotides. Otherwise, a ValueError is raised.
         """
 
-        seq = _reverse_complement(self._seq)
+        if strict:
+            seq = _reverse_complement_strict(self._seq)
+        else:
+            seq = _reverse_complement(self._seq)
         return DnaSequence(seq)
 
 

diff --git a/src/bin/gen_table.rs b/src/bin/gen_table.rs
@@ -2,11 +2,11 @@
 //! Should be run from the repository root.
 //! Translation table via Wikipedia.
 
-use std::fs;
+use std::{collections::HashSet, fs};
 
 use quickdna::{
     trans_table::{CodonIdx, TranslationTable},
-    Codon, Nucleotide,
+    CodonAmbiguous, Nucleotide, NucleotideAmbiguous, NucleotideLike,
 };
 
 // via https://en.wikipedia.org/wiki/List_of_genetic_codes
@@ -77,45 +77,36 @@ GGA || G || G || G || G || G || G || G || G || G || G || G || G || G || G || G |
 GGG || G || G || G || G || G || G || G || G || G || G || G || G || G || G || G || G || G || G || G || G || G || G || G || G || G || G || G
 ";
 
-fn codon_first_n_index(codon: Codon) -> Option<usize> {
-    let codon: [Nucleotide; 3] = codon.into();
-    codon
-        .iter()
-        .enumerate()
-        .find_map(|(idx, &n)| if n == Nucleotide::N { Some(idx) } else { None })
-}
-
 fn ambiguous_codon_protein(
-    codon: Codon,
+    codon: CodonAmbiguous,
     table_idx: usize,
-    translation_tables: [u8; TranslationTable::LOOKUP_SIZE],
+    translation_tables: &[u8; TranslationTable::LOOKUP_SIZE],
 ) -> u8 {
-    match codon.count(Nucleotide::N) {
-        0 => panic!("not an ambigious codon: {}", codon),
-        1 => {
-            // try and see if all possible permutations of this codon map to the same protein
-            let mut seen_protein = None;
-            let n_index = codon_first_n_index(codon).unwrap();
-
-            for possible in [Nucleotide::A, Nucleotide::T, Nucleotide::C, Nucleotide::G] {
-                let mut candidate: [Nucleotide; 3] = codon.into();
-                candidate[n_index] = possible;
-                let codon_idx: usize = CodonIdx::from(candidate).into();
+    let mut seen_proteins: HashSet<u8> = HashSet::new();
+
+    for &n1 in codon.0[0].possibilities() {
+        for &n2 in codon.0[1].possibilities() {
+            for &n3 in codon.0[2].possibilities() {
+                let codon_idx: usize = CodonIdx::from([n1, n2, n3]).into();
                 let candidate_protein =
                     translation_tables[table_idx * TranslationTable::CODONS_PER_TABLE + codon_idx];
 
-                match seen_protein {
-                    None => {
-                        seen_protein = Some(candidate_protein);
-                    }
-                    Some(p) if p == candidate_protein => {}
-                    _ => return b'X',
+                seen_proteins.insert(candidate_protein);
+                if seen_proteins.len() >= 3 {
+                    return b'X';
                 }
             }
-
-            seen_protein.unwrap()
         }
-        // all codons with 2 or more Ns always map to X
+    }
+
+    let mut seen_vec: Vec<u8> = seen_proteins.into_iter().collect();
+    seen_vec.sort();
+
+    match seen_vec.as_slice() {
+        [single] => *single,
+        [b'D', b'N'] => b'B', // B = Asx = Asparagine or Aspartic acid
+        [b'E', b'Q'] => b'Z', // Z = Glx = Glutamine or Glutamic acid
+        [b'I', b'L'] => b'J', // J = Xle = Leucine or Isoleucine
         _ => b'X',
     }
 }
@@ -141,15 +132,16 @@ fn gen_translation_tables() -> [u8; TranslationTable::LOOKUP_SIZE] {
     }
 
     // setup ambiguous codons
-    for &a in Nucleotide::NUCLEOTIDES.iter() {
-        for &b in Nucleotide::NUCLEOTIDES.iter() {
-            for &c in Nucleotide::NUCLEOTIDES.iter() {
-                if a == Nucleotide::N || b == Nucleotide::N || c == Nucleotide::N {
-                    let codon = Codon([a, b, c]);
+    for a in NucleotideAmbiguous::ALL {
+        for b in NucleotideAmbiguous::ALL {
+            for c in NucleotideAmbiguous::ALL {
+                if a.is_ambiguous() || b.is_ambiguous() || c.is_ambiguous() {
+                    let codon = CodonAmbiguous([a, b, c]);
                     let codon_idx: usize = CodonIdx::from(codon).into();
 
                     for table_idx in 0..TranslationTable::N_TRANS_TABLES {
-                        let protein = ambiguous_codon_protein(codon, table_idx, translation_tables);
+                        let protein =
+                            ambiguous_codon_protein(codon, table_idx, &translation_tables);
                         translation_tables
                             [table_idx * TranslationTable::CODONS_PER_TABLE + codon_idx] = protein;
                     }
@@ -164,4 +156,5 @@ fn gen_translation_tables() -> [u8; TranslationTable::LOOKUP_SIZE] {
 fn main() {
     let tables = gen_translation_tables();
     fs::write("src/tables.dat", tables).expect("failed to write tables.dat");
+    println!("Wrote codon translation table data to src/tables.dat.");
 }
diff --git a/src/errors.rs b/src/errors.rs
@@ -8,6 +8,8 @@ pub enum TranslationError {
     NonAsciiChar(char),
     #[error("bad nucleotide: {:?}", .0)]
     BadNucleotide(char),
+    #[error("unexpected ambiguous nucleotide: {:?}", .0)]
+    UnexpectedAmbiguousNucleotide(char),
     #[error("not a ncbi translation table: {}", .0)]
     BadTranslationTable(u8),
 }