Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
/dist
/quickdna/*.pyd
/target
/.hypothesis
/.vscode
Expand Down
16 changes: 11 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ Quickdna is a simple, fast library for working with DNA sequences. It is up to 1
translation tasks, in part because it uses a native Rust module (via PyO3) for the translation. However, it exposes
an easy-to-use, type-annotated API that should still feel familiar for Biopython users.

⚠ *Quickdna is "pre-1.0" software. Its API is still evolving. For now, if you're interested in using quickdna, we suggest you depend on an [exact version](https://python-poetry.org/docs/dependency-specification/#exact-requirements) or [git `rev`](https://python-poetry.org/docs/dependency-specification/#git-dependencies), so that new releases don't break your code.*

```python
# These are the two main library types. Unlike Biopython, DnaSequence and
# ProteinSequence are distinct, though they share a common BaseSequence base class
Expand Down Expand Up @@ -55,12 +57,20 @@ ProteinSequence(seq='WLNSLD'), ProteinSequence(seq='G*IVLI'))
# There is a similar method, `translate_self_frames`, that only returns the
# (up to 3) translated frames for this direction, without the reverse complement

# The IUPAC ambiguity code 'N' is supported as well.
# The IUPAC ambiguity codes are supported as well.
# Codons with N will translate to a specific amino acid if it is unambiguous,
# such as GGN -> G, or the ambiguous amino acid code 'X' if there are multiple
# possible translations.
>>> DnaSequence("GGNATN").translate()
ProteinSequence(seq='GX')

# The fine-grained ambiguity codes like "R = A or G" are accepted too, and
# translation results are the same as Biopython. In the output, amino acid
# ambiguity code 'B' means "either asparagine or aspartic acid" (N or D).
>>> DnaSequence("RAT").translate()
ProteinSequence(seq='B')

# To disallow ambiguity codes in translation, try: `.translate(strict=True)`
```

## Benchmarks
Expand Down Expand Up @@ -91,10 +101,6 @@ reverse_complement_biopython(covid_genome) | 0.02928ms / iter | 121.55%
* It's not yet 1.0 -- the API is liable to change in the future.
* It doesn't support reading FASTA files or many of the other tasks Biopython can do,
so you'll probably end up still using Biopython or something else to do those tasks.
* It doesn't support the (rarer) IUPAC ambiguity codes like B for non-A nucleotides,
instead only supporting the general N ambiguity code.
* If support for these codes is important to you, please make an issue! It may be possible
to support them, it just isn't a priority right now.

## Installation

Expand Down
57 changes: 41 additions & 16 deletions quickdna/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import typing as ty

from .quickdna import _translate, _reverse_complement # type: ignore
from .quickdna import _translate, _translate_strict, _reverse_complement, _reverse_complement_strict # type: ignore

T = ty.TypeVar("T", bound="BaseSequence")

Expand Down Expand Up @@ -102,22 +102,31 @@ class ProteinSequence(BaseSequence):

class DnaSequence(BaseSequence):
"""
A DNA sequence is a sequence of A, T, C, G, or N (ambiguous) nucleotide ASCII bytes.
A DNA sequence is a sequence of A, T, C, G, or IUPAC nucleotide ambiguity code ASCII bytes.
"""

def translate(self, table: int = 1) -> ProteinSequence:
def translate(self, table: int = 1, strict: bool = False) -> ProteinSequence:
"""
Translate a DNA sequence into a protein sequence, using the specified
NCBI table ID.

Raises ValueError if the table argument is invalid or any characters in
this sequence are invalid nucleotides.

If `strict` is true, then the input must be all `ATCG`, with no
ambiguous nucleotides. Otherwise, a ValueError is raised.
"""

seq = _translate(table, self._seq)
if strict:
seq = _translate_strict(table, self._seq)
else:
seq = _translate(table, self._seq)

return ProteinSequence(seq)

def translate_self_frames(self, table: int = 1) -> ty.List[ProteinSequence]:
def translate_self_frames(
self, table: int = 1, strict: bool = False
) -> ty.List[ProteinSequence]:
"""
Translate this DNA sequence into up to 3 protein sequences, one for each possible
reading frame on this sense.
Expand All @@ -127,27 +136,32 @@ def translate_self_frames(self, table: int = 1) -> ty.List[ProteinSequence]:
and a sequence of length 2 has none.

Can raise ValueError, see `self.translate()`

If `strict` is true, then the input must be all `ATCG`, with no
ambiguous nucleotides. Otherwise, a ValueError is raised.
"""

if len(self) >= 5:
return [
self.translate(table),
self[1:].translate(table),
self[2:].translate(table),
self.translate(table, strict),
self[1:].translate(table, strict),
self[2:].translate(table, strict),
]
elif len(self) == 4:
return [
self.translate(table),
self[1:].translate(table),
self.translate(table, strict),
self[1:].translate(table, strict),
]
elif len(self) == 3:
return [
self.translate(table),
self.translate(table, strict),
]
else:
return []

def translate_all_frames(self, table: int = 1) -> ty.List[ProteinSequence]:
def translate_all_frames(
self, table: int = 1, strict: bool = False
) -> ty.List[ProteinSequence]:
"""
Translate this DNA sequence into at most 6 protein sequences, one for each possible
reading frame on this sense and the reverse complement.
Expand All @@ -157,21 +171,32 @@ def translate_all_frames(self, table: int = 1) -> ty.List[ProteinSequence]:
and a sequence of length 2 has none.

Can raise ValueError, see `self.translate()`

If `strict` is true, then the input must be all `ATCG`, with no
ambiguous nucleotides. Otherwise, a ValueError is raised.
"""

return [
*self.translate_self_frames(table=table),
*self.reverse_complement().translate_self_frames(table=table),
*self.translate_self_frames(table=table, strict=strict),
*self.reverse_complement().translate_self_frames(
table=table, strict=strict
),
]

def reverse_complement(self) -> "DnaSequence":
def reverse_complement(self, strict: bool = False) -> "DnaSequence":
"""
Takes the reverse complement of a DNA sequence.

Raises ValueError if any character in this sequence is an invalid nucleotide.

If `strict` is true, then the input must be all `ATCG`, with no
ambiguous nucleotides. Otherwise, a ValueError is raised.
"""

seq = _reverse_complement(self._seq)
if strict:
seq = _reverse_complement_strict(self._seq)
else:
seq = _reverse_complement(self._seq)
return DnaSequence(seq)


Expand Down
69 changes: 31 additions & 38 deletions src/bin/gen_table.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
//! Should be run from the repository root.
//! Translation table via Wikipedia.

use std::fs;
use std::{collections::HashSet, fs};

use quickdna::{
trans_table::{CodonIdx, TranslationTable},
Codon, Nucleotide,
CodonAmbiguous, Nucleotide, NucleotideAmbiguous, NucleotideLike,
};

// via https://en.wikipedia.org/wiki/List_of_genetic_codes
Expand Down Expand Up @@ -77,45 +77,36 @@ GGA || G || G || G || G || G || G || G || G || G || G || G || G || G || G || G |
GGG || G || G || G || G || G || G || G || G || G || G || G || G || G || G || G || G || G || G || G || G || G || G || G || G || G || G || G
";

fn codon_first_n_index(codon: Codon) -> Option<usize> {
let codon: [Nucleotide; 3] = codon.into();
codon
.iter()
.enumerate()
.find_map(|(idx, &n)| if n == Nucleotide::N { Some(idx) } else { None })
}

fn ambiguous_codon_protein(
codon: Codon,
codon: CodonAmbiguous,
table_idx: usize,
translation_tables: [u8; TranslationTable::LOOKUP_SIZE],
translation_tables: &[u8; TranslationTable::LOOKUP_SIZE],
) -> u8 {
match codon.count(Nucleotide::N) {
0 => panic!("not an ambigious codon: {}", codon),
1 => {
// try and see if all possible permutations of this codon map to the same protein
let mut seen_protein = None;
let n_index = codon_first_n_index(codon).unwrap();

for possible in [Nucleotide::A, Nucleotide::T, Nucleotide::C, Nucleotide::G] {
let mut candidate: [Nucleotide; 3] = codon.into();
candidate[n_index] = possible;
let codon_idx: usize = CodonIdx::from(candidate).into();
let mut seen_proteins: HashSet<u8> = HashSet::new();

for &n1 in codon.0[0].possibilities() {
for &n2 in codon.0[1].possibilities() {
for &n3 in codon.0[2].possibilities() {
let codon_idx: usize = CodonIdx::from([n1, n2, n3]).into();
let candidate_protein =
translation_tables[table_idx * TranslationTable::CODONS_PER_TABLE + codon_idx];

match seen_protein {
None => {
seen_protein = Some(candidate_protein);
}
Some(p) if p == candidate_protein => {}
_ => return b'X',
seen_proteins.insert(candidate_protein);
if seen_proteins.len() >= 3 {
return b'X';
}
}

seen_protein.unwrap()
}
// all codons with 2 or more Ns always map to X
}

let mut seen_vec: Vec<u8> = seen_proteins.into_iter().collect();
seen_vec.sort();

match seen_vec.as_slice() {
[single] => *single,
[b'D', b'N'] => b'B', // B = Asx = Asparagine or Aspartic acid
[b'E', b'Q'] => b'Z', // Z = Glx = Glutamine or Glutamic acid
[b'I', b'L'] => b'J', // J = Xle = Leucine or Isoleucine
_ => b'X',
}
}
Expand All @@ -141,15 +132,16 @@ fn gen_translation_tables() -> [u8; TranslationTable::LOOKUP_SIZE] {
}

// setup ambiguous codons
for &a in Nucleotide::NUCLEOTIDES.iter() {
for &b in Nucleotide::NUCLEOTIDES.iter() {
for &c in Nucleotide::NUCLEOTIDES.iter() {
if a == Nucleotide::N || b == Nucleotide::N || c == Nucleotide::N {
let codon = Codon([a, b, c]);
for a in NucleotideAmbiguous::ALL {
for b in NucleotideAmbiguous::ALL {
for c in NucleotideAmbiguous::ALL {
if a.is_ambiguous() || b.is_ambiguous() || c.is_ambiguous() {
let codon = CodonAmbiguous([a, b, c]);
let codon_idx: usize = CodonIdx::from(codon).into();

for table_idx in 0..TranslationTable::N_TRANS_TABLES {
let protein = ambiguous_codon_protein(codon, table_idx, translation_tables);
let protein =
ambiguous_codon_protein(codon, table_idx, &translation_tables);
translation_tables
[table_idx * TranslationTable::CODONS_PER_TABLE + codon_idx] = protein;
}
Expand All @@ -164,4 +156,5 @@ fn gen_translation_tables() -> [u8; TranslationTable::LOOKUP_SIZE] {
fn main() {
let tables = gen_translation_tables();
fs::write("src/tables.dat", tables).expect("failed to write tables.dat");
println!("Wrote codon translation table data to src/tables.dat.");
}
2 changes: 2 additions & 0 deletions src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ pub enum TranslationError {
NonAsciiChar(char),
#[error("bad nucleotide: {:?}", .0)]
BadNucleotide(char),
#[error("unexpected ambiguous nucleotide: {:?}", .0)]
UnexpectedAmbiguousNucleotide(char),
#[error("not a ncbi translation table: {}", .0)]
BadTranslationTable(u8),
}
Loading