diff --git a/.gitignore b/.gitignore index e0714aa..39a28a7 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,7 @@ pip-log.txt # Unit test / coverage reports .coverage .tox +.cache # Translations *.mo diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000..1756fdc --- /dev/null +++ b/.pylintrc @@ -0,0 +1,2 @@ +[MESSAGES CONTROL] +disable=C0111,C0103 diff --git a/alignment/sequencealigner.py b/alignment/sequencealigner.py index ff79a1a..9d1ae27 100644 --- a/alignment/sequencealigner.py +++ b/alignment/sequencealigner.py @@ -1,3 +1,6 @@ +from itertools import zip_longest, islice +from collections import deque + from six import text_type from six.moves import range @@ -9,7 +12,7 @@ from abc import abstractmethod from .sequence import GAP_CODE -from .sequence import EncodedSequence +from .sequence import EncodedSequence, Sequence # Scoring --------------------------------------------------------------------- @@ -403,3 +406,147 @@ def backtraceFrom(self, first, second, f, i, j, alignments, alignment): self.backtraceFrom(first, second, f, i - 1, j, alignments, alignment) alignment.pop() + +class LinkedListNode(object): + def __init__(self, data, next_node=None): + self.data = data + self.next_node = next_node + + def __str__(self): + if self.next_node is not None: + return str(self.data) + ' -> ' + str(self.next_node) + else: + return str(self.data) + + def __iter__(self): + yield self.data + next_node = self.next_node + while next_node is not None: + yield next_node.data + next_node = next_node.next_node + +def _path_positions_to_indices(positions): + return [ + p - 1 if p_next is None or p_next != p else None + for p, p_next in zip_longest(positions, positions[1:]) + ] + +def _path_to_indices(path): + positions1, positions2 = zip(*path) + return ( + _path_positions_to_indices(positions1), + _path_positions_to_indices(positions2) + ) + +def _map_indices(indices, seq, default_value=None): + return [seq[i] if i is not None else default_value for i in indices] + +def _path_to_alignment(score_matrix, path, s1, s2, gap=GAP_CODE): + indices1, indices2 = _path_to_indices(path) + matched_seq1 = _map_indices(indices1, s1) + matched_seq2 = _map_indices(indices2, s2) + cum_scores = [score_matrix[i][j] for i, j in path] + scores = [score2 - score1 for score1, score2 in zip([0] + cum_scores, cum_scores)] + identical_count = sum([ + 1 if i1 is not None and i2 is not None and s1[i1] == s2[i2] else 0 + for i1, i2 in zip(indices1, indices2) + ]) + similar_count = sum([ + 1 if score > 0 else 0 + for score in scores + ]) + gap_count = sum([ + 1 if i1 is None or i2 is None else 0 + for i1, i2 in zip(indices1, indices2) + ]) + total_score = sum(scores) + if isinstance(s1, EncodedSequence): + first = EncodedSequence([c or gap for c in matched_seq1], id=s1.id) + second = EncodedSequence([c or gap for c in matched_seq2], id=s2.id) + else: + first = Sequence([c or gap for c in matched_seq1]) + second = Sequence([c or gap for c in matched_seq2]) + seq_alignment = SequenceAlignment(first, second) + seq_alignment.scores = scores + seq_alignment.score = total_score + seq_alignment.identicalCount = identical_count + seq_alignment.similarCount = similar_count + seq_alignment.gapCount = gap_count + seq_alignment.first_indices = indices1 + seq_alignment.second_indices = indices2 + return seq_alignment + +class SmithWatermanAligner(object): + def __init__(self, scoring, gap_score): + self.scoring = scoring + self.gap_score = gap_score + + def computeAlignmentMatrix(self, first, second): + m = len(first) + 1 + n = len(second) + 1 + f = numpy.zeros((m, n), int) + for i in range(1, m): + for j in range(1, n): + # Match elements. + ab = f[i - 1, j - 1] \ + + self.scoring(first[i - 1], second[j - 1]) + + # Gap on sequenceA. + ga = f[i, j - 1] + self.gap_score + + # Gap on sequenceB. + gb = f[i - 1, j] + self.gap_score + + f[i, j] = max(0, ab, ga, gb) + return f + + def _traceback(self, score_matrix, start_locs): + pending_roots = deque([ + LinkedListNode(tuple(loc)) + for loc in start_locs + ]) + while len(pending_roots) > 0: + n = pending_roots.pop() + i, j = n.data + next_locs = self._next_locs(score_matrix, i, j) + if len(next_locs) == 0: + yield n + else: + pending_roots.extend([ + LinkedListNode(next_loc, n) + for next_loc in next_locs + ]) + + def _next_locs(self, score_matrix, i, j): + diag_score = score_matrix[i - 1][j - 1] + up_score = score_matrix[i - 1][j] + left_score = score_matrix[i][j - 1] + max_score = max(diag_score, up_score, left_score) + if max_score == 0 or diag_score == 0: + return [] + if diag_score == max_score: + return [(i - 1, j - 1)] + locs = [] + if up_score == max_score: + locs.append((i - 1, j)) + if left_score == max_score: + locs.append((i, j - 1)) + return locs + + def align(self, s1, s2, backtrace=True, gap=GAP_CODE, limit=None): + score_matrix = self.computeAlignmentMatrix(s1, s2) + max_score = score_matrix.max() + + if not backtrace: + return max_score + + max_score_loc = numpy.argwhere(score_matrix == max_score) + paths = self._traceback(score_matrix, max_score_loc) + if limit is not None: + paths = islice(paths, limit) + seq_alignments = [ + _path_to_alignment(score_matrix, path, s1, s2, gap) + for path in paths + ] + score = seq_alignments[0].score if len(seq_alignments) > 0 else 0 + return score, seq_alignments diff --git a/alignment/sequencealigner_test.py b/alignment/sequencealigner_test.py new file mode 100644 index 0000000..34ecb89 --- /dev/null +++ b/alignment/sequencealigner_test.py @@ -0,0 +1,290 @@ +""" +Unit test +""" +from .vocabulary import Vocabulary + +from .sequence import Sequence + +from .sequencealigner import ( + GlobalSequenceAligner, + StrictGlobalSequenceAligner, + LocalSequenceAligner, + SmithWatermanAligner, + SimpleScoring +) + +DEFAULT_MATCH_SCORE = 3 +DEFAULT_MISMATCH_SCORE = -1 +DEFAULT_GAP_SCORE = -2 + +def _align( + first, + second, + aligner, + **kwargs): + vocab = Vocabulary() + score, encodeds = aligner.align( + vocab.encodeSequence(Sequence(first)), + vocab.encodeSequence(Sequence(second)), + backtrace=True, + **kwargs + ) + return score, [vocab.decodeSequenceAlignment(encoded) for encoded in encodeds] + +def _global_align( + first, second, + match_score=DEFAULT_MATCH_SCORE, + mismatch_score=DEFAULT_MISMATCH_SCORE, + gap_score=DEFAULT_GAP_SCORE): + scoring = SimpleScoring(match_score, mismatch_score) + return _align(first, second, aligner=GlobalSequenceAligner(scoring, gap_score)) + +def _strict_global_align( + first, second, + match_score=DEFAULT_MATCH_SCORE, + mismatch_score=DEFAULT_MISMATCH_SCORE, + gap_score=DEFAULT_GAP_SCORE): + scoring = SimpleScoring(match_score, mismatch_score) + return _align(first, second, aligner=StrictGlobalSequenceAligner(scoring, gap_score)) + +def _local_align( + first, second, + match_score=DEFAULT_MATCH_SCORE, + mismatch_score=DEFAULT_MISMATCH_SCORE, + gap_score=DEFAULT_GAP_SCORE): + scoring = SimpleScoring(match_score, mismatch_score) + return _align(first, second, aligner=LocalSequenceAligner(scoring, gap_score)) + + +class CommonSequenceAlignerTests(object): + def _align(self, _first, _second): + raise Exception('not implemented') + + # def test_no_match(self): + # score, alignments = self._align('ab', 'xy') + # assert len(alignments) == 0 + # assert score == 0 + + def test_exact_match(self): + score, alignments = self._align('ab', 'ab') + print("alignment:", alignments[0]) + assert len(alignments) == 1 + assert str(alignments[0].first) == 'a b' + assert str(alignments[0].second) == 'a b' + assert alignments[0].percentIdentity() == 100.0 + assert alignments[0].percentSimilarity() == 100.0 + assert alignments[0].percentGap() == 0.0 + assert score == DEFAULT_MATCH_SCORE * 2 + assert alignments[0].score == score + + def test_exact_left_partial_match(self): + score, alignments = self._align('xaby', 'ab') + assert len(alignments) == 1 + assert str(alignments[0].first) == 'a b' + assert str(alignments[0].second) == 'a b' + assert alignments[0].score == score + assert alignments[0].percentIdentity() == 100.0 + assert alignments[0].percentSimilarity() == 100.0 + assert alignments[0].percentGap() == 0.0 + assert score == DEFAULT_MATCH_SCORE * 2 + + def test_exact_right_partial_match(self): + score, alignments = self._align('ab', 'xaby') + assert len(alignments) == 1 + assert str(alignments[0].first) == 'a b' + assert str(alignments[0].second) == 'a b' + assert alignments[0].percentIdentity() == 100.0 + assert alignments[0].percentSimilarity() == 100.0 + assert alignments[0].percentGap() == 0.0 + assert score == DEFAULT_MATCH_SCORE * 2 + assert alignments[0].score == score + + def test_exact_left_partial_match_with_gap(self): + score, alignments = self._align('xaby', 'aob') + assert len(alignments) == 1 + assert str(alignments[0].first) == 'a - b' + assert str(alignments[0].second) == 'a o b' + assert alignments[0].percentIdentity() == 2 / 3 * 100.0 + assert alignments[0].percentSimilarity() == 2 / 3 * 100.0 + assert alignments[0].percentGap() == 1 / 3 * 100.0 + assert score == DEFAULT_MATCH_SCORE * 2 + DEFAULT_GAP_SCORE + assert alignments[0].score == score + + def test_exact_left_partial_match_with_mismatch(self): + score, alignments = self._align('xamby', 'aob') + assert len(alignments) == 1 + assert str(alignments[0].first) == 'a m b' + assert str(alignments[0].second) == 'a o b' + assert alignments[0].percentIdentity() == 2 / 3 * 100.0 + assert alignments[0].percentSimilarity() == 2 / 3 * 100.0 + assert alignments[0].percentGap() == 0.0 + assert score == DEFAULT_MATCH_SCORE * 2 + DEFAULT_MISMATCH_SCORE + assert alignments[0].score == score + + +class TestGlobalSequenceAligner(CommonSequenceAlignerTests): + def _align(self, first, second): + return _global_align(first, second) + + def test_multiple_alignments(self): + score, alignments = self._align('xabcabcy', 'abc') + assert len(alignments) == 1 + assert str(alignments[0].first) == 'a b c' + assert str(alignments[0].second) == 'a b c' + assert alignments[0].percentIdentity() == 3 / 3 * 100.0 + assert alignments[0].percentSimilarity() == 3 / 3 * 100.0 + assert alignments[0].percentGap() == 0.0 + assert score == DEFAULT_MATCH_SCORE * 3 + assert alignments[0].score == score + + def test_shortest_path_alignment(self): + # this tests that it doesn't pick longer paths on the way + # (e.g. goes up instead of diagonally) + score, alignments = self._align('aac', 'bac') + assert len(alignments) == 1 + assert str(alignments[0].first) == 'a a c' + assert str(alignments[0].second) == 'b a c' + assert alignments[0].percentIdentity() == 2 / 3 * 100.0 + assert alignments[0].percentSimilarity() == 2 / 3 * 100.0 + assert alignments[0].percentGap() == 0.0 + assert score == DEFAULT_MATCH_SCORE * 2 + DEFAULT_MISMATCH_SCORE + assert alignments[0].score == score + + +class TestStrictGlobalSequenceAligner(CommonSequenceAlignerTests): + def _align(self, first, second): + return _strict_global_align(first, second) + + def test_exact_left_partial_match(self): + score, alignments = self._align('xaby', 'ab') + assert len(alignments) == 1 + assert str(alignments[0].first) == 'x a b y' + assert str(alignments[0].second) == '- a b -' + assert alignments[0].score == score + assert alignments[0].percentIdentity() == 2 / 4 * 100.0 + assert alignments[0].percentSimilarity() == 2 / 4 * 100.0 + assert alignments[0].percentGap() == 2 / 4 * 100.0 + assert score == DEFAULT_MATCH_SCORE * 2 + DEFAULT_GAP_SCORE * 2 + + def test_exact_right_partial_match(self): + score, alignments = self._align('ab', 'xaby') + assert len(alignments) == 1 + assert str(alignments[0].first) == '- a b -' + assert str(alignments[0].second) == 'x a b y' + assert alignments[0].percentIdentity() == 2 / 4 * 100.0 + assert alignments[0].percentSimilarity() == 2 / 4 * 100.0 + assert alignments[0].percentGap() == 2 / 4 * 100.0 + assert score == DEFAULT_MATCH_SCORE * 2 + DEFAULT_GAP_SCORE * 2 + assert alignments[0].score == score + + def test_exact_left_partial_match_with_gap(self): + score, alignments = self._align('xaby', 'aob') + assert len(alignments) == 1 + assert str(alignments[0].first) == 'x a - b y' + assert str(alignments[0].second) == '- a o b -' + assert alignments[0].percentIdentity() == 2 / 5 * 100.0 + assert alignments[0].percentSimilarity() == 2 / 5 * 100.0 + assert alignments[0].percentGap() == 3 / 5 * 100.0 + assert score == DEFAULT_MATCH_SCORE * 2 + DEFAULT_GAP_SCORE * 3 + assert alignments[0].score == score + + def test_exact_left_partial_match_with_mismatch(self): + score, alignments = self._align('xamby', 'aob') + assert len(alignments) == 1 + assert str(alignments[0].first) == 'x a m b y' + assert str(alignments[0].second) == '- a o b -' + assert alignments[0].percentIdentity() == 2 / 5 * 100.0 + assert alignments[0].percentSimilarity() == 2 / 5 * 100.0 + assert alignments[0].percentGap() == 2 / 5 * 100.0 + assert score == DEFAULT_MATCH_SCORE * 2 + DEFAULT_MISMATCH_SCORE + DEFAULT_GAP_SCORE * 2 + assert alignments[0].score == score + + def test_multiple_alignments(self): + score, alignments = self._align('xabcabcy', 'abc') + assert len(alignments) == 1 + assert str(alignments[0].first) == 'x a b c a b c y' + assert str(alignments[0].second) == '- a b c - - - -' + assert alignments[0].percentIdentity() == 3 / 8 * 100.0 + assert alignments[0].percentSimilarity() == 3 / 8 * 100.0 + assert alignments[0].percentGap() == 5 / 8 * 100.0 + assert score == DEFAULT_MATCH_SCORE * 3 + DEFAULT_GAP_SCORE * 5 + assert alignments[0].score == score + + +class TestLocalSequenceAligner(CommonSequenceAlignerTests): + def _align(self, first, second): + return _local_align(first, second) + + def test_multiple_alignments(self): + score, alignments = self._align('xabcabcy', 'abc') + assert len(alignments) == 2 + assert str(alignments[0].first) == 'a b c' + assert str(alignments[0].second) == 'a b c' + assert alignments[0].percentIdentity() == 3 / 3 * 100.0 + assert alignments[0].percentSimilarity() == 3 / 3 * 100.0 + assert alignments[0].percentGap() == 0.0 + assert score == DEFAULT_MATCH_SCORE * 3 + assert alignments[0].score == score + + def test_multiple_gap_alignments(self): + score, alignments = self._align('abxc', 'axbc') + assert len(alignments) == 2 + assert ( + set([(str(a.first), str(a.second)) for a in alignments]) == + set([ + ('a b x - c', 'a - x b c'), + ('a - b x c', 'a x b - c') + ]) + ) + assert alignments[0].percentIdentity() == 3 / 5 * 100.0 + assert alignments[0].percentSimilarity() == 3 / 5 * 100.0 + assert alignments[0].percentGap() == 2 / 5 * 100.0 + assert score == DEFAULT_MATCH_SCORE * 3 + DEFAULT_GAP_SCORE * 2 + assert alignments[0].score == score + + def test_shortest_path_alignment(self): + # this tests that it doesn't pick longer paths on the way + # (e.g. goes up instead of diagonally) + score, alignments = self._align('aac', 'bac') + assert len(alignments) == 1 + assert str(alignments[0].first) == 'a c' + assert str(alignments[0].second) == 'a c' + assert alignments[0].percentIdentity() == 3 / 3 * 100.0 + assert alignments[0].percentSimilarity() == 3 / 3 * 100.0 + assert alignments[0].percentGap() == 0.0 + assert score == DEFAULT_MATCH_SCORE * 2 + assert alignments[0].score == score + +class TestSmithWatermannAligner(TestLocalSequenceAligner): + def _align(self, first, second, **kwargs): + scoring = SimpleScoring(DEFAULT_MATCH_SCORE, DEFAULT_MISMATCH_SCORE) + return _align( + first, second, aligner=SmithWatermanAligner(scoring, DEFAULT_GAP_SCORE), **kwargs) + + def test_limit_alignments(self): + score, alignments = self._align('abxc', 'axbc', limit=1) + assert len(alignments) == 1 + assert ( + (str(alignments[0].first), str(alignments[0].second)) in + set([ + ('a b x - c', 'a - x b c'), + ('a - b x c', 'a x b - c') + ]) + ) + assert alignments[0].percentIdentity() == 3 / 5 * 100.0 + assert alignments[0].percentSimilarity() == 3 / 5 * 100.0 + assert alignments[0].percentGap() == 2 / 5 * 100.0 + assert score == DEFAULT_MATCH_SCORE * 3 + DEFAULT_GAP_SCORE * 2 + assert alignments[0].score == score + +class TestSmithWatermannAlignerWithString(TestSmithWatermannAligner): + def _align(self, first, second, **kwargs): + scoring = SimpleScoring(DEFAULT_MATCH_SCORE, DEFAULT_MISMATCH_SCORE) + aligner = SmithWatermanAligner(scoring, DEFAULT_GAP_SCORE) + return aligner.align( + Sequence(first), + Sequence(second), + backtrace=True, + gap='-', + **kwargs + )