diff --git a/src/algorithms/chain_items.cpp b/src/algorithms/chain_items.cpp index c84e399ce1..8bd5a15d53 100644 --- a/src/algorithms/chain_items.cpp +++ b/src/algorithms/chain_items.cpp @@ -13,6 +13,7 @@ //#define debug_chaining //#define debug_transition +//#define debug_missing_transition //#define debug_dp namespace vg { @@ -262,6 +263,18 @@ transition_iterator zip_tree_transition_iterator(const std::vector filtered_transitions = calculate_transition_read_distances(all_transitions, to_chain, max_read_lookback_bases); @@ -385,6 +398,103 @@ std::vector generate_zip_tree_transitions( return all_transitions; } +bool find_missing_zip_tree_transitions( + const std::vector& seeds, + const ZipCodeTree& zip_code_tree, + size_t max_graph_lookback_bases, + const std::unordered_map& seed_to_starting, + const std::unordered_map& seed_to_ending, + const SnarlDistanceIndex& distance_index, + const std::vector& all_transitions) { + + // {source anchor : {dest anchor : dist}} + std::unordered_map> found; + for (const auto& transition : all_transitions) { + size_t dist_to_save = transition.graph_distance; + if (!found.count(transition.from_anchor)) { + found[transition.from_anchor] = std::unordered_map(); + } + if (found[transition.from_anchor].count(transition.to_anchor)) { + // If a transition appears multiple times, remember the min + dist_to_save = std::min(transition.graph_distance, + found[transition.from_anchor][transition.to_anchor]); + } + found[transition.from_anchor][transition.to_anchor] = transition.graph_distance; + } + + bool has_missing = false; + + // Helper function to check for a distance between two seeds + auto check_distance = [&] (const ZipCodeTree::oriented_seed_t& from_seed, bool rev_from, + const ZipCodeTree::oriented_seed_t& to_seed, bool rev_to) { + // XOR to get appropriate orientations + rev_from ^= from_seed.is_reversed; + rev_to ^= to_seed.is_reversed; + if (rev_from != rev_to) { + // Cannot be compared; incompatible orientations + return; + } + + // Look up appropriate anchors + auto from_anchor_itr = rev_from ? seed_to_starting.find(from_seed.seed) + : seed_to_ending.find(from_seed.seed); + if ((rev_from && from_anchor_itr == seed_to_starting.end()) + || (!rev_from && from_anchor_itr == seed_to_ending.end())) { + // No anchor exists + return; + } + auto to_anchor_itr = rev_to ? seed_to_ending.find(to_seed.seed) + : seed_to_starting.find(to_seed.seed); + if ((rev_to && to_anchor_itr == seed_to_ending.end()) + || (!rev_to && to_anchor_itr == seed_to_starting.end())) { + // No anchor exists + return; + } + + // Construct seed positions + pos_t from_pos = seeds.at(from_seed.seed).pos; + size_t from_length = distance_index.minimum_length(distance_index.get_node_net_handle(id(from_pos))); + from_pos = rev_from ? reverse(from_pos, from_length) + : from_pos; + pos_t to_pos = seeds.at(to_seed.seed).pos; + size_t to_length = distance_index.minimum_length(distance_index.get_node_net_handle(id(to_pos))); + to_pos = rev_to ? reverse(to_pos, to_length) + : to_pos; + + // Look up true minimum distance + size_t true_distance = minimum_nontrivial_distance(distance_index, from_pos, to_pos); + if (true_distance <= max_graph_lookback_bases) { + // We should've found this transition + auto from_anchor = from_anchor_itr->second; + auto to_anchor = to_anchor_itr->second; + if (!found.count(from_anchor) + || !found[from_anchor].count(to_anchor) + || found[from_anchor][to_anchor] != true_distance) { + has_missing = true; + cerr << "Missing transition " << from_pos << "->" + << to_pos << " dist " << true_distance << endl; + } + } + }; + + vector tree_seeds = zip_code_tree.get_all_seeds(); + for (size_t i = 0; i < tree_seeds.size(); i++) { + // Check self-loops + check_distance(tree_seeds[i], false, tree_seeds[i], false); + check_distance(tree_seeds[i], false, tree_seeds[i], true); + check_distance(tree_seeds[i], true, tree_seeds[i], false); + for (size_t j = i + 1; j < tree_seeds.size(); j++) { + // Check all orientation pairs + check_distance(tree_seeds[i], false, tree_seeds[j], false); + check_distance(tree_seeds[i], false, tree_seeds[j], true); + check_distance(tree_seeds[i], true, tree_seeds[j], false); + check_distance(tree_seeds[i], true, tree_seeds[j], true); + } + } + + return has_missing; +} + std::vector calculate_transition_read_distances( const std::vector& all_transitions, const VectorView& to_chain, diff --git a/src/algorithms/chain_items.hpp b/src/algorithms/chain_items.hpp index 9ac7b792e0..f4531d4201 100644 --- a/src/algorithms/chain_items.hpp +++ b/src/algorithms/chain_items.hpp @@ -477,6 +477,23 @@ std::vector generate_zip_tree_transitions( const std::unordered_map& seed_to_starting, const std::unordered_map& seed_to_ending); +/** + * Check if all possible transitions were actually found. + * + * Iterates over all pairs of seeds and uses the distance index + * to determine if there SHOULD have been a transition. + * + * Returns if any transitions were missing. + */ +bool find_missing_zip_tree_transitions( + const std::vector& seeds, + const ZipCodeTree& zip_code_tree, + size_t max_graph_lookback_bases, + const std::unordered_map& seed_to_starting, + const std::unordered_map& seed_to_ending, + const SnarlDistanceIndex& distance_index, + const std::vector& all_transitions); + /** * Calculate read distances for each of the zip tree's transitions. * Also filters out transitions that can't be used,