diff --git a/src/algorithms/chain_items.cpp b/src/algorithms/chain_items.cpp index c84e399ce1..76b0ddf21d 100644 --- a/src/algorithms/chain_items.cpp +++ b/src/algorithms/chain_items.cpp @@ -547,6 +547,8 @@ TracedScore chain_items_dp(vector& chain_scores, cerr << "Chaining group of " << to_chain.size() << " items" << endl; } + crash_unless(recomb_penalty >= 0); + // Compute a base seed average length. // TODO: Weight anchors differently? // TODO: Will this always be the same for all anchors in practice? @@ -557,6 +559,20 @@ TracedScore chain_items_dp(vector& chain_scores, base_seed_length /= to_chain.size(); chain_scores.resize(to_chain.size()); + + // We want to prefer to come from seeds where the transition preserves + // access to matching haplotypes, because we don't want to back ourselves + // into a corner where we need a recombination when we don't really have + // to. So we cheat on the dynamic programming by adding an "evaluation + // bonus" to the scores of the different DP options when comparing them. We + // keep this bonus out of the actual recorded scores because we don't want + // it raising the scores we actually get the more transitions we take. + // + // We store the bonus used to select the current winning predecessor for + // each seed in this vector, which runs alongside the DP table. + // + // Starting from nowhere means full path conservation, so bonus = recomb_penalty. + std::vector eval_bonuses(to_chain.size(), recomb_penalty); for (size_t i = 0; i < to_chain.size(); i++) { // Set up DP table so we can start anywhere with that item's score, scaled and with bonus applied. chain_scores[i] = {(int)(to_chain[i].score() * item_scale + item_bonus), TracedScore::nowhere(), to_chain[i].anchor_end_paths()}; @@ -586,8 +602,20 @@ TracedScore chain_items_dp(vector& chain_scores, } // If we come from nowhere, we get those points. - chain_scores[transition.to_anchor] = std::max(chain_scores[transition.to_anchor], - {(int)item_points, TracedScore::nowhere(), here.anchor_end_paths()}); + // This also has full path conservation (bonus = recomb_penalty). + { + TracedScore from_nowhere = {(int)item_points, TracedScore::nowhere(), here.anchor_end_paths()}; + int nowhere_bonus = recomb_penalty; + int eval_nowhere = from_nowhere.score + nowhere_bonus; + int eval_current = chain_scores[transition.to_anchor].score + eval_bonuses[transition.to_anchor]; + if (eval_nowhere > eval_current) { + chain_scores[transition.to_anchor] = from_nowhere; + eval_bonuses[transition.to_anchor] = nowhere_bonus; + } else if (eval_nowhere == eval_current && from_nowhere > chain_scores[transition.to_anchor]) { + chain_scores[transition.to_anchor] = from_nowhere; + eval_bonuses[transition.to_anchor] = nowhere_bonus; + } + } // For each source we could come from auto& source = to_chain[transition.from_anchor]; @@ -664,8 +692,34 @@ TracedScore chain_items_dp(vector& chain_scores, TracedScore from_source_score = source_score.add_points(jump_points + item_points) .set_shared_paths(here.anchor_paths()); - // Remember that we could make this jump - chain_scores[transition.to_anchor] = std::max(chain_scores[transition.to_anchor], from_source_score); + // Evaluate heuristic to preserve path flexibility without inflating actual scoring DP. + // Bonus = fraction of conserved paths * recomb_penalty. + // Bonus is 0 when recombination occurs (no shared paths). + int eval_bonus_from = 0; + if (recomb_penalty > 0) { + int pre_count = __builtin_popcountll(source_score.paths); + if (pre_count > 0 && (source_score.paths & here.anchor_start_paths()) != 0) { + // No recombination: bonus = fraction of paths conserved * penalty + int post_count = __builtin_popcountll(from_source_score.paths); + eval_bonus_from = (recomb_penalty * post_count) / pre_count; + } + // Recombination case (no shared paths): bonus stays 0 + } + + // Grab the DP table slot we are updating + auto& current_best = chain_scores[transition.to_anchor]; + // Compute the evaluation value for the new candidate + int eval_from = from_source_score.score + eval_bonus_from; + // Reconstruct the evaluation value for the current winner + int eval_best = current_best.score + eval_bonuses[transition.to_anchor]; + + if (eval_from > eval_best || (eval_from == eval_best && from_source_score > current_best)) { + // Using the evaluation values, and then if tied the real DP + // scores, this new candidate beats the previous winner, so + // replace it. + current_best = from_source_score; + eval_bonuses[transition.to_anchor] = eval_bonus_from; + } if (show_work) { #ifdef debug_dp diff --git a/src/alignment.cpp b/src/alignment.cpp index 567bfb016a..faf669ea69 100644 --- a/src/alignment.cpp +++ b/src/alignment.cpp @@ -3507,6 +3507,65 @@ pair aligned_interval(const Alignment& aln) { return pair(softclip_start(aln), aln.sequence().size() - softclip_end(aln)); } +void count_alignment_operations(const Alignment& aln, size_t& matches, size_t& mismatches, std::vector& gap_lengths) { + matches = 0; + mismatches = 0; + gap_lengths.clear(); + + enum class EditType { MATCH, MISMATCH, INS, DEL, COMPLEX, NONE }; + EditType prev_type = EditType::NONE; + size_t current_gap_length = 0; + + auto finish_gap = [&]() { + if (current_gap_length > 0) { + gap_lengths.push_back(current_gap_length); + current_gap_length = 0; + } + }; + + for (size_t i = 0; i < aln.path().mapping_size(); ++i) { + auto& mapping = aln.path().mapping(i); + for (size_t j = 0; j < mapping.edit_size(); ++j) { + auto& edit = mapping.edit(j); + if (edit.from_length() == edit.to_length() && edit.from_length() > 0) { + finish_gap(); + if (edit.sequence().empty()) { + matches += edit.from_length(); + prev_type = EditType::MATCH; + } else { + mismatches += edit.from_length(); + prev_type = EditType::MISMATCH; + } + } else if (edit.from_length() == 0 && edit.to_length() > 0) { + if (prev_type != EditType::INS) finish_gap(); + current_gap_length += edit.to_length(); + prev_type = EditType::INS; + } else if (edit.from_length() > 0 && edit.to_length() == 0) { + if (prev_type != EditType::DEL) finish_gap(); + current_gap_length += edit.from_length(); + prev_type = EditType::DEL; + } else { + finish_gap(); + mismatches += max(edit.from_length(), edit.to_length()); + prev_type = EditType::COMPLEX; + } + } + } + finish_gap(); +} + +int score_alignment_with_logged_gaps(const size_t& matches, const size_t& mismatches, const std::vector& gap_lengths) { + double d = max(0.02, static_cast(mismatches + gap_lengths.size()) / static_cast(matches + mismatches + gap_lengths.size())); + double non_match_penalty = static_cast(mismatches + gap_lengths.size()) / (2.0 * d); + + double indel_penalty = 0; + for (auto& gap_length : gap_lengths) { + indel_penalty += log2(1.0 + gap_length); + } + int adjusted_score = std::round(matches - non_match_penalty - indel_penalty); + return adjusted_score; +} + string mate_info(const string& path, int32_t pos, bool rev_strand, bool is_read1) { subrange_t subrange; string path_name = Paths::strip_subrange(path, &subrange); diff --git a/src/alignment.hpp b/src/alignment.hpp index cea9bc524d..1dbfc9870f 100644 --- a/src/alignment.hpp +++ b/src/alignment.hpp @@ -330,6 +330,15 @@ bool is_supplementary(const Alignment& alignment); // The indexes on the read sequence of the portion of the read that is aligned outside of soft clips pair aligned_interval(const Alignment& aln); +/// Count the various types of edits in an Alignment, including individual gap lengths. +void count_alignment_operations(const Alignment& aln, size_t& matches, size_t& mismatches, std::vector& gaps_lengths); +/// Compute an alignment score using minimap2 long indels penalty adjustment. +/// +/// This scoring method penalize long continous indels less, using the formula: +/// score = matches - (mismatches + gap_opens)/2d - sum_{i=1}^{gap_opens} (log_2(1 + gap_length_i)) +/// with d = max{0.02, (mismatches + gap_opens)/(matches + mismatches + gap_opens)} +int score_alignment_with_logged_gaps(const size_t& matches, const size_t& mismatches, const std::vector& gap_lengths); + // create an annotation string required to properly set the SAM fields/flags of a supplementary alignment // the arguments all refer to properties of the primary *mate* alignment // the path name saved in the info is the base path name, with any subrange info reflected in the position diff --git a/src/minimizer_mapper.hpp b/src/minimizer_mapper.hpp index 693f19cb8b..2291827383 100644 --- a/src/minimizer_mapper.hpp +++ b/src/minimizer_mapper.hpp @@ -871,7 +871,8 @@ class MinimizerMapper : public AlignerClient { */ void do_chaining_on_trees(Alignment& aln, const ZipCodeForest& zip_code_forest, const std::vector& seeds, const VectorView& minimizers, const vector& seed_anchors, - std::vector>& chains, std::vector>& chain_rec_flags, std::vector& chain_source_tree, + std::vector>& chains, std::vector>& chain_rec_flags, + std::vector& chain_rec_counts, std::vector& chain_source_tree, std::vector& chain_score_estimates, std::vector>& minimizer_kept_chain_count, std::vector& multiplicity_by_chain, std::vector& alignments, SmallBitset& minimizer_explored, vector& multiplicity_by_alignment, diff --git a/src/minimizer_mapper_from_chains.cpp b/src/minimizer_mapper_from_chains.cpp index 4ba5d3b909..cb12c8b11a 100644 --- a/src/minimizer_mapper_from_chains.cpp +++ b/src/minimizer_mapper_from_chains.cpp @@ -32,6 +32,7 @@ #include #include #include +#include // Turn on debugging prints //#define debug @@ -774,6 +775,8 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { std::vector> chains; // For each chain, mark per-seed whether it came from a recombinant anchor std::vector> chain_rec_flags; + // For each chain, track how many recombination events were used + std::vector chain_rec_counts; // The zip code tree it came from std::vector chain_source_tree; // An estimated alignment score @@ -784,7 +787,7 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { std::vector multiplicity_by_chain; do_chaining_on_trees(aln, zip_code_forest, seeds, minimizers, seed_anchors, - chains, chain_rec_flags, chain_source_tree, chain_score_estimates, + chains, chain_rec_flags, chain_rec_counts, chain_source_tree, chain_score_estimates, minimizer_kept_chain_count, multiplicity_by_chain, alignments, minimizer_explored, multiplicity_by_alignment, rng, funnel); @@ -844,6 +847,49 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { alignments_to_source, minimizer_explored, stats, funnel_depleted, rng, funnel); } + for (size_t alignment_index = 0; alignment_index < alignments.size(); ++alignment_index) { + // Rescore all the alignments using minimap2 logged-gap-length, read-identity-based scoring + + if (alignments[alignment_index].path().mapping_size() == 0) { + // Unmapped, so skip it. + continue; + } + + size_t matches, mismatches; + std::vector gap_lengths; + count_alignment_operations(alignments[alignment_index], matches, mismatches, gap_lengths); + + if (matches + mismatches + gap_lengths.size() == 0) { + continue; + } + + // Compute the logged-gaps score + auto logged_gaps_score = score_alignment_with_logged_gaps(matches, mismatches, gap_lengths); + alignments[alignment_index].set_score(logged_gaps_score); + if (show_work) { + #pragma omp critical (cerr) + { + cerr << log_name() << "Matches: " << matches << " Mismatches: " << mismatches << " Gap opens: " << gap_lengths.size() << " New score: " << logged_gaps_score << endl; + } + } + } + if (!chain_rec_counts.empty() && !alignments_to_source.empty()) { + for (size_t alignment_index = 0; alignment_index < alignments_to_source.size(); ++alignment_index) { + size_t chain_index = alignments_to_source[alignment_index]; + if (chain_index != std::numeric_limits::max() && chain_index < chain_rec_counts.size()) { + set_annotation(alignments[alignment_index], "chain.rec_count", (double) chain_rec_counts[chain_index]); + if (rec_penalty_chain != 0) { + // Penalize the score of alignment candidates according to the number of recombinations their chains required. + // This allows alignments that required fewer recombinations in their chains to win. + // TODO: We'd also eventaully like to count recombinations that we don't know are needed until base-level DP. + int64_t penalty = static_cast(rec_penalty_chain) * static_cast(chain_rec_counts[chain_index]); + int64_t penalized_score = static_cast(alignments[alignment_index].score()) - penalty; + alignments[alignment_index].set_score(static_cast(penalized_score)); + } + } + } + } + if (track_provenance) { // Now say we are finding the winner(s) @@ -1096,7 +1142,8 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& zip_code_forest, const std::vector& seeds, const VectorView& minimizers, const vector& seed_anchors, - std::vector>& chains, std::vector>& chain_rec_flags, std::vector& chain_source_tree, + std::vector>& chains, std::vector>& chain_rec_flags, + std::vector& chain_rec_counts, std::vector& chain_source_tree, std::vector& chain_score_estimates, std::vector>& minimizer_kept_chain_count, std::vector& multiplicity_by_chain, std::vector& alignments, SmallBitset& minimizer_explored, vector& multiplicity_by_alignment, @@ -1544,9 +1591,13 @@ void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& indel_limit, show_work ); +#ifdef debug_rec + if (true) { +#else if (show_work) { +#endif #pragma omp critical (cerr) - cerr << log_name() << "Found " << chain_results.chains.size() << " chains in zip code tree " << item_num + cerr << log_name() << "\t[" << aln.name() << "] Found " << chain_results.chains.size() << " chains in zip code tree " << item_num << " running " << anchors_to_chain[anchor_indexes.front()] << " to " << anchors_to_chain[anchor_indexes.back()] << std::endl; } @@ -1556,7 +1607,12 @@ void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& auto& entry = chain_results.chains[result]; auto& scored_chain = entry.scored_chain; auto& chain_rec_positions = entry.rec_positions; - if (show_work) { +#ifdef debug_rec + if (true) +#else + if (show_work) +#endif + { #ifdef debug if(true) #else @@ -1566,24 +1622,42 @@ void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& if (!scored_chain.second.empty()) { #pragma omp critical (cerr) { - cerr << log_name() << "\tChain with score " << scored_chain.first + cerr << log_name() << "\t[" << aln.name() << "] Chain " << result << " with score " << scored_chain.first << " (rec num =" << chain_rec_positions.size() << ") and length " << scored_chain.second.size() << " running " << anchor_view[scored_chain.second.front()] - << " to " << anchor_view[scored_chain.second.back()] << std::endl; + << " to " << anchor_view[scored_chain.second.back()]; if (!chain_rec_positions.empty()) { - { - cerr << log_name() << "\t\tRecombination introduced at anchors: "; - for (size_t pi = 0; pi < chain_rec_positions.size(); ++pi) { - if (pi) cerr << ", "; - cerr << chain_rec_positions[pi]; - } - cerr << std::endl; + cerr << " recombination introduced at anchors: "; + for (size_t pi = 0; pi < chain_rec_positions.size(); ++pi) { + if (pi) cerr << ", "; + cerr << chain_rec_positions[pi]; } } -#ifdef debug - - for (auto& anchor_number : scored_chain.second) { - std::cerr << log_name() << "\t\t" << anchor_view[anchor_number] << std::endl; + cerr << std::endl; +#ifdef debug_rec + algorithms::path_flags_t current_paths = 0; + bool first = true; + for (auto& selected_number : scored_chain.second) { + auto& anchor = anchor_view[selected_number]; + auto new_paths = anchor.anchor_paths(); + if (first) { + current_paths = new_paths.second; + first = false; + } else { + if (new_paths.first == new_paths.second) { + if ((current_paths & new_paths.first) == 0) { + current_paths = new_paths.first; + } else { + current_paths &= new_paths.first; + } + } else { + current_paths = new_paths.second; + } + } + + std::cerr << log_name() << "\t\t" << anchor + << " anchor_paths: " << std::bitset<64>(new_paths.first).count() << " " << std::bitset<64>(new_paths.first) + << " chain_paths: " << std::bitset<64>(current_paths).count() << " " << std::bitset<64>(current_paths) << std::endl; } #endif @@ -1591,7 +1665,7 @@ void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& } } else if (result == MANY_LIMIT) { #pragma omp critical (cerr) - std::cerr << log_name() << "\t<" << (chain_results.chains.size() - result) << " more chains>" << std::endl; + std::cerr << log_name() << "\t[" << aln.name() << "] <" << (chain_results.chains.size() - result) << " more chains>" << std::endl; } } @@ -1627,6 +1701,8 @@ void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& } // Remember the score chain_score_estimates.push_back(scored_chain.first); + // Remember how many recombinations were in this chain + chain_rec_counts.push_back(chain_rec_positions.size()); // Remember how we got it chain_source_tree.push_back(item_num); @@ -1773,7 +1849,7 @@ void MinimizerMapper::get_best_chain_stats(Alignment& aln, const ZipCodeForest& best_chain_longest_jump = std::max(best_chain_longest_jump, jump); best_chain_total_jump += jump; } - best_chain_average_jump = chains.at(best_chain).size() > 1 ? best_chain_total_jump / (chains.at(best_chain).size() - 1) : 0.0; + best_chain_average_jump = chains.at(best_chain).size() > 1 ? (double)best_chain_total_jump / (chains.at(best_chain).size() - 1) : 0.0; } // Also count anchors in the chain diff --git a/test/t/50_vg_giraffe.t b/test/t/50_vg_giraffe.t index b1136dceef..5ec9a33612 100644 --- a/test/t/50_vg_giraffe.t +++ b/test/t/50_vg_giraffe.t @@ -124,6 +124,8 @@ is "$(grep -c 'error.*are not compatible' log.txt)" "1" "appropriate error messa rm t1.bam t2.bam t3.bam t1.gaf tagged1.fq tagged2.fq rm -f read.fq read.gam +rm -rf explanation_* + vg giraffe -Z x.giraffe.gbz -f reads/small.middle.ref.indel.multi.fq --show-work --track-position -b chaining-sr > /dev/null 2>&1 # Check that at least some TSV files and directories were created is "$(find explanation_read1 -name 'chain*-dotplot*.tsv' 2>/dev/null | wc -l | tr -d ' ')" "1" "Chain explanation files are created per chain" @@ -297,8 +299,8 @@ vg index -j 1mb1kgp.dist 1mb1kgp.vg vg autoindex -p 1mb1kgp -w giraffe -P "VG w/ Variant Paths:1mb1kgp.vg" -P "Giraffe Distance Index:1mb1kgp.dist" -r 1mb1kgp/z.fa -v 1mb1kgp/z.vcf.gz vg giraffe -Z 1mb1kgp.giraffe.gbz -f reads/1mb1kgp_longread.fq >longread.gam -U 300 --track-provenance --align-from-chains --set-refpos # This is an 8001 bp read with 1 insert and 1 substitution -# 7999 * 1 + 1 * -4 + -6 + 5 + 5 = 7999 -is "$(vg view -aj longread.gam | jq -r '.score')" "7999" "A long read can be correctly aligned" +# We use minimap2-based scoring which awards that this many points. +is "$(vg view -aj longread.gam | jq -r '.score')" "7948" "A long read can be correctly aligned" is "$(vg view -aj longread.gam | jq -c '.path.mapping[].edit[] | select(.sequence)' | wc -l | sed 's/^[[:space:]]*//')" "2" "A long read has the correct edits found" is "$(vg view -aj longread.gam | jq -c '. | select(.annotation["filter_3_cluster-coverage_cluster_passed_size_total"] <= 300)' | wc -l | sed 's/^[[:space:]]*//')" "1" "Long read minimizer set is correctly restricted" is "$(vg view -aj longread.gam | jq -c '.refpos[]' | wc -l)" "$(vg view -aj longread.gam | jq -c '.path.mapping[]' | wc -l)" "Giraffe sets refpos for each reference node"