From 82678d47a1961baa7626763b9cd9068b534fc003 Mon Sep 17 00:00:00 2001 From: dmonti Date: Mon, 16 Mar 2026 02:58:32 -0700 Subject: [PATCH 01/11] added recombination info in output gam and log --- src/minimizer_mapper.hpp | 3 ++- src/minimizer_mapper_from_chains.cpp | 36 ++++++++++++++++++---------- 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/src/minimizer_mapper.hpp b/src/minimizer_mapper.hpp index 41a376069e..2bf608be9d 100644 --- a/src/minimizer_mapper.hpp +++ b/src/minimizer_mapper.hpp @@ -867,7 +867,8 @@ class MinimizerMapper : public AlignerClient { */ void do_chaining_on_trees(Alignment& aln, const ZipCodeForest& zip_code_forest, const std::vector& seeds, const VectorView& minimizers, const vector& seed_anchors, - std::vector>& chains, std::vector>& chain_rec_flags, std::vector& chain_source_tree, + std::vector>& chains, std::vector>& chain_rec_flags, + std::vector& chain_rec_counts, std::vector& chain_source_tree, std::vector& chain_score_estimates, std::vector>& minimizer_kept_chain_count, std::vector& multiplicity_by_chain, std::vector& alignments, SmallBitset& minimizer_explored, vector& multiplicity_by_alignment, diff --git a/src/minimizer_mapper_from_chains.cpp b/src/minimizer_mapper_from_chains.cpp index 26c86dce02..2cd0a4bb34 100644 --- a/src/minimizer_mapper_from_chains.cpp +++ b/src/minimizer_mapper_from_chains.cpp @@ -756,6 +756,8 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { std::vector> chains; // For each chain, mark per-seed whether it came from a recombinant anchor std::vector> chain_rec_flags; + // For each chain, track how many recombination events were used + std::vector chain_rec_counts; // The zip code tree it came from std::vector chain_source_tree; // An estimated alignment score @@ -766,7 +768,7 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { std::vector multiplicity_by_chain; do_chaining_on_trees(aln, zip_code_forest, seeds, minimizers, seed_anchors, - chains, chain_rec_flags, chain_source_tree, chain_score_estimates, + chains, chain_rec_flags, chain_rec_counts, chain_source_tree, chain_score_estimates, minimizer_kept_chain_count, multiplicity_by_chain, alignments, minimizer_explored, multiplicity_by_alignment, rng, funnel); @@ -825,6 +827,15 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { minimizer_kept_chain_count, alignments, multiplicity_by_alignment, alignments_to_source, minimizer_explored, stats, funnel_depleted, rng, funnel); } + + if (!chain_rec_counts.empty() && !alignments_to_source.empty()) { + for (size_t alignment_index = 0; alignment_index < alignments_to_source.size(); ++alignment_index) { + size_t chain_index = alignments_to_source[alignment_index]; + if (chain_index != std::numeric_limits::max() && chain_index < chain_rec_counts.size()) { + set_annotation(alignments[alignment_index], "chain.rec_count", (double) chain_rec_counts[chain_index]); + } + } + } if (track_provenance) { @@ -1078,7 +1089,8 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& zip_code_forest, const std::vector& seeds, const VectorView& minimizers, const vector& seed_anchors, - std::vector>& chains, std::vector>& chain_rec_flags, std::vector& chain_source_tree, + std::vector>& chains, std::vector>& chain_rec_flags, + std::vector& chain_rec_counts, std::vector& chain_source_tree, std::vector& chain_score_estimates, std::vector>& minimizer_kept_chain_count, std::vector& multiplicity_by_chain, std::vector& alignments, SmallBitset& minimizer_explored, vector& multiplicity_by_alignment, @@ -1548,20 +1560,18 @@ void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& if (!scored_chain.second.empty()) { #pragma omp critical (cerr) { - cerr << log_name() << "\tChain with score " << scored_chain.first + cerr << log_name() << "\t[" << aln.name() << "] Chain " << result << " with score " << scored_chain.first << " (rec num =" << chain_rec_positions.size() << ") and length " << scored_chain.second.size() << " running " << anchor_view[scored_chain.second.front()] - << " to " << anchor_view[scored_chain.second.back()] << std::endl; + << " to " << anchor_view[scored_chain.second.back()]; if (!chain_rec_positions.empty()) { - { - cerr << log_name() << "\t\tRecombination introduced at anchors: "; - for (size_t pi = 0; pi < chain_rec_positions.size(); ++pi) { - if (pi) cerr << ", "; - cerr << chain_rec_positions[pi]; - } - cerr << std::endl; + cerr << " recombination introduced at anchors: "; + for (size_t pi = 0; pi < chain_rec_positions.size(); ++pi) { + if (pi) cerr << ", "; + cerr << chain_rec_positions[pi]; } } + cerr << std::endl; #ifdef debug for (auto& anchor_number : scored_chain.second) { @@ -1573,7 +1583,7 @@ void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& } } else if (result == MANY_LIMIT) { #pragma omp critical (cerr) - std::cerr << log_name() << "\t<" << (chain_results.chains.size() - result) << " more chains>" << std::endl; + std::cerr << log_name() << "\t[" << aln.name() << "] <" << (chain_results.chains.size() - result) << " more chains>" << std::endl; } } @@ -1609,6 +1619,8 @@ void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& } // Remember the score chain_score_estimates.push_back(scored_chain.first); + // Remember how many recombinations were in this chain + chain_rec_counts.push_back(chain_rec_positions.size()); // Remember how we got it chain_source_tree.push_back(item_num); From 39cc3a538d42f8b44de94935bea90032c765c3e0 Mon Sep 17 00:00:00 2001 From: dmonti Date: Tue, 17 Mar 2026 08:28:58 -0700 Subject: [PATCH 02/11] minimap2 like rescore, progressive recombination penalty --- src/algorithms/chain_items.cpp | 50 +++++++++++++-- src/minimizer_mapper_from_chains.cpp | 91 +++++++++++++++++++++++++++- 2 files changed, 136 insertions(+), 5 deletions(-) diff --git a/src/algorithms/chain_items.cpp b/src/algorithms/chain_items.cpp index 70b27ffab6..105a96cc37 100644 --- a/src/algorithms/chain_items.cpp +++ b/src/algorithms/chain_items.cpp @@ -556,6 +556,10 @@ TracedScore chain_items_dp(vector& chain_scores, base_seed_length /= to_chain.size(); chain_scores.resize(to_chain.size()); + // Track eval bonus for heuristic comparison (path conservation bonus, used for + // selection only without affecting the actual stored score). + // Starting from nowhere means full path conservation, so bonus = recomb_penalty. + std::vector eval_bonuses(to_chain.size(), std::max(0, recomb_penalty)); for (size_t i = 0; i < to_chain.size(); i++) { // Set up DP table so we can start anywhere with that item's score, scaled and with bonus applied. chain_scores[i] = {(int)(to_chain[i].score() * item_scale + item_bonus), TracedScore::nowhere(), to_chain[i].anchor_end_paths()}; @@ -583,8 +587,20 @@ TracedScore chain_items_dp(vector& chain_scores, } // If we come from nowhere, we get those points. - chain_scores[transition.to_anchor] = std::max(chain_scores[transition.to_anchor], - {(int)item_points, TracedScore::nowhere(), here.anchor_end_paths()}); + // This also has full path conservation (bonus = recomb_penalty). + { + TracedScore from_nowhere = {(int)item_points, TracedScore::nowhere(), here.anchor_end_paths()}; + int nowhere_bonus = std::max(0, recomb_penalty); + int eval_nowhere = from_nowhere.score + nowhere_bonus; + int eval_current = chain_scores[transition.to_anchor].score + eval_bonuses[transition.to_anchor]; + if (eval_nowhere > eval_current) { + chain_scores[transition.to_anchor] = from_nowhere; + eval_bonuses[transition.to_anchor] = nowhere_bonus; + } else if (eval_nowhere == eval_current && from_nowhere > chain_scores[transition.to_anchor]) { + chain_scores[transition.to_anchor] = from_nowhere; + eval_bonuses[transition.to_anchor] = nowhere_bonus; + } + } // For each source we could come from auto& source = to_chain[transition.from_anchor]; @@ -657,8 +673,34 @@ TracedScore chain_items_dp(vector& chain_scores, TracedScore from_source_score = source_score.add_points(jump_points + item_points) .set_shared_paths(here.anchor_paths()); - // Remember that we could make this jump - chain_scores[transition.to_anchor] = std::max(chain_scores[transition.to_anchor], from_source_score); + // Evaluate heuristic to preserve path flexibility without inflating actual scoring DP. + // Bonus = fraction of conserved paths * recomb_penalty. + // Bonus is 0 when recombination occurs (no shared paths). + int eval_bonus_from = 0; + if (recomb_penalty > 0) { + int pre_count = __builtin_popcountll(source_score.paths); + if (pre_count > 0 && (source_score.paths & here.anchor_start_paths()) != 0) { + // No recombination: bonus = fraction of paths conserved * penalty + int post_count = __builtin_popcountll(from_source_score.paths); + eval_bonus_from = (recomb_penalty * post_count) / pre_count; + } + // Recombination case (no shared paths): bonus stays 0 + } + + auto& current_best = chain_scores[transition.to_anchor]; + int eval_from = from_source_score.score + eval_bonus_from; + int eval_best = current_best.score + eval_bonuses[transition.to_anchor]; + + if (eval_from > eval_best) { + current_best = from_source_score; + eval_bonuses[transition.to_anchor] = eval_bonus_from; + } else if (eval_from == eval_best) { + // Tie-breaker using actual underlying standard formulations + if (from_source_score > current_best) { + current_best = from_source_score; + eval_bonuses[transition.to_anchor] = eval_bonus_from; + } + } if (show_work) { cerr << "\t\tWe can reach #" << transition.to_anchor << " with " << source_score << " + " << jump_points diff --git a/src/minimizer_mapper_from_chains.cpp b/src/minimizer_mapper_from_chains.cpp index 2cd0a4bb34..6bd5fda2ac 100644 --- a/src/minimizer_mapper_from_chains.cpp +++ b/src/minimizer_mapper_from_chains.cpp @@ -827,12 +827,101 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { minimizer_kept_chain_count, alignments, multiplicity_by_alignment, alignments_to_source, minimizer_explored, stats, funnel_depleted, rng, funnel); } - + // Implement the logic for minimap2 long indels penalty adjustment. + // The new alignment score penalize long continous indels less, using the formula: + // score = matches - (mismatches + gap_opens)/2d - sum_{i=1}^{gap_opens} (log_2(1 + gap_length_i)) + // with d = max{0.02, (mismatches + gap_opens)/(matches + mismatches + gap_opens)} + for (size_t alignment_index = 0; alignment_index < alignments.size(); ++alignment_index) { + if (alignments[alignment_index].path().mapping_size() == 0) { + // Unmapped, so skip it. + continue; + } + int matches = 0; + int mismatches = 0; + int gap_opens = 0; + vector gap_lengths; + for (size_t i = 0; i < alignments[alignment_index].path().mapping_size(); ++i) { + auto& mapping = alignments[alignment_index].path().mapping(i); + for (size_t j = 0; j < mapping.edit_size(); ++j) { + auto& edit = mapping.edit(j); + if (edit.from_length() == edit.to_length() && edit.from_length() > 0) { + if (edit.sequence().empty()) { + matches += edit.from_length(); + } else { + mismatches += edit.from_length(); + } + } else if (edit.from_length() == 0 && edit.to_length() > 0) { + gap_opens++; + gap_lengths.push_back(edit.to_length()); + } else if (edit.from_length() > 0 && edit.to_length() == 0) { + gap_opens++; + gap_lengths.push_back(edit.from_length()); + } else { + mismatches += max(edit.from_length(), edit.to_length()); + } + } + } + + double d = max(0.02, static_cast(mismatches + gap_opens) / static_cast(matches + mismatches + gap_opens)); + double non_match_penalty = static_cast(mismatches + gap_opens) / (2.0 * d); + + double indel_penalty = 0; + for (auto& gap_length : gap_lengths) { + indel_penalty += log2(1 + gap_length); + } + int adjusted_score = static_cast(matches - non_match_penalty - indel_penalty); + alignments[alignment_index].set_score(adjusted_score); + if (show_work) { + #pragma omp critical (cerr) + { + // cerr << log_name() << "Alignment " << alignment_index << " CIGAR:"; + // // print whole cigar compressed with separated matches/mismatches + // vector> cigar_ops; + // auto add_op = [&](int len, char op) { + // if (len == 0) return; + // if (!cigar_ops.empty() && cigar_ops.back().second == op) { + // cigar_ops.back().first += len; + // } else { + // cigar_ops.push_back({len, op}); + // } + // }; + + // for (size_t i = 0; i < alignments[alignment_index].path().mapping_size(); ++i) { + // auto& mapping = alignments[alignment_index].path().mapping(i); + // for (size_t j = 0; j < mapping.edit_size(); ++j) { + // auto& edit = mapping.edit(j); + // if (edit.from_length() == edit.to_length() && edit.from_length() > 0) { + // if (edit.sequence().empty()) { + // add_op(edit.from_length(), '='); + // } else { + // add_op(edit.from_length(), 'X'); + // } + // } else if (edit.from_length() == 0 && edit.to_length() > 0) { + // add_op(edit.to_length(), 'I'); + // } else if (edit.from_length() > 0 && edit.to_length() == 0) { + // add_op(edit.from_length(), 'D'); + // } else { + // add_op(max(edit.from_length(), edit.to_length()), 'X'); + // } + // } + // } + // for (auto& op : cigar_ops) { + // cerr << " " << op.first << op.second; + // } + cerr << log_name() << "Matches: " << matches << " Mismatches: " << mismatches << " Gap opens: " << gap_opens << " New score: " << adjusted_score << endl; + } + } + } if (!chain_rec_counts.empty() && !alignments_to_source.empty()) { for (size_t alignment_index = 0; alignment_index < alignments_to_source.size(); ++alignment_index) { size_t chain_index = alignments_to_source[alignment_index]; if (chain_index != std::numeric_limits::max() && chain_index < chain_rec_counts.size()) { set_annotation(alignments[alignment_index], "chain.rec_count", (double) chain_rec_counts[chain_index]); + if (rec_penalty_chain != 0) { + int64_t penalty = static_cast(rec_penalty_chain) * static_cast(chain_rec_counts[chain_index]); + int64_t adjusted_score = static_cast(alignments[alignment_index].score()) - penalty; + alignments[alignment_index].set_score(static_cast(adjusted_score)); + } } } } From 804df61b86c2e14a86f17447016d763e1757148d Mon Sep 17 00:00:00 2001 From: dmonti Date: Mon, 23 Mar 2026 07:30:52 -0700 Subject: [PATCH 03/11] bugfixes for rescoring --- src/minimizer_mapper_from_chains.cpp | 74 +++++++++++++--------------- 1 file changed, 33 insertions(+), 41 deletions(-) diff --git a/src/minimizer_mapper_from_chains.cpp b/src/minimizer_mapper_from_chains.cpp index 6bd5fda2ac..d20c49e09c 100644 --- a/src/minimizer_mapper_from_chains.cpp +++ b/src/minimizer_mapper_from_chains.cpp @@ -840,74 +840,65 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { int mismatches = 0; int gap_opens = 0; vector gap_lengths; + + enum class EditType { MATCH, MISMATCH, INS, DEL, COMPLEX, NONE }; + EditType prev_type = EditType::NONE; + size_t current_gap_length = 0; + + auto finish_gap = [&]() { + if (current_gap_length > 0) { + gap_opens++; + gap_lengths.push_back(current_gap_length); + current_gap_length = 0; + } + }; + for (size_t i = 0; i < alignments[alignment_index].path().mapping_size(); ++i) { auto& mapping = alignments[alignment_index].path().mapping(i); for (size_t j = 0; j < mapping.edit_size(); ++j) { auto& edit = mapping.edit(j); if (edit.from_length() == edit.to_length() && edit.from_length() > 0) { + finish_gap(); if (edit.sequence().empty()) { matches += edit.from_length(); + prev_type = EditType::MATCH; } else { mismatches += edit.from_length(); + prev_type = EditType::MISMATCH; } } else if (edit.from_length() == 0 && edit.to_length() > 0) { - gap_opens++; - gap_lengths.push_back(edit.to_length()); + if (prev_type != EditType::INS) finish_gap(); + current_gap_length += edit.to_length(); + prev_type = EditType::INS; } else if (edit.from_length() > 0 && edit.to_length() == 0) { - gap_opens++; - gap_lengths.push_back(edit.from_length()); + if (prev_type != EditType::DEL) finish_gap(); + current_gap_length += edit.from_length(); + prev_type = EditType::DEL; } else { + finish_gap(); mismatches += max(edit.from_length(), edit.to_length()); + prev_type = EditType::COMPLEX; } } } + finish_gap(); + if (matches + mismatches + gap_opens == 0) { + continue; + } + double d = max(0.02, static_cast(mismatches + gap_opens) / static_cast(matches + mismatches + gap_opens)); double non_match_penalty = static_cast(mismatches + gap_opens) / (2.0 * d); double indel_penalty = 0; for (auto& gap_length : gap_lengths) { - indel_penalty += log2(1 + gap_length); + indel_penalty += log2(1.0 + gap_length); } - int adjusted_score = static_cast(matches - non_match_penalty - indel_penalty); + int adjusted_score = std::round(matches - non_match_penalty - indel_penalty); alignments[alignment_index].set_score(adjusted_score); if (show_work) { #pragma omp critical (cerr) { - // cerr << log_name() << "Alignment " << alignment_index << " CIGAR:"; - // // print whole cigar compressed with separated matches/mismatches - // vector> cigar_ops; - // auto add_op = [&](int len, char op) { - // if (len == 0) return; - // if (!cigar_ops.empty() && cigar_ops.back().second == op) { - // cigar_ops.back().first += len; - // } else { - // cigar_ops.push_back({len, op}); - // } - // }; - - // for (size_t i = 0; i < alignments[alignment_index].path().mapping_size(); ++i) { - // auto& mapping = alignments[alignment_index].path().mapping(i); - // for (size_t j = 0; j < mapping.edit_size(); ++j) { - // auto& edit = mapping.edit(j); - // if (edit.from_length() == edit.to_length() && edit.from_length() > 0) { - // if (edit.sequence().empty()) { - // add_op(edit.from_length(), '='); - // } else { - // add_op(edit.from_length(), 'X'); - // } - // } else if (edit.from_length() == 0 && edit.to_length() > 0) { - // add_op(edit.to_length(), 'I'); - // } else if (edit.from_length() > 0 && edit.to_length() == 0) { - // add_op(edit.from_length(), 'D'); - // } else { - // add_op(max(edit.from_length(), edit.to_length()), 'X'); - // } - // } - // } - // for (auto& op : cigar_ops) { - // cerr << " " << op.first << op.second; - // } cerr << log_name() << "Matches: " << matches << " Mismatches: " << mismatches << " Gap opens: " << gap_opens << " New score: " << adjusted_score << endl; } } @@ -918,6 +909,7 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { if (chain_index != std::numeric_limits::max() && chain_index < chain_rec_counts.size()) { set_annotation(alignments[alignment_index], "chain.rec_count", (double) chain_rec_counts[chain_index]); if (rec_penalty_chain != 0) { + //int64_t penalty = min(static_cast(1), static_cast(rec_penalty_chain)/5) * static_cast(chain_rec_counts[chain_index]); int64_t penalty = static_cast(rec_penalty_chain) * static_cast(chain_rec_counts[chain_index]); int64_t adjusted_score = static_cast(alignments[alignment_index].score()) - penalty; alignments[alignment_index].set_score(static_cast(adjusted_score)); @@ -1852,7 +1844,7 @@ void MinimizerMapper::get_best_chain_stats(Alignment& aln, const ZipCodeForest& best_chain_longest_jump = std::max(best_chain_longest_jump, jump); best_chain_total_jump += jump; } - best_chain_average_jump = chains.at(best_chain).size() > 1 ? best_chain_total_jump / (chains.at(best_chain).size() - 1) : 0.0; + best_chain_average_jump = chains.at(best_chain).size() > 1 ? (double)best_chain_total_jump / (chains.at(best_chain).size() - 1) : 0.0; } // Also count anchors in the chain From 5145cd4da709df9796281d47ad60bba7a910eeb2 Mon Sep 17 00:00:00 2001 From: dmonti Date: Mon, 6 Apr 2026 09:52:00 -0700 Subject: [PATCH 04/11] debug for supported paths across chain --- src/minimizer_mapper_from_chains.cpp | 42 ++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/src/minimizer_mapper_from_chains.cpp b/src/minimizer_mapper_from_chains.cpp index d20c49e09c..631c637ab0 100644 --- a/src/minimizer_mapper_from_chains.cpp +++ b/src/minimizer_mapper_from_chains.cpp @@ -32,6 +32,7 @@ #include #include #include +#include // Turn on debugging prints //#define debug @@ -1619,9 +1620,13 @@ void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& indel_limit, show_work ); +#ifdef debug_rec + if (true) { +#else if (show_work) { +#endif #pragma omp critical (cerr) - cerr << log_name() << "Found " << chain_results.chains.size() << " chains in zip code tree " << item_num + cerr << log_name() << "\t[" << aln.name() << "] Found " << chain_results.chains.size() << " chains in zip code tree " << item_num << " running " << anchors_to_chain[anchor_indexes.front()] << " to " << anchors_to_chain[anchor_indexes.back()] << std::endl; } @@ -1631,7 +1636,12 @@ void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& auto& entry = chain_results.chains[result]; auto& scored_chain = entry.scored_chain; auto& chain_rec_positions = entry.rec_positions; - if (show_work) { +#ifdef debug_rec + if (true) +#else + if (show_work) +#endif + { #ifdef debug if(true) #else @@ -1653,10 +1663,30 @@ void MinimizerMapper::do_chaining_on_trees(Alignment& aln, const ZipCodeForest& } } cerr << std::endl; -#ifdef debug - - for (auto& anchor_number : scored_chain.second) { - std::cerr << log_name() << "\t\t" << anchor_view[anchor_number] << std::endl; +#ifdef debug_rec + algorithms::path_flags_t current_paths = 0; + bool first = true; + for (auto& selected_number : scored_chain.second) { + auto& anchor = anchor_view[selected_number]; + auto new_paths = anchor.anchor_paths(); + if (first) { + current_paths = new_paths.second; + first = false; + } else { + if (new_paths.first == new_paths.second) { + if ((current_paths & new_paths.first) == 0) { + current_paths = new_paths.first; + } else { + current_paths &= new_paths.first; + } + } else { + current_paths = new_paths.second; + } + } + + std::cerr << log_name() << "\t\t" << anchor + << " anchor_paths: " << std::bitset<64>(new_paths.first).count() << " " << std::bitset<64>(new_paths.first) + << " chain_paths: " << std::bitset<64>(current_paths).count() << " " << std::bitset<64>(current_paths) << std::endl; } #endif From 78ad8bbb0c66183705051f2255a181491d7d3d90 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 23 Apr 2026 17:00:31 -0400 Subject: [PATCH 05/11] Declare the minimap2-based scores to be right --- test/t/50_vg_giraffe.t | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/t/50_vg_giraffe.t b/test/t/50_vg_giraffe.t index b1136dceef..3c18fef34f 100644 --- a/test/t/50_vg_giraffe.t +++ b/test/t/50_vg_giraffe.t @@ -297,8 +297,8 @@ vg index -j 1mb1kgp.dist 1mb1kgp.vg vg autoindex -p 1mb1kgp -w giraffe -P "VG w/ Variant Paths:1mb1kgp.vg" -P "Giraffe Distance Index:1mb1kgp.dist" -r 1mb1kgp/z.fa -v 1mb1kgp/z.vcf.gz vg giraffe -Z 1mb1kgp.giraffe.gbz -f reads/1mb1kgp_longread.fq >longread.gam -U 300 --track-provenance --align-from-chains --set-refpos # This is an 8001 bp read with 1 insert and 1 substitution -# 7999 * 1 + 1 * -4 + -6 + 5 + 5 = 7999 -is "$(vg view -aj longread.gam | jq -r '.score')" "7999" "A long read can be correctly aligned" +# We use minimap2-based scoring which awards that this many points. +is "$(vg view -aj longread.gam | jq -r '.score')" "7948" "A long read can be correctly aligned" is "$(vg view -aj longread.gam | jq -c '.path.mapping[].edit[] | select(.sequence)' | wc -l | sed 's/^[[:space:]]*//')" "2" "A long read has the correct edits found" is "$(vg view -aj longread.gam | jq -c '. | select(.annotation["filter_3_cluster-coverage_cluster_passed_size_total"] <= 300)' | wc -l | sed 's/^[[:space:]]*//')" "1" "Long read minimizer set is correctly restricted" is "$(vg view -aj longread.gam | jq -c '.refpos[]' | wc -l)" "$(vg view -aj longread.gam | jq -c '.path.mapping[]' | wc -l)" "Giraffe sets refpos for each reference node" From 31db629b5dc1bea02f9e7f8a1426d10ce6dd7d52 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 23 Apr 2026 17:21:49 -0400 Subject: [PATCH 06/11] Clear explanations before making them to count --- test/t/50_vg_giraffe.t | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/t/50_vg_giraffe.t b/test/t/50_vg_giraffe.t index 3c18fef34f..5ec9a33612 100644 --- a/test/t/50_vg_giraffe.t +++ b/test/t/50_vg_giraffe.t @@ -124,6 +124,8 @@ is "$(grep -c 'error.*are not compatible' log.txt)" "1" "appropriate error messa rm t1.bam t2.bam t3.bam t1.gaf tagged1.fq tagged2.fq rm -f read.fq read.gam +rm -rf explanation_* + vg giraffe -Z x.giraffe.gbz -f reads/small.middle.ref.indel.multi.fq --show-work --track-position -b chaining-sr > /dev/null 2>&1 # Check that at least some TSV files and directories were created is "$(find explanation_read1 -name 'chain*-dotplot*.tsv' 2>/dev/null | wc -l | tr -d ' ')" "1" "Chain explanation files are created per chain" From d71dc9fa749ec7472c20173906d2e61e6e15d330 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Thu, 23 Apr 2026 18:12:41 -0400 Subject: [PATCH 07/11] Move minimap2-style scoring logic into functions we could use later, like for surject --- src/alignment.cpp | 59 +++++++++++++++++++++++++ src/alignment.hpp | 9 ++++ src/minimizer_mapper_from_chains.cpp | 65 ++++------------------------ 3 files changed, 76 insertions(+), 57 deletions(-) diff --git a/src/alignment.cpp b/src/alignment.cpp index 567bfb016a..faf669ea69 100644 --- a/src/alignment.cpp +++ b/src/alignment.cpp @@ -3507,6 +3507,65 @@ pair aligned_interval(const Alignment& aln) { return pair(softclip_start(aln), aln.sequence().size() - softclip_end(aln)); } +void count_alignment_operations(const Alignment& aln, size_t& matches, size_t& mismatches, std::vector& gap_lengths) { + matches = 0; + mismatches = 0; + gap_lengths.clear(); + + enum class EditType { MATCH, MISMATCH, INS, DEL, COMPLEX, NONE }; + EditType prev_type = EditType::NONE; + size_t current_gap_length = 0; + + auto finish_gap = [&]() { + if (current_gap_length > 0) { + gap_lengths.push_back(current_gap_length); + current_gap_length = 0; + } + }; + + for (size_t i = 0; i < aln.path().mapping_size(); ++i) { + auto& mapping = aln.path().mapping(i); + for (size_t j = 0; j < mapping.edit_size(); ++j) { + auto& edit = mapping.edit(j); + if (edit.from_length() == edit.to_length() && edit.from_length() > 0) { + finish_gap(); + if (edit.sequence().empty()) { + matches += edit.from_length(); + prev_type = EditType::MATCH; + } else { + mismatches += edit.from_length(); + prev_type = EditType::MISMATCH; + } + } else if (edit.from_length() == 0 && edit.to_length() > 0) { + if (prev_type != EditType::INS) finish_gap(); + current_gap_length += edit.to_length(); + prev_type = EditType::INS; + } else if (edit.from_length() > 0 && edit.to_length() == 0) { + if (prev_type != EditType::DEL) finish_gap(); + current_gap_length += edit.from_length(); + prev_type = EditType::DEL; + } else { + finish_gap(); + mismatches += max(edit.from_length(), edit.to_length()); + prev_type = EditType::COMPLEX; + } + } + } + finish_gap(); +} + +int score_alignment_with_logged_gaps(const size_t& matches, const size_t& mismatches, const std::vector& gap_lengths) { + double d = max(0.02, static_cast(mismatches + gap_lengths.size()) / static_cast(matches + mismatches + gap_lengths.size())); + double non_match_penalty = static_cast(mismatches + gap_lengths.size()) / (2.0 * d); + + double indel_penalty = 0; + for (auto& gap_length : gap_lengths) { + indel_penalty += log2(1.0 + gap_length); + } + int adjusted_score = std::round(matches - non_match_penalty - indel_penalty); + return adjusted_score; +} + string mate_info(const string& path, int32_t pos, bool rev_strand, bool is_read1) { subrange_t subrange; string path_name = Paths::strip_subrange(path, &subrange); diff --git a/src/alignment.hpp b/src/alignment.hpp index cea9bc524d..1dbfc9870f 100644 --- a/src/alignment.hpp +++ b/src/alignment.hpp @@ -330,6 +330,15 @@ bool is_supplementary(const Alignment& alignment); // The indexes on the read sequence of the portion of the read that is aligned outside of soft clips pair aligned_interval(const Alignment& aln); +/// Count the various types of edits in an Alignment, including individual gap lengths. +void count_alignment_operations(const Alignment& aln, size_t& matches, size_t& mismatches, std::vector& gaps_lengths); +/// Compute an alignment score using minimap2 long indels penalty adjustment. +/// +/// This scoring method penalize long continous indels less, using the formula: +/// score = matches - (mismatches + gap_opens)/2d - sum_{i=1}^{gap_opens} (log_2(1 + gap_length_i)) +/// with d = max{0.02, (mismatches + gap_opens)/(matches + mismatches + gap_opens)} +int score_alignment_with_logged_gaps(const size_t& matches, const size_t& mismatches, const std::vector& gap_lengths); + // create an annotation string required to properly set the SAM fields/flags of a supplementary alignment // the arguments all refer to properties of the primary *mate* alignment // the path name saved in the info is the base path name, with any subrange info reflected in the position diff --git a/src/minimizer_mapper_from_chains.cpp b/src/minimizer_mapper_from_chains.cpp index 20300fc99d..88e499ffc6 100644 --- a/src/minimizer_mapper_from_chains.cpp +++ b/src/minimizer_mapper_from_chains.cpp @@ -855,70 +855,21 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { // Unmapped, so skip it. continue; } - int matches = 0; - int mismatches = 0; - int gap_opens = 0; - vector gap_lengths; - - enum class EditType { MATCH, MISMATCH, INS, DEL, COMPLEX, NONE }; - EditType prev_type = EditType::NONE; - size_t current_gap_length = 0; - - auto finish_gap = [&]() { - if (current_gap_length > 0) { - gap_opens++; - gap_lengths.push_back(current_gap_length); - current_gap_length = 0; - } - }; - - for (size_t i = 0; i < alignments[alignment_index].path().mapping_size(); ++i) { - auto& mapping = alignments[alignment_index].path().mapping(i); - for (size_t j = 0; j < mapping.edit_size(); ++j) { - auto& edit = mapping.edit(j); - if (edit.from_length() == edit.to_length() && edit.from_length() > 0) { - finish_gap(); - if (edit.sequence().empty()) { - matches += edit.from_length(); - prev_type = EditType::MATCH; - } else { - mismatches += edit.from_length(); - prev_type = EditType::MISMATCH; - } - } else if (edit.from_length() == 0 && edit.to_length() > 0) { - if (prev_type != EditType::INS) finish_gap(); - current_gap_length += edit.to_length(); - prev_type = EditType::INS; - } else if (edit.from_length() > 0 && edit.to_length() == 0) { - if (prev_type != EditType::DEL) finish_gap(); - current_gap_length += edit.from_length(); - prev_type = EditType::DEL; - } else { - finish_gap(); - mismatches += max(edit.from_length(), edit.to_length()); - prev_type = EditType::COMPLEX; - } - } - } - finish_gap(); - - if (matches + mismatches + gap_opens == 0) { + + size_t matches, mismatches; + std::vector gap_lengths; + count_alignment_operations(alignments[alignment_index], matches, mismatches, gap_lengths); + + if (matches + mismatches + gap_lengths.size() == 0) { continue; } - double d = max(0.02, static_cast(mismatches + gap_opens) / static_cast(matches + mismatches + gap_opens)); - double non_match_penalty = static_cast(mismatches + gap_opens) / (2.0 * d); - - double indel_penalty = 0; - for (auto& gap_length : gap_lengths) { - indel_penalty += log2(1.0 + gap_length); - } - int adjusted_score = std::round(matches - non_match_penalty - indel_penalty); + int adjusted_score = score_alignment_with_logged_gaps(matches, mismatches, gap_lengths); alignments[alignment_index].set_score(adjusted_score); if (show_work) { #pragma omp critical (cerr) { - cerr << log_name() << "Matches: " << matches << " Mismatches: " << mismatches << " Gap opens: " << gap_opens << " New score: " << adjusted_score << endl; + cerr << log_name() << "Matches: " << matches << " Mismatches: " << mismatches << " Gap opens: " << gap_lengths.size() << " New score: " << adjusted_score << endl; } } } From 11e053eca828f4a3f400a102636c6d276d8eccd5 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 27 Apr 2026 13:58:48 -0400 Subject: [PATCH 08/11] Stop worrying about negative recombination penalties --- src/algorithms/chain_items.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/algorithms/chain_items.cpp b/src/algorithms/chain_items.cpp index e580b970bd..10d2ea4b44 100644 --- a/src/algorithms/chain_items.cpp +++ b/src/algorithms/chain_items.cpp @@ -547,6 +547,8 @@ TracedScore chain_items_dp(vector& chain_scores, cerr << "Chaining group of " << to_chain.size() << " items" << endl; } + crash_unless(recomb_penalty >= 0); + // Compute a base seed average length. // TODO: Weight anchors differently? // TODO: Will this always be the same for all anchors in practice? @@ -560,7 +562,7 @@ TracedScore chain_items_dp(vector& chain_scores, // Track eval bonus for heuristic comparison (path conservation bonus, used for // selection only without affecting the actual stored score). // Starting from nowhere means full path conservation, so bonus = recomb_penalty. - std::vector eval_bonuses(to_chain.size(), std::max(0, recomb_penalty)); + std::vector eval_bonuses(to_chain.size(), recomb_penalty); for (size_t i = 0; i < to_chain.size(); i++) { // Set up DP table so we can start anywhere with that item's score, scaled and with bonus applied. chain_scores[i] = {(int)(to_chain[i].score() * item_scale + item_bonus), TracedScore::nowhere(), to_chain[i].anchor_end_paths()}; @@ -593,7 +595,7 @@ TracedScore chain_items_dp(vector& chain_scores, // This also has full path conservation (bonus = recomb_penalty). { TracedScore from_nowhere = {(int)item_points, TracedScore::nowhere(), here.anchor_end_paths()}; - int nowhere_bonus = std::max(0, recomb_penalty); + int nowhere_bonus = recomb_penalty; int eval_nowhere = from_nowhere.score + nowhere_bonus; int eval_current = chain_scores[transition.to_anchor].score + eval_bonuses[transition.to_anchor]; if (eval_nowhere > eval_current) { From f665d80ec91fa5bc38c35e900d24a157e3e83e0d Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 27 Apr 2026 14:06:39 -0400 Subject: [PATCH 09/11] Make the evaluation bonus system clearer --- src/algorithms/chain_items.cpp | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/src/algorithms/chain_items.cpp b/src/algorithms/chain_items.cpp index 10d2ea4b44..76b0ddf21d 100644 --- a/src/algorithms/chain_items.cpp +++ b/src/algorithms/chain_items.cpp @@ -559,8 +559,18 @@ TracedScore chain_items_dp(vector& chain_scores, base_seed_length /= to_chain.size(); chain_scores.resize(to_chain.size()); - // Track eval bonus for heuristic comparison (path conservation bonus, used for - // selection only without affecting the actual stored score). + + // We want to prefer to come from seeds where the transition preserves + // access to matching haplotypes, because we don't want to back ourselves + // into a corner where we need a recombination when we don't really have + // to. So we cheat on the dynamic programming by adding an "evaluation + // bonus" to the scores of the different DP options when comparing them. We + // keep this bonus out of the actual recorded scores because we don't want + // it raising the scores we actually get the more transitions we take. + // + // We store the bonus used to select the current winning predecessor for + // each seed in this vector, which runs alongside the DP table. + // // Starting from nowhere means full path conservation, so bonus = recomb_penalty. std::vector eval_bonuses(to_chain.size(), recomb_penalty); for (size_t i = 0; i < to_chain.size(); i++) { @@ -695,20 +705,20 @@ TracedScore chain_items_dp(vector& chain_scores, } // Recombination case (no shared paths): bonus stays 0 } - + + // Grab the DP table slot we are updating auto& current_best = chain_scores[transition.to_anchor]; + // Compute the evaluation value for the new candidate int eval_from = from_source_score.score + eval_bonus_from; + // Reconstruct the evaluation value for the current winner int eval_best = current_best.score + eval_bonuses[transition.to_anchor]; - if (eval_from > eval_best) { + if (eval_from > eval_best || (eval_from == eval_best && from_source_score > current_best)) { + // Using the evaluation values, and then if tied the real DP + // scores, this new candidate beats the previous winner, so + // replace it. current_best = from_source_score; eval_bonuses[transition.to_anchor] = eval_bonus_from; - } else if (eval_from == eval_best) { - // Tie-breaker using actual underlying standard formulations - if (from_source_score > current_best) { - current_best = from_source_score; - eval_bonuses[transition.to_anchor] = eval_bonus_from; - } } if (show_work) { From 76791abbf5bc0df10766eb697db8e96fddcab07f Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 27 Apr 2026 14:07:53 -0400 Subject: [PATCH 10/11] Adjust comments --- src/minimizer_mapper_from_chains.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/minimizer_mapper_from_chains.cpp b/src/minimizer_mapper_from_chains.cpp index 88e499ffc6..4c99424877 100644 --- a/src/minimizer_mapper_from_chains.cpp +++ b/src/minimizer_mapper_from_chains.cpp @@ -846,11 +846,10 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { minimizer_kept_chain_count, alignments, multiplicity_by_alignment, alignments_to_source, minimizer_explored, stats, funnel_depleted, rng, funnel); } - // Implement the logic for minimap2 long indels penalty adjustment. - // The new alignment score penalize long continous indels less, using the formula: - // score = matches - (mismatches + gap_opens)/2d - sum_{i=1}^{gap_opens} (log_2(1 + gap_length_i)) - // with d = max{0.02, (mismatches + gap_opens)/(matches + mismatches + gap_opens)} + for (size_t alignment_index = 0; alignment_index < alignments.size(); ++alignment_index) { + // Rescore all the alignments using minimap2 logged-gap-length, read-identity-based scoring + if (alignments[alignment_index].path().mapping_size() == 0) { // Unmapped, so skip it. continue; @@ -879,7 +878,6 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { if (chain_index != std::numeric_limits::max() && chain_index < chain_rec_counts.size()) { set_annotation(alignments[alignment_index], "chain.rec_count", (double) chain_rec_counts[chain_index]); if (rec_penalty_chain != 0) { - //int64_t penalty = min(static_cast(1), static_cast(rec_penalty_chain)/5) * static_cast(chain_rec_counts[chain_index]); int64_t penalty = static_cast(rec_penalty_chain) * static_cast(chain_rec_counts[chain_index]); int64_t adjusted_score = static_cast(alignments[alignment_index].score()) - penalty; alignments[alignment_index].set_score(static_cast(adjusted_score)); From e6d032a0c7c718fd1c7b347ac525244cc9fed21f Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 27 Apr 2026 14:10:41 -0400 Subject: [PATCH 11/11] Rename variables --- src/minimizer_mapper_from_chains.cpp | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/minimizer_mapper_from_chains.cpp b/src/minimizer_mapper_from_chains.cpp index 4c99424877..cb12c8b11a 100644 --- a/src/minimizer_mapper_from_chains.cpp +++ b/src/minimizer_mapper_from_chains.cpp @@ -862,13 +862,14 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { if (matches + mismatches + gap_lengths.size() == 0) { continue; } - - int adjusted_score = score_alignment_with_logged_gaps(matches, mismatches, gap_lengths); - alignments[alignment_index].set_score(adjusted_score); + + // Compute the logged-gaps score + auto logged_gaps_score = score_alignment_with_logged_gaps(matches, mismatches, gap_lengths); + alignments[alignment_index].set_score(logged_gaps_score); if (show_work) { #pragma omp critical (cerr) { - cerr << log_name() << "Matches: " << matches << " Mismatches: " << mismatches << " Gap opens: " << gap_lengths.size() << " New score: " << adjusted_score << endl; + cerr << log_name() << "Matches: " << matches << " Mismatches: " << mismatches << " Gap opens: " << gap_lengths.size() << " New score: " << logged_gaps_score << endl; } } } @@ -878,9 +879,12 @@ vector MinimizerMapper::map_from_chains(Alignment& aln) { if (chain_index != std::numeric_limits::max() && chain_index < chain_rec_counts.size()) { set_annotation(alignments[alignment_index], "chain.rec_count", (double) chain_rec_counts[chain_index]); if (rec_penalty_chain != 0) { + // Penalize the score of alignment candidates according to the number of recombinations their chains required. + // This allows alignments that required fewer recombinations in their chains to win. + // TODO: We'd also eventaully like to count recombinations that we don't know are needed until base-level DP. int64_t penalty = static_cast(rec_penalty_chain) * static_cast(chain_rec_counts[chain_index]); - int64_t adjusted_score = static_cast(alignments[alignment_index].score()) - penalty; - alignments[alignment_index].set_score(static_cast(adjusted_score)); + int64_t penalized_score = static_cast(alignments[alignment_index].score()) - penalty; + alignments[alignment_index].set_score(static_cast(penalized_score)); } } }