vgteam · adamnovak · Apr 28, 2026 · Mar 16, 2026 · Mar 17, 2026 · Mar 23, 2026
diff --git a/src/algorithms/chain_items.cpp b/src/algorithms/chain_items.cpp
@@ -547,6 +547,8 @@ TracedScore chain_items_dp(vector<TracedScore>& chain_scores,
         cerr << "Chaining group of " << to_chain.size() << " items" << endl;
     }
 
+    crash_unless(recomb_penalty >= 0);
+
     // Compute a base seed average length.
     // TODO: Weight anchors differently?
     // TODO: Will this always be the same for all anchors in practice?
@@ -557,6 +559,20 @@ TracedScore chain_items_dp(vector<TracedScore>& chain_scores,
     base_seed_length /= to_chain.size();
 
     chain_scores.resize(to_chain.size());
+
+    // We want to prefer to come from seeds where the transition preserves
+    // access to matching haplotypes, because we don't want to back ourselves
+    // into a corner where we need a recombination when we don't really have
+    // to. So we cheat on the dynamic programming by adding an "evaluation
+    // bonus" to the scores of the different DP options when comparing them. We
+    // keep this bonus out of the actual recorded scores because we don't want
+    // it raising the scores we actually get the more transitions we take.
+    //
+    // We store the bonus used to select the current winning predecessor for
+    // each seed in this vector, which runs alongside the DP table.
+    //
+    // Starting from nowhere means full path conservation, so bonus = recomb_penalty.
+    std::vector<int> eval_bonuses(to_chain.size(), recomb_penalty);
     for (size_t i = 0; i < to_chain.size(); i++) {
         // Set up DP table so we can start anywhere with that item's score, scaled and with bonus applied.
         chain_scores[i] = {(int)(to_chain[i].score() * item_scale + item_bonus), TracedScore::nowhere(), to_chain[i].anchor_end_paths()};
@@ -586,8 +602,20 @@ TracedScore chain_items_dp(vector<TracedScore>& chain_scores,
         }
 
         // If we come from nowhere, we get those points.
-        chain_scores[transition.to_anchor] = std::max(chain_scores[transition.to_anchor], 
-                                                      {(int)item_points, TracedScore::nowhere(), here.anchor_end_paths()});
+        // This also has full path conservation (bonus = recomb_penalty).
+        {
+            TracedScore from_nowhere = {(int)item_points, TracedScore::nowhere(), here.anchor_end_paths()};
+            int nowhere_bonus = recomb_penalty;
+            int eval_nowhere = from_nowhere.score + nowhere_bonus;
+            int eval_current = chain_scores[transition.to_anchor].score + eval_bonuses[transition.to_anchor];
+            if (eval_nowhere > eval_current) {
+                chain_scores[transition.to_anchor] = from_nowhere;
+                eval_bonuses[transition.to_anchor] = nowhere_bonus;
+            } else if (eval_nowhere == eval_current && from_nowhere > chain_scores[transition.to_anchor]) {
+                chain_scores[transition.to_anchor] = from_nowhere;
+                eval_bonuses[transition.to_anchor] = nowhere_bonus;
+            }
+        }
 
         // For each source we could come from
         auto& source = to_chain[transition.from_anchor];
@@ -664,8 +692,34 @@ TracedScore chain_items_dp(vector<TracedScore>& chain_scores,
             TracedScore from_source_score = source_score.add_points(jump_points + item_points)
                                                         .set_shared_paths(here.anchor_paths());
 
-            // Remember that we could make this jump
-            chain_scores[transition.to_anchor] = std::max(chain_scores[transition.to_anchor], from_source_score);
+            // Evaluate heuristic to preserve path flexibility without inflating actual scoring DP.
+            // Bonus = fraction of conserved paths * recomb_penalty.
+            // Bonus is 0 when recombination occurs (no shared paths).
+            int eval_bonus_from = 0;
+            if (recomb_penalty > 0) {
+                int pre_count = __builtin_popcountll(source_score.paths);
+                if (pre_count > 0 && (source_score.paths & here.anchor_start_paths()) != 0) {
+                    // No recombination: bonus = fraction of paths conserved * penalty
+                    int post_count = __builtin_popcountll(from_source_score.paths);
+                    eval_bonus_from = (recomb_penalty * post_count) / pre_count;
+                }
+                // Recombination case (no shared paths): bonus stays 0
+            }
+
+            // Grab the DP table slot we are updating
+            auto& current_best = chain_scores[transition.to_anchor];
+            // Compute the evaluation value for the new candidate
+            int eval_from = from_source_score.score + eval_bonus_from;
+            // Reconstruct the evaluation value for the current winner
+            int eval_best = current_best.score + eval_bonuses[transition.to_anchor];
+
+            if (eval_from > eval_best || (eval_from == eval_best && from_source_score > current_best)) {
+                // Using the evaluation values, and then if tied the real DP
+                // scores, this new candidate beats the previous winner, so
+                // replace it.
+                current_best = from_source_score;
+                eval_bonuses[transition.to_anchor] = eval_bonus_from;
+            }
 
             if (show_work) {
 #ifdef debug_dp

diff --git a/src/alignment.cpp b/src/alignment.cpp
@@ -3507,6 +3507,65 @@ pair<int64_t, int64_t> aligned_interval(const Alignment& aln) {
     return pair<int64_t, int64_t>(softclip_start(aln), aln.sequence().size() - softclip_end(aln));
 }
 
+void count_alignment_operations(const Alignment& aln, size_t& matches, size_t& mismatches, std::vector<size_t>& gap_lengths) {
+    matches = 0;
+    mismatches = 0;
+    gap_lengths.clear();
+
+    enum class EditType { MATCH, MISMATCH, INS, DEL, COMPLEX, NONE };
+    EditType prev_type = EditType::NONE;
+    size_t current_gap_length = 0;
+
+    auto finish_gap = [&]() {
+        if (current_gap_length > 0) {
+            gap_lengths.push_back(current_gap_length);
+            current_gap_length = 0;
+        }
+    };
+
+    for (size_t i = 0; i < aln.path().mapping_size(); ++i) {
+        auto& mapping = aln.path().mapping(i);
+        for (size_t j = 0; j < mapping.edit_size(); ++j) {
+            auto& edit = mapping.edit(j);
+            if (edit.from_length() == edit.to_length() && edit.from_length() > 0) {
+                finish_gap();
+                if (edit.sequence().empty()) {
+                    matches += edit.from_length();
+                    prev_type = EditType::MATCH;
+                } else {
+                    mismatches += edit.from_length();
+                    prev_type = EditType::MISMATCH;
+                }
+            } else if (edit.from_length() == 0 && edit.to_length() > 0) {
+                if (prev_type != EditType::INS) finish_gap();
+                current_gap_length += edit.to_length();
+                prev_type = EditType::INS;
+            } else if (edit.from_length() > 0 && edit.to_length() == 0) {
+                if (prev_type != EditType::DEL) finish_gap();
+                current_gap_length += edit.from_length();
+                prev_type = EditType::DEL;
+            } else {
+                finish_gap();
+                mismatches += max(edit.from_length(), edit.to_length());
+                prev_type = EditType::COMPLEX;
+            }
+        }
+    }
+    finish_gap();
+}
+
+int score_alignment_with_logged_gaps(const size_t& matches, const size_t& mismatches, const std::vector<size_t>& gap_lengths) {
+    double d = max(0.02, static_cast<double>(mismatches + gap_lengths.size()) / static_cast<double>(matches + mismatches + gap_lengths.size()));
+    double non_match_penalty = static_cast<double>(mismatches + gap_lengths.size()) / (2.0 * d);
+
+    double indel_penalty = 0;
+    for (auto& gap_length : gap_lengths) {
+        indel_penalty += log2(1.0 + gap_length);
+    }
+    int adjusted_score = std::round(matches - non_match_penalty - indel_penalty);
+    return adjusted_score;
+}
+
 string mate_info(const string& path, int32_t pos, bool rev_strand, bool is_read1) {
     subrange_t subrange;
     string path_name = Paths::strip_subrange(path, &subrange);

diff --git a/src/alignment.hpp b/src/alignment.hpp
@@ -330,6 +330,15 @@ bool is_supplementary(const Alignment& alignment);
 // The indexes on the read sequence of the portion of the read that is aligned outside of soft clips
 pair<int64_t, int64_t> aligned_interval(const Alignment& aln);
 
+/// Count the various types of edits in an Alignment, including individual gap lengths.
+void count_alignment_operations(const Alignment& aln, size_t& matches, size_t& mismatches, std::vector<size_t>& gaps_lengths);
+/// Compute an alignment score using minimap2 long indels penalty adjustment.
+///
+/// This scoring method penalize long continous indels less, using the formula:
+/// score = matches - (mismatches + gap_opens)/2d - sum_{i=1}^{gap_opens} (log_2(1 + gap_length_i))
+/// with d = max{0.02, (mismatches + gap_opens)/(matches + mismatches + gap_opens)}
+int score_alignment_with_logged_gaps(const size_t& matches, const size_t& mismatches, const std::vector<size_t>& gap_lengths);
+
 // create an annotation string required to properly set the SAM fields/flags of a supplementary alignment
 // the arguments all refer to properties of the primary *mate* alignment
 // the path name saved in the info is the base path name, with any subrange info reflected in the position

diff --git a/src/minimizer_mapper.hpp b/src/minimizer_mapper.hpp
@@ -871,7 +871,8 @@ class MinimizerMapper : public AlignerClient {
      */
     void do_chaining_on_trees(Alignment& aln, const ZipCodeForest& zip_code_forest, const std::vector<Seed>& seeds, const VectorView<MinimizerMapper::Minimizer>& minimizers,
                               const vector<algorithms::Anchor>& seed_anchors,
-                              std::vector<std::vector<size_t>>& chains, std::vector<std::vector<bool>>& chain_rec_flags, std::vector<size_t>& chain_source_tree,
+                              std::vector<std::vector<size_t>>& chains, std::vector<std::vector<bool>>& chain_rec_flags,
+                              std::vector<size_t>& chain_rec_counts, std::vector<size_t>& chain_source_tree,
                               std::vector<int>& chain_score_estimates, std::vector<std::vector<size_t>>& minimizer_kept_chain_count,
                               std::vector<double>& multiplicity_by_chain,
                               std::vector<Alignment>& alignments, SmallBitset& minimizer_explored, vector<double>& multiplicity_by_alignment,