Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 58 additions & 4 deletions src/algorithms/chain_items.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,8 @@ TracedScore chain_items_dp(vector<TracedScore>& chain_scores,
cerr << "Chaining group of " << to_chain.size() << " items" << endl;
}

crash_unless(recomb_penalty >= 0);

// Compute a base seed average length.
// TODO: Weight anchors differently?
// TODO: Will this always be the same for all anchors in practice?
Expand All @@ -557,6 +559,20 @@ TracedScore chain_items_dp(vector<TracedScore>& chain_scores,
base_seed_length /= to_chain.size();

chain_scores.resize(to_chain.size());

// We want to prefer to come from seeds where the transition preserves
// access to matching haplotypes, because we don't want to back ourselves
// into a corner where we need a recombination when we don't really have
// to. So we cheat on the dynamic programming by adding an "evaluation
// bonus" to the scores of the different DP options when comparing them. We
// keep this bonus out of the actual recorded scores because we don't want
// it raising the scores we actually get the more transitions we take.
//
// We store the bonus used to select the current winning predecessor for
// each seed in this vector, which runs alongside the DP table.
//
// Starting from nowhere means full path conservation, so bonus = recomb_penalty.
std::vector<int> eval_bonuses(to_chain.size(), recomb_penalty);
for (size_t i = 0; i < to_chain.size(); i++) {
// Set up DP table so we can start anywhere with that item's score, scaled and with bonus applied.
chain_scores[i] = {(int)(to_chain[i].score() * item_scale + item_bonus), TracedScore::nowhere(), to_chain[i].anchor_end_paths()};
Expand Down Expand Up @@ -586,8 +602,20 @@ TracedScore chain_items_dp(vector<TracedScore>& chain_scores,
}

// If we come from nowhere, we get those points.
chain_scores[transition.to_anchor] = std::max(chain_scores[transition.to_anchor],
{(int)item_points, TracedScore::nowhere(), here.anchor_end_paths()});
// This also has full path conservation (bonus = recomb_penalty).
{
TracedScore from_nowhere = {(int)item_points, TracedScore::nowhere(), here.anchor_end_paths()};
int nowhere_bonus = recomb_penalty;
int eval_nowhere = from_nowhere.score + nowhere_bonus;
int eval_current = chain_scores[transition.to_anchor].score + eval_bonuses[transition.to_anchor];
if (eval_nowhere > eval_current) {
chain_scores[transition.to_anchor] = from_nowhere;
eval_bonuses[transition.to_anchor] = nowhere_bonus;
} else if (eval_nowhere == eval_current && from_nowhere > chain_scores[transition.to_anchor]) {
chain_scores[transition.to_anchor] = from_nowhere;
eval_bonuses[transition.to_anchor] = nowhere_bonus;
}
}

// For each source we could come from
auto& source = to_chain[transition.from_anchor];
Expand Down Expand Up @@ -664,8 +692,34 @@ TracedScore chain_items_dp(vector<TracedScore>& chain_scores,
TracedScore from_source_score = source_score.add_points(jump_points + item_points)
.set_shared_paths(here.anchor_paths());

// Remember that we could make this jump
chain_scores[transition.to_anchor] = std::max(chain_scores[transition.to_anchor], from_source_score);
// Evaluate heuristic to preserve path flexibility without inflating actual scoring DP.
// Bonus = fraction of conserved paths * recomb_penalty.
// Bonus is 0 when recombination occurs (no shared paths).
int eval_bonus_from = 0;
if (recomb_penalty > 0) {
int pre_count = __builtin_popcountll(source_score.paths);
if (pre_count > 0 && (source_score.paths & here.anchor_start_paths()) != 0) {
// No recombination: bonus = fraction of paths conserved * penalty
int post_count = __builtin_popcountll(from_source_score.paths);
eval_bonus_from = (recomb_penalty * post_count) / pre_count;
}
// Recombination case (no shared paths): bonus stays 0
}

// Grab the DP table slot we are updating
auto& current_best = chain_scores[transition.to_anchor];
// Compute the evaluation value for the new candidate
int eval_from = from_source_score.score + eval_bonus_from;
// Reconstruct the evaluation value for the current winner
int eval_best = current_best.score + eval_bonuses[transition.to_anchor];

if (eval_from > eval_best || (eval_from == eval_best && from_source_score > current_best)) {
// Using the evaluation values, and then if tied the real DP
// scores, this new candidate beats the previous winner, so
// replace it.
current_best = from_source_score;
eval_bonuses[transition.to_anchor] = eval_bonus_from;
Comment on lines +710 to +721
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the main codepath but it doesn't have any explanation. eval here doesn't mean anything to me by itself. I guess it's meant to suggest "the score we actually use to evaluate the possible alternatives"?

}

if (show_work) {
#ifdef debug_dp
Expand Down
59 changes: 59 additions & 0 deletions src/alignment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3507,6 +3507,65 @@ pair<int64_t, int64_t> aligned_interval(const Alignment& aln) {
return pair<int64_t, int64_t>(softclip_start(aln), aln.sequence().size() - softclip_end(aln));
}

void count_alignment_operations(const Alignment& aln, size_t& matches, size_t& mismatches, std::vector<size_t>& gap_lengths) {
matches = 0;
mismatches = 0;
gap_lengths.clear();

enum class EditType { MATCH, MISMATCH, INS, DEL, COMPLEX, NONE };
EditType prev_type = EditType::NONE;
size_t current_gap_length = 0;

auto finish_gap = [&]() {
if (current_gap_length > 0) {
gap_lengths.push_back(current_gap_length);
current_gap_length = 0;
}
};

for (size_t i = 0; i < aln.path().mapping_size(); ++i) {
auto& mapping = aln.path().mapping(i);
for (size_t j = 0; j < mapping.edit_size(); ++j) {
auto& edit = mapping.edit(j);
if (edit.from_length() == edit.to_length() && edit.from_length() > 0) {
finish_gap();
if (edit.sequence().empty()) {
matches += edit.from_length();
prev_type = EditType::MATCH;
} else {
mismatches += edit.from_length();
prev_type = EditType::MISMATCH;
}
} else if (edit.from_length() == 0 && edit.to_length() > 0) {
if (prev_type != EditType::INS) finish_gap();
current_gap_length += edit.to_length();
prev_type = EditType::INS;
} else if (edit.from_length() > 0 && edit.to_length() == 0) {
if (prev_type != EditType::DEL) finish_gap();
current_gap_length += edit.from_length();
prev_type = EditType::DEL;
} else {
finish_gap();
mismatches += max(edit.from_length(), edit.to_length());
prev_type = EditType::COMPLEX;
}
}
}
finish_gap();
}

int score_alignment_with_logged_gaps(const size_t& matches, const size_t& mismatches, const std::vector<size_t>& gap_lengths) {
double d = max(0.02, static_cast<double>(mismatches + gap_lengths.size()) / static_cast<double>(matches + mismatches + gap_lengths.size()));
double non_match_penalty = static_cast<double>(mismatches + gap_lengths.size()) / (2.0 * d);

double indel_penalty = 0;
for (auto& gap_length : gap_lengths) {
indel_penalty += log2(1.0 + gap_length);
}
int adjusted_score = std::round(matches - non_match_penalty - indel_penalty);
return adjusted_score;
}

string mate_info(const string& path, int32_t pos, bool rev_strand, bool is_read1) {
subrange_t subrange;
string path_name = Paths::strip_subrange(path, &subrange);
Expand Down
9 changes: 9 additions & 0 deletions src/alignment.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,15 @@ bool is_supplementary(const Alignment& alignment);
// The indexes on the read sequence of the portion of the read that is aligned outside of soft clips
pair<int64_t, int64_t> aligned_interval(const Alignment& aln);

/// Count the various types of edits in an Alignment, including individual gap lengths.
void count_alignment_operations(const Alignment& aln, size_t& matches, size_t& mismatches, std::vector<size_t>& gaps_lengths);
/// Compute an alignment score using minimap2 long indels penalty adjustment.
///
/// This scoring method penalize long continous indels less, using the formula:
/// score = matches - (mismatches + gap_opens)/2d - sum_{i=1}^{gap_opens} (log_2(1 + gap_length_i))
/// with d = max{0.02, (mismatches + gap_opens)/(matches + mismatches + gap_opens)}
int score_alignment_with_logged_gaps(const size_t& matches, const size_t& mismatches, const std::vector<size_t>& gap_lengths);

// create an annotation string required to properly set the SAM fields/flags of a supplementary alignment
// the arguments all refer to properties of the primary *mate* alignment
// the path name saved in the info is the base path name, with any subrange info reflected in the position
Expand Down
3 changes: 2 additions & 1 deletion src/minimizer_mapper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -871,7 +871,8 @@ class MinimizerMapper : public AlignerClient {
*/
void do_chaining_on_trees(Alignment& aln, const ZipCodeForest& zip_code_forest, const std::vector<Seed>& seeds, const VectorView<MinimizerMapper::Minimizer>& minimizers,
const vector<algorithms::Anchor>& seed_anchors,
std::vector<std::vector<size_t>>& chains, std::vector<std::vector<bool>>& chain_rec_flags, std::vector<size_t>& chain_source_tree,
std::vector<std::vector<size_t>>& chains, std::vector<std::vector<bool>>& chain_rec_flags,
std::vector<size_t>& chain_rec_counts, std::vector<size_t>& chain_source_tree,
std::vector<int>& chain_score_estimates, std::vector<std::vector<size_t>>& minimizer_kept_chain_count,
std::vector<double>& multiplicity_by_chain,
std::vector<Alignment>& alignments, SmallBitset& minimizer_explored, vector<double>& multiplicity_by_alignment,
Expand Down
Loading
Loading