alignment e_value bug #77

xiaodli · 2024-12-05T15:29:23Z

Report a MCScanX bug(alignment e_value)

Introduction

The following is main.cc file's main function.

int main(int argc, char *argv[])
{
    /* Start the timer */
    uglyTime(NULL);

    print_banner();
    char align_fn[LABEL_LEN], block_fn[LABEL_LEN];
    FILE *fw;

    init_opt();
    read_opt(argc, argv);

    read_gff(prefix_fn);
    read_blast(prefix_fn);

    sprintf(align_fn, "%s.collinearity", prefix_fn);
    fw = mustOpen(align_fn, "w");

    progress("%d pairwise comparisons", (int) mol_pairs.size());
    fill_allg();
    map<string, int>::const_iterator ip;
    for (ip=mol_pairs.begin(); ip!=mol_pairs.end(); ip++)
    {
        if (ip->second >= MATCH_SIZE) feed_dag(string(ip->first));
    }

    progress("%d alignments generated", (int) seg_list.size());
    print_align(fw);
    fclose(fw);
    uglyTime("Pairwise collinear blocks written to %s", align_fn);

    if (IS_PAIRWISE) return 0;
    msa_main(prefix_fn);
    uglyTime("Done!");

    return 0;
}

Pay attention to the read_gff(prefix_fn); function called by the main function. (Below is the read_gff function code and part of the gff file)

void read_gff(const char *prefix_fn)
{
    char fn[LABEL_LEN];
    string mol,gn,line,word;
    Gene_feat gf;
    sprintf(fn, "%s.gff", prefix_fn);
    ifstream in(fn);
    while (!in.eof())
    {
        getline(in,line);
        if (line=="")
            break;
        istringstream test(line);
        getline(test,mol,'\t');
        gf.mol = mol;
        getline(test,gn,'\t');
        gf.name = gn;
        getline(test,word,'\t');
        gf.mid = atoi(word.c_str());
        gene_map[gf.name] = gf;
    }
    in.close();
}

os5    Os05t0507200    25150045    25150392
os2    Os02t0774100    33540257    33541838
os5    Os05t0480600    23720056    23723583
os6    Os06t0299300    11200887    11202509

The read_gff function stores the chromosome name on which the gene is located, the gene name and the starting position of the gene in the gff file in gene_map.

Initial the gene_id attribute of Gene_feat.

 struct Gene_feat
 {
     vector<int> cursor;
     string name;
     string mol;
     int mid;
     int gene_id;
 //   more_feat *m;
     bool operator < (const Gene_feat &g) const
     {
         return (mol == g.mol && mid < g.mid) || mol < g.mol || (mol == g.mol && mid == g.mid && name.compare(g.name)<0);
     }
 };

Note that Gene_feat also has an attribute gene_id. In the subsequent fill_allg() function (called in the main function), the gene_id attribute will be initialized. The following is the fill_allg() code.

    void fill_allg()
{
    Gene_feat *gf1;
    map<string, Gene_feat>::iterator it;
    for (it=gene_map.begin(); it!=gene_map.end(); it++)
    {   
        std::cout << it->first << std::endl;
        gf1 = &(it->second);
        allg.insert(gf1);
    }
    int i=0;
    geneSet::const_iterator it77=allg.begin();   
    for (; it77!=allg.end(); it77++)
    {
    (*it77)->gene_id=i;
    i++;
    }
}

gene_id refers to the line number (0-based) of a gene in a gff file, or an element of a gene_map (by default, gene_map is sorted by key, i.e., gene name). But in the is_sigificant function, gene_id was incorrectly compared with the start and end positions at the base pair level.

The following is is_sigificiant function(main -> feed_dag -> dag_main -> print_chains -> is_significant)

static bool is_significant(Seg_feat *sf, vector<Score_t>& score)
/* test if a syntenic block is significant, see description in permutation.cc */
{
    /* see formula in permutation.cc, unknowns are m, N, L1, L2, l_1i l_2i*/
    int s1_a, s1_b, s2_a, s2_b, m, N=0, L1, L2, i;
    double l1, l2, summation=0;

    /* get the start and stop coordinates on each syntenic segment */
    s1_a = sf->s1->mid, s1_b = sf->t1->mid;
    s2_a = sf->s2->mid, s2_b = sf->t2->mid;

    /* calculate m, number of anchor points */
    m = sf->pids.size();

    /* calculate N, number of matches in the defined region*/
    vector<Score_t>::const_iterator it;
    for (it=score.begin(); it!=score.end(); it++)
    {
        if (it->x >=s1_a && it->x <=s1_b && it->y >=s2_a && it->y <=s2_b)
            N++;
    }

    /* calculate l1, l2, distance between successive anchor points */
    int l1_pos1, l1_pos2, l2_pos1, l2_pos2;
    retrieve_pos(sf->pids[0], &l1_pos1, &l2_pos1);
    for (i=1; i<m; i++)
    {
        retrieve_pos(sf->pids[i], &l1_pos2, &l2_pos2);
        l1 = fabs(l1_pos2 - l1_pos1);
        l2 = fabs(l2_pos2 - l2_pos1);
        l1_pos1 = l1_pos2;
        l2_pos1 = l2_pos2;

        summation += log(l1)+log(l2);
    }

    /* calculate L1, L2, respective length of the matching region */
    L1 = s1_b - s1_a, L2 = s2_b - s2_a;

    /* this is the formula */
    sf->e_value = exp(M_LN2 + ln_perm(N, m) + \
                    summation - (m-1)*(log(L1)+log(L2)));

    return sf->e_value < E_VALUE;
}

The following is bug code.

/* calculate N, number of matches in the defined region*/
vector<Score_t>::const_iterator it;
for (it=score.begin(); it!=score.end(); it++)
{
    if (it->x >=s1_a && it->x <=s1_b && it->y >=s2_a && it->y <=s2_b)
        N++;
}

In the above code, it->x is gene_id, and it->y is also gene_id, but s1_a, s1_b, s2_a, and s2_b are the base pair positions of the four genes corresponding to the two gene pairs at the two ends of the alignment, which are obviously not in the same order of magnitude, so the N value is almost 0. So I think it is correct to use the attribute mid (the starting position at the base pair level) here.

Reproduce this bug

Add #include <iostream> at the top of the dagchainer.cc file.

Add std::cout << "N: " << N << std::endl; in is_sigificant function.

/* calculate N, number of matches in the defined region*/
vector<Score_t>::const_iterator it;
for (it=score.begin(); it!=score.end(); it++)
{
    if (it->x >=s1_a && it->x <=s1_b && it->y >=s2_a && it->y <=s2_b)
        N++;
}
std::cout << "N: " << N << std::endl;

Running the following command.
```
make 
./MCScanX ./os_sb
```

A solution

Add int x_mid, y_mid; in struct.h file

struct Score_t
{
    int pairID;  // identifier of match pair
    int x, y;  // x,y coordinates
    int x_mid, y_mid;
    float score;
    string gene1;
    string gene2;
    bool operator< (const Score_t & node) const
    {
        return  (x < node.x || (x == node.x && y < node.y));
    }
};

Add cur_score.x_mid = gene_map[it->gene1].mid; cur_score.y_mid = gene_map[it->gene2].mid; in read_data.cc file

void feed_dag(const string &mol_pair)
{
    // two additional filters will be applied here
    // best hsp (least e-value)
    // non-repetitive in a window of 50kb region
    vector<Blast_record>::const_iterator it;
    Score_t cur_score;

    for (it = match_list.begin(); it < match_list.end(); it++)
    {
        if (it->mol_pair != mol_pair) continue;

        cur_score.pairID = it->pair_id;
        cur_score.x = gene_map[it->gene1].gene_id;
        cur_score.y = gene_map[it->gene2].gene_id;
        cur_score.x_mid = gene_map[it->gene1].mid;
        cur_score.y_mid = gene_map[it->gene2].mid;
        cur_score.gene1=it->gene1;
        cur_score.gene2=it->gene2;
        cur_score.score = MATCH_SCORE;

        score.push_back(cur_score);
    }

    // sort by both axis and remove redundant matches within
    // a given window length (default 50kb)
    filter_matches_x();
    filter_matches_y();

    dag_main(score, mol_pair);
}

Modify the following code in dagchainer.cc file

for (it=score.begin(); it!=score.end(); it++)
{
    if (it->x >=s1_a && it->x <=s1_b && it->y >=s2_a && it->y <=s2_b)
        N++;
}

for (it=score.begin(); it!=score.end(); it++)
{
    if (it->x_mid >=s1_a && it->x_mid <=s1_b && it->y_mid >=s2_a && it->ymid <=s2_b)
        N++;
}

alignment e_value bug

f08e81c

tanghaibao approved these changes Dec 20, 2024

View reviewed changes

tanghaibao merged commit 8028209 into wyp1125:master Dec 20, 2024
2 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

alignment e_value bug #77

alignment e_value bug #77

Uh oh!

xiaodli commented Dec 5, 2024

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

alignment e_value bug #77

alignment e_value bug #77

Uh oh!

Conversation

xiaodli commented Dec 5, 2024

Report a MCScanX bug(alignment e_value)

Introduction

Reproduce this bug

A solution

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants