Skip to content

Conversation

@xiaodli
Copy link
Contributor

@xiaodli xiaodli commented Dec 5, 2024

Report a MCScanX bug(alignment e_value)

Introduction

  1. The following is main.cc file's main function.

    int main(int argc, char *argv[])
    {
        /* Start the timer */
        uglyTime(NULL);
    
        print_banner();
        char align_fn[LABEL_LEN], block_fn[LABEL_LEN];
        FILE *fw;
    
        init_opt();
        read_opt(argc, argv);
    
        read_gff(prefix_fn);
        read_blast(prefix_fn);
    
        sprintf(align_fn, "%s.collinearity", prefix_fn);
        fw = mustOpen(align_fn, "w");
    
        progress("%d pairwise comparisons", (int) mol_pairs.size());
        fill_allg();
        map<string, int>::const_iterator ip;
        for (ip=mol_pairs.begin(); ip!=mol_pairs.end(); ip++)
        {
            if (ip->second >= MATCH_SIZE) feed_dag(string(ip->first));
        }
    
        progress("%d alignments generated", (int) seg_list.size());
        print_align(fw);
        fclose(fw);
        uglyTime("Pairwise collinear blocks written to %s", align_fn);
    
        if (IS_PAIRWISE) return 0;
        msa_main(prefix_fn);
        uglyTime("Done!");
    
        return 0;
    }
    
  2. Pay attention to the read_gff(prefix_fn); function called by the main function. (Below is the read_gff function code and part of the gff file)

    void read_gff(const char *prefix_fn)
    {
        char fn[LABEL_LEN];
        string mol,gn,line,word;
        Gene_feat gf;
        sprintf(fn, "%s.gff", prefix_fn);
        ifstream in(fn);
        while (!in.eof())
        {
            getline(in,line);
            if (line=="")
                break;
            istringstream test(line);
            getline(test,mol,'\t');
            gf.mol = mol;
            getline(test,gn,'\t');
            gf.name = gn;
            getline(test,word,'\t');
            gf.mid = atoi(word.c_str());
            gene_map[gf.name] = gf;
        }
        in.close();
    }
    
    os5    Os05t0507200    25150045    25150392
    os2    Os02t0774100    33540257    33541838
    os5    Os05t0480600    23720056    23723583
    os6    Os06t0299300    11200887    11202509
    

    The read_gff function stores the chromosome name on which the gene is located, the gene name and the starting position of the gene in the gff file in gene_map.

  3. Initial the gene_id attribute of Gene_feat.

     struct Gene_feat
     {
         vector<int> cursor;
         string name;
         string mol;
         int mid;
         int gene_id;
     //   more_feat *m;
         bool operator < (const Gene_feat &g) const
         {
             return (mol == g.mol && mid < g.mid) || mol < g.mol || (mol == g.mol && mid == g.mid && name.compare(g.name)<0);
         }
     };
    

    Note that Gene_feat also has an attribute gene_id. In the subsequent fill_allg() function (called in the main function), the gene_id attribute will be initialized. The following is the fill_allg() code.

        void fill_allg()
    {
        Gene_feat *gf1;
        map<string, Gene_feat>::iterator it;
        for (it=gene_map.begin(); it!=gene_map.end(); it++)
        {   
            std::cout << it->first << std::endl;
            gf1 = &(it->second);
            allg.insert(gf1);
        }
        int i=0;
        geneSet::const_iterator it77=allg.begin();   
        for (; it77!=allg.end(); it77++)
        {
        (*it77)->gene_id=i;
        i++;
        }
    }
    

    gene_id refers to the line number (0-based) of a gene in a gff file, or an element of a gene_map (by default, gene_map is sorted by key, i.e., gene name). But in the is_sigificant function, gene_id was incorrectly compared with the start and end positions at the base pair level.

  4. The following is is_sigificiant function(main -> feed_dag -> dag_main -> print_chains -> is_significant)

    static bool is_significant(Seg_feat *sf, vector<Score_t>& score)
    /* test if a syntenic block is significant, see description in permutation.cc */
    {
        /* see formula in permutation.cc, unknowns are m, N, L1, L2, l_1i l_2i*/
        int s1_a, s1_b, s2_a, s2_b, m, N=0, L1, L2, i;
        double l1, l2, summation=0;
    
        /* get the start and stop coordinates on each syntenic segment */
        s1_a = sf->s1->mid, s1_b = sf->t1->mid;
        s2_a = sf->s2->mid, s2_b = sf->t2->mid;
    
        /* calculate m, number of anchor points */
        m = sf->pids.size();
    
        /* calculate N, number of matches in the defined region*/
        vector<Score_t>::const_iterator it;
        for (it=score.begin(); it!=score.end(); it++)
        {
            if (it->x >=s1_a && it->x <=s1_b && it->y >=s2_a && it->y <=s2_b)
                N++;
        }
    
        /* calculate l1, l2, distance between successive anchor points */
        int l1_pos1, l1_pos2, l2_pos1, l2_pos2;
        retrieve_pos(sf->pids[0], &l1_pos1, &l2_pos1);
        for (i=1; i<m; i++)
        {
            retrieve_pos(sf->pids[i], &l1_pos2, &l2_pos2);
            l1 = fabs(l1_pos2 - l1_pos1);
            l2 = fabs(l2_pos2 - l2_pos1);
            l1_pos1 = l1_pos2;
            l2_pos1 = l2_pos2;
    
            summation += log(l1)+log(l2);
        }
    
        /* calculate L1, L2, respective length of the matching region */
        L1 = s1_b - s1_a, L2 = s2_b - s2_a;
    
        /* this is the formula */
        sf->e_value = exp(M_LN2 + ln_perm(N, m) + \
                        summation - (m-1)*(log(L1)+log(L2)));
    
        return sf->e_value < E_VALUE;
    }
    

    The following is bug code.

    /* calculate N, number of matches in the defined region*/
    vector<Score_t>::const_iterator it;
    for (it=score.begin(); it!=score.end(); it++)
    {
        if (it->x >=s1_a && it->x <=s1_b && it->y >=s2_a && it->y <=s2_b)
            N++;
    }
    

    In the above code, it->x is gene_id, and it->y is also gene_id, but s1_a, s1_b, s2_a, and s2_b are the base pair positions of the four genes corresponding to the two gene pairs at the two ends of the alignment, which are obviously not in the same order of magnitude, so the N value is almost 0. So I think it is correct to use the attribute mid (the starting position at the base pair level) here.

Reproduce this bug

  1. Add #include <iostream> at the top of the dagchainer.cc file.

  2. Add std::cout << "N: " << N << std::endl; in is_sigificant function.

    /* calculate N, number of matches in the defined region*/
    vector<Score_t>::const_iterator it;
    for (it=score.begin(); it!=score.end(); it++)
    {
        if (it->x >=s1_a && it->x <=s1_b && it->y >=s2_a && it->y <=s2_b)
            N++;
    }
    std::cout << "N: " << N << std::endl;
    
  3. Running the following command.

    make 
    ./MCScanX ./os_sb

A solution

  1. Add int x_mid, y_mid; in struct.h file

    struct Score_t
    {
        int pairID;  // identifier of match pair
        int x, y;  // x,y coordinates
        int x_mid, y_mid;
        float score;
        string gene1;
        string gene2;
        bool operator< (const Score_t & node) const
        {
            return  (x < node.x || (x == node.x && y < node.y));
        }
    };
    
  2. Add cur_score.x_mid = gene_map[it->gene1].mid; cur_score.y_mid = gene_map[it->gene2].mid; in read_data.cc file

    void feed_dag(const string &mol_pair)
    {
        // two additional filters will be applied here
        // best hsp (least e-value)
        // non-repetitive in a window of 50kb region
        vector<Blast_record>::const_iterator it;
        Score_t cur_score;
    
        for (it = match_list.begin(); it < match_list.end(); it++)
        {
            if (it->mol_pair != mol_pair) continue;
    
            cur_score.pairID = it->pair_id;
            cur_score.x = gene_map[it->gene1].gene_id;
            cur_score.y = gene_map[it->gene2].gene_id;
            cur_score.x_mid = gene_map[it->gene1].mid;
            cur_score.y_mid = gene_map[it->gene2].mid;
            cur_score.gene1=it->gene1;
            cur_score.gene2=it->gene2;
            cur_score.score = MATCH_SCORE;
    
            score.push_back(cur_score);
        }
    
        // sort by both axis and remove redundant matches within
        // a given window length (default 50kb)
        filter_matches_x();
        filter_matches_y();
    
        dag_main(score, mol_pair);
    }
    
  3. Modify the following code in dagchainer.cc file

    for (it=score.begin(); it!=score.end(); it++)
    {
        if (it->x >=s1_a && it->x <=s1_b && it->y >=s2_a && it->y <=s2_b)
            N++;
    }
    
    for (it=score.begin(); it!=score.end(); it++)
    {
        if (it->x_mid >=s1_a && it->x_mid <=s1_b && it->y_mid >=s2_a && it->ymid <=s2_b)
            N++;
    }
    

@tanghaibao tanghaibao merged commit 8028209 into wyp1125:master Dec 20, 2024
2 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants