diff --git a/alignment.cpp b/alignment.cpp index a0a27a6c..6abd56f8 100644 --- a/alignment.cpp +++ b/alignment.cpp @@ -446,6 +446,55 @@ Alignment::Alignment(char *filename, char *sequence_type, InputType &intype) : v } +Alignment::Alignment(char *filename, char *sequence_type, InputType &intype, bool &gap_as_new) : vector() { + num_states = 0; + frac_const_sites = 0.0; + codon_table = NULL; + genetic_code = NULL; + non_stop_codon = NULL; + seq_type = SEQ_UNKNOWN; + STATE_UNKNOWN = 126; + cout << "Reading alignment file " << filename << " ... "; + intype = detectInputFile(filename); + + try { + + if (intype == IN_NEXUS) { + cout << "Nexus format detected" << endl; + readNexus(filename); + } else if (intype == IN_FASTA) { + cout << "Fasta format detected" << endl; + readFastaGAN(filename, sequence_type, gap_as_new); + } else if (intype == IN_PHYLIP) { + cout << "Phylip format detected" << endl; + readPhylipGAN(filename, sequence_type, gap_as_new); + } else { + outError("Unknown sequence format, please use PHYLIP, FASTA, or NEXUS format"); + } + } catch (ios::failure) { + outError(ERR_READ_INPUT); + } catch (const char *str) { + outError(str); + } catch (string str) { + outError(str); + } + + if (getNSeq() < 3) + outError("Alignment must have at least 3 sequences"); + + cout << "Alignment has " << getNSeq() << " sequences with " << getNSite() << + " columns and " << getNPattern() << " patterns"<< endl; + buildSeqStates(); + checkSeqName(); + // OBSOLETE: identical sequences are handled later +// checkIdenticalSeq(); + //cout << "Number of character states is " << num_states << endl; + //cout << "Number of patterns = " << size() << endl; + countConstSite(); + //cout << "Fraction of constant sites: " << frac_const_sites << endl; + +} + void Alignment::buildSeqStates(bool add_unobs_const) { string unobs_const; if (add_unobs_const) unobs_const = getUnobservedConstPatterns(); @@ -512,6 +561,7 @@ void Alignment::computeUnknownState() { switch (seq_type) { case SEQ_DNA: STATE_UNKNOWN = 18; break; case SEQ_PROTEIN: STATE_UNKNOWN = 22; break; + case SEQ_GAN: STATE_UNKNOWN= 1+2+4+8+16+4+1; break; default: STATE_UNKNOWN = num_states; break; } } @@ -772,6 +822,39 @@ SeqType Alignment::detectSequenceType(StrVector &sequences) { return SEQ_UNKNOWN; } +SeqType Alignment::detectSequenceTypeGAN(StrVector &sequences, bool gap_as_new) { + int num_nuc = 0; + int num_ungap = 0; + int num_bin = 0; + int num_alpha = 0; + int num_digit = 0; + + for (StrVector::iterator it = sequences.begin(); it != sequences.end(); it++) + for (string::iterator i = it->begin(); i != it->end(); i++) { + if ((*i) != '?' && (*i) != '-' && (*i) != '.' && *i != 'N' && *i != 'X') num_ungap++; + if ((*i) == 'A' || (*i) == 'C' || (*i) == 'G' || (*i) == 'T' || (*i) == 'U') + num_nuc++; + if ((*i) == 'J' && gap_as_new) + num_nuc++; + if ((*i) == '0' || (*i) == '1') + num_bin++; + if (isalpha(*i)) num_alpha++; + if (isdigit(*i)) num_digit++; + } + if (((double)num_nuc) / num_ungap > 0.9) { + if (gap_as_new) return SEQ_GAN; + return SEQ_DNA; + } + if (((double)num_bin) / num_ungap > 0.9) + return SEQ_BINARY; + if (((double)num_alpha) / num_ungap > 0.9) + return SEQ_PROTEIN; + if (((double)(num_alpha+num_digit)) / num_ungap > 0.9) + return SEQ_MORPH; + + return SEQ_UNKNOWN; +} + void Alignment::buildStateMap(char *map, SeqType seq_type) { memset(map, STATE_INVALID, NUM_CHAR); assert(STATE_UNKNOWN < 126); @@ -822,12 +905,31 @@ void Alignment::buildStateMap(char *map, SeqType seq_type) { for (int i = 0; i < len; i++) map[(int)symbols_morph[i]] = i; return; + case SEQ_GAN: + map[(unsigned char)'A'] = 0; + map[(unsigned char)'C'] = 1; + map[(unsigned char)'G'] = 2; + map[(unsigned char)'T'] = 3; + map[(unsigned char)'U'] = 3; + map[(unsigned char)'J'] = 4; + map[(unsigned char)'R'] = 1+4+4; // A or G, Purine + map[(unsigned char)'Y'] = 2+8+4; // C or T, Pyrimidine + map[(unsigned char)'N'] = STATE_UNKNOWN; + map[(unsigned char)'X'] = STATE_UNKNOWN; + map[(unsigned char)'W'] = 1+8+4; // A or T, Weak + map[(unsigned char)'S'] = 2+4+4; // G or C, Strong + map[(unsigned char)'M'] = 1+2+4; // A or C, Amino + map[(unsigned char)'K'] = 4+8+4; // G or T, Keto + map[(unsigned char)'B'] = 2+4+8+4; // C or G or T + map[(unsigned char)'H'] = 1+2+8+4; // A or C or T + map[(unsigned char)'D'] = 1+4+8+4; // A or G or T + map[(unsigned char)'V'] = 1+2+4+4; // A or G or C + return; default: return; } } - /** convert a raw characer state into ID, indexed from 0 @param state input raw state @@ -908,6 +1010,46 @@ char Alignment::convertState(char state, SeqType seq_type) { if (!loc) return STATE_INVALID; // unrecognize character state = loc - symbols_morph; return state; + case SEQ_GAN: + switch (state) { + case 'A': + return 0; + case 'C': + return 1; + case 'G': + return 2; + case 'T': + return 3; + case 'U': + return 3; + case 'J': + return 4; + case 'R': + return 1+4+4; // A or G, Purine + case 'Y': + return 2+8+4; // C or T, Pyrimidine + case 'N': + return STATE_UNKNOWN; + case 'W': + return 1+8+4; // A or T, Weak + case 'S': + return 2+4+4; // G or C, Strong + case 'M': + return 1+2+4; // A or C, Amino + case 'K': + return 4+8+4; // G or T, Keto + case 'B': + return 2+4+8+4; // C or G or T + case 'H': + return 1+2+8+4; // A or C or T + case 'D': + return 1+4+8+4; // A or G or T + case 'V': + return 1+2+4+4; // A or G or C + default: + return STATE_INVALID; // unrecognize character + } + return state; default: return STATE_INVALID; } @@ -982,6 +1124,42 @@ char Alignment::convertStateBack(char state) { return symbols_morph[(int)state]; else return '-'; + case SEQ_GAN: + switch (state) { + case 0: + return 'A'; + case 1: + return 'C'; + case 2: + return 'G'; + case 3: + return 'T'; + case 4: + return 'J'; + case 1+4+4: + return 'R'; // A or G, Purine + case 2+8+4: + return 'Y'; // C or T, Pyrimidine + case 1+8+4: + return 'W'; // A or T, Weak + case 2+4+4: + return 'S'; // G or C, Strong + case 1+2+4: + return 'M'; // A or C, Amino + case 4+8+4: + return 'K'; // G or T, Keto + case 2+4+8+4: + return 'B'; // C or G or T + case 1+2+8+4: + return 'H'; // A or C or T + case 1+4+8+4: + return 'D'; // A or G or T + case 1+2+4+4: + return 'V'; // A or G or C + default: + return '?'; // unrecognize character + } + return state; default: // unknown return '*'; @@ -1231,6 +1409,171 @@ int Alignment::buildPattern(StrVector &sequences, char *sequence_type, int nseq, return 1; } +int Alignment::buildPatternGAN(StrVector &sequences, char *sequence_type, int nseq, int nsite, bool gap_as_new) { + int seq_id; + ostringstream err_str; + codon_table = NULL; + genetic_code = NULL; + non_stop_codon = NULL; + + + if (nseq != seq_names.size()) throw "Different number of sequences than specified"; + + /* now check that all sequence names are correct */ + for (seq_id = 0; seq_id < nseq; seq_id ++) { + ostringstream err_str; + if (seq_names[seq_id] == "") + err_str << "Sequence number " << seq_id+1 << " has no names\n"; + // check that all the names are different + for (int i = 0; i < seq_id; i++) + if (seq_names[i] == seq_names[seq_id]) + err_str << "The sequence name " << seq_names[seq_id] << " is dupplicated\n"; + } + if (err_str.str() != "") + throw err_str.str(); + + + /* now check that all sequences have the same length */ + for (seq_id = 0; seq_id < nseq; seq_id ++) { + if (sequences[seq_id].length() != nsite) { + err_str << "Sequence " << seq_names[seq_id] << " contains "; + if (sequences[seq_id].length() < nsite) + err_str << "not enough"; + else + err_str << "too many"; + + err_str << " characters (" << sequences[seq_id].length() << ")\n"; + } + } + + if (err_str.str() != "") + throw err_str.str(); + + /* now check data type */ + seq_type = detectSequenceTypeGAN(sequences, gap_as_new); + switch (seq_type) { + case SEQ_BINARY: + num_states = 2; + cout << "Alignment most likely contains binary sequences" << endl; + break; + case SEQ_DNA: + num_states = 4; + cout << "Alignment most likely contains DNA/RNA sequences" << endl; + break; + case SEQ_PROTEIN: + num_states = 20; + cout << "Alignment most likely contains protein sequences" << endl; + break; + case SEQ_MORPH: + num_states = getMaxObservedStates(sequences); + if (num_states < 2 || num_states > 32) throw "Invalid number of states."; + cout << "Alignment most likely contains " << num_states << "-state morphological data" << endl; + break; + case SEQ_GAN: + num_states = 5; + cout << "Alignment most likely contains 5 states DNA/RNA sequences" << endl; + break; + default: + if (!sequence_type) + throw "Unknown sequence type."; + } + + if (sequence_type && strcmp(sequence_type,"") != 0) { + SeqType user_seq_type; + if (strcmp(sequence_type, "BIN") == 0) { + num_states = 2; + user_seq_type = SEQ_BINARY; + } else if (strcmp(sequence_type, "DNA") == 0) { + num_states = 4; + user_seq_type = SEQ_DNA; + } else if (strcmp(sequence_type, "AA") == 0 || strcmp(sequence_type, "PROT") == 0) { + num_states = 20; + user_seq_type = SEQ_PROTEIN; + } else if (strcmp(sequence_type, "NUM") == 0 || strcmp(sequence_type, "MORPH") == 0) { + num_states = getMaxObservedStates(sequences); + if (num_states < 2 || num_states > 32) throw "Invalid number of states"; + user_seq_type = SEQ_MORPH; + } else if (strcmp(sequence_type, "GAN") == 0) { + num_states = 5; + user_seq_type = SEQ_GAN; + } else if (strcmp(sequence_type, "TINA") == 0 || strcmp(sequence_type, "MULTI") == 0) { + cout << "Multi-state data with " << num_states << " alphabets" << endl; + user_seq_type = SEQ_MULTISTATE; + } else if (strncmp(sequence_type, "CODON", 5) == 0) { + if (seq_type != SEQ_DNA) + outWarning("You want to use codon models but the sequences were not detected as DNA"); + seq_type = user_seq_type = SEQ_CODON; + initCodon(sequence_type); + } else + throw "Invalid sequence type."; + if (user_seq_type != seq_type && seq_type != SEQ_UNKNOWN) + outWarning("Your specified sequence type is different from the detected one"); + seq_type = user_seq_type; + } + + // now convert to patterns + int site, seq, num_gaps_only = 0; + + char char_to_state[NUM_CHAR]; + computeUnknownState(); + buildStateMap(char_to_state, seq_type); + + Pattern pat; + pat.resize(nseq); + int step = ((seq_type == SEQ_CODON) ? 3 : 1); + if (nsite % step != 0) + outError("Number of sites is not multiple of 3"); + site_pattern.resize(nsite/step, -1); + clear(); + pattern_index.clear(); + + for (site = 0; site < nsite; site+=step) { + for (seq = 0; seq < nseq; seq++) { + //char state = convertState(sequences[seq][site], seq_type); + char state = char_to_state[(int)(sequences[seq][site])]; + //cout<<(char)(sequences[seq][site])<<" "<<(int)state<> nseq >> nsite)) + throw "Invalid PHYLIP format. First line must contain number of sequences and sites"; + //cout << "nseq: " << nseq << " nsite: " << nsite << endl; + if (nseq < 3) + throw "There must be at least 3 sequences"; + if (nsite < 1) + throw "No alignment columns"; + + seq_names.resize(nseq, ""); + sequences.resize(nseq, ""); + + } else { // read sequence contents + if (seq_names[seq_id] == "") { // cut out the sequence name + string::size_type pos = line.find_first_of(" \t"); + if (pos == string::npos) pos = 10; // assume standard phylip + seq_names[seq_id] = line.substr(0, pos); + line.erase(0, pos); + } + int old_len = sequences[seq_id].length(); + if (tina_state) { + stringstream linestr(line); + int state; + while (!linestr.eof() ) { + state = -1; + linestr >> state; + if (state < 0) break; + sequences[seq_id].append(1, state); + if (num_states < state+1) num_states = state+1; + } + } else + for (string::iterator it = line.begin(); it != line.end(); it++) { + if ((*it) <= ' ') continue; + if ((*it) == '-' && gap_as_new) + (*it) = 'J'; + if (isalnum(*it) || (*it) == '-' || (*it) == '?'|| (*it) == '.') + sequences[seq_id].append(1, toupper(*it)); + else { + err_str << "Unrecognized character " << *it << " on line " << line_num; + throw err_str.str(); + } + } + if (sequences[seq_id].length() != sequences[0].length()) { + err_str << "Line " << line_num << ": alignment block has variable sequence lengths" << endl; + throw err_str.str(); + } + if (sequences[seq_id].length() > old_len) + seq_id++; + if (seq_id == nseq) { + seq_id = 0; + // make sure that all sequences have the same length at this moment + } + + } + //sequences. + } + in.clear(); + // set the failbit again + in.exceptions(ios::failbit | ios::badbit); + in.close(); + + return buildPatternGAN(sequences, sequence_type, nseq, nsite, gap_as_new); +} + int Alignment::readFasta(char *filename, char *sequence_type) { StrVector sequences; @@ -1360,6 +1790,53 @@ int Alignment::readFasta(char *filename, char *sequence_type) { return buildPattern(sequences, sequence_type, seq_names.size(), sequences.front().length()); } +int Alignment::readFastaGAN(char *filename, char *sequence_type, bool gap_as_new) { + + StrVector sequences; + ostringstream err_str; + ifstream in; + int line_num = 1; + string line; + + // set the failbit and badbit + in.exceptions(ios::failbit | ios::badbit); + in.open(filename); + // remove the failbit + in.exceptions(ios::badbit); + + for (; !in.eof(); line_num++) { + getline(in, line); + if (line == "") continue; + + //cout << line << endl; + if (line[0] == '>') { // next sequence + string::size_type pos = line.find_first_of(" \n\r\t"); + seq_names.push_back(line.substr(1, pos-1)); + sequences.push_back(""); + continue; + } + // read sequence contents + if (sequences.empty()) throw "First line must begin with '>' to define sequence name"; + for (string::iterator it = line.begin(); it != line.end(); it++) { + if ((*it) <= ' ') continue; + if (gap_as_new && (*it) == '-') + (*it) = 'J'; + if (isalnum(*it) || (*it) == '-' || (*it) == '?'|| (*it) == '.') + sequences.back().append(1, toupper(*it)); + else { + err_str << "Unrecognized character " << *it << " on line " << line_num; + throw err_str.str(); + } + } + } + in.clear(); + // set the failbit again + in.exceptions(ios::failbit | ios::badbit); + in.close(); + + return buildPatternGAN(sequences, sequence_type, seq_names.size(), sequences.front().length(), gap_as_new); +} + bool Alignment::getSiteFromResidue(int seq_id, int &residue_left, int &residue_right) { int i, j; int site_left = -1, site_right = -1; diff --git a/alignment.h b/alignment.h index 51ca71dc..7202ba19 100644 --- a/alignment.h +++ b/alignment.h @@ -23,7 +23,7 @@ const char STATE_INVALID = 127; const int NUM_CHAR = 256; enum SeqType { - SEQ_DNA, SEQ_PROTEIN, SEQ_BINARY, SEQ_MORPH, SEQ_MULTISTATE, SEQ_CODON, SEQ_UNKNOWN + SEQ_DNA, SEQ_PROTEIN, SEQ_BINARY, SEQ_MORPH, SEQ_MULTISTATE, SEQ_CODON, SEQ_UNKNOWN, SEQ_GAN }; @@ -60,6 +60,15 @@ class Alignment : public vector { */ Alignment(char *filename, char *sequence_type, InputType &intype); + /** + constructor + @param filename file name + @param sequence_type type of the sequence, either "BIN", "DNA", "AA", or NULL + @param intype (OUT) input format of the file + @param gap_as_new check if gap character is considered as new character + */ + Alignment(char *filename, char *sequence_type, InputType &intype, bool &gap_as_new); + /** destructor */ @@ -93,6 +102,8 @@ class Alignment : public vector { int buildPattern(StrVector &sequences, char *sequence_type, int nseq, int nsite); + int buildPatternGAN(StrVector &sequences, char *sequence_type, int nseq, int nsite, bool gap_as_new); + /** read the alignment in PHYLIP format @param filename file name @@ -101,6 +112,15 @@ class Alignment : public vector { */ int readPhylip(char *filename, char *sequence_type); + /** + read the alignment in PHYLIP format + @param filename file name + @param sequence_type type of the sequence, either "BIN", "DNA", "AA", or NULL + @param gap_as_new check if gap character is considered as new character + @return 1 on success, 0 on failure + */ + int readPhylipGAN(char *filename, char *sequence_type, bool gap_as_new); + /** read the alignment in FASTA format @param filename file name @@ -109,6 +129,15 @@ class Alignment : public vector { */ int readFasta(char *filename, char *sequence_type); + /** + read the alignment in FASTA format + @param filename file name + @param sequence_type type of the sequence, either "BIN", "DNA", "AA", or NULL + @param gap_as_new check if gap character is considered as new character + @return 1 on success, 0 on failure + */ + int readFastaGAN(char *filename, char *sequence_type, bool gap_as_new); + /** extract the alignment from a nexus data block, called by readNexus() @param data_block data block of nexus file @@ -140,6 +169,8 @@ class Alignment : public vector { ****************************************************************************/ SeqType detectSequenceType(StrVector &sequences); + SeqType detectSequenceTypeGAN(StrVector &sequences, bool gap_as_new); + void computeUnknownState(); void buildStateMap(char *map, SeqType seq_type); diff --git a/iqtree.cpp b/iqtree.cpp index b1629713..652519b1 100644 --- a/iqtree.cpp +++ b/iqtree.cpp @@ -512,6 +512,8 @@ void IQTree::createPLLPartition(Params ¶ms, ostream &pllPartitionFileHandle) } else { model = "WAG"; } + } else if (aln->seq_type == SEQ_GAN) { + model = "GAN"; } else { model = "WAG"; //outError("PLL currently only supports DNA/protein alignments"); diff --git a/maalignment.h b/maalignment.h index d70072c8..53c0f510 100644 --- a/maalignment.h +++ b/maalignment.h @@ -35,6 +35,9 @@ class MaAlignment : public Alignment MaAlignment() : Alignment() {}; MaAlignment(char *filename, char *sequence_type, InputType &intype) : Alignment(filename, sequence_type, intype){}; + + MaAlignment(char *filename, char *sequence_type, InputType &intype, bool &gap_as_new) : Alignment(filename, sequence_type, intype, gap_as_new){}; + MaAlignment(Alignment &align) : Alignment(align){}; diff --git a/mexttree.cpp b/mexttree.cpp index 7eb183fd..343e9c6c 100644 --- a/mexttree.cpp +++ b/mexttree.cpp @@ -24,7 +24,11 @@ void MExtTree::generateRandomTree(TreeGenType tree_type, Params ¶ms, bool bi Alignment *alignment = NULL; if (params.aln_file) { // generate random tree with leaf sets taken from an alignment - alignment = new Alignment(params.aln_file, params.sequence_type, params.intype); + if (params.gap_as_new) { + alignment = new Alignment(params.aln_file, params.sequence_type, params.intype, params.gap_as_new); + } else { + alignment = new Alignment(params.aln_file, params.sequence_type, params.intype); + } params.sub_size = alignment->getNSeq(); } if (params.sub_size < 3) { diff --git a/model/modelfactory.cpp b/model/modelfactory.cpp index 801cd4ca..6e52abad 100644 --- a/model/modelfactory.cpp +++ b/model/modelfactory.cpp @@ -96,6 +96,8 @@ ModelSubst* ModelFactory::createModel(string model_str, StateFreqType freq_type, model = new ModelCodon(model_str.c_str(), model_params, freq_type, freq_params, tree, count_rates); } else if (tree->aln->seq_type == SEQ_MORPH) { model = new ModelMorphology(model_str.c_str(), model_params, freq_type, freq_params, tree); + } else if (tree->aln->seq_type == SEQ_GAN) { + model = new ModelMorphology(model_str.c_str(), model_params, freq_type, freq_params, tree); } else { outError("Unsupported model type"); } @@ -117,6 +119,7 @@ ModelFactory::ModelFactory(Params ¶ms, PhyloTree *tree) { else if (tree->aln->seq_type == SEQ_BINARY) model_str = "GTR2"; else if (tree->aln->seq_type == SEQ_CODON) model_str = "GY"; else if (tree->aln->seq_type == SEQ_MORPH) model_str = "MK"; + else if (tree->aln->seq_type == SEQ_GAN) model_str = "MK"; else model_str = "JC"; if(!params.maximum_parsimony) outWarning("Default model may be under-fitting. Use option '-m TEST' to select best-fit model."); @@ -129,6 +132,7 @@ ModelFactory::ModelFactory(Params ¶ms, PhyloTree *tree) { case SEQ_BINARY: freq_type = FREQ_ESTIMATE; break; // default for binary: optimized frequencies case SEQ_PROTEIN: freq_type = FREQ_USER_DEFINED; break; // default for protein: frequencies of the empirical AA matrix case SEQ_MORPH: freq_type = FREQ_EQUAL; break; + case SEQ_GAN: freq_type = FREQ_EQUAL; break; default: freq_type = FREQ_EMPIRICAL; break; // default for DNA and others: counted frequencies from alignment } } @@ -347,11 +351,11 @@ ModelFactory::ModelFactory(Params ¶ms, PhyloTree *tree) { } -int ModelFactory::getNParameters() { - int df = model->getNDim() + site_rate->getNDim() + site_rate->phylo_tree->branchNum; - if (model->freq_type == FREQ_EMPIRICAL) df += model->num_states-1; - return df; -} +int ModelFactory::getNParameters() { + int df = model->getNDim() + site_rate->getNDim() + site_rate->phylo_tree->branchNum; + if (model->freq_type == FREQ_EMPIRICAL) df += model->num_states-1; + return df; +} void ModelFactory::readSiteFreq(Alignment *aln, char* site_freq_file, IntVector &site_model, vector &freq_vec) { cout << "Reading site-specific state frequency file " << site_freq_file << " ..." << endl; diff --git a/parsmultistate.cpp b/parsmultistate.cpp index 5cf4c021..047e55c7 100644 --- a/parsmultistate.cpp +++ b/parsmultistate.cpp @@ -34,3 +34,15 @@ void doParsMultiState(Params ¶ms) { cout << "Parsimony score ver2 is: " << tree.computeParsimony() << endl; //tree.printParsimonyStates(); } + +void doParsMultiStateGAN(Params ¶ms) { + cout << "Here\n"; + Alignment alignment(params.aln_file, params.sequence_type, params.intype, params.gap_as_new); + TinaTree tree; + tree.readTree(params.user_file, params.is_rooted); + tree.setAlignment(&alignment); + tree.drawTree(cout); + cout << "Parsimony score is: " << tree.computeParsimonyScore() << endl; + cout << "Parsimony score ver2 is: " << tree.computeParsimony() << endl; + //tree.printParsimonyStates(); +} diff --git a/parsmultistate.h b/parsmultistate.h index c1c3eb1a..78abac1a 100644 --- a/parsmultistate.h +++ b/parsmultistate.h @@ -24,5 +24,6 @@ #include "tools.h" void doParsMultiState(Params ¶ms); +void doParsMultiStateGAN(Params ¶ms); #endif diff --git a/pda.cpp b/pda.cpp index a1f9f3f2..36947ae1 100644 --- a/pda.cpp +++ b/pda.cpp @@ -1699,6 +1699,43 @@ void guidedBootstrap(Params ¶ms) cout << "Log of the probability of the new alignment is printed to: " << outProb_name << endl; } +void guidedBootstrapGAN(Params ¶ms) +{ + MaAlignment inputAlign(params.aln_file,params.sequence_type, params.intype, params.gap_as_new); + inputAlign.readLogLL(params.siteLL_file); + + string outFre_name = params.out_prefix; + outFre_name += ".patInfo"; + + inputAlign.printPatObsExpFre(outFre_name.c_str()); + + string gboAln_name = params.out_prefix; + gboAln_name += ".gbo"; + + MaAlignment gboAlign; + double prob; + gboAlign.generateExpectedAlignment(&inputAlign, prob); + gboAlign.printPhylip(gboAln_name.c_str()); + + + string outProb_name = params.out_prefix; + outProb_name += ".gbo.logP"; + try { + ofstream outProb; + outProb.exceptions(ios::failbit | ios::badbit); + outProb.open(outProb_name.c_str()); + outProb.precision(10); + outProb << prob << endl; + outProb.close(); + } catch (ios::failure) { + outError(ERR_WRITE_OUTPUT, outProb_name); + } + + cout << "Information about patterns in the input alignment is printed to: " << outFre_name << endl; + cout << "A 'guided bootstrap' alignment is printed to: " << gboAln_name << endl; + cout << "Log of the probability of the new alignment is printed to: " << outProb_name << endl; +} + /**MINH ANH: to compute the probability of an alignment given the multinomial distribution of patterns frequencies derived from a reference alignment*/ void computeMulProb(Params ¶ms) { @@ -1723,6 +1760,29 @@ void computeMulProb(Params ¶ms) cout << "The probability is printed to: " << outProb_name << endl; } +void computeMulProbGAN(Params ¶ms) +{ + Alignment refAlign(params.second_align, params.sequence_type, params.intype, params.gap_as_new); + Alignment inputAlign(params.aln_file, params.sequence_type, params.intype, params.gap_as_new); + double prob; + inputAlign.multinomialProb(refAlign,prob); + //Printing + string outProb_name = params.out_prefix; + outProb_name += ".mprob"; + try { + ofstream outProb; + outProb.exceptions(ios::failbit | ios::badbit); + outProb.open(outProb_name.c_str()); + outProb.precision(10); + outProb << prob << endl; + outProb.close(); + } catch (ios::failure) { + outError(ERR_WRITE_OUTPUT, outProb_name); + } + cout << "Probability of alignment " << params.aln_file << " given alignment " << params.second_align << " is: " << prob << endl; + cout << "The probability is printed to: " << outProb_name << endl; +} + void processNCBITree(Params ¶ms) { NCBITree tree; Node *dad = tree.readNCBITree(params.user_file, params.ncbi_taxid, params.ncbi_taxon_level, params.ncbi_ignore_level); @@ -2327,19 +2387,35 @@ int main(int argc, char *argv[]) } else if (params.do_pars_multistate) { // cout << "Starting the test for computing concensus NOT from file:" << endl; // testCompConsensus("test.fa.boottrees.treefile", "test.out", ¶ms); // (const char * infile, const char * outfile, Params *params); - doParsMultiState(params); + if (params.gap_as_new) { + doParsMultiStateGAN(params); + } else { + doParsMultiState(params); + } } else if(params.test_mode){ test(params); } else if(params.print_site_pars_user_tree){ printSiteParsimonyUserTree(params); } else if (params.compute_parsimony) { - computeUserTreeParsimomy(params); + if (params.gap_as_new) { + computeUserTreeParsimomyGAN(params); + } else { + computeUserTreeParsimomy(params); + } } else if (params.newick_to_tnt) { - convertNewickToTnt(params); + if (params.gap_as_new) { + convertNewickToTntGAN(params); + } else { + convertNewickToTnt(params); + } } else if (params.newick_to_nexus) { - convertNewickToNexus(params); + if (params.gap_as_new) { + convertNewickToNexusGAN(params); + } else { + convertNewickToNexus(params); + } } else if (params.rf_dist_mode != 0) { computeRFDist(params); @@ -2365,10 +2441,22 @@ int main(int argc, char *argv[]) } else if (params.aln_file || params.partition_file) { if ((params.siteLL_file || params.second_align) && !params.gbo_replicates) { - if (params.siteLL_file) - guidedBootstrap(params); - if (params.second_align) - computeMulProb(params); + if (params.siteLL_file) { + if (params.gap_as_new) { + guidedBootstrapGAN(params); + } else { + guidedBootstrap(params); + } + } + + if (params.second_align) { + if (params.gap_as_new) { + computeMulProbGAN(params); + } else { + computeMulProb(params); + } + } + } else { runPhyloAnalysis(params); } diff --git a/phyloanalysis.cpp b/phyloanalysis.cpp index 9e1e8f44..a08e832c 100644 --- a/phyloanalysis.cpp +++ b/phyloanalysis.cpp @@ -510,6 +510,7 @@ void reportPhyloAnalysis(Params ¶ms, string &original_model, case SEQ_MORPH: out << "MORPH"; break; case SEQ_MULTISTATE: out << "TINA"; break; case SEQ_PROTEIN: out << "AA"; break; + case SEQ_GAN: out<<"GAN"; break; case SEQ_UNKNOWN: out << "???"; break; } out.width(5); @@ -1664,7 +1665,7 @@ void runTreeReconstruction(Params ¶ms, string &original_model, IQTree &iqtre for (PhyloSuperTree::iterator it = stree->begin(); it != stree->end(); it++) if ((*it)->aln->seq_type != SEQ_DNA && (*it)->aln->seq_type != SEQ_PROTEIN) params.start_tree = STT_PARSIMONY; - } else if (iqtree.aln->seq_type != SEQ_DNA && iqtree.aln->seq_type != SEQ_PROTEIN) + } else if (iqtree.aln->seq_type != SEQ_DNA && iqtree.aln->seq_type != SEQ_PROTEIN && iqtree.aln->seq_type != SEQ_GAN) params.start_tree = STT_PARSIMONY; } @@ -2095,13 +2096,23 @@ void convertAlignment(Params ¶ms, IQTree *iqtree) { } else if (params.gap_masked_aln) { Alignment out_aln; - Alignment masked_aln(params.gap_masked_aln, params.sequence_type, params.intype); - out_aln.createGapMaskedAlignment(&masked_aln, alignment); - out_aln.printPhylip(params.aln_output, false, params.aln_site_list, - params.aln_nogaps, params.aln_no_const_sites, params.ref_seq_name); - string str = params.gap_masked_aln; - str += ".sitegaps"; - out_aln.printSiteGaps(str.c_str()); + if (params.gap_as_new) { + Alignment masked_aln(params.gap_masked_aln, params.sequence_type, params.intype, params.gap_as_new); + out_aln.createGapMaskedAlignment(&masked_aln, alignment); + out_aln.printPhylip(params.aln_output, false, params.aln_site_list, + params.aln_nogaps, params.aln_no_const_sites, params.ref_seq_name); + string str = params.gap_masked_aln; + str += ".sitegaps"; + out_aln.printSiteGaps(str.c_str()); + } else { + Alignment masked_aln(params.gap_masked_aln, params.sequence_type, params.intype); + out_aln.createGapMaskedAlignment(&masked_aln, alignment); + out_aln.printPhylip(params.aln_output, false, params.aln_site_list, + params.aln_nogaps, params.aln_no_const_sites, params.ref_seq_name); + string str = params.gap_masked_aln; + str += ".sitegaps"; + out_aln.printSiteGaps(str.c_str()); + } } else if (params.aln_output_format == ALN_PHYLIP) alignment->printPhylip(params.aln_output, false, params.aln_site_list, params.aln_nogaps, params.aln_no_const_sites, params.ref_seq_name); @@ -2136,11 +2147,19 @@ void runPhyloAnalysis(Params ¶ms) { // this alignment will actually be of type SuperAlignment alignment = tree->aln; } else if(params.maximum_parsimony && params.sankoff_cost_file){ - alignment = new Alignment(params.aln_file, params.sequence_type, params.intype); + if (params.gap_as_new) { + alignment = new Alignment(params.aln_file, params.sequence_type, params.intype, params.gap_as_new); + } else { + alignment = new Alignment(params.aln_file, params.sequence_type, params.intype); + } tree = new ParsTree(alignment); dynamic_cast(tree)->initParsData(¶ms); }else { - alignment = new Alignment(params.aln_file, params.sequence_type, params.intype); + if (params.gap_as_new) { + alignment = new Alignment(params.aln_file, params.sequence_type, params.intype, params.gap_as_new); + } else { + alignment = new Alignment(params.aln_file, params.sequence_type, params.intype); + } if (params.maximum_parsimony && !params.sankoff_cost_file && params.condense_parsimony_equiv_sites) { Alignment *aln = new Alignment(); aln->condenseParsimonyEquivalentSites(alignment); @@ -2160,9 +2179,15 @@ void runPhyloAnalysis(Params ¶ms) { string original_model = params.model_name; if (params.concatenate_aln) { - Alignment aln(params.concatenate_aln, params.sequence_type, params.intype); - cout << "Concatenating " << params.aln_file << " with " << params.concatenate_aln << " ..." << endl; - alignment->concatenateAlignment(&aln); + if (params.gap_as_new) { + Alignment aln(params.concatenate_aln, params.sequence_type, params.intype, params.gap_as_new); + cout << "Concatenating " << params.aln_file << " with " << params.concatenate_aln << " ..." << endl; + alignment->concatenateAlignment(&aln); + } else { + Alignment aln(params.concatenate_aln, params.sequence_type, params.intype); + cout << "Concatenating " << params.aln_file << " with " << params.concatenate_aln << " ..." << endl; + alignment->concatenateAlignment(&aln); + } } if (params.aln_output) { @@ -2263,7 +2288,12 @@ void runPhyloAnalysis(Params ¶ms) { } void printSiteParsimonyUserTree(Params ¶ms) { - Alignment alignment(params.aln_file, params.sequence_type, params.intype); + Alignment alignment; + if (params.gap_as_new) { + Alignment alignment(params.aln_file, params.sequence_type, params.intype, params.gap_as_new); + } else { + Alignment alignment(params.aln_file, params.sequence_type, params.intype); + } IQTree * ptree; if(params.sankoff_cost_file){ diff --git a/phylosupertree.cpp b/phylosupertree.cpp index 731ef11d..6eaf0c60 100644 --- a/phylosupertree.cpp +++ b/phylosupertree.cpp @@ -73,7 +73,12 @@ void PhyloSuperTree::readPartition(Params ¶ms) { info.nniMoves[1].ptnlh = NULL; info.cur_ptnlh = NULL; part_info.push_back(info); - Alignment *part_aln = new Alignment((char*)info.aln_file.c_str(), (char*)info.sequence_type.c_str(), params.intype); + Alignment *part_aln; + if (params.gap_as_new) { + part_aln = new Alignment((char*)info.aln_file.c_str(), (char*)info.sequence_type.c_str(), params.intype, params.gap_as_new); + } else { + part_aln = new Alignment((char*)info.aln_file.c_str(), (char*)info.sequence_type.c_str(), params.intype); + } if (!info.position_spec.empty()) { Alignment *new_aln = new Alignment(); new_aln->extractSites(part_aln, info.position_spec.c_str()); @@ -108,7 +113,11 @@ void PhyloSuperTree::readPartitionNexus(Params ¶ms) { Alignment *input_aln = NULL; if (params.aln_file) { - input_aln = new Alignment(params.aln_file, params.sequence_type, params.intype); + if (params.gap_as_new) { + input_aln = new Alignment(params.aln_file, params.sequence_type, params.intype, params.gap_as_new); + } else { + input_aln = new Alignment(params.aln_file, params.sequence_type, params.intype); + } } bool empty_partition = true; @@ -145,7 +154,11 @@ void PhyloSuperTree::readPartitionNexus(Params ¶ms) { part_info.push_back(info); Alignment *part_aln; if (info.aln_file != "") { - part_aln = new Alignment((char*)info.aln_file.c_str(), (char*)info.sequence_type.c_str(), params.intype); + if (params.gap_as_new) { + part_aln = new Alignment((char*)info.aln_file.c_str(), (char*)info.sequence_type.c_str(), params.intype, params.gap_as_new); + } else { + part_aln = new Alignment((char*)info.aln_file.c_str(), (char*)info.sequence_type.c_str(), params.intype); + } } else { part_aln = input_aln; } diff --git a/phylotree.cpp b/phylotree.cpp index d834c3f9..7dd3647d 100644 --- a/phylotree.cpp +++ b/phylotree.cpp @@ -1032,7 +1032,7 @@ int PhyloTree::computeParsimonyBranch(PhyloNeighbor *dad_branch, PhyloNode *dad, _pattern_pars[ptn] = node_branch->partial_pars[ptn_pars_start_id + ptn] + dad_branch->partial_pars[ptn_pars_start_id + ptn] + 1; }else - _pattern_pars[ptn + i] = node_branch->partial_pars[ptn_pars_start_id + ptn] + + _pattern_pars[ptn] = node_branch->partial_pars[ptn_pars_start_id + ptn] + dad_branch->partial_pars[ptn_pars_start_id + ptn]; } delete[] bits_entry; diff --git a/pllrepo/src/globalVariables.h b/pllrepo/src/globalVariables.h index 220c9a48..88dec819 100644 --- a/pllrepo/src/globalVariables.h +++ b/pllrepo/src/globalVariables.h @@ -42,6 +42,8 @@ const char protStateNames[20] = {'A','R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'}; +const char ganStateNames[5] = {'A', 'C', 'G', 'T', 'J'}; + const char inverseMeaningBINARY[4] = {'_', '0', '1', '-'}; const char inverseMeaningDNA[16] = {'_', 'A', 'C', 'M', 'G', 'R', 'S', 'V', 'T', 'W', 'Y', 'H', 'K', 'D', 'B', '-'}; const char inverseMeaningPROT[23] = {'A','R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', @@ -56,6 +58,7 @@ const char inverseMeaningGeneric64[33] = {'0', '1', '2', '3', '4', '5', '6', '7' 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', '-'}; +const char inverseMeaningGAN[17] = {'_', 'A', 'C', 'M', 'G', 'R', 'S', 'V', 'T', 'W', 'Y', 'H', 'K', 'D', 'B', '-', 'J'}; const unsigned int bitVectorIdentity[256] = {0 ,1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,11 ,12 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,21 ,22 ,23 ,24 ,25 ,26 , 27 ,28 ,29 ,30 ,31 ,32 ,33 ,34 ,35 ,36 ,37 ,38 ,39 ,40 ,41 ,42 ,43 ,44 ,45 ,46 ,47 ,48 ,49 ,50 ,51 , @@ -101,6 +104,8 @@ const unsigned int bitVector32[33] = {1, 2, 4, 8, 16, 32, 64, 16777216, 33554432, 67108864, 134217728, 268435456, 536870912, 1073741824, 2147483648u, 4294967295u}; +const unsigned int bitVectorGAN[5] = {1, 2, 4, 8, 16}; + /*const unsigned int bitVector64[65] = {};*/ /** @brief Array for setting bits 0 .. 31 in a bit vector, used in saveMemory technique for the gapVector */ const unsigned int mask32[32] = {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, @@ -137,7 +142,10 @@ const partitionLengths pLengths[PLL_MAX_MODEL] = { {1024, 1024, 32, 1024, 1024, 496, 32, 1056, 496, 32, PLL_FALSE, PLL_FALSE, 32, inverseMeaningGeneric32, 32, PLL_TRUE, bitVector32}, /* 64 states */ - {4096, 4096, 64, 4096, 4096, 2016, 64, 4160, 64, 2016, PLL_FALSE, PLL_FALSE, 64, (char*)NULL, 64, PLL_TRUE, (unsigned int*)NULL} + {4096, 4096, 64, 4096, 4096, 2016, 64, 4160, 64, 2016, PLL_FALSE, PLL_FALSE, 64, (char*)NULL, 64, PLL_TRUE, (unsigned int*)NULL}, + + /*Gap as new state */ + {25, 25, 5, 25, 25, 10, 5, 1056, 10, 5, PLL_FALSE, PLL_FALSE, 31, inverseMeaningGAN, 5, PLL_FALSE, bitVectorIdentity}, }; diff --git a/pllrepo/src/models.c b/pllrepo/src/models.c index a838b4b6..0ca152b1 100644 --- a/pllrepo/src/models.c +++ b/pllrepo/src/models.c @@ -3383,6 +3383,7 @@ void pllInitReversibleGTR(pllInstance * tr, partitionList * pr, int model) switch(pr->partitionData[model]->dataType) { + case PLL_GAN_DATA: case PLL_GENERIC_32: case PLL_GENERIC_64: case PLL_SECONDARY_DATA_6: @@ -3855,7 +3856,8 @@ void initRateMatrix(pllInstance *tr, partitionList *pr) case PLL_SECONDARY_DATA_6: case PLL_SECONDARY_DATA_7: setRates(pr->partitionData[model]->substRates, rates); - break; + break; + case PLL_GAN_DATA: case PLL_GENERIC_32: case PLL_GENERIC_64: switch(tr->multiStateModel) diff --git a/pllrepo/src/newviewGenericSpecial.c b/pllrepo/src/newviewGenericSpecial.c index b5ff6e81..b5bbae8d 100644 --- a/pllrepo/src/newviewGenericSpecial.c +++ b/pllrepo/src/newviewGenericSpecial.c @@ -154,6 +154,7 @@ extern const char binaryStateNames[2]; /**< @brief Alphabet of binary states */ extern const char dnaStateNames[4]; /**< @brief DNA alphabet */ extern const char protStateNames[20]; /**< @brief Amino-acid alphabet */ extern const unsigned int mask32[32]; /**< @brief Contains the first 32 powers of 2, i.e. 2^0 upto 2^31 */ +extern const char ganStateNames[5]; static void ascertainmentBiasSequence(unsigned char tip[32], int numStates) { @@ -3955,7 +3956,10 @@ static char getStateCharacter(int dataType, int state) break; case PLL_AA_DATA: result = protStateNames[state]; - break; + break; + case PLL_GAN_DATA: + result = ganStateNames[state]; + break; default: assert(0); } diff --git a/pllrepo/src/optimizeModel.c b/pllrepo/src/optimizeModel.c index 2c557e1e..04add1db 100644 --- a/pllrepo/src/optimizeModel.c +++ b/pllrepo/src/optimizeModel.c @@ -1037,6 +1037,7 @@ void pllOptAlphasGeneric(pllInstance *tr, partitionList * pr, double modelEpsilo { switch(pr->partitionData[ll->ld[i].partitionList[0]]->dataType) { + case PLL_GAN_DATA: case PLL_DNA_DATA: case PLL_BINARY_DATA: case PLL_SECONDARY_DATA: @@ -1088,6 +1089,7 @@ void pllOptAlphasGeneric(pllInstance *tr, partitionList * pr, double modelEpsilo { switch(pr->partitionData[ll->ld[i].partitionList[0]]->dataType) { + case PLL_GAN_DATA: case PLL_DNA_DATA: case PLL_BINARY_DATA: case PLL_SECONDARY_DATA: @@ -1459,6 +1461,7 @@ void pllOptBaseFreqs(pllInstance *tr, partitionList * pr, double modelEpsilon, l else ll->ld[i].valid = PLL_FALSE; break; + case PLL_GAN_DATA: case PLL_DNA_DATA: case PLL_AA_DATA: case PLL_SECONDARY_DATA: @@ -1603,6 +1606,7 @@ void pllOptRatesGeneric(pllInstance *tr, partitionList *pr, double modelEpsilon, else ll->ld[i].valid = PLL_FALSE; break; + case PLL_GAN_DATA: case PLL_BINARY_DATA: case PLL_AA_DATA: case PLL_SECONDARY_DATA: diff --git a/pllrepo/src/parsePartition.c b/pllrepo/src/parsePartition.c index 5ea20b42..ad96a96d 100644 --- a/pllrepo/src/parsePartition.c +++ b/pllrepo/src/parsePartition.c @@ -160,7 +160,10 @@ static pllQueue * parse_partition (int * inp, pllHashTable * proteinModelsHash) if (!strcmp(modelptr, "DNAX")) pi->optimizeBaseFrequencies = PLL_TRUE; } - else + else if (!strcmp(modelptr, "GAN")) + { + pi->dataType = PLL_GAN_DATA; + } else { /* and protein data */ pi->dataType = PLL_AA_DATA; diff --git a/pllrepo/src/pll.h b/pllrepo/src/pll.h index 05212179..37d1e55f 100644 --- a/pllrepo/src/pll.h +++ b/pllrepo/src/pll.h @@ -243,7 +243,8 @@ extern "C" { #define PLL_SECONDARY_DATA_7 5 #define PLL_GENERIC_32 6 #define PLL_GENERIC_64 7 -#define PLL_MAX_MODEL 8 +#define PLL_GAN_DATA 8 +#define PLL_MAX_MODEL 9 #define PLL_SEC_6_A 0 #define PLL_SEC_6_B 1 diff --git a/pllrepo/src/utils.c b/pllrepo/src/utils.c index 39b87cd1..a855cfaa 100644 --- a/pllrepo/src/utils.c +++ b/pllrepo/src/utils.c @@ -135,7 +135,25 @@ static const char PLL_MAP_AA[256] = -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }; - +static const char PLL_MAP_GAN[256] = + { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, + -1, 1, 14, 2, 13, -1, -1, 4, 11, -1, 16, 12, -1, 3, 15, 15, + -1, -1, 5, 6, 8, 8, 7, 9, 15, 10, -1, -1, -1, -1, -1, -1, + -1, 1, 14, 2, 13, -1, -1, 4, 11, -1, -1, 12, -1, 3, 15, 15, + -1, -1, 5, 6, 8, 8, 7, 9, 15, 10, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; @@ -2519,6 +2537,9 @@ void pllBaseSubstitute (pllInstance * tr, partitionList * partitions) case PLL_AA_DATA: d = PLL_MAP_AA; break; + case PLL_GAN_DATA: + d = PLL_MAP_GAN; + break; default: assert(0); } diff --git a/sprparsimony.cpp b/sprparsimony.cpp index 01feb077..4c98e174 100644 --- a/sprparsimony.cpp +++ b/sprparsimony.cpp @@ -3539,6 +3539,81 @@ void computeUserTreeParsimomy(Params ¶ms) { delete ptree; } +void computeUserTreeParsimomyGAN(Params ¶ms) { + Alignment alignment(params.aln_file, params.sequence_type, params.intype, params.gap_as_new); + + IQTree * ptree; + + if(params.sankoff_cost_file){ + ptree = new ParsTree(&alignment); + dynamic_cast(ptree)->initParsData(¶ms); + }else + ptree = new IQTree(&alignment); + + +// cout << "Read user tree... 1st time"; + ptree->readTree(params.user_file, params.is_rooted); + + ptree->setAlignment(&alignment); // IMPORTANT: Always call setAlignment() after readTree() + optimizeAlignment(ptree, params); + +// cout << "Read user tree... 2nd time"; + ptree->readTree(params.user_file, params.is_rooted); + + ptree->setAlignment(ptree->aln); // IMPORTANT: Always call setAlignment() after readTree() + ptree->initializeAllPartialPars(); + ptree->clearAllPartialLH(); + +// cout << "done" << endl; + cout << "Parsimony score (by IQTree kernel) is: "; + cout << ptree->computeParsimony() << endl; + + cout << "###################################################################" << endl << endl; + + if(params.sankoff_cost_file){ + ptree->initializePLL(params); + string tree_string = ptree->getTreeString(); + pllNewickTree *pll_tree = pllNewickParseString(tree_string.c_str()); + assert(pll_tree != NULL); + pllTreeInitTopologyNewick(ptree->pllInst, pll_tree, PLL_FALSE); + iqtree = ptree; + pllNewickParseDestroy(&pll_tree); + _allocateParsimonyDataStructures(ptree->pllInst, ptree->pllPartitions, false); + ptree->pllInst->bestParsimony = UINT_MAX; // Important because of early termination in evaluateSankoffParsimonyIterativeFastSIMD + unsigned int pll_score = evaluateParsimony(ptree->pllInst, ptree->pllPartitions, ptree->pllInst->start, PLL_TRUE, false); + cout << "Parsimony score (by PLL kernel) is: " << pll_score << endl; + + + if(ptree->pllPartitions->partitionData[0]->informativePtnWgt){ + int diffCount = 0; + unsigned int *ptnWgt = (unsigned int*)ptree->pllPartitions->partitionData[0]->informativePtnWgt; + for(int i = 0; i < ptree->aln->n_informative_patterns; i++){ + if(ptree->aln->at(i).frequency != ptnWgt[i]) diffCount++; + } + assert(diffCount == 0); + } + + _pllFreeParsimonyDataStructures(ptree->pllInst, ptree->pllPartitions); + }else{ + ptree->initializePLL(params); + string tree_string = ptree->getTreeString(); + pllNewickTree *pll_tree = pllNewickParseString(tree_string.c_str()); + assert(pll_tree != NULL); + pllTreeInitTopologyNewick(ptree->pllInst, pll_tree, PLL_FALSE); + iqtree = ptree; + pllNewickParseDestroy(&pll_tree); + _allocateParsimonyDataStructures(ptree->pllInst, ptree->pllPartitions, false); + ptree->pllInst->bestParsimony = UINT_MAX; // Important because of early termination in evaluateSankoffParsimonyIterativeFastSIMD + unsigned int pll_score = evaluateParsimony(ptree->pllInst, ptree->pllPartitions, ptree->pllInst->start, PLL_TRUE, false); + cout << "Parsimony score (by PLL kernel) is: " << pll_score << endl; + + + _pllFreeParsimonyDataStructures(ptree->pllInst, ptree->pllPartitions); + } + delete ptree; +} + + // Given an alignment A (-s) and a tree T (-nwtree) // Convert T into TNT format (clear all bootstrap supports, use tread syntax) // @param: Params @@ -3592,6 +3667,56 @@ void convertNewickToTnt(Params ¶ms) { delete ptree; } +void convertNewickToTntGAN(Params ¶ms) { + Alignment alignment(params.aln_file, params.sequence_type, params.intype, params.gap_as_new); + + IQTree * ptree; + + ptree = new IQTree(&alignment); + +// cout << "Read user tree... 1st time"; + ptree->readTree(params.user_file, params.is_rooted); + + ptree->setAlignment(&alignment); // IMPORTANT: Always call setAlignment() after readTree() + +// ptree->printTree(cout, WT_TAXON_ID | WT_SORT_TAXA); +// cout << endl; + + ostringstream ss; + ptree->printTree(ss, WT_TAXON_ID | WT_SORT_TAXA); + + string treestrboot = ss.str(); + string treestr = ""; + int len = treestrboot.length(); + int j; + for(int i = 0; i < len; i++){ + if(treestrboot[i] == ',') + treestr.append(1, ' '); + else + treestr.append(1, treestrboot[i]); + + if(treestrboot[i] == ')'){ + j = i; + do{ + j = j + 1; + } while(treestrboot[j] >= '0' && treestrboot[j] <= '9'); + i = j - 1; + } + } + + string filename = params.user_file; + filename += ".tnt"; + ofstream fout(filename.c_str()); + fout << "tread" << endl; + fout << treestr << endl; + fout << "proc /;" << endl; + fout.close(); + + cout << "Tree in TNT format is outputted to " << filename.c_str() << endl; + + delete ptree; +} + // Given an alignment A (-s) and a tree T @@ -3655,6 +3780,64 @@ void convertNewickToNexus(Params ¶ms) { delete ptree; } +void convertNewickToNexusGAN(Params ¶ms) { + Alignment alignment(params.aln_file, params.sequence_type, params.intype, params.gap_as_new); + + IQTree * ptree; + + ptree = new IQTree(&alignment); + +// cout << "Read user tree... 1st time"; + ptree->readTree(params.user_file, params.is_rooted); + + ptree->setAlignment(&alignment); // IMPORTANT: Always call setAlignment() after readTree() + + string filename = params.user_file; + filename += ".nex"; + ofstream fout(filename.c_str()); + fout << "#NEXUS" << endl << endl; + fout << "BEGIN TAXA;" << endl; + fout << "\tDIMENSIONS ntax=" << ptree->aln->getNSeq() << ";" << endl; + fout << "\tTAXLABELS "; + for(int i = 0; i < ptree->aln->getNSeq(); i++) + fout << ptree->aln->getSeqName(i) << " "; + fout << ";" << endl; + fout << "END;" << endl << endl << "BEGIN TREES;" << endl << "\tTREE tree_1="; + + ostringstream ss; + ptree->printTree(ss, WT_SORT_TAXA); + string treestr = ss.str(); + + string wtreestr = ""; + int len = treestr.length(); + int j; + int count=0; + for(int i = 0; i < len; i++){ + wtreestr.append(1, treestr[i]); + if(treestr[i] == ')' && i != len - 2){ + j = i; + do{ + j = j + 1; + } while(treestr[j] >= '0' && treestr[j] <= '9'); + i = j - 1; + wtreestr.append("inode"); + stringstream ssi; + ssi << count++; + wtreestr.append(ssi.str()); + } + if(treestr[i] == ')' && i == len - 2) wtreestr.append("root"); + } + + fout << wtreestr; + + fout << endl << "END;" << endl; + fout.close(); + + cout << "Tree in NEXUS format is outputted to " << filename.c_str() << endl; + + delete ptree; +} + /****************************************** UTILS ***************************/ /* Diep begin */ diff --git a/sprparsimony.h b/sprparsimony.h index c02b22ec..dbdd1923 100644 --- a/sprparsimony.h +++ b/sprparsimony.h @@ -44,8 +44,11 @@ int pllCalcMinParsScorePattern(pllInstance *tr, int dataType, int site); void testSiteParsimony(Params ¶ms); void computeUserTreeParsimomy(Params ¶ms); +void computeUserTreeParsimomyGAN(Params ¶ms); void convertNewickToTnt(Params ¶ms); +void convertNewickToTntGAN(Params ¶ms); void convertNewickToNexus(Params ¶ms); +void convertNewickToNexusGAN(Params ¶ms); // util function // act as pllAlignmentRemoveDups of PLL but for sorted alignment of IQTREE diff --git a/tools.cpp b/tools.cpp index 19ec5631..e5b6a1ef 100644 --- a/tools.cpp +++ b/tools.cpp @@ -804,6 +804,7 @@ void parseArg(int argc, char *argv[], Params ¶ms) { params.top_boot_concensus = false; params.do_first_rell = false; params.test_mode = false; + params.gap_as_new = false; #ifdef _OPENMP params.num_threads = 0; @@ -2462,6 +2463,10 @@ void parseArg(int argc, char *argv[], Params ¶ms) { params.test_mode = true; continue; + } + if(strcmp(argv[cnt], "-gap_as_new") == 0){ + params.gap_as_new = true; + continue; } if(strcmp(argv[cnt], "-opt_btree_spr") == 0){ cnt++; diff --git a/tools.h b/tools.h index 97f89afc..a7ea2fd7 100644 --- a/tools.h +++ b/tools.h @@ -1625,6 +1625,12 @@ struct Params { */ bool test_mode; + /* + * Duong: + * To check how to cheat the gap character + */ + bool gap_as_new; + #ifdef _OPENMP int num_threads; #endif