"Fossies" - the Fresh Open Source Software Archive

Member "augustus-3.3.3/src/statemodel.cc" (13 Sep 2019, 16080 Bytes) of package /linux/misc/augustus-3.3.3.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "statemodel.cc" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 3.3.2_vs_3.3.3.

    1 /*
    2  * statemodel.cc
    3  *
    4  * License: Artistic License, see file LICENSE.TXT or 
    5  *          https://opensource.org/licenses/artistic-license-1.0
    6  */
    7 
    8 #include "statemodel.hh"
    9 
   10 // project includes
   11 #include "exonmodel.hh"
   12 #include "intronmodel.hh"
   13 #include "igenicmodel.hh"
   14 #include "utrmodel.hh"
   15 #include "ncmodel.hh"
   16 #include "pp_scoring.hh"
   17 #include "extrinsicinfo.hh"
   18 
   19 // standard C/C++ includes
   20 #include <iostream>
   21 
   22 
   23 // definition of class variables
   24 vector<Boolean>*           StateModel::shortpattern = NULL;
   25 int                        StateModel::dnalen = -1;
   26 int                        StateModel::countStart = -1;
   27 int                        StateModel::countEnd = -1;
   28 const char*                StateModel::sequence = NULL;
   29 SequenceFeatureCollection* StateModel::seqFeatColl = NULL;
   30 PP::SubstateModel*         StateModel::profileModel = NULL;
   31 const vector<StateType>*   StateModel::stateMap = NULL;
   32 int                        StateModel::activeWinLen = 1;
   33 int                        StateModel::gcIdx = 0;
   34 ContentStairs*             StateModel::cs = NULL;
   35 
   36 /* --- StateModel methods ------------------------------------------ */
   37 
   38 /*
   39  * initPredecessors
   40  */
   41 void StateModel::initPredecessors(Matrix<Double>& trans, int self) {
   42     ancestor.clear();
   43     for( int i = 0; i < trans.getColSize(); i++ )
   44         if( trans[i][self] != 0 ) 
   45         ancestor.push_back(Ancestor(i, trans[i][self]));
   46 }
   47 
   48 /*
   49  * init
   50  */
   51 void StateModel::init() {
   52     ExonModel::init();
   53     IntronModel::init();
   54     UtrModel::init();
   55     IGenicModel::init();
   56     NcModel::init();
   57 
   58     // by linking submaps we reach at most as far as the length of 
   59     // equalD state + MAX_LINKCOUNT times geometric states + splicesite states
   60     // add this to the maximum exon length
   61     activeWinLen = ExonModel::getMaxStateLen() + IntronModel::getD() + MAX_LINKCOUNT;
   62 }
   63 
   64 /**
   65  * newStateModelPtr: Get a pointer to a StateModel object.
   66  *
   67  * creates a new StateModel descendant; the type is determined by
   68  * by the string _path_, currently one of:
   69  *  intronmodel, exonmodel, igenicmodel, utrmodel, ncmodel
   70  *
   71  */
   72 StateModel* StateModel::newStateModelPtr(const char* path) {
   73     string intronMdlStr = "intronmodel", exonMdlStr = "exonmodel", igenicMdlStr = "igenicmodel", utrMdlStr = "utrmodel", ncMdlStr = "ncmodel";
   74     string pathString(path);
   75     try {
   76     if (pathString == intronMdlStr)
   77         return new IntronModel();
   78     else if (pathString == exonMdlStr)
   79         return new ExonModel();
   80     else if (pathString == igenicMdlStr)
   81         return new IGenicModel();
   82     else if (pathString == utrMdlStr)
   83         return new UtrModel();
   84     else if (pathString == ncMdlStr)
   85         return new NcModel();
   86     } catch (ProjectError &e) {
   87     cerr << e.getMessage() << endl;
   88     }
   89     throw ProjectError("StateModelPtr: error creating model \"" + string(path) + "\".");
   90     return (StateModel*) 0;
   91 }
   92 
   93         
   94 /*
   95  * StateModel::determineShortPatterns
   96  * for i=0, .. , k-1 is patcounts[i] the number of patterns with base4 representation i
   97  * A ** midpn ** A
   98  * C             C
   99  * G             G
  100  * T             T
  101  *
  102  * If for some x the number sum_y #(midpn|y) is less than minCount, then 
  103  * the by 1 shorter pattern is used later: shortpattern[midpn] = true;
  104  */
  105 void StateModel::determineShortPatterns(const vector<Integer>& patcounts, int k, int minCount){
  106     if (k<2) {
  107     cerr << "StateModel::determineShortPatterns: "
  108          << "Ignoring the possibility of shortening the patterns." << endl;
  109     return;
  110     }
  111     int n;
  112     int shortsize = POWER4TOTHE(k);
  113     if (!shortpattern) {
  114     shortpattern = new vector<Boolean>(shortsize, false);
  115     }
  116     for (int pn = 0; pn < shortsize; pn++) {
  117     n = patcounts[4*pn + 0] + patcounts[4*pn + 1] + patcounts[4*pn + 2] + patcounts[4*pn + 3];
  118     (*shortpattern)[pn] = (n < minCount);
  119     }
  120     // cout << " #shortpatterns = " << numshort << " / " << shortsize << endl;
  121 }
  122 
  123 void StateModel::makeProbsFromCounts(vector<Double> &patprobs , const vector<Integer> &patcounts, 
  124                      int k, Double pseudocount, Boolean shorten){
  125     int size = POWER4TOTHE(k+1);
  126     if (size != patcounts.size() || size != patprobs.size() )
  127     throw ProjectError("StateModel::makeProbsFromCounts: size doesn't match k");
  128     if (shorten && !shortpattern) {
  129     cerr << "StateModel::makeProbsFromCounts: Couldn't shorten the patterns." <<endl;
  130     cerr << "Don't know which to shorten yet." << endl;
  131     shorten = false;
  132     }
  133 
  134     Double sum, normsum = 0.0;
  135     if (!shorten || k<2) { // normal estimation 
  136     for(int pn=0; pn < size; pn+=4) {
  137         sum = patcounts[pn+0]+patcounts[pn+1]+patcounts[pn+2]+patcounts[pn+3];
  138         for (int i=0; i<4; i++) {
  139         normsum += patprobs[pn+i] = patcounts[pn+i] + pseudocount;
  140         }
  141     }
  142     } else { // shorten the pattern by 1 if specified in the shortpattern vector
  143     int shortsize = POWER4TOTHE(k-1);
  144     for(int pn=0; pn < shortpattern->size(); pn++) {
  145         if (!((*shortpattern)[pn])) { // do it normally here also
  146         sum = patcounts[4*pn+0]+patcounts[4*pn+1]+patcounts[4*pn+2]+patcounts[4*pn+3];
  147         for (int i=0; i<4; i++) {
  148             normsum += patprobs[4*pn+i] = patcounts[4*pn+i] + pseudocount;
  149         }
  150         } else { // pool the data for 4 different nucs at the beginning
  151         sum = 0.0;
  152         int midpn = pn % shortsize;
  153         for (int a=0; a<4; a++) {// build the sum = *|midpn|*
  154             for(int b=0; b<4; b++) {
  155             sum += patcounts[4*(a*shortsize+midpn) + b];
  156             }  
  157         }
  158         for(int b=0; b<4; b++) {
  159             // p is the number of patterns "a|midpn|b" for any a
  160             Double p = patcounts[4*midpn + b]+
  161             patcounts[4*(  shortsize+midpn) + b]+
  162             patcounts[4*(2*shortsize+midpn) + b]+
  163             patcounts[4*(3*shortsize+midpn) + b];
  164             normsum += patprobs[4*pn+b] = p/4 + pseudocount;
  165             
  166         }
  167         }
  168     }
  169     }
  170     // normalize
  171     for (int pn=0; pn < size; pn++) 
  172     patprobs[pn] /= normsum;
  173 }
  174 
  175 void StateModel::computeEmiFromPat(const vector<Double>& patprobs, vector<Double>& emiprobs, Integer k){
  176     /*
  177      * Emission Probabilities for order k emissions only
  178      * They are computed using the Pls also.
  179      */
  180     int size = POWER4TOTHE(k+1);
  181     for( int i = 0; i < size; i += 4 ){
  182     Double sum = patprobs[i] + patprobs[i+1] + patprobs[i+2] + patprobs[i+3]; 
  183     for( int nuk = 0; nuk < 4; nuk++ )
  184         if (k>0) {
  185         emiprobs[i+nuk] = patprobs[i+nuk]/sum;
  186         } else {
  187         emiprobs[i+nuk] = patprobs[i+nuk];
  188         }
  189     }
  190 }
  191 
  192 // this is stuff that needs to be called once for each new sequence
  193 void StateModel::prepareViterbi(const char* dna, int len, 
  194                 const vector<StateType>& smap) {
  195     sequence = dna;
  196     dnalen = len;
  197     stateMap = &smap; // needed in exonmodel to determine predecessor type
  198     if (profileModel) {
  199     PP::DNA::initSeq(dna, len);
  200     profileModel->initScores();
  201     profileModel->initHitSequences();
  202     }
  203     ExonModel::setORF();
  204 }
  205 
  206 void StateModel::readProbabilities(int newParIndex) {
  207     ExonModel::readProbabilities(newParIndex);
  208     IntronModel::readProbabilities(newParIndex);
  209     IGenicModel::readProbabilities(newParIndex);
  210     UtrModel::readProbabilities(newParIndex);
  211 }
  212 
  213 void StateModel::resetPars(){
  214     ExonModel::resetPars();
  215     IntronModel::resetPars();
  216     IGenicModel::resetPars();
  217     UtrModel::resetPars();
  218     NcModel::resetPars();
  219 }
  220 
  221 void StateModel::updateToLocalGC(int idx, int from, int to){
  222     gcIdx = idx;
  223     ExonModel::updateToLocalGC(from, to);
  224     IntronModel::updateToLocalGC(from, to);
  225     IGenicModel::updateToLocalGC(from, to);
  226     if (Constant::utr_option_on)
  227     UtrModel::updateToLocalGC(from, to);
  228     if (Constant::nc_option_on)
  229     NcModel::updateToLocalGC(from, to);
  230 }
  231 
  232 void StateModel::readAllParameters(){
  233   ExonModel::readAllParameters();
  234   IntronModel::readAllParameters();
  235   IGenicModel::readAllParameters();
  236   UtrModel::readAllParameters();
  237   NcModel::readAllParameters();
  238 }
  239 
  240 void StateModel::storeGCPars(int idx){
  241   ExonModel::storeGCPars(idx);
  242   IntronModel::storeGCPars(idx);
  243   IGenicModel::storeGCPars(idx);
  244   UtrModel::storeGCPars(idx);
  245 }
  246 
  247 void StateModel::resetModelCounts() {
  248      // reset model count of the statemodels which have a count (all except igenic)
  249     ExonModel::resetModelCount();
  250     IntronModel::resetModelCount();
  251     UtrModel::resetModelCount();
  252     NcModel::resetModelCount();
  253 }
  254 
  255 int StateModel::getActiveWindowStart(int base) {
  256     int result = base - activeWinLen;
  257     if (seqFeatColl) {
  258     // check if we have hints that make lessD states longer
  259     // we might already be in the longass state following it
  260     int endOfBioIntron = base - Constant::ass_end;
  261     for (Feature* ihint = seqFeatColl->getFeatureListOvlpingRange(intronF, endOfBioIntron, endOfBioIntron, plusstrand); 
  262          ihint != NULL; ihint = ihint->next) {
  263         int endOfPred = ihint->start - 1 + DSS_MIDDLE + Constant::dss_end;
  264         if (result > endOfPred)
  265         result = endOfPred;
  266     }
  267     // analogously for rlessD state, followed by rlongdss
  268     endOfBioIntron = base - Constant::dss_start;
  269     for (Feature* ihint = seqFeatColl->getFeatureListOvlpingRange(intronF, endOfBioIntron, endOfBioIntron, minusstrand); 
  270          ihint != NULL; ihint = ihint->next) {
  271         int endOfPred = ihint->start - 1 + Constant::ass_upwindow_size + Constant::ass_start + ASS_MIDDLE;
  272         if (result > endOfPred)
  273         result = endOfPred;
  274     }
  275     }
  276     return result;
  277 }
  278 
  279 
  280 
  281 /* --- Snippet* methods -------------------------------------------- */
  282 
  283 Double SnippetProbs::getElemSeqProb(int base, int len){
  284     Double seqProb = 1.0;
  285     Seq2Int s2i(k+1);
  286     if (forwardStrand) {
  287     for (int curpos = base; curpos >= base - len + 1; curpos--) {
  288         try {
  289         if (curpos-k >= 0)
  290             seqProb *= (*emiprobs)[s2i(dna+curpos-k)];
  291         else 
  292             seqProb *= 0.25;
  293         } catch (InvalidNucleotideError &e) {
  294         seqProb *= 0.25;
  295         }
  296     }
  297     } else {
  298     for (int curpos = base; curpos >= base - len + 1; curpos--) {
  299         try {
  300         if (curpos >= 0 && curpos+k < n)
  301             seqProb *= (*emiprobs)[s2i.rc(dna+curpos)];
  302         else 
  303             seqProb *= 0.25;
  304         } catch (InvalidNucleotideError &e) {
  305         seqProb *= 0.25;
  306         }
  307     }
  308     }
  309     return seqProb;
  310 }
  311 
  312 Double SnippetProbs::getSeqProb(int base, int len){
  313     Double p;
  314     if (len == 0)
  315     return 1.0;
  316     if (base<0 || base > n-1)
  317     throw ProjectError(string("SnippetProbs::getSeqProb: Index out of range: ") + itoa(base));
  318     SnippetList * sl = snippetlist[base];
  319     if (sl != NULL) {
  320     if (sl->last->len < len) {
  321         p = getSeqProb(base - sl->last->len, len - sl->last->len) * sl->last->prob;
  322         addProb(base, len, p);
  323     } else {
  324         int partlen = len;
  325         p = sl->getProb(base, partlen); // search for previously computed value
  326         if (partlen != len) {                           // not exactly found,
  327         Double prest = 1.0;
  328         if (partlen == 0) {
  329             prest = getElemSeqProb(base, len); // compute rest
  330             addProb(base, len, prest); // and store
  331         } else {
  332             prest = getSeqProb(base-partlen, len-partlen); // compute rest
  333         }
  334         p *= prest;
  335         }
  336     }
  337     } else {
  338     p = getElemSeqProb(base, len);
  339     addProb(base, len, p);
  340     }
  341     return p;
  342 }
  343 
  344 void SnippetProbs::addProb(int base, int len, Double p) {
  345     SnippetListItem *sni = new SnippetListItem(p, len);
  346     if (snippetlist[base] == NULL) {
  347     snippetlist[base] = new SnippetList();
  348     snippetlist[base]->first = snippetlist[base]->last = sni;
  349     } else if (snippetlist[base]->last->len < len){ // insert at end
  350     snippetlist[base]->last->next = sni;
  351     snippetlist[base]->last = sni;
  352     } else { // sort into at the right position
  353     SnippetListItem *s = snippetlist[base]->first;
  354     if (s->len > len) { // insert at very beginning
  355         sni->next = s;
  356         snippetlist[base]->first = sni;
  357         return;// bugfix: inserted 060606
  358     }
  359     while (s && s->next && s->next->len < len)
  360         s = s->next;
  361     if (s->next && s->next->len == len) {
  362         cerr << "SnippetProb: tried to add snippet of same length: prob " << p << " original prob " << s->next->prob << " len " << len << endl;
  363         for (SnippetListItem *snit = snippetlist[base]->first; snit != NULL; snit=snit->next)
  364         cout << base <<  "\t" << snit->len << " " << snit->prob << endl;
  365     } else {
  366         sni->next = s->next;
  367         s->next = sni;
  368     }
  369     }
  370 }
  371 
  372 
  373 /*
  374  * getProb
  375  * if the prob for the len is not stored then partlen
  376  * is the largest length below len for wich the prob is stored
  377  * and prob is that probability.
  378  */
  379 Double SnippetList::getProb(int base, int &partlen) {
  380     SnippetListItem *s = first;
  381     if (!s || s->len > partlen) {
  382     partlen = 0;
  383     return 1.0;
  384     }
  385     while (s->next && s->next->len < partlen)
  386     s = s->next;
  387     if (!s->next || s->next->len > partlen) {
  388     partlen = s->len;
  389     return s->prob;
  390     } else {
  391     return s->next->prob;
  392     }
  393 }
  394 
  395 /*
  396  * SegProbs::setEmiProbs
  397  */
  398 void SegProbs::setEmiProbs(vector<Double> *emiprobs, int from, int to) {
  399     this->emiprobs = emiprobs;
  400     Double p;
  401     if (from < 0 || to < 0) { // if nothing else is requested, use the whole sequence
  402     from = 1;
  403     to = n;
  404     }
  405     if (from == 1)
  406     cumProds[0] = (float) .25;
  407     if (to > n)
  408     to = n;
  409     if (cumProds[from-1] == 0){
  410     cerr << "SegProbs::setEmiProbs: previous emiprobs not computed. This should not happen." << endl;
  411     cumProds[from-1] = 1;
  412     }
  413     for (int i = from; i <= to; i++){
  414     if (forwardStrand) {
  415         try {
  416         if (i<k)
  417             throw InvalidNucleotideError('\0');
  418         p = (*emiprobs)[s2i(dna + i - k)];
  419         } catch (InvalidNucleotideError&){
  420         p = (float) 0.25;
  421         }
  422     } else {
  423         try {
  424         if (i >= n-k)
  425             throw InvalidNucleotideError('\0');
  426         p = (*emiprobs)[s2i.rc(dna + i)];
  427         } catch (InvalidNucleotideError&){
  428         p = (float) 0.25;
  429         }
  430     }
  431     cumProds[i] = cumProds[i-1] * p;
  432     }
  433     //    cout << "setEmiProbs " << "cumProds[" << from << "]= " << cumProds[from] << "\t" << "cumProds[" << to << "]= " << cumProds[to] << endl;
  434 }
  435 
  436 /*
  437  * SegProbs::getSeqProb
  438  */
  439 Double SegProbs::getSeqProb(int from, int to) {
  440     if (from == to){
  441     try {
  442         if (forwardStrand) {
  443         if (to < k)
  444             return (float) .25;
  445         return (*emiprobs)[s2i(dna + to - k)];
  446         } else
  447         return (*emiprobs)[s2i.rc(dna + to)];
  448     } catch (InvalidNucleotideError&) {
  449         return (float) .25;
  450     }
  451     } else if (from > to)
  452        return 1;
  453     if (to > n)
  454     to = n;
  455     Double r = cumProds[to];
  456     if (from < 1)
  457     return r;
  458     else
  459     return r / cumProds[from - 1];
  460 }
  461 
  462 /*
  463  * make the Viterbi and Forward loop more efficient by memoizing the possible
  464  * endOfPred (left interval boundaries) for each state
  465  *    |     |        |  |   |           |
  466  *                   |  |   |           |    |  |  | (next time endPartProb > 0)
  467  * C                        A          D1      D2  B
  468  *  possibleEndOfPred is a decreasingly sorted list of endOfPred positions, for 
  469  *  which notEndPartProb > 0
  470  *  eopit points to the current list element and iterates from right to left
  471  */
  472 
  473 void EOPList::decrement(int &endOfPred){
  474    if (inCache && eopit != possibleEndOfPreds.end() && *eopit == endOfPred
  475        && ++eopit != possibleEndOfPreds.end())
  476       endOfPred = *eopit;
  477    else
  478       endOfPred--;
  479 }
  480 
  481 
  482 void EOPList::update(int endOfPred){
  483     if (possibleEndOfPreds.empty()) {
  484     possibleEndOfPreds.push_front(endOfPred);
  485     return;
  486     }
  487     if (endOfPred == *eopit) // case A
  488     return;
  489     if (endOfPred >= possibleEndOfPreds.front()){ // case B
  490     if (endOfPred > possibleEndOfPreds.front())
  491         possibleEndOfPreds.push_front(endOfPred);
  492     eopit = possibleEndOfPreds.begin();
  493     return;
  494     }
  495     if (endOfPred < possibleEndOfPreds.back()){ // case C
  496     eopit = possibleEndOfPreds.insert(possibleEndOfPreds.end(), endOfPred);
  497     return;
  498     }
  499     if (endOfPred < *eopit){ // case D
  500     list<int>::iterator nxt = eopit;
  501     ++nxt;
  502     if (nxt != possibleEndOfPreds.end() && endOfPred == *nxt){ // D1
  503         eopit = nxt;
  504         inCache = true;
  505         return;
  506     }
  507     if (nxt == possibleEndOfPreds.end() || endOfPred > *nxt){ // D2
  508         eopit = possibleEndOfPreds.insert(nxt, endOfPred);
  509         return;
  510     }
  511     // this line should very rarely be reached (e.g. when the support of the length distribution is not an interval)
  512     //cerr << "EOPList::update. Error: endOfPred = " << endOfPred << "\teopit=" << *eopit << "\tinCache=" << inCache << endl;
  513     while (eopit != possibleEndOfPreds.end() && *eopit > endOfPred)
  514         ++eopit;
  515     update(endOfPred);
  516     return;
  517     }
  518     // this should not happen either
  519     throw ProjectError("Error 2 in UtrModel::updatePossibleEOPs");
  520 }