"Fossies" - the Fresh Open Source Software Archive

Member "augustus-3.3.3/src/train_logReg_param.cc" (22 May 2019, 22075 Bytes) of package /linux/misc/augustus-3.3.3.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "train_logReg_param.cc" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 3.3.2_vs_3.3.3.

    1 /*
    2  * train_logReg_param.cc
    3  *
    4  * License: Artistic License, see file LICENSE.TXT or 
    5  *          https://opensource.org/licenses/artistic-license-1.0
    6  */
    7 
    8 // project includes
    9 
   10 // standard C/C++ includes
   11 #include <iostream>
   12 #include <fstream>
   13 #include <iomanip>
   14 #include <regex>
   15 
   16 #include "train_logReg_param.hh"
   17 
   18 int num_species;
   19 double prob_true_false;
   20 vector<double> mean;
   21 vector<double> se;
   22 
   23 
   24 void train_OEscore_params(int numSpecies){
   25 
   26   unordered_map<string,int> ref_class;
   27   reference_from_file(&ref_class);
   28   train_data data(&Constant::logReg_feature, &ref_class, numSpecies);
   29   optimize_parameters(&data);
   30 
   31 }
   32 
   33 
   34 train_data::train_data(unordered_map<string, pair<int, vector<double> > > *s, unordered_map<string,int> *ref_class, int numSp){
   35   
   36   int numExonRef = 0;
   37   int numIntronRef = 0;
   38   num_species = numSp;
   39   exon_samples = new vector<pair<int, vector<double> > >;
   40   intron_samples = new vector<pair<int, vector<double> > >;
   41   while(!s->empty()){
   42     unordered_map<string, pair<int, vector<double> > >::iterator samp = s->begin();
   43     unordered_map<string, int>::iterator got = ref_class->find(samp->first);
   44     if(got == ref_class->end())
   45       samp->second.first = 0;
   46     else{
   47       samp->second.first = 1;
   48       if(samp->first.at(0) == 'C')
   49     numExonRef++;
   50       else
   51     numIntronRef++;
   52     }
   53     if(samp->first.at(0) == 'C')
   54       exon_samples->push_back(samp->second);
   55     else
   56       intron_samples->push_back(samp->second);
   57     s->erase(samp);
   58   }
   59   cout << "# " << numExonRef << " reference exon samples and " << numIntronRef << " reference intron samples in training set." << endl;
   60   cout << "# " << exon_samples->size() << " exon and " << intron_samples->size() << " intron candidates are used for training." << endl;
   61 
   62   int num_samples = exon_samples->size();
   63   for(int i=0; i<num_samples-1; i++){
   64     try{
   65       if( (*exon_samples)[i].second.size() != (*exon_samples)[i+1].second.size() ){
   66     throw ProjectError("Train data vectors for logistic regression have inconsistent length!");
   67       }
   68     }catch(...){
   69       throw ProjectError("Train data vectors for logistic regression have inconsistent length!");
   70     }
   71   }
   72 
   73   /*
   74    *  number of features
   75    */
   76 
   77   try{
   78     num_features = Properties::getIntProperty("/CompPred/num_features");
   79   }catch(...){
   80     num_features = 15;
   81   }
   82   mean.resize(num_features);
   83   se.resize(num_features);
   84 
   85   // feaures that are included in standardization
   86   exon_feature_idx   = {0,0,0,1,1,1,1,1,1,1,1,1,0,1,1,1};
   87   intron_feature_idx = {0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0};
   88 
   89   prob_true_false_exons = (double) numExonRef / exon_samples->size();
   90   prob_true_false_introns = (double) numIntronRef / intron_samples->size();
   91   prob_true_false = 1;
   92 
   93   /*      
   94     cout << "exon samples:" << endl << "class\t";
   95   for(int i=0; i<num_features; i++)
   96     cout << i << "\t";
   97   cout << endl;
   98   for(int i=0; i<exon_samples->size(); i++){
   99     cout << (*exon_samples)[i].first << "\t";
  100     for(int j=0; j<(*exon_samples)[i].second.size(); j++){
  101       cout << (*exon_samples)[i].second[j] << "\t";
  102     }
  103     cout << endl;
  104   }
  105 
  106   cout << "intron samples:" << endl << "class\t" << endl;
  107   for(int i=0; i<num_features; i++)
  108     cout << i << "\t";
  109   cout << endl;
  110   for(int i=0; i<intron_samples->size(); i++){
  111     cout << (*intron_samples)[i].first << "\t";
  112     for(int j=0; j<(*intron_samples)[i].second.size(); j++){
  113       cout << (*intron_samples)[i].second[j] << "\t";
  114     }
  115     cout << endl;
  116   }
  117   
  118   */
  119 
  120 }
  121 
  122 
  123 /*
  124  * get mean and std of sample features for standardization
  125  */
  126 void train_data::set_mean_std(vector<pair<int, vector<double> > > *samples, vector<int> feature_idx){
  127 
  128   fill(mean.begin(), mean.end(), 0);
  129   fill(se.begin(), se.end(), 0);
  130 
  131   int m = samples->size();
  132   for(int i=0; i<m; i++){
  133     for(int j=0; j<num_features; j++){
  134       if(feature_idx[j]){
  135     double f = get_feature(&(*samples)[i].second, j);
  136     mean[j] += f;
  137     se[j] += pow(f, 2);
  138       }else{
  139     se[j] = 1;
  140       }
  141     }
  142   }
  143   for(int j=0; j<num_features; j++){
  144     if(feature_idx[j]){
  145       mean[j] /= m;
  146       se[j] = sqrt( se[j]/m - pow(mean[j], 2) );
  147     }
  148   }
  149   
  150   cout << "mean and std of features" << endl;
  151   for(int i=0; i<num_features; i++){
  152     cout << i << "\t" << mean[i] << "\t" << se[i] << endl;
  153   } 
  154    
  155 }
  156 
  157 double activation_f(const gsl_vector *theta, vector<double> *f){
  158 
  159   double theta_x = gsl_vector_get(theta, 0);
  160   int n = theta->size;
  161 
  162   for(int i=1; i<n; i++){
  163     theta_x += gsl_vector_get(theta, i) * ( (get_feature(f, i) - mean[i]) / se[i] ) ;
  164     //cout << "f" << i << ":" << setw(10) << gsl_vector_get(theta, i) << ":" << get_feature(f, i) << "(" << ( (get_feature(f, i) - mean[i]) / se[i] ) << ")\t"; 
  165   }
  166   //cout << endl;
  167   return theta_x;
  168 }
  169 
  170 double get_feature(vector<double> *f, int i){
  171 
  172   double x;
  173   switch(i){
  174   case 0: x = 1;                                          // intercept
  175     break;
  176   case 1: x = ( ((*f)[3] > 0) ? 0 : 1 );                  // not having omega
  177     break;
  178   case 2: x = ( 1 - (*f)[9] );                            // is not an OE 
  179     break;
  180   case 3: x = log((*f)[0]);                               // log exon length
  181     break;
  182   case 4: x = (*f)[1];                                    // posterior probability  
  183     break;
  184   case 5: x = (*f)[2];                                    // mean base probability 
  185     break;
  186   case 6: x = (*f)[3];                                    // omega
  187     break;
  188   case 7: x = (*f)[4];                                    // variance of omega
  189     break;
  190   case 8: x = (*f)[5];                                    // conservation
  191     break;
  192   case 9: x = (*f)[7];                                    // containment
  193     break;
  194   case 10: x = (*f)[6];                                   // diversity
  195     break;
  196   case 11: x = (*f)[8];                                   // number of exons involved in OE 
  197     break;
  198   case 12: x = ( ((*f)[1] <= 0) ? 1 : 0 );                // is not sampled 
  199     break;
  200   case 13: x = (*f)[5] * (*f)[6];                         // conservation * diversity
  201     break; 
  202   case 14: x = (*f)[3] * (*f)[6];                         // omega * diversity 
  203     break;
  204     //  case 15: x = (*f)[8]/num_species;                       // number of species involved in this ortho exon divided by clade size
  205     //break;
  206   case 15: x = ((*f)[10]>0 && (*f)[11]>0 && (*f)[12]>0 && (*f)[13]>0) ? ( (log((*f)[11]) - log((*f)[10])) * exp(Constant::lambda * (log((*f)[11]) - log((*f)[10]))) + (log((*f)[12]) - log((*f)[13])) * exp(Constant::lambda * (log((*f)[12]) - log((*f)[13]))) )/( exp(Constant::lambda * (log((*f)[11]) - log((*f)[10]))) + exp(Constant::lambda * (log((*f)[12]) - log((*f)[13]))) ) : 0;
  207     break;
  208   default: throw ProjectError("feature number " + itoa(i) + " does not exist!");
  209     break;
  210   }
  211   return x;
  212 }
  213 
  214 
  215 double cross_entropy_error_f(const gsl_vector *theta, void *features){
  216 
  217   double cross_entropy_error = 0;
  218   double curr_CEE = 0;
  219   vector<pair<int, vector<double> > > *x  = (vector<pair<int, vector<double> > > *)features;
  220 
  221   int m = x->size();
  222   //  cout << "########### extreme candidates ################" << endl;
  223   for(int i=0; i<m; i++){
  224     double theta_x = activation_f(theta, &(*x)[i].second);
  225     curr_CEE = -1 * (*x)[i].first * log(1 + exp(-theta_x)) + (1-(*x)[i].first) * (-theta_x - log(1 + exp(-theta_x)));
  226     cross_entropy_error += curr_CEE;
  227     //cout.precision(10); 
  228     //   cout << "cross entropy error of sample " << i << " = " << ( -1 * (*x)[i].first * log(1 + exp(-theta_x)) + (1-(*x)[i].first) * (-theta_x - log(1 + exp(-theta_x)))) << endl;
  229  
  230     // test output of "extreme" candidates: labels contradict prediction
  231     // double sigma = 1/(1+exp(-theta_x));
  232      //    if(abs(sigma - (*x)[i].first) > 0.8){
  233      //     cout << "feature_" << i << "\t" << (*x)[i].first << "\t" << theta_x << "\t" << sigma << "\t" << curr_CEE << endl;
  234      //}
  235   }
  236   //cout << "##############################################" << endl;
  237   //  cout << "CEE: " << (-1 * cross_entropy_error) << endl;  
  238   return (-1 * cross_entropy_error);
  239 }
  240 
  241 // The gradient of f, df = (df/dx_1, df/dx_2, ..., df/dx_n). 
  242 void cross_entropy_error_df(const gsl_vector *theta, void *features, gsl_vector *df){
  243   
  244   vector<pair<int, vector<double> > > *x = (vector<pair<int, vector<double> > > *)features;
  245   int m = x->size();
  246   int n = theta->size;
  247 
  248   vector<double> grad(n,0);
  249   for(int i=0; i<m; i++){
  250     double sigma = 1/(1 + exp(-1 * activation_f(theta, &(*x)[i].second)));
  251     for(int j=0; j<n; j++){
  252       grad[j] += ( (get_feature(&(*x)[i].second, j) - mean[j]) / se[j] ) * (sigma - (*x)[i].first);
  253       //cout << "gradient of exon 0 ... " << i << " for feature " << j << ": " << grad[j] << endl;
  254     }
  255   }
  256   for(int j=0; j<n; j++){
  257     gsl_vector_set(df, j, grad[j]);
  258   }
  259 }
  260 
  261 // Compute both f and df together. 
  262 void cross_entropy_error_fdf(const gsl_vector *params, void *features, double *f, gsl_vector *df){
  263 
  264   *f = cross_entropy_error_f(params, features);
  265   cross_entropy_error_df(params, features, df);
  266 }
  267 
  268 
  269 // label-noise robust version 
  270 
  271 double rob_cross_entropy_error_f(const gsl_vector *theta, void *features){
  272 
  273   double cross_entropy_error = 0;
  274   double curr_CEE = 0;
  275   vector<pair<int, vector<double> > > *x  = (vector<pair<int, vector<double> > > *)features;
  276   double epsilon = Constant::label_flip_prob; 
  277   double gamma = prob_true_false;
  278   int m = x->size();
  279   for(int i=0; i<m; i++){
  280     double sigma = 1/(1 + exp(-1 * activation_f(theta, &(*x)[i].second)));
  281     curr_CEE = (*x)[i].first * log( (1 - sigma)*epsilon*gamma + sigma*(1 - epsilon) ) + (1-(*x)[i].first) * log( (1 - sigma)*(1 - epsilon*gamma) + sigma*epsilon );
  282     //cout << "feature_" << i << "\t" << (*x)[i].first << "\t" << sigma << "\t" << curr_CEE << endl;
  283     cross_entropy_error += curr_CEE;
  284   }
  285   return (-1 * cross_entropy_error);
  286 }
  287 
  288 void rob_cross_entropy_error_df(const gsl_vector *theta, void *features, gsl_vector *df){
  289 
  290   vector<pair<int, vector<double> > > *x = (vector<pair<int, vector<double> > > *)features;
  291   int m = x->size();
  292   int n = theta->size;
  293   double epsilon = Constant::label_flip_prob;
  294   double gamma = prob_true_false;
  295   vector<double> grad(n,0);
  296   for(int i=0; i<m; i++){
  297     double sigma = 1/(1 + exp(-1 * activation_f(theta, &(*x)[i].second)));
  298     double g = -1 * ( (*x)[i].first * (1 - epsilon - epsilon*gamma) / (epsilon*gamma + sigma * (1 - epsilon - epsilon*gamma)) + (1 - (*x)[i].first)*(epsilon + epsilon*gamma - 1) / (1 - epsilon*gamma + sigma*(epsilon + epsilon*gamma - 1)) ) * sigma * (1 - sigma);
  299     for(int j=0; j<n; j++){
  300       grad[j] += ( (get_feature(&(*x)[i].second, j) - mean[j]) / se[j] ) * g;
  301       //cout << "gradient of exon 0 ... " << i << " for feature " << j << ": " << grad[j] << endl;
  302     }
  303    
  304   }
  305   for(int j=0; j<n; j++){
  306     gsl_vector_set(df, j, grad[j]);
  307   }
  308 }
  309 
  310 // Compute both f and df together. 
  311 void rob_cross_entropy_error_fdf(const gsl_vector *params, void *features, double *f, gsl_vector *df){
  312 
  313   *f = rob_cross_entropy_error_f(params, features);
  314   rob_cross_entropy_error_df(params, features, df);
  315 }
  316 
  317 
  318 void optimize_parameters(train_data *data){
  319 
  320   // output file for trained log_reg parameter
  321   ofstream paramfile;
  322   if(Properties::hasProperty("param_outfile"))
  323     paramfile.open(Properties::getProperty("param_outfile"));
  324   else
  325     paramfile.open(Constant::configPath + "/cgp/log_reg_parameters_trained.cfg");
  326 
  327   int n = data->num_features;
  328 
  329   gsl_vector *theta;
  330   theta = gsl_vector_alloc(n);
  331   gsl_vector_set_all(theta, 0);
  332   
  333   bool useTFprob = 0;
  334     
  335   if(Constant::rLogReg){
  336     cout << "# using lable-noise robust logistic regression with label flip probability " << Constant::label_flip_prob << endl;
  337     try{
  338       useTFprob = Properties::getBoolProperty("useTFprob");
  339     } catch(...){
  340       useTFprob = 0;
  341     }
  342     if(useTFprob)
  343       prob_true_false = data->prob_true_false_exons;
  344   }
  345 
  346   cout << "# exon parameter training" << endl;
  347  
  348   // random allocation
  349   srand(time(NULL));
  350   cout << "random theta: ";
  351   for(int i =0; i<n; i++){
  352     double r = ( (double) rand() / RAND_MAX );
  353     gsl_vector_set(theta, i, r);
  354     cout << r << "\t";
  355   }
  356   cout << endl;
  357 
  358   data->set_mean_std(data->exon_samples, data->exon_feature_idx);
  359 
  360   bool use_sgd;
  361   try{
  362     use_sgd = Properties::getBoolProperty("use_sgd");
  363   } catch(...){
  364     use_sgd = 0;
  365   }
  366   
  367   if(use_sgd)
  368     sgd(data->exon_samples, theta, n);
  369   else
  370     gsl_minimizer(data->exon_samples, theta, n);
  371 
  372   // output back transformed coefficients
  373   double theta_sum = 0;
  374   for(int i=1; i<n; i++){
  375     theta_sum += gsl_vector_get(theta, i) * mean[i] / se[i];
  376   }
  377   paramfile << "/CompPred/exon_score0\t" << ( gsl_vector_get(theta, 0) - theta_sum ) << endl; 
  378   for(int i=1; i<n; i++)
  379     paramfile << "/CompPred/exon_score" << i << "\t" << ( gsl_vector_get(theta, i) / se[i] ) << endl;
  380   
  381 
  382   cout << "# intron parameter training" << endl;
  383 
  384   // random allocation                                                                                                                
  385   srand(time(NULL));
  386   cout << "random theta: ";
  387   for(int i =0; i<n; i++){
  388     double r = ( (double) rand() / RAND_MAX );
  389     gsl_vector_set(theta, i, r);
  390     cout << r << "\t";
  391   }
  392   cout << endl;
  393 
  394   data->set_mean_std(data->intron_samples, data->intron_feature_idx);
  395 
  396   if(Constant::rLogReg && useTFprob)
  397     prob_true_false = data->prob_true_false_introns;
  398 
  399   if(use_sgd)
  400     sgd(data->intron_samples, theta, n);
  401   else
  402     gsl_minimizer(data->intron_samples, theta, n);
  403   
  404   // output back transformed coefficients
  405   theta_sum = 
  406       (gsl_vector_get(theta, 4) * mean[4] / se[4])
  407     + (gsl_vector_get(theta, 5) * mean[5] / se[5]) 
  408     + (gsl_vector_get(theta, 3) * mean[3] / se[3]);
  409   
  410 
  411   paramfile << "/CompPred/intron_score" << 0 << "\t" << ( gsl_vector_get (theta, 0) - theta_sum ) << endl;
  412   paramfile << "/CompPred/intron_score" << 1 << "\t" << ( gsl_vector_get (theta, 4) / se[4] ) << endl;
  413   paramfile << "/CompPred/intron_score" << 2 << "\t" << ( gsl_vector_get (theta, 5) / se[5] ) << endl;
  414   paramfile << "/CompPred/intron_score" << 3 << "\t" << ( gsl_vector_get (theta, 3) / se[3] ) << endl;
  415   
  416   paramfile.close();
  417 
  418   
  419 }
  420 
  421 void gsl_minimizer(vector<pair<int, vector<double> > > *samples, gsl_vector *theta, int n){
  422 
  423   int iter = 0;
  424   int status;
  425 
  426   const gsl_multimin_fdfminimizer_type *T;
  427   gsl_multimin_fdfminimizer *s;
  428 
  429 
  430   try{
  431     Constant::rLogReg = Properties::getBoolProperty("rLogReg");
  432     Constant::label_flip_prob = Properties::getdoubleProperty("label_flip_prob");
  433   }catch(...){}
  434 
  435   gsl_multimin_function_fdf CE_error_f;
  436   CE_error_f.n = n;
  437   if(Constant::rLogReg){
  438     CE_error_f.f = &rob_cross_entropy_error_f;
  439     CE_error_f.df = &rob_cross_entropy_error_df;
  440     CE_error_f.fdf = &rob_cross_entropy_error_fdf;
  441   }else{
  442     CE_error_f.f = &cross_entropy_error_f;
  443     CE_error_f.df = &cross_entropy_error_df;
  444     CE_error_f.fdf = &cross_entropy_error_fdf;  
  445   }
  446   
  447   CE_error_f.params = (void *)samples;
  448 
  449   cout << "cross entropy test error: " << cross_entropy_error_f(theta, samples) << endl;
  450 
  451   T = gsl_multimin_fdfminimizer_vector_bfgs2;
  452   s = gsl_multimin_fdfminimizer_alloc (T, n);
  453 
  454   try{
  455     Constant::GD_stepsize = Properties::getdoubleProperty("GD_stepsize");
  456   }catch(...){}
  457   gsl_multimin_fdfminimizer_set (s, &CE_error_f, theta, Constant::GD_stepsize, 1e-4);
  458   
  459   do{
  460     iter++;
  461     status = gsl_multimin_fdfminimizer_iterate (s);
  462     
  463     if (status){
  464       cout << "gsl error_code: " << status << endl; 
  465       break;
  466     }
  467     status = gsl_multimin_test_gradient (s->gradient, 1e-3);
  468    
  469     if (status == GSL_SUCCESS)
  470       cout << "Minimum found at:\n";
  471     
  472     cout << setw(10) << iter;
  473     for(int i=0; i<n; i++)
  474       cout << setw(10) << gsl_vector_get(s->x, i);
  475     cout << setw(10) << s->f << endl;
  476     
  477 
  478   }while (status == GSL_CONTINUE && iter < 10000);
  479 
  480   for(int i=0; i<n; i++)
  481     gsl_vector_set(theta, i, gsl_vector_get(s->x, i));
  482 
  483   gsl_multimin_fdfminimizer_free (s);  
  484   
  485 }
  486 
  487 
  488 void sgd(vector<pair<int, vector<double> > > *samples, gsl_vector *theta, int n){
  489 
  490   random_shuffle ( samples->begin(), samples->end() );
  491 
  492   double learning_rate;
  493   try{
  494     learning_rate = Properties::getdoubleProperty("learning_rate");
  495   }catch(...){
  496     learning_rate = 0.001;
  497   }
  498   int maxInter;
  499   try{
  500     maxInter = Properties::getIntProperty("max_sgd_inter");
  501   }catch(...){
  502     maxInter = 10;
  503   }
  504   int iter = 0;
  505   int num_samples = samples->size();
  506   double curr_lowest_error = numeric_limits<double>::max();
  507   gsl_vector *best_theta = gsl_vector_alloc(n);
  508 
  509   if(iter < maxInter*num_samples){
  510     for(vector<pair<int, vector<double> > >::iterator it = samples->begin(); it != samples->end(); it++){
  511       //cout << "++++++++++ exon with class " <<  it->first << endl;
  512       for(int i=0; i<n; i++){
  513     double sigma = 1/(1 + exp(-1 * activation_f(theta, &it->second)));
  514     //cout << "x_" << i << " = " << ( (get_feature(&it->second, i) - mean[i]) / se[i] ) << "; pred = " << sigma << endl;
  515     double grad = ( it->first - sigma ) * ( (get_feature(&it->second, i) - mean[i]) / se[i] );
  516     //      cout << "gradient = " << grad << endl;
  517     double new_theta_i =  gsl_vector_get(theta, i) + learning_rate * grad;
  518     gsl_vector_set(theta, i, new_theta_i);
  519     //      cout << "theta[" << i << "] = " << gsl_vector_get(theta, i) << endl;
  520       }
  521       
  522       if(iter % 1000 == 0){
  523     double cee = cross_entropy_error_f(theta, samples);
  524     if(cee < curr_lowest_error){
  525       gsl_vector_memcpy(best_theta, theta);
  526       curr_lowest_error = cee;
  527     }
  528     cout << setw(10) << iter;
  529     for(int i=0; i<n; i++)
  530       cout << setw(10) << gsl_vector_get(theta, i);
  531     cout << setw(10) << cee << endl;
  532       }
  533       iter++;
  534     }
  535   }
  536   gsl_vector_memcpy(theta, best_theta);
  537   gsl_vector_free(best_theta);
  538 }
  539 
  540 void trainFeature_from_file(){
  541 
  542   ifstream trainfile;
  543   string buffer;
  544   string filename = Properties::getProperty("trainFeatureFile");
  545   trainfile.open(filename, ifstream::in);
  546   if(!trainfile){
  547     string errmsg = "Could not open the train feature file " + filename + ".";
  548     throw PropertiesError(errmsg);
  549   }
  550   while(getline(trainfile,buffer)){
  551     if(buffer[0] == '#')
  552       continue;
  553     stringstream linestream(buffer);
  554     string chr;
  555     string type;
  556     int startPos;
  557     int endPos;
  558     char strand;
  559     char frame;
  560     string attributes;
  561     linestream >> chr >> buffer >> type >> startPos >> endPos >> buffer >> strand >> frame >> attributes;
  562 
  563     if(type == "exon")
  564       type = "CDS";
  565     if(type == "intron" || type == "CDS"){
  566       stringstream str;
  567       str << type << "\t" << chr << "\t" << startPos<< "\t" << endPos << "\t" << strand << "\t" << frame;
  568       string key = str.str();
  569       
  570       unordered_map<string, pair<int, vector<double> > >::iterator got = Constant::logReg_feature.find(key);
  571       if ( got == Constant::logReg_feature.end() ){
  572     vector<double> feature(14,0);
  573     feature[0] = endPos - startPos + 1;   // exon length    
  574     pair<int, vector<double> > p;
  575     p = make_pair(-1, feature);
  576     pair<string, pair<int, vector<double> > > entry;
  577     entry = make_pair(key, p);
  578     got = Constant::logReg_feature.insert(entry).first;
  579       }
  580 
  581       stringstream s(attributes);
  582       string ap;
  583       vector<string> attr;
  584       while(std::getline(s, ap, ';')){
  585     attr.push_back(ap);
  586       }
  587       for(int i=0; i<attr.size(); i++){
  588     vector<string> attr_pair;
  589     stringstream ss(attr[i]);
  590     while(std::getline(ss, ap, '=')){
  591       attr_pair.push_back(ap);
  592     }
  593     
  594     if (attr_pair[0] == "postProb")
  595       got->second.second[1] = stod(attr_pair[1]); 
  596     else if (attr_pair[0] == "avgBaseProb" || attr_pair[0] == "mbp")
  597       got->second.second[2] = stod(attr_pair[1]);
  598     else if (attr_pair[0] == "Eomega" || attr_pair[0] == "PMomega")
  599       got->second.second[3] = stod(attr_pair[1]);
  600     else if (attr_pair[0] == "VarOmega" || attr_pair[0] == "varOmega")
  601       got->second.second[4] = stod(attr_pair[1]);
  602     else if (attr_pair[0] == "cons")
  603           got->second.second[5] = stod(attr_pair[1]);
  604     else if (attr_pair[0] == "div")
  605           got->second.second[6] = stod(attr_pair[1]);
  606     else if (attr_pair[0] == "containment" || attr_pair[0] == "contain")
  607           got->second.second[7] = stod(attr_pair[1]);
  608     else if (attr_pair[0] == "n" || attr_pair[0] == "numSpecies")
  609           got->second.second[8] = stod(attr_pair[1]);
  610     else if (attr_pair[0] == "oescore")
  611       got->second.second[9] = 1;
  612     else if (attr_pair[0] == "leftBoundaryExtOmega")
  613       got->second.second[10] = stod(attr_pair[1]);
  614     else if (attr_pair[0] == "leftBoundaryIntOmega")
  615           got->second.second[11] = stod(attr_pair[1]);
  616     else if (attr_pair[0] == "rightBoundaryIntOmega")
  617       got->second.second[12] = stod(attr_pair[1]);
  618     else if (attr_pair[0] == "rightBoundaryExtOmega")
  619       got->second.second[13] = stod(attr_pair[1]);
  620       }
  621     }
  622   }
  623 }
  624 
  625 
  626 void reference_from_file(unordered_map<string,int> *ref_class){
  627 
  628   ifstream reffile;
  629   string buffer;
  630   bool found_other_feature = 0;
  631   reffile.open(Constant::referenceFile, ifstream::in);
  632   if(!reffile){
  633     string errmsg = "Could not open the reference exon file " + Constant::referenceFile + ".";
  634     throw PropertiesError(errmsg);
  635   }
  636   while(getline(reffile,buffer)){
  637     if(buffer[0] == '#')  
  638       continue;
  639     stringstream linestream(buffer);
  640     string chr;
  641     string type;
  642     int startPos;
  643     int endPos;
  644     char strand;
  645     char frame;
  646     linestream >> chr >> buffer >> type >> startPos >> endPos >> buffer >> strand >> frame >> buffer;
  647     /*size_t pos = buffer.find("class=");
  648     int ref_class = buffer[pos+6] - '0';
  649     if(ref_class < 0){
  650       ref_class = buffer[pos+7] - '0';
  651       ref_class*= -1;
  652     }
  653     */
  654    
  655     if(type != "intron" && type != "CDS"){
  656       found_other_feature = 1;
  657     }else{
  658       stringstream str;
  659       str << type << "\t" << chr << "\t" << startPos<< "\t" << endPos << "\t" << strand << "\t" << frame;
  660       string key = str.str();
  661       //      cout << "ref_feature: " << key << endl;
  662       pair<string, int> k(key,1);
  663       ref_class->insert(k);
  664     }
  665   }
  666   if(found_other_feature)
  667     cerr << "Warning: reference feature(s) of type other than CDS or intron found! Make sure that relevant features have the correct type definition, e.g. intron or CDS. Note, that the program is case sensitive." << endl;
  668 
  669 }
  670