"Fossies" - the Fresh Open Source Software Archive

Member "speech_tools/stats/wagon/wagon.cc" (22 Nov 2017, 39225 Bytes) of package /linux/misc/speech_tools-2.5.0-release.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "wagon.cc" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 2.4-release_vs_2.5.0-release.

    1 /*************************************************************************/
    2 /*                                                                       */
    3 /*                Centre for Speech Technology Research                  */
    4 /*                     University of Edinburgh, UK                       */
    5 /*                      Copyright (c) 1996,1997                          */
    6 /*                        All Rights Reserved.                           */
    7 /*                                                                       */
    8 /*  Permission is hereby granted, free of charge, to use and distribute  */
    9 /*  this software and its documentation without restriction, including   */
   10 /*  without limitation the rights to use, copy, modify, merge, publish,  */
   11 /*  distribute, sublicense, and/or sell copies of this work, and to      */
   12 /*  permit persons to whom this work is furnished to do so, subject to   */
   13 /*  the following conditions:                                            */
   14 /*   1. The code must retain the above copyright notice, this list of    */
   15 /*      conditions and the following disclaimer.                         */
   16 /*   2. Any modifications must be clearly marked as such.                */
   17 /*   3. Original authors' names are not deleted.                         */
   18 /*   4. The authors' names are not used to endorse or promote products   */
   19 /*      derived from this software without specific prior written        */
   20 /*      permission.                                                      */
   21 /*                                                                       */
   22 /*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
   23 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
   24 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
   25 /*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
   26 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
   27 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
   28 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
   29 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
   30 /*  THIS SOFTWARE.                                                       */
   31 /*                                                                       */
   32 /*************************************************************************/
   33 /*                     Author :  Alan W Black                            */
   34 /*                     Date   :  May 1996                                */
   35 /*-----------------------------------------------------------------------*/
   36 /*  A Classification and Regression Tree (CART) Program                  */
   37 /*  A basic implementation of many of the techniques in                  */
   38 /*  Briemen et al. 1984                                                  */
   39 /*                                                                       */
   40 /*  Added decision list support, Feb 1997                                */
   41 /*  Added stepwise use of features, Oct 1997                             */
   42 /*                                                                       */
   43 /*=======================================================================*/
   44 
   45 #include <cstdlib>
   46 #include <iostream>
   47 #include <fstream>
   48 #include <cstring>
   49 #include "EST_Token.h"
   50 #include "EST_FMatrix.h"
   51 #include "EST_multistats.h"
   52 #include "EST_Wagon.h"
   53 #include "EST_math.h"
   54 
   55 Discretes wgn_discretes;
   56 
   57 WDataSet wgn_dataset;
   58 WDataSet wgn_test_dataset;
   59 EST_FMatrix wgn_DistMatrix;
   60 EST_Track wgn_VertexTrack;
   61 EST_Track wgn_VertexFeats;
   62 EST_Track wgn_UnitTrack;
   63 
   64 int wgn_min_cluster_size = 50;
   65 int wgn_max_questions = 2000000; /* not ideal, but adequate */
   66 int wgn_held_out = 0;
   67 float wgn_dropout_feats = 0.0;
   68 float wgn_dropout_samples = 0.0;
   69 int wgn_cos = 1;
   70 int wgn_prune = TRUE;
   71 int wgn_quiet = FALSE;
   72 int wgn_verbose = FALSE;
   73 int wgn_count_field = -1;
   74 EST_String wgn_count_field_name = "";
   75 int wgn_predictee = 0;
   76 EST_String wgn_predictee_name = "";
   77 float wgn_float_range_split = 10;
   78 float wgn_balance = 0;
   79 EST_String wgn_opt_param = "";
   80 EST_String wgn_vertex_output = "mean";
   81 EST_String wgn_vertex_otype = "mean";
   82 
   83 static float do_summary(WNode &tree,WDataSet &ds,ostream *output);
   84 static float test_tree_float(WNode &tree,WDataSet &ds,ostream *output);
   85 static float test_tree_class(WNode &tree,WDataSet &ds,ostream *output);
   86 static float test_tree_cluster(WNode &tree,WDataSet &dataset, ostream *output);
   87 static float test_tree_vector(WNode &tree,WDataSet &dataset,ostream *output);
   88 static float test_tree_trajectory(WNode &tree,WDataSet &dataset,ostream *output);
   89 static float test_tree_ols(WNode &tree,WDataSet &dataset,ostream *output);
   90 static int wagon_split(int margin,WNode &node);
   91 static WQuestion find_best_question(WVectorVector &dset);
   92 static void construct_binary_ques(int feat,WQuestion &test_ques);
   93 static float construct_float_ques(int feat,WQuestion &ques,WVectorVector &ds);
   94 static float construct_class_ques(int feat,WQuestion &ques,WVectorVector &ds);
   95 static void wgn_set_up_data(WVectorVector &data,const WVectorList &ds,int held_out,int in);
   96 static WNode *wagon_stepwise_find_next_best(float &bscore,int &best_feat);
   97 
   98 Declare_TList_T(WVector *, WVectorP)
   99 
  100 Declare_TVector_Base_T(WVector *,NULL,NULL,WVectorP)
  101 
  102 #if defined(INSTANTIATE_TEMPLATES)
  103 // Instantiate class
  104 #include "../base_class/EST_TList.cc"
  105 #include "../base_class/EST_TVector.cc"
  106 
  107 Instantiate_TList_T(WVector *, WVectorP)
  108 
  109 Instantiate_TVector(WVector *)
  110 
  111 #endif
  112 
  113 void wgn_load_datadescription(EST_String fname,LISP ignores)
  114 {
  115     // Load field description for a file
  116     wgn_dataset.load_description(fname,ignores);
  117     wgn_test_dataset.load_description(fname,ignores);
  118 }
  119 
  120 void wgn_load_dataset(WDataSet &dataset,EST_String fname)
  121 {
  122     // Read the data set from a filename.  One vector per line
  123     // Assume all numbers are numbers and non-nums are categorical
  124     EST_TokenStream ts;
  125     WVector *v;
  126     int nvec=0,i;
  127 
  128     if (ts.open(fname) == -1)
  129     wagon_error(EST_String("unable to open data file \"")+
  130             fname+"\"");
  131     ts.set_PunctuationSymbols("");
  132     ts.set_PrePunctuationSymbols("");
  133     ts.set_SingleCharSymbols("");
  134 
  135     for ( ;!ts.eof(); )
  136     {
  137     v = new WVector(dataset.width());
  138     i = 0;
  139     do 
  140     {
  141         int type = dataset.ftype(i);
  142         if ((type == wndt_float) || 
  143                 (type == wndt_ols) ||
  144                 (wgn_count_field == i))
  145         {
  146         // need to ensure this is not NaN or Infinity
  147         float f = atof(ts.get().string());
  148         if (isfinite(f))
  149             v->set_flt_val(i,f);
  150         else
  151         {
  152             cout << fname << ": bad float " << f
  153             << " in field " <<
  154             dataset.feat_name(i) << " vector " << 
  155                 dataset.samples() << endl;
  156             v->set_flt_val(i,0.0);
  157         }
  158         }
  159         else if (type == wndt_binary)
  160         v->set_int_val(i,atoi(ts.get().string()));
  161         else if (type == wndt_cluster)  /* index into distmatrix */
  162         v->set_int_val(i,atoi(ts.get().string()));
  163         else if (type == wndt_vector)   /* index into VertexTrack */
  164         v->set_int_val(i,atoi(ts.get().string()));
  165         else if (type == wndt_trajectory) /* index to index and length */
  166             {   /* a number pointing to a vector in UnitTrack that */
  167                 /* has an idex into VertexTrack and a number of Vertices */
  168                 /* Thus if its 15, UnitTrack.a(15,0) is the start frame in */
  169                 /* VertexTrack and UnitTrack.a(15,1) is the number of */
  170                 /* frames in the unit                                 */
  171         v->set_int_val(i,atoi(ts.get().string()));
  172             }
  173         else if (type == wndt_ignore)
  174         {
  175         ts.get();  // skip it
  176         v->set_int_val(i,0);
  177         }
  178         else // should check the different classes 
  179         {
  180         EST_String s = ts.get().string();
  181         int n = wgn_discretes.discrete(type).name(s); 
  182         if (n == -1)
  183         {
  184             cout << fname << ": bad value " << s << " in field " <<
  185             dataset.feat_name(i) << " vector " << 
  186                 dataset.samples() << endl;
  187             n = 0;
  188         }
  189         v->set_int_val(i,n);
  190         }
  191         i++;
  192     }
  193     while (!ts.eoln() && i<dataset.width());
  194     nvec ++;
  195     if (i != dataset.width())
  196     {
  197         wagon_error(fname+": data vector "+itoString(nvec)+" contains "
  198             +itoString(i)+" parameters instead of "+
  199             itoString(dataset.width()));
  200     }
  201     if (!ts.eoln())
  202     {
  203         cerr << fname << ": data vector " << nvec << 
  204         " contains too many parameters instead of " 
  205         << dataset.width() << endl;
  206         wagon_error(EST_String("extra parameter(s) from ")+
  207             ts.peek().string());
  208     }
  209     dataset.append(v);
  210     }
  211 
  212     cout << "Dataset of " << dataset.samples() << " vectors of " <<
  213     dataset.width() << " parameters from: " << fname << endl;
  214     ts.close();
  215 }
  216 
  217 float summary_results(WNode &tree,ostream *output)
  218 {
  219     if (wgn_test_dataset.samples() != 0)
  220     return do_summary(tree,wgn_test_dataset,output);
  221     else
  222     return do_summary(tree,wgn_dataset,output);
  223 }
  224 
  225 static float do_summary(WNode &tree,WDataSet &ds,ostream *output)
  226 {
  227     if (wgn_dataset.ftype(wgn_predictee) == wndt_cluster)
  228     return test_tree_cluster(tree,ds,output);
  229     else if (wgn_dataset.ftype(wgn_predictee) == wndt_vector)
  230     return test_tree_vector(tree,ds,output);
  231     else if (wgn_dataset.ftype(wgn_predictee) == wndt_trajectory)
  232     return test_tree_trajectory(tree,ds,output);
  233     else if (wgn_dataset.ftype(wgn_predictee) == wndt_ols)
  234     return test_tree_ols(tree,ds,output);
  235     else if (wgn_dataset.ftype(wgn_predictee) >= wndt_class)
  236     return test_tree_class(tree,ds,output);
  237     else
  238     return test_tree_float(tree,ds,output);
  239 }
  240 
  241 WNode *wgn_build_tree(float &score)
  242 {
  243     // Build init node and split it while reducing the impurity
  244     WNode *top = new WNode();
  245     int margin = 0;
  246 
  247     wgn_set_up_data(top->get_data(),wgn_dataset,wgn_held_out,TRUE);
  248 
  249     margin = 0;
  250     wagon_split(margin,*top);  // recursively split data;
  251 
  252     if (wgn_held_out > 0)
  253     {
  254     wgn_set_up_data(top->get_data(),wgn_dataset,wgn_held_out,FALSE);
  255     top->held_out_prune();
  256     }
  257     
  258     if (wgn_prune)
  259     top->prune();
  260 
  261     score = summary_results(*top,0);
  262 
  263     return top;
  264 }
  265 
  266 static void wgn_set_up_data(WVectorVector &data,const WVectorList &ds,int held_out,int in)
  267 {
  268     // Set data ommitting held_out percent if in is true
  269     // or only including 100-held_out percent if in is false
  270     int i,j;
  271     EST_Litem *d;
  272 
  273     // Make it definitely big enough
  274     data.resize(ds.length());
  275     
  276     for (j=i=0,d=ds.head(); d != 0; d=d->next(),j++)
  277     {
  278     if ((in) && ((j%100) >= held_out))
  279         data[i++] = ds(d);
  280 //  else if ((!in) && ((j%100 < held_out)))
  281 //      data[i++] = ds(d);
  282     else if (!in)
  283         data[i++] = ds(d);
  284 //  if ((in) && (j < held_out))
  285 //      data[i++] = ds(d);      
  286 //  else if ((!in) && (j >=held_out))
  287 //      data[i++] = ds(d);      
  288     }
  289     // make it the actual size, but don't reset values
  290     data.resize(i,1);  
  291 }
  292 
  293 static float test_tree_class(WNode &tree,WDataSet &dataset,ostream *output)
  294 {
  295     // Test tree against data to get summary of results
  296     EST_StrStr_KVL pairs;
  297     EST_StrList lex;
  298     EST_Litem *p;
  299     EST_String predict,real;
  300     WNode *pnode;
  301     double H=0,prob;
  302     int i,type;
  303     float correct=0,total=0, count=0;
  304 
  305     float bcorrect=0, bpredicted=0, bactual=0;
  306     float precision=0, recall=0;
  307 
  308     for (p=dataset.head(); p != 0; p=p->next())
  309     {
  310     pnode = tree.predict_node((*dataset(p)));
  311     predict = (EST_String)pnode->get_impurity().value();
  312     if (wgn_count_field == -1)
  313         count = 1.0;
  314     else
  315         count = dataset(p)->get_flt_val(wgn_count_field);
  316     prob = pnode->get_impurity().pd().probability(predict);
  317     H += (log(prob))*count;
  318     type = dataset.ftype(wgn_predictee);
  319     real = wgn_discretes[type].name(dataset(p)->get_int_val(wgn_predictee));
  320     
  321     if (wgn_opt_param == "B_NB_F1")
  322       {
  323         //cout << real << " " << predict << endl;
  324         if (real == "B")
  325           bactual +=count;
  326         if (predict == "B")
  327           {
  328         bpredicted += count;
  329         if (real == predict)
  330           bcorrect += count;
  331           }
  332         //      cout <<bactual << " " << bpredicted << " " << bcorrect << endl;
  333       }
  334     if (real == predict)
  335         correct += count;
  336     total += count;
  337     pairs.add_item(real,predict,1);
  338     }
  339     for (i=0; i<wgn_discretes[dataset.ftype(wgn_predictee)].length(); i++)
  340     lex.append(wgn_discretes[dataset.ftype(wgn_predictee)].name(i));
  341 
  342     const EST_FMatrix &m = confusion(pairs,lex);
  343     
  344     if (output != NULL)
  345     {
  346     print_confusion(m,pairs,lex);  // should be to output not stdout
  347     *output << ";; entropy " << (-1*(H/total)) << " perplexity " <<
  348         pow(2.0,(-1*(H/total))) << endl;
  349     }
  350 
  351     
  352     // Minus it so bigger is better 
  353     if (wgn_opt_param == "entropy")
  354     return -pow(2.0,(-1*(H/total)));
  355     else if(wgn_opt_param == "B_NB_F1")
  356       {
  357     if(bpredicted == 0)
  358       precision = 1;
  359     else
  360       precision = bcorrect/bpredicted;
  361     if(bactual == 0)
  362       recall = 1;
  363     else
  364       recall = bcorrect/bactual;
  365     float fmeasure = 0;
  366     if((precision+recall) !=0)
  367       fmeasure = 2* (precision*recall)/(precision+recall);
  368     cout<< "F1 :" << fmeasure << " Prec:" << precision << " Rec:" << recall << " B-Pred:" << bpredicted << " B-Actual:" << bactual << " B-Correct:" << bcorrect << endl;
  369     return fmeasure;
  370       }
  371     else
  372     return (float)correct/(float)total;
  373 }
  374 
  375 static float test_tree_vector(WNode &tree,WDataSet &dataset,ostream *output)
  376 {
  377     // Test tree against data to get summary of results VECTOR
  378     // distance is calculated in zscores (as the values in vector may
  379     // have quite different ranges 
  380     WNode *leaf;
  381     EST_Litem *p;
  382     float predict, actual;
  383     EST_SuffStats x,y,xx,yy,xy,se,e;
  384     EST_SuffStats b;
  385     int i,j,pos;
  386     double cor,error;
  387     double count;
  388     EST_Litem *pp;
  389 
  390     for (p=dataset.head(); p != 0; p=p->next())
  391     {
  392     leaf = tree.predict_node((*dataset(p)));
  393     pos = dataset(p)->get_int_val(wgn_predictee);
  394         for (j=0; j<wgn_VertexFeats.num_channels(); j++)
  395             if (wgn_VertexFeats.a(0,j) > 0.0)
  396             {
  397                 b.reset();
  398                 for (pp=leaf->get_impurity().members.head(); pp != 0; pp=pp->next())
  399                 {
  400                     i = leaf->get_impurity().members.item(pp);
  401                     b += wgn_VertexTrack.a(i,j);
  402                 }
  403                 predict = b.mean();
  404                 actual = wgn_VertexTrack.a(pos,j);
  405                 if (wgn_count_field == -1)
  406                     count = 1.0;
  407                 else
  408                     count = dataset(p)->get_flt_val(wgn_count_field);
  409                 x.cumulate(predict,count);
  410                 y.cumulate(actual,count);
  411                 /* Normalized the error by the standard deviation */
  412                 if (b.stddev() == 0)
  413                     error = predict-actual;
  414                 else
  415                     error = (predict-actual)/b.stddev();
  416                 error = predict-actual; /* awb_debug */
  417                 se.cumulate((error*error),count);
  418                 e.cumulate(fabs(error),count);
  419                 xx.cumulate(predict*predict,count);
  420                 yy.cumulate(actual*actual,count);
  421                 xy.cumulate(predict*actual,count);
  422             }
  423     }
  424 
  425     // Pearson's product moment correlation coefficient
  426 //    cor = (xy.mean() - (x.mean()*y.mean()))/
  427 //  (sqrt(xx.mean()-(x.mean()*x.mean())) *
  428 //   sqrt(yy.mean()-(y.mean()*y.mean())));
  429     // Because when the variation is X is very small we can
  430     // go negative, thus cause the sqrt's to give FPE
  431     double v1 = xx.mean()-(x.mean()*x.mean());
  432     double v2 = yy.mean()-(y.mean()*y.mean());
  433 
  434     double v3 = v1*v2;
  435 
  436     if (v3 <= 0)
  437     // happens when there's very little variation in x
  438     cor = 0;
  439     else
  440     cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
  441     
  442     if (output != NULL)
  443     {
  444     if (output != &cout)   // save in output file
  445         *output 
  446         << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
  447         << " Correlation is " << ftoString(cor,4,1)
  448         << " Mean (abs) Error " << ftoString(e.mean(),4,1)
  449         << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
  450     
  451     cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
  452         << " Correlation is " << ftoString(cor,4,1)
  453         << " Mean (abs) Error " << ftoString(e.mean(),4,1)
  454         << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
  455     }
  456 
  457     if (wgn_opt_param == "rmse")
  458     return -sqrt(se.mean());  // * -1 so bigger is better 
  459     else
  460     return cor;  // should really be % variance, I think
  461 }
  462 
  463 static float test_tree_trajectory(WNode &tree,WDataSet &dataset,ostream *output)
  464 {
  465     // Test tree against data to get summary of results TRAJECTORY
  466     // distance is calculated in zscores (as the values in vector may
  467     // have quite different ranges)
  468     // NOT WRITTEN YET
  469     WNode *leaf;
  470     EST_Litem *p;
  471     float predict, actual;
  472     EST_SuffStats x,y,xx,yy,xy,se,e;
  473     EST_SuffStats b;
  474     int i,j,pos;
  475     double cor,error;
  476     double count;
  477     EST_Litem *pp;
  478 
  479     for (p=dataset.head(); p != 0; p=p->next())
  480     {
  481     leaf = tree.predict_node((*dataset(p)));
  482     pos = dataset(p)->get_int_val(wgn_predictee);
  483         for (j=0; j<wgn_VertexFeats.num_channels(); j++)
  484             if (wgn_VertexFeats.a(0,j) > 0.0)
  485             {
  486                 b.reset();
  487                 for (pp=leaf->get_impurity().members.head(); pp != 0; pp=pp->next())
  488                 {
  489                     i = leaf->get_impurity().members.item(pp);
  490                     b += wgn_VertexTrack.a(i,j);
  491                 }
  492                 predict = b.mean();
  493                 actual = wgn_VertexTrack.a(pos,j);
  494                 if (wgn_count_field == -1)
  495                     count = 1.0;
  496                 else
  497                     count = dataset(p)->get_flt_val(wgn_count_field);
  498                 x.cumulate(predict,count);
  499                 y.cumulate(actual,count);
  500                 /* Normalized the error by the standard deviation */
  501                 if (b.stddev() == 0)
  502                     error = predict-actual;
  503                 else
  504                     error = (predict-actual)/b.stddev();
  505                 error = predict-actual; /* awb_debug */
  506                 se.cumulate((error*error),count);
  507                 e.cumulate(fabs(error),count);
  508                 xx.cumulate(predict*predict,count);
  509                 yy.cumulate(actual*actual,count);
  510                 xy.cumulate(predict*actual,count);
  511             }
  512     }
  513 
  514     // Pearson's product moment correlation coefficient
  515 //    cor = (xy.mean() - (x.mean()*y.mean()))/
  516 //  (sqrt(xx.mean()-(x.mean()*x.mean())) *
  517 //   sqrt(yy.mean()-(y.mean()*y.mean())));
  518     // Because when the variation is X is very small we can
  519     // go negative, thus cause the sqrt's to give FPE
  520     double v1 = xx.mean()-(x.mean()*x.mean());
  521     double v2 = yy.mean()-(y.mean()*y.mean());
  522 
  523     double v3 = v1*v2;
  524 
  525     if (v3 <= 0)
  526     // happens when there's very little variation in x
  527     cor = 0;
  528     else
  529     cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
  530     
  531     if (output != NULL)
  532     {
  533     if (output != &cout)   // save in output file
  534         *output 
  535         << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
  536         << " Correlation is " << ftoString(cor,4,1)
  537         << " Mean (abs) Error " << ftoString(e.mean(),4,1)
  538         << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
  539     
  540     cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
  541         << " Correlation is " << ftoString(cor,4,1)
  542         << " Mean (abs) Error " << ftoString(e.mean(),4,1)
  543         << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
  544     }
  545 
  546     if (wgn_opt_param == "rmse")
  547     return -sqrt(se.mean());  // * -1 so bigger is better 
  548     else
  549     return cor;  // should really be % variance, I think
  550 }
  551 
  552 static float test_tree_cluster(WNode &tree,WDataSet &dataset,ostream *output)
  553 {
  554     // Test tree against data to get summary of results for cluster trees
  555     WNode *leaf;
  556     int real;
  557     int right_cluster=0;
  558     EST_SuffStats ranking, meandist;
  559     EST_Litem *p;
  560 
  561     for (p=dataset.head(); p != 0; p=p->next())
  562     {
  563     leaf = tree.predict_node((*dataset(p)));
  564     real = dataset(p)->get_int_val(wgn_predictee);
  565     meandist += leaf->get_impurity().cluster_distance(real);
  566     right_cluster += leaf->get_impurity().in_cluster(real);
  567     ranking += leaf->get_impurity().cluster_ranking(real);
  568     }
  569 
  570     if (output != NULL)
  571     {
  572     // Want number in right class, mean distance in sds, mean ranking
  573     if (output != &cout)   // save in output file
  574         *output << ";; Right cluster " << right_cluster << " (" <<
  575         (int)(100.0*(float)right_cluster/(float)dataset.length()) << 
  576             "%) mean ranking " << ranking.mean() << " mean distance "
  577             << meandist.mean() << endl;
  578     cout << "Right cluster " << right_cluster << " (" <<
  579         (int)(100.0*(float)right_cluster/(float)dataset.length()) << 
  580         "%) mean ranking " << ranking.mean() << " mean distance "
  581             << meandist.mean() << endl;
  582     }
  583 
  584     return 10000-meandist.mean();  // this doesn't work but I tested it
  585 }
  586 
  587 static float test_tree_float(WNode &tree,WDataSet &dataset,ostream *output)
  588 {
  589     // Test tree against data to get summary of results FLOAT
  590     EST_Litem *p;
  591     float predict,real;
  592     EST_SuffStats x,y,xx,yy,xy,se,e;
  593     double cor,error;
  594     double count;
  595 
  596     for (p=dataset.head(); p != 0; p=p->next())
  597     {
  598     predict = tree.predict((*dataset(p)));
  599     real = dataset(p)->get_flt_val(wgn_predictee);
  600     if (wgn_count_field == -1)
  601         count = 1.0;
  602     else
  603         count = dataset(p)->get_flt_val(wgn_count_field);
  604     x.cumulate(predict,count);
  605     y.cumulate(real,count);
  606     error = predict-real;
  607     se.cumulate((error*error),count);
  608     e.cumulate(fabs(error),count);
  609     xx.cumulate(predict*predict,count);
  610     yy.cumulate(real*real,count);
  611     xy.cumulate(predict*real,count);
  612     }
  613 
  614     // Pearson's product moment correlation coefficient
  615 //    cor = (xy.mean() - (x.mean()*y.mean()))/
  616 //  (sqrt(xx.mean()-(x.mean()*x.mean())) *
  617 //   sqrt(yy.mean()-(y.mean()*y.mean())));
  618     // Because when the variation is X is very small we can
  619     // go negative, thus cause the sqrt's to give FPE
  620     double v1 = xx.mean()-(x.mean()*x.mean());
  621     double v2 = yy.mean()-(y.mean()*y.mean());
  622 
  623     double v3 = v1*v2;
  624 
  625     if (v3 <= 0)
  626     // happens when there's very little variation in x
  627     cor = 0;
  628     else
  629     cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
  630     
  631     if (output != NULL)
  632     {
  633     if (output != &cout)   // save in output file
  634         *output 
  635         << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
  636         << " Correlation is " << ftoString(cor,4,1)
  637         << " Mean (abs) Error " << ftoString(e.mean(),4,1)
  638         << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
  639     
  640     cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
  641         << " Correlation is " << ftoString(cor,4,1)
  642         << " Mean (abs) Error " << ftoString(e.mean(),4,1)
  643         << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
  644     }
  645 
  646     if (wgn_opt_param == "rmse")
  647     return -sqrt(se.mean());  // * -1 so bigger is better 
  648     else
  649     return cor;  // should really be % variance, I think
  650 }
  651 
  652 static float test_tree_ols(WNode &tree,WDataSet &dataset,ostream *output)
  653 {
  654     // Test tree against data to get summary of results OLS
  655     EST_Litem *p;
  656     WNode *leaf;
  657     float predict,real;
  658     EST_SuffStats x,y,xx,yy,xy,se,e;
  659     double cor,error;
  660     double count;
  661 
  662     for (p=dataset.head(); p != 0; p=p->next())
  663     {
  664     leaf = tree.predict_node((*dataset(p)));
  665         // do ols to get predict;
  666         predict = 0.0;  // This is incomplete ! you need to use leaf
  667     real = dataset(p)->get_flt_val(wgn_predictee);
  668     if (wgn_count_field == -1)
  669         count = 1.0;
  670     else
  671         count = dataset(p)->get_flt_val(wgn_count_field);
  672     x.cumulate(predict,count);
  673     y.cumulate(real,count);
  674     error = predict-real;
  675     se.cumulate((error*error),count);
  676     e.cumulate(fabs(error),count);
  677     xx.cumulate(predict*predict,count);
  678     yy.cumulate(real*real,count);
  679     xy.cumulate(predict*real,count);
  680     }
  681 
  682     // Pearson's product moment correlation coefficient
  683 //    cor = (xy.mean() - (x.mean()*y.mean()))/
  684 //  (sqrt(xx.mean()-(x.mean()*x.mean())) *
  685 //   sqrt(yy.mean()-(y.mean()*y.mean())));
  686     // Because when the variation is X is very small we can
  687     // go negative, thus cause the sqrt's to give FPE
  688     double v1 = xx.mean()-(x.mean()*x.mean());
  689     double v2 = yy.mean()-(y.mean()*y.mean());
  690 
  691     double v3 = v1*v2;
  692 
  693     if (v3 <= 0)
  694     // happens when there's very little variation in x
  695     cor = 0;
  696     else
  697     cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
  698     
  699     if (output != NULL)
  700     {
  701     if (output != &cout)   // save in output file
  702         *output 
  703         << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
  704         << " Correlation is " << ftoString(cor,4,1)
  705         << " Mean (abs) Error " << ftoString(e.mean(),4,1)
  706         << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
  707     
  708     cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
  709         << " Correlation is " << ftoString(cor,4,1)
  710         << " Mean (abs) Error " << ftoString(e.mean(),4,1)
  711         << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
  712     }
  713 
  714     if (wgn_opt_param == "rmse")
  715     return -sqrt(se.mean());  // * -1 so bigger is better 
  716     else
  717     return cor;  // should really be % variance, I think
  718 }
  719 
  720 static int wagon_split(int margin, WNode &node)
  721 {
  722     // Split given node (if possible)
  723     WQuestion q;
  724     WNode *l,*r;
  725 
  726     node.set_impurity(WImpurity(node.get_data()));
  727     if (wgn_max_questions < 1)
  728         return FALSE;
  729         
  730     q = find_best_question(node.get_data());
  731 
  732 /*    printf("q.score() %f impurity %f\n",
  733        q.get_score(),
  734        node.get_impurity().measure()); */
  735 
  736     double impurity_measure = node.get_impurity().measure();   
  737     double question_score = q.get_score();       
  738 
  739     if ((question_score < WGN_HUGE_VAL) && 
  740         (question_score < impurity_measure))
  741     {
  742     // Ok its worth a split
  743     l = new WNode();
  744     r = new WNode();
  745     wgn_find_split(q,node.get_data(),l->get_data(),r->get_data());
  746     node.set_subnodes(l,r);
  747     node.set_question(q);
  748     if (wgn_verbose)
  749     {
  750         int i;
  751         for (i=0; i < margin; i++)
  752         cout << " ";
  753         cout << q << endl;
  754     }
  755         wgn_max_questions--;
  756     margin++;
  757     wagon_split(margin,*l);
  758     margin++;
  759     wagon_split(margin,*r);
  760     margin--;
  761     return TRUE;
  762     }
  763     else
  764     {
  765     if (wgn_verbose)
  766     {
  767         int i;
  768         for (i=0; i < margin; i++)
  769         cout << " ";
  770         cout << "stopped samples: " << node.samples() << " impurity: " 
  771         << node.get_impurity() << endl;
  772     }
  773     margin--;
  774     return FALSE;
  775     }
  776 }
  777 
  778 void wgn_find_split(WQuestion &q,WVectorVector &ds,
  779             WVectorVector &y,WVectorVector &n)
  780 {
  781     int i, iy, in;
  782 
  783     if (wgn_dropout_samples > 0.0)
  784     {
  785         // You need to count the number of yes/no again in all ds
  786         for (iy=in=i=0; i < ds.n(); i++)
  787             if (q.ask(*ds(i)) == TRUE)
  788                 iy++;
  789             else
  790                 in++;
  791     }
  792     else
  793     {
  794         // Current counts are corrent (as all data was used)
  795         iy = q.get_yes();
  796         in = q.get_no();
  797     }
  798 
  799     y.resize(iy);
  800     n.resize(in);
  801     
  802     for (iy=in=i=0; i < ds.n(); i++)
  803     if (q.ask(*ds(i)) == TRUE)
  804         y[iy++] = ds(i);
  805     else
  806         n[in++] = ds(i);
  807 
  808 }
  809 
  810 static float wgn_random_number(float x)
  811 {
  812     // Returns random number between 0 and x
  813     return (((float)random())/RAND_MAX)*x;
  814 }
  815 
  816 #ifdef OMP_WAGON
  817 static WQuestion find_best_question(WVectorVector &dset)
  818 {
  819     //  Ask all possible questions and find the best one
  820     int i;
  821     float bscore,tscore;
  822     WQuestion test_ques, best_ques;
  823     WQuestion** questions=new WQuestion*[wgn_dataset.width()];
  824     float* scores = new float[wgn_dataset.width()];
  825     bscore = tscore = WGN_HUGE_VAL;
  826     best_ques.set_score(bscore);
  827 #pragma omp parallel
  828 #pragma omp for
  829     for (i=0;i < wgn_dataset.width(); i++)
  830     {
  831         questions[i] = new WQuestion;
  832     questions[i]->set_score(bscore);}
  833 #pragma omp parallel
  834 #pragma omp for
  835     for (i=0;i < wgn_dataset.width(); i++)
  836     {
  837     if ((wgn_dataset.ignore(i) == TRUE) ||
  838         (i == wgn_predictee))
  839         scores[i] = WGN_HUGE_VAL;     // ignore this feature this time
  840         else if (wgn_random_number(1.0) < wgn_dropout_feats)
  841         scores[i] = WGN_HUGE_VAL;     // randomly dropout feature
  842     else if (wgn_dataset.ftype(i) == wndt_binary)
  843     {
  844         construct_binary_ques(i,*questions[i]);
  845         scores[i] = wgn_score_question(*questions[i],dset);
  846     }
  847     else if (wgn_dataset.ftype(i) == wndt_float)
  848     {
  849         scores[i] = construct_float_ques(i,*questions[i],dset);
  850     }
  851     else if (wgn_dataset.ftype(i) == wndt_ignore)
  852         scores[i] = WGN_HUGE_VAL;    // always ignore this feature
  853 #if 0
  854     // This doesn't work reasonably 
  855     else if (wgn_csubset && (wgn_dataset.ftype(i) >= wndt_class))
  856     {
  857         wagon_error("subset selection temporarily deleted");
  858         tscore = construct_class_ques_subset(i,test_ques,dset);
  859     }
  860 #endif
  861     else if (wgn_dataset.ftype(i) >= wndt_class)
  862         scores[i] = construct_class_ques(i,*questions[i],dset);
  863     }
  864     for (i=0;i < wgn_dataset.width(); i++)
  865     {
  866     if (scores[i] < bscore)
  867     {
  868         memcpy(&best_ques,questions[i],sizeof(*questions[i]));
  869         best_ques.set_score(scores[i]);
  870         bscore = scores[i];
  871     }
  872         delete questions[i];
  873     }
  874     delete [] questions;
  875     delete [] scores;
  876     return best_ques;
  877 }
  878 #else
  879 // No OMP parallelism
  880 static WQuestion find_best_question(WVectorVector &dset)
  881 {
  882     //  Ask all possible questions and find the best one
  883     int i;
  884     float bscore,tscore;
  885     WQuestion test_ques, best_ques;
  886 
  887     bscore = tscore = WGN_HUGE_VAL;
  888     best_ques.set_score(bscore);
  889     // test each feature with each possible question
  890     for (i=0;i < wgn_dataset.width(); i++)
  891     {
  892     if ((wgn_dataset.ignore(i) == TRUE) ||
  893         (i == wgn_predictee))
  894         tscore = WGN_HUGE_VAL;     // ignore this feature this time
  895         else if (wgn_random_number(1.0) < wgn_dropout_feats)
  896         tscore = WGN_HUGE_VAL;     // randomly dropout feature
  897     else if (wgn_dataset.ftype(i) == wndt_binary)
  898     {
  899         construct_binary_ques(i,test_ques);
  900         tscore = wgn_score_question(test_ques,dset);
  901     }
  902     else if (wgn_dataset.ftype(i) == wndt_float)
  903     {
  904         tscore = construct_float_ques(i,test_ques,dset);
  905     }
  906     else if (wgn_dataset.ftype(i) == wndt_ignore)
  907         tscore = WGN_HUGE_VAL;    // always ignore this feature
  908 #if 0
  909     // This doesn't work reasonably 
  910     else if (wgn_csubset && (wgn_dataset.ftype(i) >= wndt_class))
  911     {
  912         wagon_error("subset selection temporarily deleted");
  913         tscore = construct_class_ques_subset(i,test_ques,dset);
  914     }
  915 #endif
  916     else if (wgn_dataset.ftype(i) >= wndt_class)
  917         tscore = construct_class_ques(i,test_ques,dset);
  918     if (tscore < bscore)
  919     {
  920         best_ques = test_ques;
  921         best_ques.set_score(tscore);
  922         bscore = tscore;
  923     }
  924     }
  925 
  926     return best_ques;
  927 }
  928 #endif
  929 
  930 static float construct_class_ques(int feat,WQuestion &ques,WVectorVector &ds)
  931 {
  932     // Find out which member of a class gives the best split
  933     float tscore,bscore = WGN_HUGE_VAL;
  934     int cl;
  935     WQuestion test_q;
  936 
  937     test_q.set_fp(feat);
  938     test_q.set_oper(wnop_is);
  939     ques = test_q;
  940     
  941     for (cl=0; cl < wgn_discretes[wgn_dataset.ftype(feat)].length(); cl++)
  942     {
  943     test_q.set_operand1(EST_Val(cl));
  944     tscore = wgn_score_question(test_q,ds);
  945     if (tscore < bscore)
  946     {
  947         ques = test_q;
  948         bscore = tscore;
  949     }
  950     }
  951 
  952     return bscore;
  953 }
  954 
  955 #if 0
  956 static float construct_class_ques_subset(int feat,WQuestion &ques,
  957                      WVectorVector &ds)
  958 {
  959     // Find out which subset of a class gives the best split.
  960     // We first measure the subset of the data for each member of 
  961     // of the class.  Then order those splits.  Then go through finding
  962     // where the best split of that ordered list is.  This is described
  963     // on page 247 of Breiman et al.
  964     float tscore,bscore = WGN_HUGE_VAL;
  965     LISP l;
  966     int cl;
  967 
  968     ques.set_fp(feat);
  969     ques.set_oper(wnop_is);
  970     float *scores = new float[wgn_discretes[wgn_dataset.ftype(feat)].length()];
  971     
  972     // Only do it for exists values
  973     for (cl=0; cl < wgn_discretes[wgn_dataset.ftype(feat)].length(); cl++)
  974     {
  975     ques.set_operand(flocons(cl));
  976     scores[cl] = wgn_score_question(ques,ds);
  977     }
  978 
  979     LISP order = sort_class_scores(feat,scores);
  980     if (order == NIL)
  981     return WGN_HUGE_VAL;
  982     if (siod_llength(order) == 1)
  983     {   // Only one so we know the best "split"
  984     ques.set_oper(wnop_is);
  985     ques.set_operand(car(order));
  986     return scores[get_c_int(car(order))];
  987     }
  988 
  989     ques.set_oper(wnop_in);
  990     LISP best_l = NIL;
  991     for (l=cdr(order); CDR(l) != NIL; l = cdr(l))
  992     {
  993     ques.set_operand(l);
  994     tscore = wgn_score_question(ques,ds);
  995     if (tscore < bscore)
  996     {
  997         best_l = l;
  998         bscore = tscore;
  999     }
 1000 
 1001     }
 1002 
 1003     if (best_l != NIL)
 1004     {
 1005     if (siod_llength(best_l) == 1)
 1006     {
 1007         ques.set_oper(wnop_is);
 1008         ques.set_operand(car(best_l));
 1009     }
 1010     else if (equal(cdr(order),best_l) != NIL)
 1011     {
 1012         ques.set_oper(wnop_is);
 1013         ques.set_operand(car(order));
 1014     }
 1015     else
 1016     {
 1017         cout << "Found a good subset" << endl;
 1018         ques.set_operand(best_l);
 1019     }
 1020     }
 1021     return bscore;
 1022 }
 1023 
 1024 static LISP sort_class_scores(int feat,float *scores)
 1025 {
 1026     // returns sorted list of (non WGN_HUGE_VAL) items
 1027     int i;
 1028     LISP items = NIL;
 1029     LISP l;
 1030 
 1031     for (i=0; i < wgn_discretes[wgn_dataset.ftype(feat)].length(); i++)
 1032     {
 1033     if (scores[i] != WGN_HUGE_VAL)
 1034     {
 1035         if (items == NIL)
 1036         items = cons(flocons(i),NIL);
 1037         else
 1038         {
 1039         for (l=items; l != NIL; l=cdr(l))
 1040         {
 1041             if (scores[i] < scores[get_c_int(car(l))])
 1042             {
 1043             CDR(l) = cons(car(l),cdr(l));
 1044             CAR(l) = flocons(i);
 1045             break;
 1046             }
 1047         }
 1048         if (l == NIL)
 1049             items = l_append(items,cons(flocons(i),NIL));
 1050         }
 1051     }
 1052     }
 1053     return items;
 1054 }
 1055 #endif
 1056 
 1057 static float construct_float_ques(int feat,WQuestion &ques,WVectorVector &ds)
 1058 {
 1059     // Find out a split of the range that gives the best score 
 1060     // Naively does this by partitioning the range into float_range_split slots
 1061     float tscore,bscore = WGN_HUGE_VAL;
 1062     int d, i;
 1063     float p;
 1064     WQuestion test_q;
 1065     float max,min,val,incr;
 1066 
 1067     test_q.set_fp(feat);
 1068     test_q.set_oper(wnop_lessthan);
 1069     ques = test_q;
 1070 
 1071     min = max = ds(0)->get_flt_val(feat);  /* set up some value */
 1072     for (d=0; d < ds.n(); d++)
 1073     {
 1074     val = ds(d)->get_flt_val(feat);
 1075     if (val < min)
 1076         min = val;
 1077     else if (val > max)
 1078         max = val;
 1079     }
 1080     if (max == min)  // we're pure
 1081     return WGN_HUGE_VAL;
 1082     incr = (max-min)/wgn_float_range_split;  
 1083     // so do float_range-1 splits
 1084     /* We calculate this based on the number splits, not the increments, */
 1085     /* becuase incr can be so small it doesn't increment p */
 1086     for (i=0,p=min+incr; i < wgn_float_range_split; i++,p += incr )
 1087     {
 1088     test_q.set_operand1(EST_Val(p));
 1089     tscore = wgn_score_question(test_q,ds);
 1090     if (tscore < bscore)
 1091     {
 1092         ques = test_q;
 1093         bscore = tscore;
 1094     }
 1095     }
 1096 
 1097     return bscore;
 1098 }
 1099 
 1100 static void construct_binary_ques(int feat,WQuestion &test_ques)
 1101 {
 1102     // construct a question.  Not sure about this in general
 1103     // of course continuous/categorical features will require different
 1104     // rule and non-binary ones will require some test point
 1105 
 1106     test_ques.set_fp(feat);
 1107     test_ques.set_oper(wnop_binary);
 1108     test_ques.set_operand1(EST_Val(""));
 1109 }
 1110 
 1111 static float score_question_set(WQuestion &q, WVectorVector &ds, int ignorenth)
 1112 {
 1113     // score this question as a possible split by finding
 1114     // the sum of the impurities when ds is split with this question
 1115     WImpurity y,n;
 1116     int d, num_yes, num_no;
 1117     float count;
 1118     WVector *wv;
 1119 
 1120     num_yes = num_no = 0;
 1121     y.data = &ds;
 1122     n.data = &ds;
 1123     for (d=0; d < ds.n(); d++)
 1124     {
 1125         if (wgn_random_number(1.0) < wgn_dropout_samples)
 1126         {
 1127             continue;  // dropout this sample
 1128         }
 1129         else if ((ignorenth < 2) ||
 1130         (d%ignorenth != ignorenth-1))
 1131     {
 1132         wv = ds(d);
 1133         if (wgn_count_field == -1)
 1134         count = 1.0;
 1135         else 
 1136         count = (*wv)[wgn_count_field];
 1137 
 1138         if (q.ask(*wv) == TRUE)
 1139         {
 1140         num_yes++;
 1141                 if (wgn_dataset.ftype(wgn_predictee) == wndt_ols)
 1142                     y.cumulate(d,count);  // note the sample number not value
 1143                 else
 1144                     y.cumulate((*wv)[wgn_predictee],count);
 1145         }
 1146         else
 1147         {
 1148         num_no++;
 1149                 if (wgn_dataset.ftype(wgn_predictee) == wndt_ols)
 1150                     n.cumulate(d,count);  // note the sample number not value
 1151                 else
 1152                     n.cumulate((*wv)[wgn_predictee],count);
 1153         }
 1154     }
 1155     }
 1156 
 1157     q.set_yes(num_yes);
 1158     q.set_no(num_no);
 1159 
 1160     int min_cluster;
 1161 
 1162     if ((wgn_balance == 0.0) ||
 1163     (ds.n()/wgn_balance < wgn_min_cluster_size))
 1164     min_cluster = wgn_min_cluster_size;
 1165     else 
 1166     min_cluster = (int)(ds.n()/wgn_balance);
 1167 
 1168     if ((y.samples() < min_cluster) ||
 1169     (n.samples() < min_cluster))
 1170     return WGN_HUGE_VAL;
 1171 
 1172     float ym,nm,bm;
 1173     //    printf("awb_debug score_question_set X %f Y %f\n",
 1174     //    y.samples(), n.samples());
 1175     ym = y.measure();
 1176     nm = n.measure();
 1177     bm = ym + nm;
 1178 
 1179     /*    cout << q << endl;
 1180     printf("test question y %f n %f b %f\n",
 1181     ym, nm, bm); */
 1182 
 1183     return bm/2.0;
 1184 }
 1185 
 1186 float wgn_score_question(WQuestion &q, WVectorVector &ds)
 1187 {
 1188     // This level of indirection was introduced for later expansion
 1189 
 1190     return score_question_set(q,ds,1);
 1191 }
 1192 
 1193 WNode *wagon_stepwise(float limit)
 1194 {
 1195     // Find the best single features and incrementally add features
 1196     // that best improve result until it doesn't improve.
 1197     // This is basically to automate what Kurt was doing in building
 1198     // trees, he then automated it in PERL and as it seemed to work
 1199     // I put it into wagon itself.
 1200     // This can be pretty computationally intensive.
 1201     WNode *best = 0,*new_best = 0;
 1202     float bscore,best_score = -WGN_HUGE_VAL;
 1203     int best_feat,i;
 1204     int nf = 1;
 1205 
 1206     // Set all features to ignore
 1207     for (i=0; i < wgn_dataset.width(); i++)
 1208     wgn_dataset.set_ignore(i,TRUE);
 1209 
 1210     for (i=0; i < wgn_dataset.width(); i++)
 1211     {
 1212     if ((wgn_dataset.ftype(i) == wndt_ignore) || (i == wgn_predictee))
 1213     {
 1214         // This skips the round not because this has anything to
 1215         // do with this feature being (user specified) ignored
 1216         // but because it indicates there is one less cycle that is
 1217         // necessary
 1218         continue;
 1219     }
 1220     new_best = wagon_stepwise_find_next_best(bscore,best_feat);
 1221 
 1222     if ((bscore - fabs(bscore * (limit/100))) <= best_score)
 1223     {
 1224         // gone as far as we can
 1225         delete new_best;
 1226         break;
 1227     }
 1228     else
 1229     {
 1230         best_score = bscore;
 1231         delete best;
 1232         best = new_best;
 1233         wgn_dataset.set_ignore(best_feat,FALSE);
 1234         if (!wgn_quiet)
 1235         {
 1236         fprintf(stdout,"FEATURE    %d %s: %2.4f\n",
 1237             nf,
 1238             (const char *)wgn_dataset.feat_name(best_feat),
 1239             best_score);
 1240         fflush(stdout);
 1241         nf++;
 1242         }
 1243     }
 1244     }
 1245 
 1246     return best;
 1247 }
 1248 
 1249 static WNode *wagon_stepwise_find_next_best(float &bscore,int &best_feat)
 1250 {
 1251     // Find which of the currently ignored features will best improve
 1252     // the result
 1253     WNode *best = 0;
 1254     float best_score = -WGN_HUGE_VAL;
 1255     int best_new_feat = -1;
 1256     int i;
 1257 
 1258     for (i=0; i < wgn_dataset.width(); i++)
 1259     {
 1260     if (wgn_dataset.ftype(i) == wndt_ignore)
 1261         continue; // user wants me to ignore this completely
 1262     else if (i == wgn_predictee) // can't use the answer
 1263         continue;
 1264     else if (wgn_dataset.ignore(i) == TRUE)
 1265     {
 1266         WNode *current;
 1267         float score;
 1268 
 1269         // Allow this feature to participate
 1270         wgn_dataset.set_ignore(i,FALSE);
 1271         
 1272         current = wgn_build_tree(score);
 1273 
 1274         if (score > best_score)
 1275         {
 1276         best_score = score;
 1277         delete best;
 1278         best = current;
 1279         best_new_feat = i;
 1280 //      fprintf(stdout,"BETTER FEATURE    %d %s: %2.4f\n",
 1281 //          i,
 1282 //          (const char *)wgn_dataset.feat_name(i),
 1283 //          best_score);
 1284 //      fflush(stdout);
 1285         }
 1286         else
 1287         delete current;
 1288 
 1289         // switch it off again
 1290         wgn_dataset.set_ignore(i,TRUE);
 1291     }
 1292     }
 1293 
 1294     bscore = best_score;
 1295     best_feat = best_new_feat;
 1296     return best;
 1297 }