"Fossies" - the Fresh Open Source Software Archive 
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996,1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : May 1996 */
35 /*-----------------------------------------------------------------------*/
36 /* A Classification and Regression Tree (CART) Program */
37 /* A basic implementation of many of the techniques in */
38 /* Briemen et al. 1984 */
39 /* */
40 /* Added decision list support, Feb 1997 */
41 /* Added stepwise use of features, Oct 1997 */
42 /* */
43 /*=======================================================================*/
44
45 #include <cstdlib>
46 #include <iostream>
47 #include <fstream>
48 #include <cstring>
49 #include "EST_Token.h"
50 #include "EST_FMatrix.h"
51 #include "EST_multistats.h"
52 #include "EST_Wagon.h"
53 #include "EST_math.h"
54
55 Discretes wgn_discretes;
56
57 WDataSet wgn_dataset;
58 WDataSet wgn_test_dataset;
59 EST_FMatrix wgn_DistMatrix;
60 EST_Track wgn_VertexTrack;
61 EST_Track wgn_VertexFeats;
62 EST_Track wgn_UnitTrack;
63
64 int wgn_min_cluster_size = 50;
65 int wgn_max_questions = 2000000; /* not ideal, but adequate */
66 int wgn_held_out = 0;
67 float wgn_dropout_feats = 0.0;
68 float wgn_dropout_samples = 0.0;
69 int wgn_cos = 1;
70 int wgn_prune = TRUE;
71 int wgn_quiet = FALSE;
72 int wgn_verbose = FALSE;
73 int wgn_count_field = -1;
74 EST_String wgn_count_field_name = "";
75 int wgn_predictee = 0;
76 EST_String wgn_predictee_name = "";
77 float wgn_float_range_split = 10;
78 float wgn_balance = 0;
79 EST_String wgn_opt_param = "";
80 EST_String wgn_vertex_output = "mean";
81 EST_String wgn_vertex_otype = "mean";
82
83 static float do_summary(WNode &tree,WDataSet &ds,ostream *output);
84 static float test_tree_float(WNode &tree,WDataSet &ds,ostream *output);
85 static float test_tree_class(WNode &tree,WDataSet &ds,ostream *output);
86 static float test_tree_cluster(WNode &tree,WDataSet &dataset, ostream *output);
87 static float test_tree_vector(WNode &tree,WDataSet &dataset,ostream *output);
88 static float test_tree_trajectory(WNode &tree,WDataSet &dataset,ostream *output);
89 static float test_tree_ols(WNode &tree,WDataSet &dataset,ostream *output);
90 static int wagon_split(int margin,WNode &node);
91 static WQuestion find_best_question(WVectorVector &dset);
92 static void construct_binary_ques(int feat,WQuestion &test_ques);
93 static float construct_float_ques(int feat,WQuestion &ques,WVectorVector &ds);
94 static float construct_class_ques(int feat,WQuestion &ques,WVectorVector &ds);
95 static void wgn_set_up_data(WVectorVector &data,const WVectorList &ds,int held_out,int in);
96 static WNode *wagon_stepwise_find_next_best(float &bscore,int &best_feat);
97
98 Declare_TList_T(WVector *, WVectorP)
99
100 Declare_TVector_Base_T(WVector *,NULL,NULL,WVectorP)
101
102 #if defined(INSTANTIATE_TEMPLATES)
103 // Instantiate class
104 #include "../base_class/EST_TList.cc"
105 #include "../base_class/EST_TVector.cc"
106
107 Instantiate_TList_T(WVector *, WVectorP)
108
109 Instantiate_TVector(WVector *)
110
111 #endif
112
113 void wgn_load_datadescription(EST_String fname,LISP ignores)
114 {
115 // Load field description for a file
116 wgn_dataset.load_description(fname,ignores);
117 wgn_test_dataset.load_description(fname,ignores);
118 }
119
120 void wgn_load_dataset(WDataSet &dataset,EST_String fname)
121 {
122 // Read the data set from a filename. One vector per line
123 // Assume all numbers are numbers and non-nums are categorical
124 EST_TokenStream ts;
125 WVector *v;
126 int nvec=0,i;
127
128 if (ts.open(fname) == -1)
129 wagon_error(EST_String("unable to open data file \"")+
130 fname+"\"");
131 ts.set_PunctuationSymbols("");
132 ts.set_PrePunctuationSymbols("");
133 ts.set_SingleCharSymbols("");
134
135 for ( ;!ts.eof(); )
136 {
137 v = new WVector(dataset.width());
138 i = 0;
139 do
140 {
141 int type = dataset.ftype(i);
142 if ((type == wndt_float) ||
143 (type == wndt_ols) ||
144 (wgn_count_field == i))
145 {
146 // need to ensure this is not NaN or Infinity
147 float f = atof(ts.get().string());
148 if (isfinite(f))
149 v->set_flt_val(i,f);
150 else
151 {
152 cout << fname << ": bad float " << f
153 << " in field " <<
154 dataset.feat_name(i) << " vector " <<
155 dataset.samples() << endl;
156 v->set_flt_val(i,0.0);
157 }
158 }
159 else if (type == wndt_binary)
160 v->set_int_val(i,atoi(ts.get().string()));
161 else if (type == wndt_cluster) /* index into distmatrix */
162 v->set_int_val(i,atoi(ts.get().string()));
163 else if (type == wndt_vector) /* index into VertexTrack */
164 v->set_int_val(i,atoi(ts.get().string()));
165 else if (type == wndt_trajectory) /* index to index and length */
166 { /* a number pointing to a vector in UnitTrack that */
167 /* has an idex into VertexTrack and a number of Vertices */
168 /* Thus if its 15, UnitTrack.a(15,0) is the start frame in */
169 /* VertexTrack and UnitTrack.a(15,1) is the number of */
170 /* frames in the unit */
171 v->set_int_val(i,atoi(ts.get().string()));
172 }
173 else if (type == wndt_ignore)
174 {
175 ts.get(); // skip it
176 v->set_int_val(i,0);
177 }
178 else // should check the different classes
179 {
180 EST_String s = ts.get().string();
181 int n = wgn_discretes.discrete(type).name(s);
182 if (n == -1)
183 {
184 cout << fname << ": bad value " << s << " in field " <<
185 dataset.feat_name(i) << " vector " <<
186 dataset.samples() << endl;
187 n = 0;
188 }
189 v->set_int_val(i,n);
190 }
191 i++;
192 }
193 while (!ts.eoln() && i<dataset.width());
194 nvec ++;
195 if (i != dataset.width())
196 {
197 wagon_error(fname+": data vector "+itoString(nvec)+" contains "
198 +itoString(i)+" parameters instead of "+
199 itoString(dataset.width()));
200 }
201 if (!ts.eoln())
202 {
203 cerr << fname << ": data vector " << nvec <<
204 " contains too many parameters instead of "
205 << dataset.width() << endl;
206 wagon_error(EST_String("extra parameter(s) from ")+
207 ts.peek().string());
208 }
209 dataset.append(v);
210 }
211
212 cout << "Dataset of " << dataset.samples() << " vectors of " <<
213 dataset.width() << " parameters from: " << fname << endl;
214 ts.close();
215 }
216
217 float summary_results(WNode &tree,ostream *output)
218 {
219 if (wgn_test_dataset.samples() != 0)
220 return do_summary(tree,wgn_test_dataset,output);
221 else
222 return do_summary(tree,wgn_dataset,output);
223 }
224
225 static float do_summary(WNode &tree,WDataSet &ds,ostream *output)
226 {
227 if (wgn_dataset.ftype(wgn_predictee) == wndt_cluster)
228 return test_tree_cluster(tree,ds,output);
229 else if (wgn_dataset.ftype(wgn_predictee) == wndt_vector)
230 return test_tree_vector(tree,ds,output);
231 else if (wgn_dataset.ftype(wgn_predictee) == wndt_trajectory)
232 return test_tree_trajectory(tree,ds,output);
233 else if (wgn_dataset.ftype(wgn_predictee) == wndt_ols)
234 return test_tree_ols(tree,ds,output);
235 else if (wgn_dataset.ftype(wgn_predictee) >= wndt_class)
236 return test_tree_class(tree,ds,output);
237 else
238 return test_tree_float(tree,ds,output);
239 }
240
241 WNode *wgn_build_tree(float &score)
242 {
243 // Build init node and split it while reducing the impurity
244 WNode *top = new WNode();
245 int margin = 0;
246
247 wgn_set_up_data(top->get_data(),wgn_dataset,wgn_held_out,TRUE);
248
249 margin = 0;
250 wagon_split(margin,*top); // recursively split data;
251
252 if (wgn_held_out > 0)
253 {
254 wgn_set_up_data(top->get_data(),wgn_dataset,wgn_held_out,FALSE);
255 top->held_out_prune();
256 }
257
258 if (wgn_prune)
259 top->prune();
260
261 score = summary_results(*top,0);
262
263 return top;
264 }
265
266 static void wgn_set_up_data(WVectorVector &data,const WVectorList &ds,int held_out,int in)
267 {
268 // Set data ommitting held_out percent if in is true
269 // or only including 100-held_out percent if in is false
270 int i,j;
271 EST_Litem *d;
272
273 // Make it definitely big enough
274 data.resize(ds.length());
275
276 for (j=i=0,d=ds.head(); d != 0; d=d->next(),j++)
277 {
278 if ((in) && ((j%100) >= held_out))
279 data[i++] = ds(d);
280 // else if ((!in) && ((j%100 < held_out)))
281 // data[i++] = ds(d);
282 else if (!in)
283 data[i++] = ds(d);
284 // if ((in) && (j < held_out))
285 // data[i++] = ds(d);
286 // else if ((!in) && (j >=held_out))
287 // data[i++] = ds(d);
288 }
289 // make it the actual size, but don't reset values
290 data.resize(i,1);
291 }
292
293 static float test_tree_class(WNode &tree,WDataSet &dataset,ostream *output)
294 {
295 // Test tree against data to get summary of results
296 EST_StrStr_KVL pairs;
297 EST_StrList lex;
298 EST_Litem *p;
299 EST_String predict,real;
300 WNode *pnode;
301 double H=0,prob;
302 int i,type;
303 float correct=0,total=0, count=0;
304
305 float bcorrect=0, bpredicted=0, bactual=0;
306 float precision=0, recall=0;
307
308 for (p=dataset.head(); p != 0; p=p->next())
309 {
310 pnode = tree.predict_node((*dataset(p)));
311 predict = (EST_String)pnode->get_impurity().value();
312 if (wgn_count_field == -1)
313 count = 1.0;
314 else
315 count = dataset(p)->get_flt_val(wgn_count_field);
316 prob = pnode->get_impurity().pd().probability(predict);
317 H += (log(prob))*count;
318 type = dataset.ftype(wgn_predictee);
319 real = wgn_discretes[type].name(dataset(p)->get_int_val(wgn_predictee));
320
321 if (wgn_opt_param == "B_NB_F1")
322 {
323 //cout << real << " " << predict << endl;
324 if (real == "B")
325 bactual +=count;
326 if (predict == "B")
327 {
328 bpredicted += count;
329 if (real == predict)
330 bcorrect += count;
331 }
332 // cout <<bactual << " " << bpredicted << " " << bcorrect << endl;
333 }
334 if (real == predict)
335 correct += count;
336 total += count;
337 pairs.add_item(real,predict,1);
338 }
339 for (i=0; i<wgn_discretes[dataset.ftype(wgn_predictee)].length(); i++)
340 lex.append(wgn_discretes[dataset.ftype(wgn_predictee)].name(i));
341
342 const EST_FMatrix &m = confusion(pairs,lex);
343
344 if (output != NULL)
345 {
346 print_confusion(m,pairs,lex); // should be to output not stdout
347 *output << ";; entropy " << (-1*(H/total)) << " perplexity " <<
348 pow(2.0,(-1*(H/total))) << endl;
349 }
350
351
352 // Minus it so bigger is better
353 if (wgn_opt_param == "entropy")
354 return -pow(2.0,(-1*(H/total)));
355 else if(wgn_opt_param == "B_NB_F1")
356 {
357 if(bpredicted == 0)
358 precision = 1;
359 else
360 precision = bcorrect/bpredicted;
361 if(bactual == 0)
362 recall = 1;
363 else
364 recall = bcorrect/bactual;
365 float fmeasure = 0;
366 if((precision+recall) !=0)
367 fmeasure = 2* (precision*recall)/(precision+recall);
368 cout<< "F1 :" << fmeasure << " Prec:" << precision << " Rec:" << recall << " B-Pred:" << bpredicted << " B-Actual:" << bactual << " B-Correct:" << bcorrect << endl;
369 return fmeasure;
370 }
371 else
372 return (float)correct/(float)total;
373 }
374
375 static float test_tree_vector(WNode &tree,WDataSet &dataset,ostream *output)
376 {
377 // Test tree against data to get summary of results VECTOR
378 // distance is calculated in zscores (as the values in vector may
379 // have quite different ranges
380 WNode *leaf;
381 EST_Litem *p;
382 float predict, actual;
383 EST_SuffStats x,y,xx,yy,xy,se,e;
384 EST_SuffStats b;
385 int i,j,pos;
386 double cor,error;
387 double count;
388 EST_Litem *pp;
389
390 for (p=dataset.head(); p != 0; p=p->next())
391 {
392 leaf = tree.predict_node((*dataset(p)));
393 pos = dataset(p)->get_int_val(wgn_predictee);
394 for (j=0; j<wgn_VertexFeats.num_channels(); j++)
395 if (wgn_VertexFeats.a(0,j) > 0.0)
396 {
397 b.reset();
398 for (pp=leaf->get_impurity().members.head(); pp != 0; pp=pp->next())
399 {
400 i = leaf->get_impurity().members.item(pp);
401 b += wgn_VertexTrack.a(i,j);
402 }
403 predict = b.mean();
404 actual = wgn_VertexTrack.a(pos,j);
405 if (wgn_count_field == -1)
406 count = 1.0;
407 else
408 count = dataset(p)->get_flt_val(wgn_count_field);
409 x.cumulate(predict,count);
410 y.cumulate(actual,count);
411 /* Normalized the error by the standard deviation */
412 if (b.stddev() == 0)
413 error = predict-actual;
414 else
415 error = (predict-actual)/b.stddev();
416 error = predict-actual; /* awb_debug */
417 se.cumulate((error*error),count);
418 e.cumulate(fabs(error),count);
419 xx.cumulate(predict*predict,count);
420 yy.cumulate(actual*actual,count);
421 xy.cumulate(predict*actual,count);
422 }
423 }
424
425 // Pearson's product moment correlation coefficient
426 // cor = (xy.mean() - (x.mean()*y.mean()))/
427 // (sqrt(xx.mean()-(x.mean()*x.mean())) *
428 // sqrt(yy.mean()-(y.mean()*y.mean())));
429 // Because when the variation is X is very small we can
430 // go negative, thus cause the sqrt's to give FPE
431 double v1 = xx.mean()-(x.mean()*x.mean());
432 double v2 = yy.mean()-(y.mean()*y.mean());
433
434 double v3 = v1*v2;
435
436 if (v3 <= 0)
437 // happens when there's very little variation in x
438 cor = 0;
439 else
440 cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
441
442 if (output != NULL)
443 {
444 if (output != &cout) // save in output file
445 *output
446 << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
447 << " Correlation is " << ftoString(cor,4,1)
448 << " Mean (abs) Error " << ftoString(e.mean(),4,1)
449 << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
450
451 cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
452 << " Correlation is " << ftoString(cor,4,1)
453 << " Mean (abs) Error " << ftoString(e.mean(),4,1)
454 << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
455 }
456
457 if (wgn_opt_param == "rmse")
458 return -sqrt(se.mean()); // * -1 so bigger is better
459 else
460 return cor; // should really be % variance, I think
461 }
462
463 static float test_tree_trajectory(WNode &tree,WDataSet &dataset,ostream *output)
464 {
465 // Test tree against data to get summary of results TRAJECTORY
466 // distance is calculated in zscores (as the values in vector may
467 // have quite different ranges)
468 // NOT WRITTEN YET
469 WNode *leaf;
470 EST_Litem *p;
471 float predict, actual;
472 EST_SuffStats x,y,xx,yy,xy,se,e;
473 EST_SuffStats b;
474 int i,j,pos;
475 double cor,error;
476 double count;
477 EST_Litem *pp;
478
479 for (p=dataset.head(); p != 0; p=p->next())
480 {
481 leaf = tree.predict_node((*dataset(p)));
482 pos = dataset(p)->get_int_val(wgn_predictee);
483 for (j=0; j<wgn_VertexFeats.num_channels(); j++)
484 if (wgn_VertexFeats.a(0,j) > 0.0)
485 {
486 b.reset();
487 for (pp=leaf->get_impurity().members.head(); pp != 0; pp=pp->next())
488 {
489 i = leaf->get_impurity().members.item(pp);
490 b += wgn_VertexTrack.a(i,j);
491 }
492 predict = b.mean();
493 actual = wgn_VertexTrack.a(pos,j);
494 if (wgn_count_field == -1)
495 count = 1.0;
496 else
497 count = dataset(p)->get_flt_val(wgn_count_field);
498 x.cumulate(predict,count);
499 y.cumulate(actual,count);
500 /* Normalized the error by the standard deviation */
501 if (b.stddev() == 0)
502 error = predict-actual;
503 else
504 error = (predict-actual)/b.stddev();
505 error = predict-actual; /* awb_debug */
506 se.cumulate((error*error),count);
507 e.cumulate(fabs(error),count);
508 xx.cumulate(predict*predict,count);
509 yy.cumulate(actual*actual,count);
510 xy.cumulate(predict*actual,count);
511 }
512 }
513
514 // Pearson's product moment correlation coefficient
515 // cor = (xy.mean() - (x.mean()*y.mean()))/
516 // (sqrt(xx.mean()-(x.mean()*x.mean())) *
517 // sqrt(yy.mean()-(y.mean()*y.mean())));
518 // Because when the variation is X is very small we can
519 // go negative, thus cause the sqrt's to give FPE
520 double v1 = xx.mean()-(x.mean()*x.mean());
521 double v2 = yy.mean()-(y.mean()*y.mean());
522
523 double v3 = v1*v2;
524
525 if (v3 <= 0)
526 // happens when there's very little variation in x
527 cor = 0;
528 else
529 cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
530
531 if (output != NULL)
532 {
533 if (output != &cout) // save in output file
534 *output
535 << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
536 << " Correlation is " << ftoString(cor,4,1)
537 << " Mean (abs) Error " << ftoString(e.mean(),4,1)
538 << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
539
540 cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
541 << " Correlation is " << ftoString(cor,4,1)
542 << " Mean (abs) Error " << ftoString(e.mean(),4,1)
543 << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
544 }
545
546 if (wgn_opt_param == "rmse")
547 return -sqrt(se.mean()); // * -1 so bigger is better
548 else
549 return cor; // should really be % variance, I think
550 }
551
552 static float test_tree_cluster(WNode &tree,WDataSet &dataset,ostream *output)
553 {
554 // Test tree against data to get summary of results for cluster trees
555 WNode *leaf;
556 int real;
557 int right_cluster=0;
558 EST_SuffStats ranking, meandist;
559 EST_Litem *p;
560
561 for (p=dataset.head(); p != 0; p=p->next())
562 {
563 leaf = tree.predict_node((*dataset(p)));
564 real = dataset(p)->get_int_val(wgn_predictee);
565 meandist += leaf->get_impurity().cluster_distance(real);
566 right_cluster += leaf->get_impurity().in_cluster(real);
567 ranking += leaf->get_impurity().cluster_ranking(real);
568 }
569
570 if (output != NULL)
571 {
572 // Want number in right class, mean distance in sds, mean ranking
573 if (output != &cout) // save in output file
574 *output << ";; Right cluster " << right_cluster << " (" <<
575 (int)(100.0*(float)right_cluster/(float)dataset.length()) <<
576 "%) mean ranking " << ranking.mean() << " mean distance "
577 << meandist.mean() << endl;
578 cout << "Right cluster " << right_cluster << " (" <<
579 (int)(100.0*(float)right_cluster/(float)dataset.length()) <<
580 "%) mean ranking " << ranking.mean() << " mean distance "
581 << meandist.mean() << endl;
582 }
583
584 return 10000-meandist.mean(); // this doesn't work but I tested it
585 }
586
587 static float test_tree_float(WNode &tree,WDataSet &dataset,ostream *output)
588 {
589 // Test tree against data to get summary of results FLOAT
590 EST_Litem *p;
591 float predict,real;
592 EST_SuffStats x,y,xx,yy,xy,se,e;
593 double cor,error;
594 double count;
595
596 for (p=dataset.head(); p != 0; p=p->next())
597 {
598 predict = tree.predict((*dataset(p)));
599 real = dataset(p)->get_flt_val(wgn_predictee);
600 if (wgn_count_field == -1)
601 count = 1.0;
602 else
603 count = dataset(p)->get_flt_val(wgn_count_field);
604 x.cumulate(predict,count);
605 y.cumulate(real,count);
606 error = predict-real;
607 se.cumulate((error*error),count);
608 e.cumulate(fabs(error),count);
609 xx.cumulate(predict*predict,count);
610 yy.cumulate(real*real,count);
611 xy.cumulate(predict*real,count);
612 }
613
614 // Pearson's product moment correlation coefficient
615 // cor = (xy.mean() - (x.mean()*y.mean()))/
616 // (sqrt(xx.mean()-(x.mean()*x.mean())) *
617 // sqrt(yy.mean()-(y.mean()*y.mean())));
618 // Because when the variation is X is very small we can
619 // go negative, thus cause the sqrt's to give FPE
620 double v1 = xx.mean()-(x.mean()*x.mean());
621 double v2 = yy.mean()-(y.mean()*y.mean());
622
623 double v3 = v1*v2;
624
625 if (v3 <= 0)
626 // happens when there's very little variation in x
627 cor = 0;
628 else
629 cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
630
631 if (output != NULL)
632 {
633 if (output != &cout) // save in output file
634 *output
635 << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
636 << " Correlation is " << ftoString(cor,4,1)
637 << " Mean (abs) Error " << ftoString(e.mean(),4,1)
638 << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
639
640 cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
641 << " Correlation is " << ftoString(cor,4,1)
642 << " Mean (abs) Error " << ftoString(e.mean(),4,1)
643 << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
644 }
645
646 if (wgn_opt_param == "rmse")
647 return -sqrt(se.mean()); // * -1 so bigger is better
648 else
649 return cor; // should really be % variance, I think
650 }
651
652 static float test_tree_ols(WNode &tree,WDataSet &dataset,ostream *output)
653 {
654 // Test tree against data to get summary of results OLS
655 EST_Litem *p;
656 WNode *leaf;
657 float predict,real;
658 EST_SuffStats x,y,xx,yy,xy,se,e;
659 double cor,error;
660 double count;
661
662 for (p=dataset.head(); p != 0; p=p->next())
663 {
664 leaf = tree.predict_node((*dataset(p)));
665 // do ols to get predict;
666 predict = 0.0; // This is incomplete ! you need to use leaf
667 real = dataset(p)->get_flt_val(wgn_predictee);
668 if (wgn_count_field == -1)
669 count = 1.0;
670 else
671 count = dataset(p)->get_flt_val(wgn_count_field);
672 x.cumulate(predict,count);
673 y.cumulate(real,count);
674 error = predict-real;
675 se.cumulate((error*error),count);
676 e.cumulate(fabs(error),count);
677 xx.cumulate(predict*predict,count);
678 yy.cumulate(real*real,count);
679 xy.cumulate(predict*real,count);
680 }
681
682 // Pearson's product moment correlation coefficient
683 // cor = (xy.mean() - (x.mean()*y.mean()))/
684 // (sqrt(xx.mean()-(x.mean()*x.mean())) *
685 // sqrt(yy.mean()-(y.mean()*y.mean())));
686 // Because when the variation is X is very small we can
687 // go negative, thus cause the sqrt's to give FPE
688 double v1 = xx.mean()-(x.mean()*x.mean());
689 double v2 = yy.mean()-(y.mean()*y.mean());
690
691 double v3 = v1*v2;
692
693 if (v3 <= 0)
694 // happens when there's very little variation in x
695 cor = 0;
696 else
697 cor = (xy.mean() - (x.mean()*y.mean()))/ sqrt(v3);
698
699 if (output != NULL)
700 {
701 if (output != &cout) // save in output file
702 *output
703 << ";; RMSE " << ftoString(sqrt(se.mean()),4,1)
704 << " Correlation is " << ftoString(cor,4,1)
705 << " Mean (abs) Error " << ftoString(e.mean(),4,1)
706 << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
707
708 cout << "RMSE " << ftoString(sqrt(se.mean()),4,1)
709 << " Correlation is " << ftoString(cor,4,1)
710 << " Mean (abs) Error " << ftoString(e.mean(),4,1)
711 << " (" << ftoString(e.stddev(),4,1) << ")" << endl;
712 }
713
714 if (wgn_opt_param == "rmse")
715 return -sqrt(se.mean()); // * -1 so bigger is better
716 else
717 return cor; // should really be % variance, I think
718 }
719
720 static int wagon_split(int margin, WNode &node)
721 {
722 // Split given node (if possible)
723 WQuestion q;
724 WNode *l,*r;
725
726 node.set_impurity(WImpurity(node.get_data()));
727 if (wgn_max_questions < 1)
728 return FALSE;
729
730 q = find_best_question(node.get_data());
731
732 /* printf("q.score() %f impurity %f\n",
733 q.get_score(),
734 node.get_impurity().measure()); */
735
736 double impurity_measure = node.get_impurity().measure();
737 double question_score = q.get_score();
738
739 if ((question_score < WGN_HUGE_VAL) &&
740 (question_score < impurity_measure))
741 {
742 // Ok its worth a split
743 l = new WNode();
744 r = new WNode();
745 wgn_find_split(q,node.get_data(),l->get_data(),r->get_data());
746 node.set_subnodes(l,r);
747 node.set_question(q);
748 if (wgn_verbose)
749 {
750 int i;
751 for (i=0; i < margin; i++)
752 cout << " ";
753 cout << q << endl;
754 }
755 wgn_max_questions--;
756 margin++;
757 wagon_split(margin,*l);
758 margin++;
759 wagon_split(margin,*r);
760 margin--;
761 return TRUE;
762 }
763 else
764 {
765 if (wgn_verbose)
766 {
767 int i;
768 for (i=0; i < margin; i++)
769 cout << " ";
770 cout << "stopped samples: " << node.samples() << " impurity: "
771 << node.get_impurity() << endl;
772 }
773 margin--;
774 return FALSE;
775 }
776 }
777
778 void wgn_find_split(WQuestion &q,WVectorVector &ds,
779 WVectorVector &y,WVectorVector &n)
780 {
781 int i, iy, in;
782
783 if (wgn_dropout_samples > 0.0)
784 {
785 // You need to count the number of yes/no again in all ds
786 for (iy=in=i=0; i < ds.n(); i++)
787 if (q.ask(*ds(i)) == TRUE)
788 iy++;
789 else
790 in++;
791 }
792 else
793 {
794 // Current counts are corrent (as all data was used)
795 iy = q.get_yes();
796 in = q.get_no();
797 }
798
799 y.resize(iy);
800 n.resize(in);
801
802 for (iy=in=i=0; i < ds.n(); i++)
803 if (q.ask(*ds(i)) == TRUE)
804 y[iy++] = ds(i);
805 else
806 n[in++] = ds(i);
807
808 }
809
810 static float wgn_random_number(float x)
811 {
812 // Returns random number between 0 and x
813 return (((float)random())/RAND_MAX)*x;
814 }
815
816 #ifdef OMP_WAGON
817 static WQuestion find_best_question(WVectorVector &dset)
818 {
819 // Ask all possible questions and find the best one
820 int i;
821 float bscore,tscore;
822 WQuestion test_ques, best_ques;
823 WQuestion** questions=new WQuestion*[wgn_dataset.width()];
824 float* scores = new float[wgn_dataset.width()];
825 bscore = tscore = WGN_HUGE_VAL;
826 best_ques.set_score(bscore);
827 #pragma omp parallel
828 #pragma omp for
829 for (i=0;i < wgn_dataset.width(); i++)
830 {
831 questions[i] = new WQuestion;
832 questions[i]->set_score(bscore);}
833 #pragma omp parallel
834 #pragma omp for
835 for (i=0;i < wgn_dataset.width(); i++)
836 {
837 if ((wgn_dataset.ignore(i) == TRUE) ||
838 (i == wgn_predictee))
839 scores[i] = WGN_HUGE_VAL; // ignore this feature this time
840 else if (wgn_random_number(1.0) < wgn_dropout_feats)
841 scores[i] = WGN_HUGE_VAL; // randomly dropout feature
842 else if (wgn_dataset.ftype(i) == wndt_binary)
843 {
844 construct_binary_ques(i,*questions[i]);
845 scores[i] = wgn_score_question(*questions[i],dset);
846 }
847 else if (wgn_dataset.ftype(i) == wndt_float)
848 {
849 scores[i] = construct_float_ques(i,*questions[i],dset);
850 }
851 else if (wgn_dataset.ftype(i) == wndt_ignore)
852 scores[i] = WGN_HUGE_VAL; // always ignore this feature
853 #if 0
854 // This doesn't work reasonably
855 else if (wgn_csubset && (wgn_dataset.ftype(i) >= wndt_class))
856 {
857 wagon_error("subset selection temporarily deleted");
858 tscore = construct_class_ques_subset(i,test_ques,dset);
859 }
860 #endif
861 else if (wgn_dataset.ftype(i) >= wndt_class)
862 scores[i] = construct_class_ques(i,*questions[i],dset);
863 }
864 for (i=0;i < wgn_dataset.width(); i++)
865 {
866 if (scores[i] < bscore)
867 {
868 memcpy(&best_ques,questions[i],sizeof(*questions[i]));
869 best_ques.set_score(scores[i]);
870 bscore = scores[i];
871 }
872 delete questions[i];
873 }
874 delete [] questions;
875 delete [] scores;
876 return best_ques;
877 }
878 #else
879 // No OMP parallelism
880 static WQuestion find_best_question(WVectorVector &dset)
881 {
882 // Ask all possible questions and find the best one
883 int i;
884 float bscore,tscore;
885 WQuestion test_ques, best_ques;
886
887 bscore = tscore = WGN_HUGE_VAL;
888 best_ques.set_score(bscore);
889 // test each feature with each possible question
890 for (i=0;i < wgn_dataset.width(); i++)
891 {
892 if ((wgn_dataset.ignore(i) == TRUE) ||
893 (i == wgn_predictee))
894 tscore = WGN_HUGE_VAL; // ignore this feature this time
895 else if (wgn_random_number(1.0) < wgn_dropout_feats)
896 tscore = WGN_HUGE_VAL; // randomly dropout feature
897 else if (wgn_dataset.ftype(i) == wndt_binary)
898 {
899 construct_binary_ques(i,test_ques);
900 tscore = wgn_score_question(test_ques,dset);
901 }
902 else if (wgn_dataset.ftype(i) == wndt_float)
903 {
904 tscore = construct_float_ques(i,test_ques,dset);
905 }
906 else if (wgn_dataset.ftype(i) == wndt_ignore)
907 tscore = WGN_HUGE_VAL; // always ignore this feature
908 #if 0
909 // This doesn't work reasonably
910 else if (wgn_csubset && (wgn_dataset.ftype(i) >= wndt_class))
911 {
912 wagon_error("subset selection temporarily deleted");
913 tscore = construct_class_ques_subset(i,test_ques,dset);
914 }
915 #endif
916 else if (wgn_dataset.ftype(i) >= wndt_class)
917 tscore = construct_class_ques(i,test_ques,dset);
918 if (tscore < bscore)
919 {
920 best_ques = test_ques;
921 best_ques.set_score(tscore);
922 bscore = tscore;
923 }
924 }
925
926 return best_ques;
927 }
928 #endif
929
930 static float construct_class_ques(int feat,WQuestion &ques,WVectorVector &ds)
931 {
932 // Find out which member of a class gives the best split
933 float tscore,bscore = WGN_HUGE_VAL;
934 int cl;
935 WQuestion test_q;
936
937 test_q.set_fp(feat);
938 test_q.set_oper(wnop_is);
939 ques = test_q;
940
941 for (cl=0; cl < wgn_discretes[wgn_dataset.ftype(feat)].length(); cl++)
942 {
943 test_q.set_operand1(EST_Val(cl));
944 tscore = wgn_score_question(test_q,ds);
945 if (tscore < bscore)
946 {
947 ques = test_q;
948 bscore = tscore;
949 }
950 }
951
952 return bscore;
953 }
954
955 #if 0
956 static float construct_class_ques_subset(int feat,WQuestion &ques,
957 WVectorVector &ds)
958 {
959 // Find out which subset of a class gives the best split.
960 // We first measure the subset of the data for each member of
961 // of the class. Then order those splits. Then go through finding
962 // where the best split of that ordered list is. This is described
963 // on page 247 of Breiman et al.
964 float tscore,bscore = WGN_HUGE_VAL;
965 LISP l;
966 int cl;
967
968 ques.set_fp(feat);
969 ques.set_oper(wnop_is);
970 float *scores = new float[wgn_discretes[wgn_dataset.ftype(feat)].length()];
971
972 // Only do it for exists values
973 for (cl=0; cl < wgn_discretes[wgn_dataset.ftype(feat)].length(); cl++)
974 {
975 ques.set_operand(flocons(cl));
976 scores[cl] = wgn_score_question(ques,ds);
977 }
978
979 LISP order = sort_class_scores(feat,scores);
980 if (order == NIL)
981 return WGN_HUGE_VAL;
982 if (siod_llength(order) == 1)
983 { // Only one so we know the best "split"
984 ques.set_oper(wnop_is);
985 ques.set_operand(car(order));
986 return scores[get_c_int(car(order))];
987 }
988
989 ques.set_oper(wnop_in);
990 LISP best_l = NIL;
991 for (l=cdr(order); CDR(l) != NIL; l = cdr(l))
992 {
993 ques.set_operand(l);
994 tscore = wgn_score_question(ques,ds);
995 if (tscore < bscore)
996 {
997 best_l = l;
998 bscore = tscore;
999 }
1000
1001 }
1002
1003 if (best_l != NIL)
1004 {
1005 if (siod_llength(best_l) == 1)
1006 {
1007 ques.set_oper(wnop_is);
1008 ques.set_operand(car(best_l));
1009 }
1010 else if (equal(cdr(order),best_l) != NIL)
1011 {
1012 ques.set_oper(wnop_is);
1013 ques.set_operand(car(order));
1014 }
1015 else
1016 {
1017 cout << "Found a good subset" << endl;
1018 ques.set_operand(best_l);
1019 }
1020 }
1021 return bscore;
1022 }
1023
1024 static LISP sort_class_scores(int feat,float *scores)
1025 {
1026 // returns sorted list of (non WGN_HUGE_VAL) items
1027 int i;
1028 LISP items = NIL;
1029 LISP l;
1030
1031 for (i=0; i < wgn_discretes[wgn_dataset.ftype(feat)].length(); i++)
1032 {
1033 if (scores[i] != WGN_HUGE_VAL)
1034 {
1035 if (items == NIL)
1036 items = cons(flocons(i),NIL);
1037 else
1038 {
1039 for (l=items; l != NIL; l=cdr(l))
1040 {
1041 if (scores[i] < scores[get_c_int(car(l))])
1042 {
1043 CDR(l) = cons(car(l),cdr(l));
1044 CAR(l) = flocons(i);
1045 break;
1046 }
1047 }
1048 if (l == NIL)
1049 items = l_append(items,cons(flocons(i),NIL));
1050 }
1051 }
1052 }
1053 return items;
1054 }
1055 #endif
1056
1057 static float construct_float_ques(int feat,WQuestion &ques,WVectorVector &ds)
1058 {
1059 // Find out a split of the range that gives the best score
1060 // Naively does this by partitioning the range into float_range_split slots
1061 float tscore,bscore = WGN_HUGE_VAL;
1062 int d, i;
1063 float p;
1064 WQuestion test_q;
1065 float max,min,val,incr;
1066
1067 test_q.set_fp(feat);
1068 test_q.set_oper(wnop_lessthan);
1069 ques = test_q;
1070
1071 min = max = ds(0)->get_flt_val(feat); /* set up some value */
1072 for (d=0; d < ds.n(); d++)
1073 {
1074 val = ds(d)->get_flt_val(feat);
1075 if (val < min)
1076 min = val;
1077 else if (val > max)
1078 max = val;
1079 }
1080 if (max == min) // we're pure
1081 return WGN_HUGE_VAL;
1082 incr = (max-min)/wgn_float_range_split;
1083 // so do float_range-1 splits
1084 /* We calculate this based on the number splits, not the increments, */
1085 /* becuase incr can be so small it doesn't increment p */
1086 for (i=0,p=min+incr; i < wgn_float_range_split; i++,p += incr )
1087 {
1088 test_q.set_operand1(EST_Val(p));
1089 tscore = wgn_score_question(test_q,ds);
1090 if (tscore < bscore)
1091 {
1092 ques = test_q;
1093 bscore = tscore;
1094 }
1095 }
1096
1097 return bscore;
1098 }
1099
1100 static void construct_binary_ques(int feat,WQuestion &test_ques)
1101 {
1102 // construct a question. Not sure about this in general
1103 // of course continuous/categorical features will require different
1104 // rule and non-binary ones will require some test point
1105
1106 test_ques.set_fp(feat);
1107 test_ques.set_oper(wnop_binary);
1108 test_ques.set_operand1(EST_Val(""));
1109 }
1110
1111 static float score_question_set(WQuestion &q, WVectorVector &ds, int ignorenth)
1112 {
1113 // score this question as a possible split by finding
1114 // the sum of the impurities when ds is split with this question
1115 WImpurity y,n;
1116 int d, num_yes, num_no;
1117 float count;
1118 WVector *wv;
1119
1120 num_yes = num_no = 0;
1121 y.data = &ds;
1122 n.data = &ds;
1123 for (d=0; d < ds.n(); d++)
1124 {
1125 if (wgn_random_number(1.0) < wgn_dropout_samples)
1126 {
1127 continue; // dropout this sample
1128 }
1129 else if ((ignorenth < 2) ||
1130 (d%ignorenth != ignorenth-1))
1131 {
1132 wv = ds(d);
1133 if (wgn_count_field == -1)
1134 count = 1.0;
1135 else
1136 count = (*wv)[wgn_count_field];
1137
1138 if (q.ask(*wv) == TRUE)
1139 {
1140 num_yes++;
1141 if (wgn_dataset.ftype(wgn_predictee) == wndt_ols)
1142 y.cumulate(d,count); // note the sample number not value
1143 else
1144 y.cumulate((*wv)[wgn_predictee],count);
1145 }
1146 else
1147 {
1148 num_no++;
1149 if (wgn_dataset.ftype(wgn_predictee) == wndt_ols)
1150 n.cumulate(d,count); // note the sample number not value
1151 else
1152 n.cumulate((*wv)[wgn_predictee],count);
1153 }
1154 }
1155 }
1156
1157 q.set_yes(num_yes);
1158 q.set_no(num_no);
1159
1160 int min_cluster;
1161
1162 if ((wgn_balance == 0.0) ||
1163 (ds.n()/wgn_balance < wgn_min_cluster_size))
1164 min_cluster = wgn_min_cluster_size;
1165 else
1166 min_cluster = (int)(ds.n()/wgn_balance);
1167
1168 if ((y.samples() < min_cluster) ||
1169 (n.samples() < min_cluster))
1170 return WGN_HUGE_VAL;
1171
1172 float ym,nm,bm;
1173 // printf("awb_debug score_question_set X %f Y %f\n",
1174 // y.samples(), n.samples());
1175 ym = y.measure();
1176 nm = n.measure();
1177 bm = ym + nm;
1178
1179 /* cout << q << endl;
1180 printf("test question y %f n %f b %f\n",
1181 ym, nm, bm); */
1182
1183 return bm/2.0;
1184 }
1185
1186 float wgn_score_question(WQuestion &q, WVectorVector &ds)
1187 {
1188 // This level of indirection was introduced for later expansion
1189
1190 return score_question_set(q,ds,1);
1191 }
1192
1193 WNode *wagon_stepwise(float limit)
1194 {
1195 // Find the best single features and incrementally add features
1196 // that best improve result until it doesn't improve.
1197 // This is basically to automate what Kurt was doing in building
1198 // trees, he then automated it in PERL and as it seemed to work
1199 // I put it into wagon itself.
1200 // This can be pretty computationally intensive.
1201 WNode *best = 0,*new_best = 0;
1202 float bscore,best_score = -WGN_HUGE_VAL;
1203 int best_feat,i;
1204 int nf = 1;
1205
1206 // Set all features to ignore
1207 for (i=0; i < wgn_dataset.width(); i++)
1208 wgn_dataset.set_ignore(i,TRUE);
1209
1210 for (i=0; i < wgn_dataset.width(); i++)
1211 {
1212 if ((wgn_dataset.ftype(i) == wndt_ignore) || (i == wgn_predictee))
1213 {
1214 // This skips the round not because this has anything to
1215 // do with this feature being (user specified) ignored
1216 // but because it indicates there is one less cycle that is
1217 // necessary
1218 continue;
1219 }
1220 new_best = wagon_stepwise_find_next_best(bscore,best_feat);
1221
1222 if ((bscore - fabs(bscore * (limit/100))) <= best_score)
1223 {
1224 // gone as far as we can
1225 delete new_best;
1226 break;
1227 }
1228 else
1229 {
1230 best_score = bscore;
1231 delete best;
1232 best = new_best;
1233 wgn_dataset.set_ignore(best_feat,FALSE);
1234 if (!wgn_quiet)
1235 {
1236 fprintf(stdout,"FEATURE %d %s: %2.4f\n",
1237 nf,
1238 (const char *)wgn_dataset.feat_name(best_feat),
1239 best_score);
1240 fflush(stdout);
1241 nf++;
1242 }
1243 }
1244 }
1245
1246 return best;
1247 }
1248
1249 static WNode *wagon_stepwise_find_next_best(float &bscore,int &best_feat)
1250 {
1251 // Find which of the currently ignored features will best improve
1252 // the result
1253 WNode *best = 0;
1254 float best_score = -WGN_HUGE_VAL;
1255 int best_new_feat = -1;
1256 int i;
1257
1258 for (i=0; i < wgn_dataset.width(); i++)
1259 {
1260 if (wgn_dataset.ftype(i) == wndt_ignore)
1261 continue; // user wants me to ignore this completely
1262 else if (i == wgn_predictee) // can't use the answer
1263 continue;
1264 else if (wgn_dataset.ignore(i) == TRUE)
1265 {
1266 WNode *current;
1267 float score;
1268
1269 // Allow this feature to participate
1270 wgn_dataset.set_ignore(i,FALSE);
1271
1272 current = wgn_build_tree(score);
1273
1274 if (score > best_score)
1275 {
1276 best_score = score;
1277 delete best;
1278 best = current;
1279 best_new_feat = i;
1280 // fprintf(stdout,"BETTER FEATURE %d %s: %2.4f\n",
1281 // i,
1282 // (const char *)wgn_dataset.feat_name(i),
1283 // best_score);
1284 // fflush(stdout);
1285 }
1286 else
1287 delete current;
1288
1289 // switch it off again
1290 wgn_dataset.set_ignore(i,TRUE);
1291 }
1292 }
1293
1294 bscore = best_score;
1295 best_feat = best_new_feat;
1296 return best;
1297 }