"Fossies" - the Fresh Open Source Software Archive 
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996-2006 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : May 1996 */
35 /*-----------------------------------------------------------------------*/
36 /* A Classification and Regression Tree (CART) Program */
37 /* A basic implementation of many of the techniques in */
38 /* Briemen et al. 1984 */
39 /* */
40 /* Added decision list support, Feb 1997 */
41 /* */
42 /* Added vector support for Clustergen 2005/2006 */
43 /* */
44 /*=======================================================================*/
45 #include <cstdlib>
46 #include <iostream>
47 #include <fstream>
48 #include <cstring>
49 #include "EST_Wagon.h"
50 #include "EST_cmd_line.h"
51
52 enum wn_strategy_type {wn_decision_list, wn_decision_tree};
53
54 static wn_strategy_type wagon_type = wn_decision_tree;
55
56 static int wagon_main(int argc, char **argv);
57
58 /** @name <command>wagon</command> <emphasis>CART building program</emphasis>
59 @id wagon_manual
60 * @toc
61 */
62
63 //@{
64
65
66 /**@name Synopsis
67 */
68 //@{
69
70 //@synopsis
71
72 /**
73 wagon is used to build CART tress from feature data, its basic
74 features include:
75
76 <itemizedlist>
77 <listitem><para>both decisions trees and decision lists are supported</para></listitem>
78 <listitem><para>predictees can be discrete or continuous</para></listitem>
79 <listitem><para>input features may be discrete or continuous</para></listitem>
80 <listitem><para>many options for controlling tree building</para>
81 <itemizedlist>
82 <listitem><para>fixed stop value</para></listitem>
83 <listitem><para>balancing</para></listitem>
84 <listitem><para>held-out data and pruning</para></listitem>
85 <listitem><para>stepwise use of input features</para></listitem>
86 <listitem><para>choice of optimization criteria correct/entropy (for
87 classification and rmse/correlation (for regression)</para></listitem>
88 </itemizedlist>
89 </listitem>
90 </itemizedlist>
91
92 A detailed description of building CART models can be found in the
93 <link linkend="cart-overview">CART model overview</link> section.
94
95 */
96
97 //@}
98
99 /**@name OPTIONS
100 */
101 //@{
102
103 //@options
104
105 //@}
106
107 int main(int argc, char **argv)
108 {
109
110 wagon_main(argc,argv);
111
112 exit(0);
113 return 0;
114 }
115
116 static int set_Vertex_Feats(EST_Track &wgn_VertexFeats,
117 EST_String &wagon_track_features)
118 {
119 int i,s=0,e;
120 EST_TokenStream ts;
121
122 for (i=0; i<wgn_VertexFeats.num_channels(); i++)
123 wgn_VertexFeats.a(0,i) = 0.0;
124
125 ts.open_string(wagon_track_features);
126 ts.set_WhiteSpaceChars(",- ");
127 ts.set_PunctuationSymbols("");
128 ts.set_PrePunctuationSymbols("");
129 ts.set_SingleCharSymbols("");
130
131 while (!ts.eof())
132 {
133 EST_Token &token = ts.get();
134 const EST_String ws = (const char *)token.whitespace();
135 if (token == "all")
136 {
137 for (i=0; i<wgn_VertexFeats.num_channels(); i++)
138 wgn_VertexFeats.a(0,i) = 1.0;
139 break;
140 } else if ((ws == ",") || (ws == ""))
141 {
142 s = atoi(token.string());
143 wgn_VertexFeats.a(0,s) = 1.0;
144 } else if (ws == "-")
145 {
146 if (token == "")
147 e = wgn_VertexFeats.num_channels()-1;
148 else
149 e = atoi(token.string());
150 for (i=s; i<=e && i<wgn_VertexFeats.num_channels(); i++)
151 wgn_VertexFeats.a(0,i) = 1.0;
152 } else
153 {
154 printf("wagon: track_feats invalid: %s at position %d\n",
155 (const char *)wagon_track_features,
156 ts.filepos());
157 exit(-1);
158 }
159 }
160
161 return 0;
162 }
163
164 static int wagon_main(int argc, char **argv)
165 {
166 // Top level function sets up data and creates a tree
167 EST_Option al;
168 EST_StrList files;
169 EST_String wgn_oname;
170 ostream *wgn_coutput = 0;
171 float stepwise_limit = 0;
172 int feats_start=0, feats_end=0;
173 int i;
174
175 parse_command_line
176 (argc, argv,
177 EST_String("[options]\n") +
178 "Summary: CART building program\n"+
179 "-desc <ifile> Field description file\n"+
180 "-data <ifile> Datafile, one vector per line\n"+
181 "-stop <int> {50} Minimum number of examples for leaf nodes\n"+
182 "-test <ifile> Datafile to test tree on\n"+
183 "-frs <float> {10} Float range split, number of partitions to\n"+
184 " split a float feature range into\n"+
185 "-dlist Build a decision list (rather than tree)\n"+
186 "-dtree Build a decision tree (rather than list) default\n"+
187 "-output <ofile> \n"+
188 "-o <ofile> File to save output tree in\n"+
189 "-distmatrix <ifile>\n"+
190 " A distance matrix for clustering\n"+
191 "-track <ifile>\n"+
192 " track for vertex indices\n"+
193 "-track_start <int>\n"+
194 " start channel vertex indices\n"+
195 "-track_end <int>\n"+
196 " end (inclusive) channel for vertex indices\n"+
197 "-track_feats <string>\n"+
198 " Track features to use, comma separated list\n"+
199 " with feature numbers and/or ranges, 0 start\n"+
200 "-unittrack <ifile>\n"+
201 " track for unit start and length in vertex track\n"+
202 "-quiet No questions printed during building\n"+
203 "-verbose Lost of information printing during build\n"+
204 "-predictee <string>\n"+
205 " name of field to predict (default is first field)\n"+
206 "-ignore <string>\n"+
207 " Filename or bracket list of fields to ignore\n"+
208 "-count_field <string>\n"+
209 " Name of field containing count weight for samples\n"+
210 "-stepwise Incrementally find best features\n"+
211 "-swlimit <float> {0.0}\n"+
212 " Percentage necessary improvement for stepwise,\n"+
213 " may be negative.\n"+
214 "-swopt <string> Parameter to optimize for stepwise, for \n"+
215 " classification options are correct or entropy\n"+
216 " for regression options are rmse or correlation\n"+
217 " correct and correlation are the defaults\n"+
218 "-balance <float> For derived stop size, if dataset at node, divided\n"+
219 " by balance is greater than stop it is used as stop\n"+
220 " if balance is 0 (default) always use stop as is.\n"+
221 "-cos Use mean cosine distance rather than gausian (TBD).\n"+
222 "-dof <float> Randomly dropout feats in training (prob).\n"+
223 "-dos <float> Randomly dropout samples in training (prob).\n"+
224 "-vertex_output <string> Output <mean> or <best> of cluster\n"+
225 "-held_out <int> Percent to hold out for pruning\n"+
226 "-max_questions <int> Maximum number of questions in tree\n"+
227 "-heap <int> {210000}\n"+
228 " Set size of Lisp heap, should not normally need\n"+
229 " to be changed from its default, only with *very*\n"+
230 " large description files (> 1M)\n"+
231 "-omp_nthreads <int> {1}\n"+
232 " Set number of OMP threads to run wagon in\n"+
233 " tree building; this overrides $OMP_NUM_THREADS\n"+
234 " (ignored if not supported)\n"+
235 "-noprune No (same class) pruning required\n",
236 files, al);
237
238 if (al.present("-held_out"))
239 wgn_held_out = al.ival("-held_out");
240 if (al.present("-dof"))
241 wgn_dropout_feats = al.fval("-dof");
242 if (al.present("-dos"))
243 wgn_dropout_samples = al.fval("-dos");
244 if (al.present("-cos"))
245 wgn_cos = al.ival("-cos");
246 if (al.present("-balance"))
247 wgn_balance = al.fval("-balance");
248 if ((!al.present("-desc")) || ((!al.present("-data"))))
249 {
250 cerr << argv[0] << ": missing description and/or datafile" << endl;
251 cerr << "use -h for description of arguments" << endl;
252 }
253
254 if (al.present("-quiet"))
255 wgn_quiet = TRUE;
256 if (al.present("-verbose"))
257 wgn_verbose = TRUE;
258
259 if (al.present("-stop"))
260 wgn_min_cluster_size = atoi(al.val("-stop"));
261 if (al.present("-max_questions"))
262 wgn_max_questions = atoi(al.val("-max_questions"));
263 if (al.present("-noprune"))
264 wgn_prune = FALSE;
265 if (al.present("-predictee"))
266 wgn_predictee_name = al.val("-predictee");
267 if (al.present("-count_field"))
268 wgn_count_field_name = al.val("-count_field");
269 if (al.present("-swlimit"))
270 stepwise_limit = al.fval("-swlimit");
271 if (al.present("-frs")) // number of partitions to try in floats
272 wgn_float_range_split = atof(al.val("-frs"));
273 if (al.present("-swopt"))
274 wgn_opt_param = al.val("-swopt");
275 if (al.present("-vertex_output"))
276 wgn_vertex_output = al.val("-vertex_output");
277 if (al.present("-output") || al.present("-o"))
278 {
279 if (al.present("-o"))
280 wgn_oname = al.val("-o");
281 else
282 wgn_oname = al.val("-output");
283 wgn_coutput = new ofstream(wgn_oname);
284 if (!(*wgn_coutput))
285 {
286 cerr << "Wagon: can't open file \"" << wgn_oname <<
287 "\" for output " << endl;
288 exit(-1);
289 }
290 }
291 else
292 wgn_coutput = &cout;
293 if (al.present("-distmatrix"))
294 {
295 if (wgn_DistMatrix.load(al.val("-distmatrix")) != 0)
296 {
297 cerr << "Wagon: failed to load Distance Matrix from \"" <<
298 al.val("-distmatrix") << "\"\n" << endl;
299 exit(-1);
300 }
301 }
302 if (al.present("-dlist"))
303 wagon_type = wn_decision_list;
304
305 WNode *tree;
306 float score;
307 LISP ignores = NIL;
308
309 siod_init(al.ival("-heap"));
310
311 if (al.present("-ignore"))
312 {
313 EST_String ig = (const char *)al.sval("-ignore");
314 if (ig[0] == '(')
315 ignores = read_from_string(ig);
316 else
317 ignores = vload(ig,1);
318 }
319 // Load in the data
320 wgn_load_datadescription(al.val("-desc"),ignores);
321 wgn_load_dataset(wgn_dataset,al.val("-data"));
322 if (al.present("-distmatrix") &&
323 (wgn_DistMatrix.num_rows() < wgn_dataset.length()))
324 {
325 cerr << "wagon: distance matrix is smaller than number of training elements\n";
326 exit(-1);
327 }
328 else if (al.present("-track"))
329 {
330 wgn_VertexTrack.load(al.val("-track"));
331 wgn_VertexFeats.resize(1,wgn_VertexTrack.num_channels());
332 for (i=0; i<wgn_VertexFeats.num_channels(); i++)
333 wgn_VertexFeats.a(0,i) = 1.0;
334 }
335
336 if (al.present("-track_start"))
337 {
338 feats_start = al.ival("-track_start");
339 if ((feats_start < 0) ||
340 (feats_start > wgn_VertexTrack.num_channels()))
341 {
342 printf("wagon: track_start invalid: %d out of %d channels\n",
343 feats_start,
344 wgn_VertexTrack.num_channels());
345 exit(-1);
346 }
347 for (i=0; i<feats_start; i++)
348 wgn_VertexFeats.a(0,i) = 0.0; /* don't do feats up to start */
349
350 }
351
352 if (al.present("-track_end"))
353 {
354 feats_end = al.ival("-track_end");
355 if ((feats_end < feats_start) ||
356 (feats_end > wgn_VertexTrack.num_channels()))
357 {
358 printf("wagon: track_end invalid: %d between start %d out of %d channels\n",
359 feats_end,
360 feats_start,
361 wgn_VertexTrack.num_channels());
362 exit(-1);
363 }
364 for (i=feats_end+1; i<wgn_VertexTrack.num_channels(); i++)
365 wgn_VertexFeats.a(0,i) = 0.0; /* don't do feats after end */
366 }
367 if (al.present("-track_feats"))
368 { /* overrides start and end numbers */
369 EST_String wagon_track_features = (const char *)al.val("-track_feats");
370 set_Vertex_Feats(wgn_VertexFeats,wagon_track_features);
371 }
372
373 // printf("Track feats\n");
374 // for (i=0; i<wgn_VertexTrack.num_channels(); i++)
375 // if (wgn_VertexFeats.a(0,i) > 0.0)
376 // printf("%d ",i);
377 // printf("\n");
378
379 if (al.present("-unittrack"))
380 { /* contains two features, a start and length. start indexes */
381 /* into VertexTrack to the first vector in the segment */
382 wgn_UnitTrack.load(al.val("-unittrack"));
383 }
384
385 #ifdef OMP_WAGON
386 if (al.present ("-omp_nthreads"))
387 {
388 omp_set_num_threads(atoi(al.val("-omp_nthreads")));
389 }else{
390 omp_set_num_threads(1);
391 }
392 #else
393 if (al.present ("-omp_nthreads"))
394 {
395 printf("wagon: -omp_nthreads ignored: not supported in this build.\n");
396 }
397 #endif
398
399 if (al.present("-test"))
400 wgn_load_dataset(wgn_test_dataset,al.val("-test"));
401
402 // Build and test the model
403 if (al.present("-stepwise"))
404 tree = wagon_stepwise(stepwise_limit);
405 else if (wagon_type == wn_decision_tree)
406 tree = wgn_build_tree(score); // default operation
407 else if (wagon_type == wn_decision_list)
408 // dlist is printed with build_dlist rather than returned
409 tree = wgn_build_dlist(score,wgn_coutput);
410 else
411 {
412 cerr << "Wagon: unknown operation, not tree or list" << endl;
413 exit(-1);
414 }
415
416 if (tree != 0)
417 {
418 *wgn_coutput << *tree;
419 summary_results(*tree,wgn_coutput);
420 }
421
422 if (wgn_coutput != &cout)
423 delete wgn_coutput;
424 return 0;
425 }
426
427 /** @name Building Trees
428
429 To build a decision tree (or list) Wagon requires data and a description
430 of it. A data file consists a set of samples, one per line each
431 consisting of the same set of features. Features may be categorial
432 or continuous. By default the first feature is the predictee and the
433 others are used as predictors. A typical data file will look like
434 this
435 </para>
436 <para>
437 <screen>
438 0.399 pau sh 0 0 0 1 1 0 0 0 0 0 0
439 0.082 sh iy pau onset 0 1 0 0 1 1 0 0 1
440 0.074 iy hh sh coda 1 0 1 0 1 1 0 0 1
441 0.048 hh ae iy onset 0 1 0 1 1 1 0 1 1
442 0.062 ae d hh coda 1 0 0 1 1 1 0 1 1
443 0.020 d y ae coda 2 0 1 1 1 1 0 1 1
444 0.082 y ax d onset 0 1 0 1 1 1 1 1 1
445 0.082 ax r y coda 1 0 0 1 1 1 1 1 1
446 0.036 r d ax coda 2 0 1 1 1 1 1 1 1
447 ...
448 </screen>
449 </para>
450 <para>
451 The data may come from any source, such as the festival script
452 dumpfeats which allows the creation of such files easily from utterance
453 files.
454 </para><para>
455 In addition to a data file a description file is also require that
456 gives a name and a type to each of the features in the datafile.
457 For the above example it would look like
458 </para><para>
459 <screen>
460 ((segment_duration float)
461 ( name aa ae ah ao aw ax ay b ch d dh dx eh el em en er ey f g
462 hh ih iy jh k l m n nx ng ow oy p r s sh t th uh uw v w y z zh pau )
463 ( n.name 0 aa ae ah ao aw ax ay b ch d dh dx eh el em en er ey f g
464 hh ih iy jh k l m n nx ng ow oy p r s sh t th uh uw v w y z zh pau )
465 ( p.name 0 aa ae ah ao aw ax ay b ch d dh dx eh el em en er ey f g
466 hh ih iy jh k l m n nx ng ow oy p r s sh t th uh uw v w y z zh pau )
467 (position_type 0 onset coda)
468 (pos_in_syl float)
469 (syl_initial 0 1)
470 (syl_final 0 1)
471 (R:Sylstructure.parent.R:Syllable.p.syl_break float)
472 (R:Sylstructure.parent.syl_break float)
473 (R:Sylstructure.parent.R:Syllable.n.syl_break float)
474 (R:Sylstructure.parent.R:Syllable.p.stress 0 1)
475 (R:Sylstructure.parent.stress 0 1)
476 (R:Sylstructure.parent.R:Syllable.n.stress 0 1)
477 )
478 </screen>
479 </para><para>
480 The feature names are arbitrary, but as they appear in the generated
481 trees is most useful if the trees are to be used in prediction of
482 an utterance that the names are features and/or pathnames.
483 </para><para>
484 Wagon can be used to build a tree with such files with the command
485 <screen>
486 wagon -data feats.data -desc fest.desc -stop 10 -output feats.tree
487 </screen>
488 A test data set may also be given which must match the given data description.
489 If specified the built tree will be tested on the test set and results
490 on that will be presented on completion, without a test set the
491 results are given with respect to the training data. However in
492 stepwise case the test set is used in the multi-level training process
493 thus it cannot be considered as true test data and more reasonable
494 results should found on applying the generate tree to truly
495 held out data (via the program wagon_test).
496
497 */
498
499 //@}