"Fossies" - the Fresh Open Source Software Archive

Member "speech_tools/main/viterbi_main.cc" (4 Sep 2017, 21805 Bytes) of package /linux/misc/speech_tools-2.5.0-release.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "viterbi_main.cc" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 2.4-release_vs_2.5.0-release.

    1 /*************************************************************************/
    2 /*                                                                       */
    3 /*                Centre for Speech Technology Research                  */
    4 /*                     University of Edinburgh, UK                       */
    5 /*                      Copyright (c) 1995,1996                          */
    6 /*                        All Rights Reserved.                           */
    7 /*                                                                       */
    8 /*  Permission is hereby granted, free of charge, to use and distribute  */
    9 /*  this software and its documentation without restriction, including   */
   10 /*  without limitation the rights to use, copy, modify, merge, publish,  */
   11 /*  distribute, sublicense, and/or sell copies of this work, and to      */
   12 /*  permit persons to whom this work is furnished to do so, subject to   */
   13 /*  the following conditions:                                            */
   14 /*   1. The code must retain the above copyright notice, this list of    */
   15 /*      conditions and the following disclaimer.                         */
   16 /*   2. Any modifications must be clearly marked as such.                */
   17 /*   3. Original authors' names are not deleted.                         */
   18 /*   4. The authors' names are not used to endorse or promote products   */
   19 /*      derived from this software without specific prior written        */
   20 /*      permission.                                                      */
   21 /*                                                                       */
   22 /*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
   23 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
   24 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
   25 /*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
   26 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
   27 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
   28 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
   29 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
   30 /*  THIS SOFTWARE.                                                       */
   31 /*                                                                       */
   32 /*************************************************************************/
   33 /*                 Authors:  Alan W Black and Simon King                 */
   34 /*                 Date   :  January 1997                                */
   35 /*-----------------------------------------------------------------------*/
   36 /*  A simple use of the Viterbi decoder                                  */
   37 /*                                                                       */
   38 /*=======================================================================*/
   39 
   40 #include <cstdlib>
   41 #include <cstdio>
   42 #include <cmath>
   43 #include "EST.h"
   44 
   45 EST_read_status load_TList_of_StrVector(EST_TList<EST_StrVector> &w,
   46                     const EST_String &filename,
   47                     const int vec_len);
   48 
   49 static void print_results(EST_Relation &wstream);
   50 static bool do_search(EST_Relation &wstream);
   51 static EST_VTPath *vit_npath(EST_VTPath *p,EST_VTCandidate *c,EST_Features &f);
   52 static EST_VTCandidate *vit_candlist(EST_Item *s,EST_Features &f);
   53 static void top_n_candidates(EST_VTCandidate* &all_c);
   54 static void load_vocab(const EST_String &vfile);
   55 
   56 static void add_word(EST_Relation &w, const EST_String &word, int pos);
   57 
   58 static void load_wstream(const EST_String &filename,
   59              const EST_String &vfile,
   60              EST_Relation &w,
   61              EST_Track &obs);
   62 
   63 static void load_given(const EST_String &filename,
   64                const int ngram_order);
   65                
   66 static double find_gram_prob(EST_VTPath *p,int *state);
   67 
   68 // special stuff for non-sliding window ngrams
   69 static double find_extra_gram_prob(EST_VTPath *p,int *state, int time);
   70 static void get_history(EST_StrVector &history, EST_VTPath *p);
   71 static void fill_window(EST_StrVector &window,EST_StrVector &history,
   72             EST_VTPath *p,const int time);
   73 static int is_a_special(const EST_String &s, int &val);
   74 static int max_history=0;
   75 
   76 static EST_Ngrammar ngram;
   77 static EST_String pstring = SENTENCE_START_MARKER;
   78 static EST_String ppstring = SENTENCE_END_MARKER;
   79 static float lm_scale = 1.0;
   80 static float ob_scale = 1.0;
   81 static float ob_scale2 = 1.0;
   82 
   83 // pruning beam widths
   84 static float beam=-1;
   85 static float ob_beam=-1;
   86 static int n_beam = -1;
   87 
   88 static bool trace_on = FALSE;
   89 
   90 // always logs
   91 static double ob_log_prob_floor = SAFE_LOG_ZERO;  
   92 static double ob_log_prob_floor2 = SAFE_LOG_ZERO;  
   93 static double lm_log_prob_floor = SAFE_LOG_ZERO;  
   94 
   95 int btest_debug = FALSE;
   96 static EST_String out_file = "";
   97 static EST_StrList vocab;
   98 static EST_Track observations;  
   99 static EST_Track observations2;  
  100 static EST_TList<EST_StrVector> given; // to do : convert to array for speed
  101 int using_given=FALSE;
  102 
  103 // default is that obs are already logs
  104 int take_logs = FALSE;
  105 int num_obs = 1;
  106 
  107 
  108 
  109 
  110 /** @name  <command>viterbi</command> <emphasis>Combine n-gram model and likelihoods to estimate posterior probabilities</emphasis>
  111   * @id viterbi-manual
  112   * @toc
  113  */
  114 
  115 //@{
  116 
  117 /**@name Synopsis
  118   */
  119 //@{
  120 
  121 //@synopsis
  122 
  123 /**
  124 viterbi is a simple time-synchronous Viterbi decoder. It finds the
  125 most likely sequence of items drawn from a fixed vocabulary, given
  126 frame-by-frame observation probabilities for each item in that
  127 vocabulary, and a ngram grammar. Possible uses include:
  128 
  129 <itemizedlist>
  130 <listitem><para>Simple speech recogniser back end</para></listitem>
  131 </itemizedlist>
  132 
  133 viterbi can optionally use two sets of frame-by-frame observation
  134 probabilities in a weighted-sum fashion. Also, the ngram language model
  135 is not restricted to the conventional sliding window type in which the
  136 previous n-1 items are the ngram context. Items in the ngram context
  137 at each frame may be given. In this case, the user must provide a file
  138 containing the ngram context: one (n-1) tuple per line. To include
  139 items from the partial Viterbi path so far (i.e. found at recognition
  140 time, not given) the special notation <-N> is used where N indicates
  141 the distance back to the item required. For example <-1> would
  142 indicate the item on the partial Viterbi path at the last frame. See
  143 \Ref{Examples}.
  144 
  145 <formalpara>
  146 <para><title>Pruning</title></para>
  147 
  148 <para>
  149 Three types of pruning are available to reduce the size of the search
  150 space and therefore speed up the search:
  151 
  152 <itemizedlist>
  153 <listitem><para>Observation pruning</para></listitem>
  154 <listitem><para>Top-N pruning at each frame</para></listitem>
  155 <listitem><para>Fixed width beam pruning</para></listitem>
  156 </itemizedlist>
  157 
  158 </para>
  159 </formalpara>
  160 
  161 */
  162 
  163 //@}
  164 
  165 /**@name Options
  166   */
  167 //@{
  168 
  169 //@options
  170 
  171 
  172 //@}
  173 
  174 int main(int argc, char **argv)
  175 {
  176     EST_StrList files;
  177     EST_Option al;
  178     EST_Relation wstream;
  179     double floor; // a temporary
  180 
  181     parse_command_line(argc, argv, 
  182        EST_String("[observations file] -o [output file]\n")+
  183        "Summary: find the most likely path through a sequence of\n"+
  184        "         observations, constrained by a language model.\n"+
  185        "-ngram <string>     Grammar file, required\n"+
  186        "-given <string>     ngram left contexts, per frame\n"+
  187        "-vocab <string>     File with names of vocabulary, this\n"+
  188        "                    must be same number as width of observations, required\n"+
  189        "-ob_type <string>   Observation type : likelihood .... and change doc\"probs\" or \"logs\" (default is \"logs\")\n"+
  190        "\nFloor values and scaling (scaling is applied after floor value)\n"+
  191        "-lm_floor <float>   LM floor probability\n"+
  192        "-lm_scale <float>   LM scale factor factor (applied to log prob)\n"+
  193        "-ob_floor <float>   Observations floor probability\n"+
  194        "-ob_scale <float>   Observation scale factor (applied to prob or log prob, depending on -ob_type)\n\n"+
  195        "-prev_tag <string>\n"+
  196        "                 tag before sentence start\n"+
  197        "-prev_prev_tag <string>\n"+
  198        "                 all words before 'prev_tag'\n"+
  199        "-last_tag <string>\n"+
  200        "                 after sentence end\n"+
  201        "-default_tags    use default tags of "+SENTENCE_START_MARKER+","
  202             SENTENCE_END_MARKER+" and "+SENTENCE_END_MARKER+"\n"+
  203        "                 respectively\n\n"+
  204 
  205        "-observes2  <string> second observations (overlays first, ob_type must be same)\n"+
  206        "-ob_floor2 <float>  \n"+
  207        "-ob_scale2 <float>  \n\n"+
  208        "-ob_prune  <float> observation pruning beam width (log) probability\n"+
  209        "-n_prune   <int>   top-n pruning of observations\n"+
  210        "-prune     <float> pruning beam width (log) probability\n"+
  211        "-trace             show details of search as it proceeds\n",
  212             files, al);
  213 
  214     out_file = al.present("-o") ? al.val("-o") : (EST_String)"-";
  215 
  216     if (files.length() != 1)
  217       {
  218     cerr << argv[0];
  219     cerr << ": you must give exactly one observations file on the command line";
  220     cerr << endl;
  221     cerr << "(use -observes2 for optional second observations)" << endl;
  222     exit(-1);
  223       }
  224 
  225     if (al.present("-ngram"))
  226     {
  227     ngram.load(al.val("-ngram"));
  228     }
  229     else
  230     {
  231     cerr << argv[0] << ": no ngram specified" << endl;
  232     exit(-1);
  233     }
  234 
  235     if(!al.present("-vocab"))
  236       {
  237     cerr << "You must provide a vocabulary file !" << endl;
  238     exit(-1);
  239       }
  240 
  241     load_wstream(files.first(),al.val("-vocab"),wstream,observations);
  242     if (al.present("-observes2"))
  243     {
  244     load_wstream(al.val("-observes2"),al.val("-vocab"),wstream,observations2);
  245     num_obs = 2;
  246     }
  247 
  248     if (al.present("-given"))
  249     {
  250     load_given(al.val("-given"),ngram.order());
  251     using_given=TRUE;
  252     }
  253 
  254     if (al.present("-lm_scale"))
  255     lm_scale = al.fval("-lm_scale");
  256     else
  257     lm_scale = 1.0;
  258 
  259     if (al.present("-ob_scale"))
  260     ob_scale = al.fval("-ob_scale");
  261     else
  262     ob_scale = 1.0;
  263 
  264     if (al.present("-ob_scale2"))
  265     ob_scale2 = al.fval("-ob_scale2");
  266     else
  267     ob_scale2 = 1.0;
  268 
  269     if (al.present("-prev_tag"))
  270     pstring = al.val("-prev_tag");
  271     if (al.present("-prev_prev_tag"))
  272     ppstring = al.val("-prev_prev_tag");
  273 
  274     // pruning
  275     if (al.present("-prune"))
  276     beam = al.fval("-prune");
  277     else
  278     beam = -1; // no pruning
  279 
  280     if (al.present("-ob_prune"))
  281     ob_beam = al.fval("-ob_prune");
  282     else
  283     ob_beam = -1; // no pruning
  284 
  285     if (al.present("-n_prune"))
  286     {
  287     n_beam = al.ival("-n_prune");
  288     if(n_beam <= 0)
  289     {
  290         cerr << "WARNING : " << n_beam;
  291         cerr << " is not a reasonable value for -n_prune !" << endl;
  292     }
  293     }
  294     else
  295     n_beam = -1; // no pruning
  296     
  297 
  298 
  299     if (al.present("-trace"))
  300     trace_on = TRUE;
  301 
  302     // language model floor
  303     if (al.present("-lm_floor"))
  304     {
  305     floor = al.fval("-lm_floor");
  306     if(floor < 0)
  307     {
  308         cerr << "Error : LM floor probability is negative !" << endl;
  309         exit(-1);
  310     }
  311     else if(floor > 1)
  312     {
  313         cerr << "Error : LM floor probability > 1 " << endl;
  314         exit(-1);
  315     }
  316     lm_log_prob_floor = safe_log(floor);
  317     }
  318 
  319     // observations floor
  320     if (al.present("-ob_floor"))
  321     {
  322     floor = al.fval("-ob_floor");
  323     if(floor < 0)
  324     {
  325         cerr << "Error : Observation floor probability is negative !" << endl;
  326         exit(-1);
  327     }
  328     else if(floor > 1)
  329     {
  330         cerr << "Error : Observation floor probability > 1 " << endl;
  331         exit(-1);
  332     }
  333     ob_log_prob_floor = safe_log(floor);
  334     }
  335 
  336     if (al.present("-ob_floor2"))
  337     {
  338     floor = al.fval("-ob_floor2");
  339     if(floor < 0)
  340     {
  341         cerr << "Error : Observation2 floor probability is negative !" << endl;
  342         exit(-1);
  343     }
  344     else if(floor > 1)
  345     {
  346         cerr << "Error : Observation2 floor probability > 1 " << endl;
  347         exit(-1);
  348     }
  349     ob_log_prob_floor2 = safe_log(floor);
  350     }
  351     
  352 
  353     if (al.present("-ob_type"))
  354     {
  355     if(al.val("-ob_type") == "logs")
  356         take_logs = false;
  357     else if(al.val("-ob_type") == "probs")
  358         take_logs = true;
  359     else
  360     {
  361         cerr << "\"" << al.val("-ob_type") 
  362         << "\" is not a valid ob_type : try \"logs\" or \"probs\"" << endl;
  363         exit(-1);
  364     }
  365     }
  366 
  367     if(do_search(wstream))
  368     print_results(wstream);
  369     else
  370     cerr << "No path could be found." << endl;
  371 
  372     return 0;
  373 }
  374 
  375 static void print_results(EST_Relation &wstream)
  376 {
  377     EST_Item *s;
  378     float pscore;
  379     EST_String predict;
  380     FILE *fd;
  381 
  382     if (out_file == "-")
  383     fd = stdout;
  384     else if ((fd = fopen(out_file,"wb")) == NULL)
  385     {
  386     cerr << "can't open \"" << out_file << "\" for output" << endl;
  387     exit(-1);
  388     }
  389 
  390     for (s=wstream.head(); s != 0 ; s=inext(s))
  391     {
  392     predict = s->f("best").string();
  393     pscore = s->f("best_score");
  394     fprintf(fd,"%s %f\n",(const char *)predict,pscore);
  395     }
  396 
  397     if (out_file != "")
  398     fclose(fd);
  399 
  400 }
  401 
  402 static bool do_search(EST_Relation &wstream)
  403 {
  404     // Apply Ngram to matrix of probs 
  405     int states;
  406 
  407     states = ngram.num_states();
  408     EST_Viterbi_Decoder vc(vit_candlist,vit_npath,states);
  409 
  410     vc.initialise(&wstream);
  411 
  412     if((beam > 0) || (ob_beam > 0))
  413     vc.set_pruning_parameters(beam,ob_beam);
  414 
  415     if(trace_on)
  416     {
  417     vc.turn_on_trace();
  418     cerr << "Starting Viterbi search..." << endl;
  419     }
  420 
  421     vc.search();
  422 
  423     return vc.result("best");  // adds fields to w with best values 
  424 
  425 }
  426 
  427 static void load_wstream(const EST_String &filename,
  428              const EST_String &vfile, 
  429              EST_Relation &w,
  430              EST_Track &obs)
  431 {
  432     // Load in vocab and probs into Stream (this isn't general)
  433     EST_String word, pos;
  434     int i=-1;
  435 
  436     if(vocab.empty())
  437     load_vocab(vfile);
  438 
  439     if (obs.load(filename,0.10) != 0)
  440     {
  441     cerr << "can't find observations file \"" << filename << "\"" << endl;
  442     exit(-1);
  443     }
  444 
  445     if (vocab.length() != obs.num_channels())
  446     {
  447     cerr << "Number in vocab (" << vocab.length() << 
  448         ") not equal to observation's width (" <<
  449         obs.num_channels() << ")" << endl;
  450     exit(-1);
  451     }
  452     
  453     if(w.empty())
  454     {
  455     for (i=0; i < obs.num_frames(); i++)
  456         {
  457         add_word(w,itoString(i),i);
  458         }
  459 
  460     }
  461 }
  462 
  463 
  464 static void load_given(const EST_String &filename,
  465                const int ngram_order)
  466 {
  467 
  468     EST_String word, pos;
  469     EST_Litem *p;
  470     int i,j;
  471 
  472     if (load_TList_of_StrVector(given,filename,ngram_order-1) != 0)
  473     {
  474     cerr << "can't load given file \"" << filename << "\"" << endl;
  475     exit(-1);
  476     }
  477 
  478     // set max history
  479     for (p = given.head(); p; p = p->next())
  480     {
  481     for(i=0;i<given(p).length();i++)
  482         if( is_a_special( given(p)(i), j) && (-j > max_history))
  483         max_history = -j;
  484     
  485     }
  486     
  487 }
  488 
  489 static void load_vocab(const EST_String &vfile)
  490 {
  491     // Load vocabulary (strings)
  492     EST_TokenStream ts;
  493 
  494     if (ts.open(vfile) == -1)
  495     {
  496     cerr << "can't find vocab file \"" << vfile << "\"" << endl;
  497     exit(-1);
  498     }
  499 
  500     while (!ts.eof())
  501     {
  502     if (ts.peek() != "")
  503         {
  504         vocab.append(ts.get().string());
  505         }
  506     }
  507 
  508     ts.close();
  509 }
  510 
  511 static void add_word(EST_Relation &w, const EST_String &word, int pos)
  512 {
  513     EST_Item *item = w.append();
  514     
  515     item->set_name(word);
  516     item->set("pos",pos);
  517 } 
  518 
  519 static EST_VTCandidate *vit_candlist(EST_Item *s,EST_Features &f)
  520 {
  521     // Return a list of new candidates from this point 
  522     double prob=1.0,prob2=1.0;
  523     int i;
  524     EST_Litem *p;
  525     int observe;
  526     EST_VTCandidate *all_c = 0;
  527     EST_VTCandidate *c;
  528     (void)f;
  529 
  530     observe = s->f("pos");  // index for observations TRACK
  531     for (i=0,p=vocab.head(); i < observations.num_channels(); i++,p=p->next())
  532     {
  533     c = new EST_VTCandidate;
  534     c->name = vocab(p);  // to be more efficient this could be the index
  535     prob = observations.a(observe,i);
  536     if(num_obs == 2)
  537         prob2 = observations2.a(observe,i);
  538 
  539     if(take_logs)
  540     {
  541         prob = safe_log10(prob);
  542         if (prob < ob_log_prob_floor)
  543         prob = ob_log_prob_floor;
  544 
  545         if(num_obs == 2)
  546         {
  547         prob2 = safe_log10(prob2);
  548         if (prob2 < ob_log_prob_floor2)
  549             prob2 = ob_log_prob_floor2;
  550         }
  551     }
  552     else // already in logs
  553     {
  554         if (prob < ob_log_prob_floor)
  555         prob = ob_log_prob_floor;
  556         if ((num_obs == 2) && (prob2 < ob_log_prob_floor2))
  557         prob2 = ob_log_prob_floor2;
  558     }
  559 
  560     prob *= ob_scale;
  561     prob2 *= ob_scale2;
  562 
  563     if(num_obs == 2)
  564         c->score = prob + prob2;
  565     else
  566         c->score = prob;
  567 
  568     c->next = all_c;
  569     c->s = s;
  570     all_c = c;
  571     }
  572 
  573     if(n_beam > 0)
  574     {
  575     // N.B. this might be very time-consuming
  576     top_n_candidates(all_c);
  577     }
  578 
  579     return all_c;
  580 }
  581 
  582 static EST_VTPath *vit_npath(EST_VTPath *p,EST_VTCandidate *c,
  583                  EST_Features &f)
  584 {
  585     // Build a (potential) new path link from this previous path and 
  586     // This candidate 
  587     EST_VTPath *np = new EST_VTPath;
  588     double lprob,prob;
  589     EST_String prev,ttt;
  590     (void)f;
  591 
  592     np->c = c;
  593     np->from = p;
  594 
  595     // are we using extra info ?
  596     if(using_given)
  597     // time of candidate is
  598     // c->s->f("pos");
  599     prob = find_extra_gram_prob(np,&np->state,c->s->f("pos"));
  600     else
  601     prob = find_gram_prob(np,&np->state);
  602 
  603     lprob = safe_log10(prob);
  604     if (lprob < lm_log_prob_floor)
  605     lprob = lm_log_prob_floor;
  606 
  607     lprob *= lm_scale;
  608 
  609     np->f.set("lscore",(c->score+lprob)); // simonk : changed prob to lprob
  610     if (p==0)
  611     np->score = (c->score+lprob);
  612     else
  613     np->score = (c->score+lprob) + p->score;
  614 
  615     return np;
  616 }
  617 
  618 static double find_gram_prob(EST_VTPath *p,int *state)
  619 {
  620     // Look up transition probability from *state for name.
  621     // Return probability and update state
  622     double prob=0.0,nprob;
  623     int i,f=FALSE;
  624     EST_VTPath *pp;
  625     
  626     EST_StrVector window(ngram.order());
  627     for (pp=p->from,i=ngram.order()-2; i >= 0; i--)
  628     {
  629     if (pp != 0)
  630     {
  631         window[i] = pp->c->name.string();
  632         pp = pp->from;
  633     }
  634     else if (f)
  635         window[i] = ppstring;
  636     else
  637     {
  638         window[i] = pstring;
  639         f = TRUE;
  640     }
  641     }
  642     window[ngram.order()-1] = p->c->name.string();
  643     const EST_DiscreteProbDistribution &pd = ngram.prob_dist(window);
  644     if (pd.samples() == 0)
  645     prob = 0;
  646     else
  647     prob = (double)pd.probability(p->c->name.string());
  648     
  649     for (i=0; i < ngram.order()-1; i++)
  650     window[i] = window(i+1);
  651     ngram.predict(window,&nprob,state);
  652 
  653     return prob;
  654 }
  655 
  656 
  657 static double find_extra_gram_prob(EST_VTPath *p,int *state,int time)
  658 {
  659 
  660     int i;
  661     double prob=0.0,nprob;
  662     EST_StrVector window(ngram.order());
  663     EST_StrVector history(max_history);
  664 
  665     get_history(history,p);
  666 
  667     fill_window(window,history,p,time);
  668 
  669     /*
  670     cerr << "Looking up ngram ";
  671     for(i=0;i<window.num_points();i++)
  672     cerr << window(i) << " ";
  673     cerr << endl;
  674     */
  675 
  676     const EST_DiscreteProbDistribution &pd = ngram.prob_dist(window);
  677     if (pd.samples() == 0)
  678     prob = 0;
  679     else
  680     prob = (double)pd.probability(p->c->name.string());
  681 
  682     // shift history, adding latest item at 'end' (0)
  683     if(max_history>0)
  684     {
  685     for(i=history.length()-1;i>0;i--)
  686         history[i] = history(i-1);
  687     history[0] = p->c->name.string();
  688     }
  689 
  690     fill_window(window,history,p,time+1);
  691     ngram.predict(window,&nprob,state);
  692 
  693     //cerr << endl << endl;
  694 
  695     return prob;
  696 
  697 }
  698 
  699 static void get_history(EST_StrVector &history, EST_VTPath *p)
  700 {
  701 
  702     EST_VTPath *pp;
  703     int i,f=FALSE;
  704     for (pp=p->from,i=0; i < history.length(); i++)
  705     {
  706     
  707     if (pp != 0)
  708     {
  709         history[i] = pp->c->name.string();
  710         pp = pp->from;
  711     }
  712     else if (f)
  713         history[i] = ppstring;
  714     else
  715     {
  716         history[i] = pstring;
  717         f = TRUE;
  718     }
  719     }
  720 
  721 }
  722 
  723 static void fill_window(EST_StrVector &window,EST_StrVector &history,
  724             EST_VTPath *p,const int time)
  725 {
  726     // Look up transition probability from *state for name.
  727     // Return probability and update state
  728     int i,j;
  729     EST_String s;
  730 
  731     // can we even do it?
  732     if( time >= given.length() )
  733     return;
  734 
  735     // format should be run-time defined, but try this for now
  736     // first n-1 things in window come from 'given'
  737     // last one is predictee
  738 
  739     // also want vocab and grammar mismatch allowed !!!!!!
  740 
  741     // predictee
  742     window[ngram.order()-1] = p->c->name.string();
  743 
  744     // given info for this time
  745     EST_StrVector *this_g = &(given.nth(time)); // inefficient to count down a list
  746 
  747 
  748     for(i=0;i<ngram.order()-1;i++)
  749     {
  750 
  751     if( is_a_special( (*this_g)(i), j))
  752         window[i] = history(-1-j); // j=-1 -> (0)   j=-2 -> (1)   etc.
  753     else
  754         window[i] = (*this_g)(i);
  755     }
  756 }
  757 
  758 
  759 
  760 static int is_a_special(const EST_String &s, int &val)
  761 {
  762 
  763     // special is "<int>"
  764 
  765     EST_String tmp;
  766     if(s.contains("<") && s.contains(">"))
  767     {
  768     tmp = s.after("<");
  769     tmp = tmp.before(">");
  770     val = atoi(tmp);
  771     //cerr << "special " << tmp << "=" << val << endl;
  772     return TRUE;
  773     }
  774     return FALSE;
  775 }
  776 
  777 static void top_n_candidates(EST_VTCandidate* &all_c)
  778 {
  779     // keep the n most likely candidates
  780     // avoiding a full sort of the (potentially long) list
  781 
  782     EST_VTCandidate *top_c=NULL,*p,*q,*this_best, *prev_to_best;
  783     int i;
  784     if(n_beam < 1)
  785     return; // do nothing
  786 
  787     // here we assume big is always good
  788     //if(!big_is_good)
  789     //score_multiplier = -1;
  790 
  791     for(i=0;i<n_beam;i++)
  792     {
  793 
  794     // head of the list is best guess
  795     this_best=all_c;
  796     prev_to_best=NULL;
  797 
  798     // find best candidate in all_c
  799     q=NULL;;
  800     for(p=all_c;p!= NULL;q=p,p=p->next)
  801     {
  802         //cerr << "item : " << p->score << endl;
  803         if(p->score > this_best->score)
  804         {
  805         this_best = p;
  806         prev_to_best=q;
  807         }
  808     }
  809     
  810     if(this_best == NULL)
  811         break; // give up now - must have run out of candidates
  812 
  813     // move best candidate over to new list
  814     if(prev_to_best == NULL)
  815         // best was head of list
  816         all_c = this_best->next;
  817     else
  818         {
  819         // best was not head of all_c
  820         prev_to_best->next = this_best->next;
  821 
  822         this_best->next = top_c;
  823         top_c = this_best;
  824         }
  825     }
  826 
  827     delete all_c;
  828     all_c = top_c;
  829 
  830 /*
  831     cerr << "Here are the top " << n_beam << " candidates" << endl;
  832     for(p=all_c;p != NULL;p=p->next)
  833     cerr << p->score << endl;
  834 */
  835 }
  836 
  837 
  838 /**@name Examples
  839 
  840 Example 'given' file (items f and g are in the vocabulary), the ngram
  841 is a 4-gram.
  842 
  843 <para>
  844 <screen>
  845 <-2> g g
  846 <-1> g f
  847 <-1> f g
  848 <-2> g g
  849 <-3> g g
  850 <-1> g f
  851 </screen>
  852 </para>
  853 
  854 */
  855 //@{
  856 //@}
  857 
  858 
  859 
  860 //@}