"Fossies" - the Fresh Open Source Software Archive

Member "speech_tools/grammar/ngram/ngrammar_io.cc" (4 Sep 2017, 22960 Bytes) of package /linux/misc/speech_tools-2.5.0-release.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "ngrammar_io.cc" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 2.4-release_vs_2.5.0-release.

    1 /*************************************************************************/
    2 /*                                                                       */
    3 /*                Centre for Speech Technology Research                  */
    4 /*                     University of Edinburgh, UK                       */
    5 /*                      Copyright (c) 1996,1997                          */
    6 /*                        All Rights Reserved.                           */
    7 /*                                                                       */
    8 /*  Permission is hereby granted, free of charge, to use and distribute  */
    9 /*  this software and its documentation without restriction, including   */
   10 /*  without limitation the rights to use, copy, modify, merge, publish,  */
   11 /*  distribute, sublicense, and/or sell copies of this work, and to      */
   12 /*  permit persons to whom this work is furnished to do so, subject to   */
   13 /*  the following conditions:                                            */
   14 /*   1. The code must retain the above copyright notice, this list of    */
   15 /*      conditions and the following disclaimer.                         */
   16 /*   2. Any modifications must be clearly marked as such.                */
   17 /*   3. Original authors' names are not deleted.                         */
   18 /*   4. The authors' names are not used to endorse or promote products   */
   19 /*      derived from this software without specific prior written        */
   20 /*      permission.                                                      */
   21 /*                                                                       */
   22 /*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
   23 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
   24 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
   25 /*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
   26 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
   27 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
   28 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
   29 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
   30 /*  THIS SOFTWARE.                                                       */
   31 /*                                                                       */
   32 /*************************************************************************/
   33 /*                     Author :  Simon King & Alan W Black               */
   34 /*                     Date   :  February 1997                           */
   35 /*-----------------------------------------------------------------------*/
   36 /*                                                                       */
   37 /* IO functions for EST_Ngram class                                      */
   38 /*                                                                       */
   39 /*=======================================================================*/
   40 
   41 #include <cstdlib>
   42 #include <fstream>
   43 #include <iostream>
   44 #include "EST_unix.h"
   45 #include <cstring>
   46 #include <climits>
   47 #include <cfloat>
   48 #include "EST_String.h"
   49 #include "EST_Ngrammar.h"
   50 #include "EST_Token.h"
   51 #include "EST_cutils.h"
   52 
   53 EST_read_status
   54 load_ngram_htk_ascii(const EST_String filename, EST_Ngrammar &n)
   55 {
   56     (void)filename;
   57     (void)n;
   58     return wrong_format;
   59 }
   60 
   61 EST_read_status
   62 load_ngram_htk_binary(const EST_String filename, EST_Ngrammar &n)
   63 {
   64     (void)filename;
   65     (void)n;
   66     return wrong_format;
   67 }
   68 
   69 EST_read_status
   70 load_ngram_arpa(const EST_String filename, EST_Ngrammar &n, const EST_StrList &vocab)
   71 {
   72 
   73     EST_TokenStream ts;
   74     EST_String s;
   75     int i,j,k, order=0;
   76     double occur,weight;
   77     int this_num,this_order;
   78 
   79     if (ts.open(filename) == -1)
   80     return misc_read_error;
   81 
   82     // find  backslash data backslash
   83     while ((!ts.eof()) && !ts.get().string().contains("\\data\\"));
   84 
   85     if (ts.eof())
   86     {
   87     ts.close();
   88     return wrong_format;
   89     }
   90 
   91     // find order and numbers of ngrams
   92 
   93     // somewhere to keep numbers
   94     EST_IVector nums(100); // not going to have anything bigger than a 100-gram !
   95 
   96     while (!ts.eof())
   97     {
   98     // have we got to next section
   99     if (ts.peek().string().contains("-grams:"))
  100         break;
  101 
  102     s=ts.get_upto_eoln().string();
  103 
  104     if(s.contains("ngram ") && s.contains("="))
  105     {
  106 
  107         s=s.after("ngram ");
  108         this_order=atoi(s.before("="));
  109         this_num=atoi(s.after("="));
  110 
  111         //cerr << "There are " << this_num << " " << this_order
  112         //<< "-grams" << endl;
  113 
  114         nums[this_order] = this_num;
  115 
  116         if(this_order > order)
  117         order = this_order;
  118     }
  119 
  120     }
  121 
  122 
  123     if(order==0)
  124     {
  125     //cerr << "No ngram ?=? in header !" << endl;
  126     ts.close();
  127     return wrong_format;
  128     }
  129 
  130     //cerr << "Initialising " << order << "-grammar" << endl;
  131     if(!n.init(order,EST_Ngrammar::backoff,vocab))
  132     return misc_read_error;
  133 
  134     // read data
  135     for(i=1;i<=order;i++)
  136     {
  137 
  138     EST_StrVector window(i);
  139 
  140     // find start of data for this order "<order>-grams:"
  141     EST_String tmp =  "\\" + itoString(i) + "-grams:";
  142     while (!ts.eof())
  143     {
  144         s=ts.get().string();
  145         if (s.contains(tmp))
  146         break;
  147     }
  148 
  149 
  150     if(ts.eof())
  151     {
  152         cerr << "Unexpected end of grammar file whilst looking for '"
  153         << tmp << "'" << endl;
  154         return misc_read_error;
  155     }
  156     
  157     //cerr << "Found order " << i << " : " << tmp << endl;
  158     //cerr << "Looking for " << nums(i) << " ngrams" << endl;
  159     // look for nums(i) ngrams
  160 
  161     for(j=0;j<nums(i);j++)
  162     {
  163         
  164         for (k=0; ((k<i) && !ts.eof()); k++)
  165         window[k] = ts.get().string();
  166 
  167         if(ts.eof())
  168         {
  169         cerr << "Unexpected end of file whilst reading " << i
  170             << "-grams !" << endl;
  171         return misc_read_error;
  172         }
  173 
  174         // can't for backoff grammars, need to set probs directly
  175         
  176         cerr << "ooooooooops" << endl;
  177         return wrong_format;
  178 
  179         occur = atof(ts.get().string());
  180         n.accumulate(window,occur);
  181 
  182         // backoff weight ?
  183         if (!ts.eoln())
  184         {
  185         weight = atof(ts.get().string());
  186         n.set_backoff_weight(window,weight);
  187         }
  188         
  189         if (!ts.eoln())
  190         {
  191         cerr << "EST_Ngrammar:load_ngram_arpa expect end of line at filepos "
  192             << ts.filepos() << endl;
  193         ts.close();
  194         return misc_read_error;
  195         }
  196     }
  197     
  198     } // loop through orders
  199     
  200 
  201     // find backslash end backslash 
  202     while (!ts.eof())
  203     if (ts.get().string() == "\\end\\")
  204     {
  205         ts.close();
  206         return format_ok;
  207 
  208     }
  209 
  210     cerr << "Missing \\end\\ !" << endl;
  211 
  212     ts.close();
  213     return misc_read_error;
  214 
  215 }
  216 
  217 EST_read_status
  218 load_ngram_cstr_ascii(const EST_String filename, EST_Ngrammar &n)
  219 {
  220     EST_TokenStream ts;
  221     int i, order;
  222     double occur;
  223     
  224     if (ts.open(filename) == -1)
  225     return misc_read_error;
  226 
  227     if (ts.peek().string() != "Ngram_2")
  228     {
  229     ts.close();
  230     return wrong_format;
  231     }
  232     ts.get();           // skip magic number
  233     
  234     order = atoi(ts.get().string());
  235     ts.get_upto_eoln();     // skip to next line
  236     EST_StrList vocab;
  237     EST_StrList pred_vocab; // may be different
  238     
  239     while (!ts.eoln())
  240     vocab.append(ts.get().string());
  241     ts.get_upto_eoln();     // skip to next line
  242     while (!ts.eoln())
  243     pred_vocab.append(ts.get().string());
  244     
  245     if(!n.init(order,EST_Ngrammar::dense,vocab,pred_vocab))
  246     {
  247     cerr << "Something may be wrong with the vocab lists in '"
  248         << filename << "'" << endl;
  249     return misc_read_error;
  250     }
  251     
  252     EST_StrVector window(order);
  253     
  254     while(!ts.eof())
  255     {
  256     for (i=0; i < order; i++)
  257         window[i] = ts.get().string();
  258     if (ts.get().string() != ":")
  259     {
  260         cerr << "EST_Ngrammar:load_ngram_cstr_ascii missing colon at filepos "
  261         << ts.filepos() << endl;
  262         return misc_read_error;
  263     }
  264     occur = atof(ts.get().string());
  265     n.accumulate(window,occur);
  266     if (!ts.eoln())
  267     {
  268         cerr << "EST_Ngrammar:load_ngram_cstr_ascii expect end of line at filepos "
  269         << ts.filepos() << endl;
  270         return misc_read_error;
  271     }
  272     }
  273     
  274     ts.close();
  275     
  276     return format_ok;
  277 }
  278 
  279 EST_read_status 
  280 load_ngram_cstr_bin(const EST_String filename, EST_Ngrammar &n)
  281 {
  282     EST_TokenStream ts;
  283     int i,j,order;
  284     EST_Litem *k;
  285     int num_entries;
  286     double approx_num_samples = 0.0;
  287     long freq_data_start, freq_data_end;
  288     FILE *ifd;
  289     int magic = 0;
  290     int swap = FALSE;
  291     
  292     if ((ifd=fopen(filename,"rb")) == NULL)
  293     return misc_read_error;
  294     fread(&magic,sizeof(int),1,ifd);
  295     
  296     if (SWAPINT(magic) == EST_NGRAMBIN_MAGIC)
  297     swap = TRUE;
  298     else if (magic != EST_NGRAMBIN_MAGIC)
  299     return wrong_format;
  300     if (ts.open(ifd, FALSE) == -1)
  301     return misc_read_error;
  302     
  303     ts.set_SingleCharSymbols("\n");
  304     ts.set_WhiteSpaceChars(" \t\r");
  305     
  306     if (ts.peek().string() != "mBin_2")
  307     {
  308     fclose(ifd);
  309     ts.close();
  310     return wrong_format;
  311     }
  312     ts.get();           // skip magic number
  313     
  314     order = atoi(ts.get().string());
  315     if (ts.get() != "\n")
  316     {
  317     fclose(ifd);
  318     ts.close();
  319     return misc_read_error;
  320     }
  321     EST_StrList vocab;
  322     EST_StrList pred_vocab; // may be different
  323     
  324     while ((ts.peek() != "\n") && (!ts.eof()))
  325     vocab.append(ts.get().string());
  326     ts.get();           // skip newline
  327     while ((ts.peek() != "\n") && (!ts.eof()))
  328     pred_vocab.append(ts.get().string());
  329     
  330     // Need to get to the position one after the newline and
  331     // who knows what TokenStream has already read,
  332     fseek(ifd,(long)(ts.peek().filepos()+5),SEEK_SET);
  333     
  334     if(!n.init(order,EST_Ngrammar::dense,vocab,pred_vocab))
  335     {
  336     ts.close();
  337     fclose(ifd);
  338     return misc_read_error;
  339     }
  340     
  341     EST_StrVector window(order);
  342     
  343     freq_data_start = ftell(ifd);
  344     fseek(ifd,0,SEEK_END);
  345     freq_data_end = ftell(ifd);
  346     num_entries = (freq_data_end-freq_data_start)/sizeof(double);
  347     double *dd = new double[num_entries];
  348     
  349     // Go back to start of data
  350     fseek(ifd,freq_data_start,SEEK_SET);
  351     
  352     if (fread(dd,sizeof(double),num_entries,ifd) != (unsigned)num_entries)
  353     {
  354     cerr << "EST_Ngrammar::load_ngram_cstr_bin format does not have expected number of entries" << endl;
  355     ts.close();
  356     fclose(ifd);
  357     return misc_read_error;
  358     }
  359     if (swap)
  360     swap_bytes_double(dd,num_entries);
  361     
  362     for(j=i=0;i<n.num_states();i++)
  363     {
  364     if (j >= num_entries)
  365     {
  366         cerr << "EST_Ngrammar::load_ngram_cstr_bin unexpected end of frequency data" << endl;
  367         ts.close();
  368         fclose(ifd);
  369         return misc_read_error; 
  370     }
  371     for (k=n.p_states[i].pdf().item_start();
  372          (!n.p_states[i].pdf().item_end(k)) && (j < num_entries) ;
  373          k = n.p_states[i].pdf().item_next(k))
  374     {
  375         n.p_states[i].pdf().set_frequency(k,dd[j]);
  376         // Update global info too
  377         approx_num_samples += dd[j]; // probably not right
  378         n.vocab_pdf.cumulate(k,dd[j]);
  379         
  380         // Number of consecutive occurrences of this frequency as in
  381         // dd[j+1] if its a negative number
  382         if (j+1 >= num_entries)
  383         j++;
  384         else if (dd[j+1] < -1)
  385         dd[j+1]++;
  386         else if (dd[j+1] == -1)
  387         j +=2;
  388         else
  389         j++;
  390     }
  391     }
  392     
  393     // With smoothing num_samples might not be as exact as you like
  394     n.p_num_samples = (int)approx_num_samples;
  395     
  396     delete [] dd;
  397     
  398     ts.close();
  399     fclose(ifd);
  400     
  401     return format_ok;
  402 }
  403 
  404 // ====================================================================
  405 
  406 EST_write_status
  407 save_ngram_htk_ascii_sub(const EST_String &word, ostream *ost, 
  408              EST_Ngrammar &n, double floor)
  409 {
  410     EST_Litem *k;
  411     EST_String name;
  412     double freq;
  413     EST_StrVector this_ngram(2); // assumes bigram
  414     this_ngram[0] = word;
  415     EST_DiscreteProbDistribution this_pdf;
  416     this_pdf = n.prob_dist(this_ngram);
  417     
  418     double lfreq=-1;
  419     int lcount=0;
  420     double total_freq=0;
  421     
  422     double floor_prob_total = floor * (n.pred_vocab->length()-1);
  423     
  424     if (word == n.p_sentence_end_marker)
  425     {
  426     *ost << word;
  427     *ost << " 0*" << n.pred_vocab->length()-1 << " " << 1 << endl;
  428     return write_ok;
  429     }
  430     
  431     if(floor_prob_total > 1)
  432     {
  433     cerr << "ERROR : floor is impossibly large, scaling it !" << endl;
  434     floor = 1.0 / (double)(n.pred_vocab->length()-1);
  435     floor_prob_total = 1;
  436     }
  437     
  438     // not efficient but who cares ?
  439     for (k=this_pdf.item_start();
  440      !this_pdf.item_end(k);
  441      k = this_pdf.item_next(k))
  442     {
  443     this_pdf.item_freq(k,name,freq);
  444     if(name != n.p_sentence_start_marker)
  445     {
  446         total_freq += freq;
  447     }
  448     }
  449     
  450     
  451     // 0 for prob(word,start marker)
  452     *ost << word << " 0 ";
  453     
  454     if (total_freq <= 0)
  455     {
  456     *ost << 1.0 / (double)(n.pred_vocab->length()-1) << "*";
  457     *ost << n.pred_vocab->length()-1 << " " << endl;
  458     }
  459     else
  460     {
  461     lfreq=-1;
  462     
  463     for (k=this_pdf.item_start();
  464          !this_pdf.item_end(k);
  465          k = this_pdf.item_next(k))
  466     {
  467         this_pdf.item_freq(k,name,freq);
  468         
  469         if ( (name == n.p_sentence_start_marker) ||
  470         (name == n.p_sentence_end_marker) ||
  471         (name == OOV_MARKER) )
  472         continue;
  473         
  474         if (freq == lfreq)
  475         lcount++;
  476         else
  477         {
  478         if (lcount > 1)
  479             *ost << "*" << lcount << " ";
  480         else
  481             *ost << " ";
  482         
  483         lcount=1;
  484         lfreq = freq;
  485         
  486         if(freq > 0)
  487         {
  488             double base_prob = freq / total_freq;
  489             
  490             // and floor/scale it
  491             *ost << floor + ( base_prob * (1-floor_prob_total) );
  492             
  493         }
  494         else
  495             *ost << floor;
  496         
  497         }
  498         
  499         
  500     }
  501     
  502     }               // total_freq > 0
  503     
  504     
  505     if(!n.closed_vocab())
  506     {
  507     
  508     // not fully tested !!!!!!!!
  509     
  510     *ost << 0 << " ERROR !!!!!!!! ";
  511     }
  512     
  513     
  514     if (total_freq > 0)
  515     {
  516     freq = this_pdf.frequency(n.p_sentence_end_marker);
  517     
  518     if(freq == lfreq)
  519     {
  520         lcount++;
  521         *ost << "*" << lcount << " " << endl;
  522     }
  523     else
  524     {
  525         
  526         if (lcount > 1)
  527         *ost << "*" << lcount << " ";
  528         else
  529         *ost << " ";
  530         
  531         if(freq > 0)
  532         {
  533         double base_prob = freq / total_freq;
  534         
  535         // and floor/scale it
  536         *ost << floor + ( base_prob * (1-floor_prob_total) ) << endl;
  537         
  538         }
  539         else
  540         *ost << floor << endl;
  541     }
  542     }
  543     
  544     return write_ok;
  545 }
  546 
  547 EST_write_status
  548 save_ngram_htk_ascii(const EST_String filename, 
  549              EST_Ngrammar &n, double floor)
  550 {
  551     
  552     ostream *ost;
  553 
  554     // only for bigram
  555     if(n.order() != 2)
  556     {
  557     cerr << "Can only save bigrams in htk_ascii format" << endl;
  558     return misc_write_error;
  559     }
  560     
  561     if (floor < 0)
  562     {
  563     cerr << "Negative floor probability does not make sense !" << endl;
  564     return misc_write_error;
  565     }
  566     
  567     if (filename == "-")
  568     ost = &cout;
  569     else
  570     ost = new ofstream(filename);
  571     
  572     if(!(*ost))
  573     return write_fail;
  574     
  575     if(floor * (n.pred_vocab->length()-1) > 1)
  576     {
  577     floor = 1.0 / (double)(n.pred_vocab->length()-1);
  578     cerr << "ERROR : floor is impossibly large, scaling it to ";
  579     cerr << floor << endl;
  580     }
  581     
  582     int i;
  583     
  584     if(n.p_sentence_start_marker == "")
  585     {
  586     cerr << "Can't save in HTK format as no sentence start/end tags"
  587         << " were given !" << endl;
  588     return misc_write_error;
  589     }
  590     
  591     // need '!ENTER' (or whatever) as first word- that's HTK for you
  592     save_ngram_htk_ascii_sub(n.p_sentence_start_marker,ost,n,floor);
  593     
  594     // the real words
  595     for(i=0;i<n.vocab->length();i++)
  596     {
  597     if ( (n.vocab->name(i) != n.p_sentence_start_marker) &&
  598         (n.vocab->name(i) != n.p_sentence_end_marker) &&
  599         (n.vocab->name(i) != OOV_MARKER) )
  600         save_ngram_htk_ascii_sub(n.vocab->name(i),ost,n,floor);
  601     }
  602     
  603     if(!n.closed_vocab())
  604     save_ngram_htk_ascii_sub(OOV_MARKER,ost,n,floor);
  605     
  606     save_ngram_htk_ascii_sub(n.p_sentence_end_marker,ost,n,floor);
  607     
  608     if(ost != &cout)
  609     delete ost;
  610     
  611     return write_ok;
  612 }
  613 
  614 /*
  615    EST_write_status
  616    save_ngram_htk_binary(const EST_String filename, EST_Ngrammar &n)
  617    {
  618    return write_ok;
  619    }
  620    */
  621 
  622 void
  623 count_ngram_arpa_sub(EST_Ngrammar *n, EST_StrVector &ngram, void *count)
  624 {
  625     if(n->ngram_exists(ngram))
  626     *((double*)count) += 1;
  627 }
  628 
  629 void
  630 save_ngram_arpa_sub(EST_Ngrammar *n, EST_StrVector &ngram, void *ost)
  631 {
  632     
  633     int i;
  634     
  635     if(n->ngram_exists(ngram))
  636     {
  637     *((ostream*)(ost)) << safe_log10(n->probability(ngram)) << " ";
  638     for(i=0;i<ngram.n();i++)
  639         *((ostream*)(ost)) << ngram(i) << " ";
  640     
  641     if ((n->representation() == EST_Ngrammar::backoff) &&
  642         (n->order() > ngram.n()) )
  643         *((ostream*)(ost)) << safe_log10(n->get_backoff_weight(ngram));
  644     //<< " = "
  645     //<< n->get_backoff_weight(ngram) << " ";
  646     
  647     *((ostream*)(ost)) << endl;
  648     
  649     }
  650 }
  651 
  652 EST_write_status
  653 save_ngram_arpa(const EST_String filename, EST_Ngrammar &n)
  654 {
  655     // ARPA MIT-LL format - see HTK manual !!
  656     
  657     ostream *ost;
  658     int i,o;
  659     
  660     if (filename == "-")
  661     ost = &cout;
  662     else
  663     ost = new ofstream(filename);
  664     
  665     if (!(*ost))
  666     return write_fail;
  667     
  668     //n.set_entry_type(EST_Ngrammar::probabilities);
  669     //n.make_htk_compatible(); // fix enter/exit probs
  670     //*ost << *(n.vocab) << endl;
  671     
  672     *ost << "\\data\\" << endl;
  673     
  674     double *count = new double;
  675     
  676     if (n.representation() == EST_Ngrammar::backoff)
  677     {
  678     for(o=1;o<=n.order();o++)
  679     {
  680         EST_StrVector ngram(o);
  681         for(i=0;i<o;i++)
  682         ngram[i] = "";
  683         *count =0;
  684         
  685         // this is a deeply silly way to count them,
  686         // we could traverse the tree directly !
  687         n.iterate(ngram,&count_ngram_arpa_sub,(void*)count);
  688         *ost << "ngram " << o << "=" << *count << endl;
  689     }
  690     
  691     for(o=1;o<=n.order();o++)
  692     {
  693         *ost << endl;
  694         *ost << "\\" << o << "-grams:" << endl;
  695         EST_StrVector ngram(o);
  696         for(i=0;i<o;i++)
  697         ngram[i] = "";
  698         n.iterate(ngram,&save_ngram_arpa_sub,(void*)ost);
  699     }
  700     
  701     }
  702     else
  703     {
  704     EST_StrVector ngram(n.order());
  705     for(i=0;i<n.order();i++)
  706         ngram[i] = "";
  707     *count =0;
  708     n.iterate(ngram,&count_ngram_arpa_sub,(void*)count);
  709     *ost << "ngram " << n.order() << "=" << *count << endl;
  710     
  711     *ost << endl;
  712     *ost << "\\" << n.order() << "-grams:" << endl;
  713     
  714     for(i=0;i<n.order();i++)
  715         ngram[i] = "";
  716     n.iterate(ngram,&save_ngram_arpa_sub,ost);
  717     
  718     }
  719     
  720     *ost << "\\end\\" << endl;
  721     
  722     if (ost != &cout)
  723     delete ost;
  724     
  725     return write_ok;
  726 }
  727 
  728 EST_write_status 
  729 save_ngram_cstr_ascii(const EST_String filename, EST_Ngrammar &n,
  730               const bool trace, double floor)
  731 {
  732     // awb's format
  733     (void)trace;
  734     ostream *ost;
  735     int i;
  736     EST_Litem *k;
  737     
  738     if (filename == "-")
  739     ost = &cout;
  740     else
  741     ost = new ofstream(filename);
  742     
  743     if(!(*ost))
  744     return write_fail;
  745     
  746     *ost << "Ngram_2 " << n.order() << endl;
  747     for (i=0; i < n.vocab->length(); i++)
  748     *ost << n.vocab->name(i) << " ";
  749     *ost << endl;
  750     for (i=0; i < n.pred_vocab->length(); i++)
  751     *ost << n.pred_vocab->name(i) << " ";
  752     *ost << endl;
  753     
  754     if (n.representation() == EST_Ngrammar::dense)
  755     n.print_freqs(*ost,floor);
  756     else if (n.representation() == EST_Ngrammar::backoff)
  757     {
  758       int total_ngrams = (int)pow(float(n.get_vocab_length()),float(n.order()-1));
  759     
  760     for(i=0;i<total_ngrams;i++)
  761     {
  762         EST_DiscreteProbDistribution this_pdf;
  763         const EST_StrVector this_ngram = n.make_ngram_from_index(i);
  764         this_pdf = n.prob_dist(this_ngram);
  765         
  766         for (k=this_pdf.item_start();
  767          !this_pdf.item_end(k);
  768          k = this_pdf.item_next(k))
  769         {
  770         double freq;
  771         EST_String name;
  772         this_pdf.item_freq(k,name,freq);
  773         
  774         for (int jj=0; jj < this_ngram.n(); jj++)
  775             *ost << this_ngram(jj) << " ";
  776         *ost << name << " : " << freq << endl;
  777         }
  778     }
  779     }
  780     
  781     if(ost != &cout)
  782     delete ost;
  783     
  784     return write_ok;
  785 }
  786 
  787 EST_write_status 
  788 save_ngram_wfst(const EST_String filename, EST_Ngrammar &n)
  789 {
  790     // Save as a WFST
  791     FILE *ost;
  792     int i;
  793 
  794     if ((ost = fopen(filename,"wb")) == NULL)
  795     {
  796     cerr << "Ngrammar save: unable to open \"" << filename << 
  797         "\" for writing" << endl;
  798     return write_fail;
  799     }
  800 
  801     fprintf(ost,"EST_File fst\n");
  802     fprintf(ost,"DataType ascii\n");
  803     fprintf(ost,"in \"(");
  804     for (i=0; i < n.vocab->length(); i++)
  805     fprintf(ost," %s\n",(const char *)n.vocab->name(i));
  806     fprintf(ost," )\"\n");
  807     fprintf(ost,"out \"(");
  808     for (i=0; i < n.vocab->length(); i++)
  809     fprintf(ost," %s\n",(const char *)n.vocab->name(i));
  810     fprintf(ost," )\"\n");
  811     fprintf(ost,"NumStates %d\n",n.num_states());
  812     fprintf(ost,"EST_Header_End\n");
  813 
  814     for (i=0; i<n.num_states(); i++)
  815     {
  816     fprintf(ost,"((%d nonfinal %d)\n",i,i);
  817     fprintf(ost,")\n");
  818     }
  819 
  820     fclose(ost);
  821     
  822     return write_ok;
  823 }
  824 
  825 EST_write_status 
  826 save_ngram_cstr_bin(const EST_String filename, EST_Ngrammar &n, 
  827             const bool trace, double floor)
  828 {
  829     
  830     if (n.representation() == EST_Ngrammar::sparse)
  831     return misc_write_error;
  832     
  833     int i;
  834     EST_Litem *k;
  835     FILE *ofd;
  836     double lfreq = -1;
  837     double count = -1;
  838     int magic = EST_NGRAMBIN_MAGIC;
  839     
  840     if (filename == "-")
  841     {
  842     if ((ofd=stdout) == NULL)
  843         return misc_write_error;
  844     }
  845     else
  846     {
  847     if ((ofd=fopen(filename,"wb")) == NULL)
  848         return misc_write_error;
  849     }
  850     
  851     fwrite(&magic,sizeof(int),1,ofd);
  852     fprintf(ofd,"mBin_2 %d\n",n.order());
  853     for (i=0; i < n.vocab->length(); i++)
  854     fprintf(ofd,"%s ",(const char *)n.vocab->name(i));
  855     fprintf(ofd,"\n");
  856     for (i=0; i < n.pred_vocab->length(); i++)
  857     fprintf(ofd,"%s ",(const char *)n.pred_vocab->name(i));
  858     fprintf(ofd,"\n");
  859     
  860     // We use a simple form of run-length encoding, if consecutive
  861     // values are equal only a length is printed.  lengths are
  862     // negative as frequencies (even smoothed ones) can never be -ve
  863     
  864     if ( trace )
  865     cerr << "Saving ..." << endl;
  866     
  867     if (n.representation() == EST_Ngrammar::dense)
  868     {
  869     for(i=0;i<n.num_states();i++)
  870     {
  871         
  872         if ( trace )
  873         cerr << "\r" << i*100/n.num_states() << "%";
  874         
  875         for (k=n.p_states[i].pdf().item_start();
  876          !n.p_states[i].pdf().item_end(k);
  877          k = n.p_states[i].pdf().item_next(k))
  878         {
  879         double freq;
  880         EST_String name;
  881         n.p_states[i].pdf().item_freq(k,name,freq);
  882         if (freq == 0.0)
  883             freq = floor;
  884         if (freq == lfreq)
  885             count--;
  886         else
  887         {
  888             if (count < -1)
  889             fwrite(&count,sizeof(double),1,ofd);
  890             fwrite(&freq,sizeof(double),1,ofd);
  891             count = -1;
  892         }
  893         lfreq = freq;
  894         }
  895     }
  896     if (count < -1)
  897         fwrite(&count,sizeof(double),1,ofd);
  898     }
  899     else if (n.representation() == EST_Ngrammar::backoff)
  900     {
  901     // need to construct pdfs in right order
  902     // noting that dense states are indexed s.t. the last
  903     // word in the ngram is the least significant 'bit'
  904     
  905     // number of ngrams, excluding last word, is
  906       int total_ngrams = (int)pow(float(n.get_vocab_length()),float(n.order()-1));
  907     
  908     for(i=0;i<total_ngrams;i++)
  909     {
  910         
  911         if ( trace )
  912         cerr << "\r" << i*100/total_ngrams << "%";
  913         
  914         EST_DiscreteProbDistribution this_pdf;
  915         const EST_StrVector this_ngram = n.make_ngram_from_index(i);
  916         this_pdf = n.prob_dist(this_ngram);
  917         
  918         for (k=this_pdf.item_start();
  919          !this_pdf.item_end(k);
  920          k = this_pdf.item_next(k))
  921         {
  922         
  923         double freq;
  924         EST_String name;
  925         this_pdf.item_freq(k,name,freq);
  926         if (freq == lfreq)
  927             count--;
  928         else
  929         {
  930             if (count < -1)
  931             fwrite(&count,sizeof(double),1,ofd);
  932             fwrite(&freq,sizeof(double),1,ofd);
  933             count = -1;
  934         }
  935         lfreq = freq;
  936         }
  937         
  938         
  939     }
  940     
  941     }
  942     if ( trace )
  943     cerr << "\r      \r" << endl;
  944     
  945     fclose(ofd);
  946     
  947     return write_ok;
  948 }