"Fossies" - the Fresh Open Source Software Archive 
Member "speech_tools/grammar/ngram/ngrammar_io.cc" (4 Sep 2017, 22960 Bytes) of package /linux/misc/speech_tools-2.5.0-release.tar.gz:
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996,1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Simon King & Alan W Black */
34 /* Date : February 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* IO functions for EST_Ngram class */
38 /* */
39 /*=======================================================================*/
40
41 #include <cstdlib>
42 #include <fstream>
43 #include <iostream>
44 #include "EST_unix.h"
45 #include <cstring>
46 #include <climits>
47 #include <cfloat>
48 #include "EST_String.h"
49 #include "EST_Ngrammar.h"
50 #include "EST_Token.h"
51 #include "EST_cutils.h"
52
53 EST_read_status
54 load_ngram_htk_ascii(const EST_String filename, EST_Ngrammar &n)
55 {
56 (void)filename;
57 (void)n;
58 return wrong_format;
59 }
60
61 EST_read_status
62 load_ngram_htk_binary(const EST_String filename, EST_Ngrammar &n)
63 {
64 (void)filename;
65 (void)n;
66 return wrong_format;
67 }
68
69 EST_read_status
70 load_ngram_arpa(const EST_String filename, EST_Ngrammar &n, const EST_StrList &vocab)
71 {
72
73 EST_TokenStream ts;
74 EST_String s;
75 int i,j,k, order=0;
76 double occur,weight;
77 int this_num,this_order;
78
79 if (ts.open(filename) == -1)
80 return misc_read_error;
81
82 // find backslash data backslash
83 while ((!ts.eof()) && !ts.get().string().contains("\\data\\"));
84
85 if (ts.eof())
86 {
87 ts.close();
88 return wrong_format;
89 }
90
91 // find order and numbers of ngrams
92
93 // somewhere to keep numbers
94 EST_IVector nums(100); // not going to have anything bigger than a 100-gram !
95
96 while (!ts.eof())
97 {
98 // have we got to next section
99 if (ts.peek().string().contains("-grams:"))
100 break;
101
102 s=ts.get_upto_eoln().string();
103
104 if(s.contains("ngram ") && s.contains("="))
105 {
106
107 s=s.after("ngram ");
108 this_order=atoi(s.before("="));
109 this_num=atoi(s.after("="));
110
111 //cerr << "There are " << this_num << " " << this_order
112 //<< "-grams" << endl;
113
114 nums[this_order] = this_num;
115
116 if(this_order > order)
117 order = this_order;
118 }
119
120 }
121
122
123 if(order==0)
124 {
125 //cerr << "No ngram ?=? in header !" << endl;
126 ts.close();
127 return wrong_format;
128 }
129
130 //cerr << "Initialising " << order << "-grammar" << endl;
131 if(!n.init(order,EST_Ngrammar::backoff,vocab))
132 return misc_read_error;
133
134 // read data
135 for(i=1;i<=order;i++)
136 {
137
138 EST_StrVector window(i);
139
140 // find start of data for this order "<order>-grams:"
141 EST_String tmp = "\\" + itoString(i) + "-grams:";
142 while (!ts.eof())
143 {
144 s=ts.get().string();
145 if (s.contains(tmp))
146 break;
147 }
148
149
150 if(ts.eof())
151 {
152 cerr << "Unexpected end of grammar file whilst looking for '"
153 << tmp << "'" << endl;
154 return misc_read_error;
155 }
156
157 //cerr << "Found order " << i << " : " << tmp << endl;
158 //cerr << "Looking for " << nums(i) << " ngrams" << endl;
159 // look for nums(i) ngrams
160
161 for(j=0;j<nums(i);j++)
162 {
163
164 for (k=0; ((k<i) && !ts.eof()); k++)
165 window[k] = ts.get().string();
166
167 if(ts.eof())
168 {
169 cerr << "Unexpected end of file whilst reading " << i
170 << "-grams !" << endl;
171 return misc_read_error;
172 }
173
174 // can't for backoff grammars, need to set probs directly
175
176 cerr << "ooooooooops" << endl;
177 return wrong_format;
178
179 occur = atof(ts.get().string());
180 n.accumulate(window,occur);
181
182 // backoff weight ?
183 if (!ts.eoln())
184 {
185 weight = atof(ts.get().string());
186 n.set_backoff_weight(window,weight);
187 }
188
189 if (!ts.eoln())
190 {
191 cerr << "EST_Ngrammar:load_ngram_arpa expect end of line at filepos "
192 << ts.filepos() << endl;
193 ts.close();
194 return misc_read_error;
195 }
196 }
197
198 } // loop through orders
199
200
201 // find backslash end backslash
202 while (!ts.eof())
203 if (ts.get().string() == "\\end\\")
204 {
205 ts.close();
206 return format_ok;
207
208 }
209
210 cerr << "Missing \\end\\ !" << endl;
211
212 ts.close();
213 return misc_read_error;
214
215 }
216
217 EST_read_status
218 load_ngram_cstr_ascii(const EST_String filename, EST_Ngrammar &n)
219 {
220 EST_TokenStream ts;
221 int i, order;
222 double occur;
223
224 if (ts.open(filename) == -1)
225 return misc_read_error;
226
227 if (ts.peek().string() != "Ngram_2")
228 {
229 ts.close();
230 return wrong_format;
231 }
232 ts.get(); // skip magic number
233
234 order = atoi(ts.get().string());
235 ts.get_upto_eoln(); // skip to next line
236 EST_StrList vocab;
237 EST_StrList pred_vocab; // may be different
238
239 while (!ts.eoln())
240 vocab.append(ts.get().string());
241 ts.get_upto_eoln(); // skip to next line
242 while (!ts.eoln())
243 pred_vocab.append(ts.get().string());
244
245 if(!n.init(order,EST_Ngrammar::dense,vocab,pred_vocab))
246 {
247 cerr << "Something may be wrong with the vocab lists in '"
248 << filename << "'" << endl;
249 return misc_read_error;
250 }
251
252 EST_StrVector window(order);
253
254 while(!ts.eof())
255 {
256 for (i=0; i < order; i++)
257 window[i] = ts.get().string();
258 if (ts.get().string() != ":")
259 {
260 cerr << "EST_Ngrammar:load_ngram_cstr_ascii missing colon at filepos "
261 << ts.filepos() << endl;
262 return misc_read_error;
263 }
264 occur = atof(ts.get().string());
265 n.accumulate(window,occur);
266 if (!ts.eoln())
267 {
268 cerr << "EST_Ngrammar:load_ngram_cstr_ascii expect end of line at filepos "
269 << ts.filepos() << endl;
270 return misc_read_error;
271 }
272 }
273
274 ts.close();
275
276 return format_ok;
277 }
278
279 EST_read_status
280 load_ngram_cstr_bin(const EST_String filename, EST_Ngrammar &n)
281 {
282 EST_TokenStream ts;
283 int i,j,order;
284 EST_Litem *k;
285 int num_entries;
286 double approx_num_samples = 0.0;
287 long freq_data_start, freq_data_end;
288 FILE *ifd;
289 int magic = 0;
290 int swap = FALSE;
291
292 if ((ifd=fopen(filename,"rb")) == NULL)
293 return misc_read_error;
294 fread(&magic,sizeof(int),1,ifd);
295
296 if (SWAPINT(magic) == EST_NGRAMBIN_MAGIC)
297 swap = TRUE;
298 else if (magic != EST_NGRAMBIN_MAGIC)
299 return wrong_format;
300 if (ts.open(ifd, FALSE) == -1)
301 return misc_read_error;
302
303 ts.set_SingleCharSymbols("\n");
304 ts.set_WhiteSpaceChars(" \t\r");
305
306 if (ts.peek().string() != "mBin_2")
307 {
308 fclose(ifd);
309 ts.close();
310 return wrong_format;
311 }
312 ts.get(); // skip magic number
313
314 order = atoi(ts.get().string());
315 if (ts.get() != "\n")
316 {
317 fclose(ifd);
318 ts.close();
319 return misc_read_error;
320 }
321 EST_StrList vocab;
322 EST_StrList pred_vocab; // may be different
323
324 while ((ts.peek() != "\n") && (!ts.eof()))
325 vocab.append(ts.get().string());
326 ts.get(); // skip newline
327 while ((ts.peek() != "\n") && (!ts.eof()))
328 pred_vocab.append(ts.get().string());
329
330 // Need to get to the position one after the newline and
331 // who knows what TokenStream has already read,
332 fseek(ifd,(long)(ts.peek().filepos()+5),SEEK_SET);
333
334 if(!n.init(order,EST_Ngrammar::dense,vocab,pred_vocab))
335 {
336 ts.close();
337 fclose(ifd);
338 return misc_read_error;
339 }
340
341 EST_StrVector window(order);
342
343 freq_data_start = ftell(ifd);
344 fseek(ifd,0,SEEK_END);
345 freq_data_end = ftell(ifd);
346 num_entries = (freq_data_end-freq_data_start)/sizeof(double);
347 double *dd = new double[num_entries];
348
349 // Go back to start of data
350 fseek(ifd,freq_data_start,SEEK_SET);
351
352 if (fread(dd,sizeof(double),num_entries,ifd) != (unsigned)num_entries)
353 {
354 cerr << "EST_Ngrammar::load_ngram_cstr_bin format does not have expected number of entries" << endl;
355 ts.close();
356 fclose(ifd);
357 return misc_read_error;
358 }
359 if (swap)
360 swap_bytes_double(dd,num_entries);
361
362 for(j=i=0;i<n.num_states();i++)
363 {
364 if (j >= num_entries)
365 {
366 cerr << "EST_Ngrammar::load_ngram_cstr_bin unexpected end of frequency data" << endl;
367 ts.close();
368 fclose(ifd);
369 return misc_read_error;
370 }
371 for (k=n.p_states[i].pdf().item_start();
372 (!n.p_states[i].pdf().item_end(k)) && (j < num_entries) ;
373 k = n.p_states[i].pdf().item_next(k))
374 {
375 n.p_states[i].pdf().set_frequency(k,dd[j]);
376 // Update global info too
377 approx_num_samples += dd[j]; // probably not right
378 n.vocab_pdf.cumulate(k,dd[j]);
379
380 // Number of consecutive occurrences of this frequency as in
381 // dd[j+1] if its a negative number
382 if (j+1 >= num_entries)
383 j++;
384 else if (dd[j+1] < -1)
385 dd[j+1]++;
386 else if (dd[j+1] == -1)
387 j +=2;
388 else
389 j++;
390 }
391 }
392
393 // With smoothing num_samples might not be as exact as you like
394 n.p_num_samples = (int)approx_num_samples;
395
396 delete [] dd;
397
398 ts.close();
399 fclose(ifd);
400
401 return format_ok;
402 }
403
404 // ====================================================================
405
406 EST_write_status
407 save_ngram_htk_ascii_sub(const EST_String &word, ostream *ost,
408 EST_Ngrammar &n, double floor)
409 {
410 EST_Litem *k;
411 EST_String name;
412 double freq;
413 EST_StrVector this_ngram(2); // assumes bigram
414 this_ngram[0] = word;
415 EST_DiscreteProbDistribution this_pdf;
416 this_pdf = n.prob_dist(this_ngram);
417
418 double lfreq=-1;
419 int lcount=0;
420 double total_freq=0;
421
422 double floor_prob_total = floor * (n.pred_vocab->length()-1);
423
424 if (word == n.p_sentence_end_marker)
425 {
426 *ost << word;
427 *ost << " 0*" << n.pred_vocab->length()-1 << " " << 1 << endl;
428 return write_ok;
429 }
430
431 if(floor_prob_total > 1)
432 {
433 cerr << "ERROR : floor is impossibly large, scaling it !" << endl;
434 floor = 1.0 / (double)(n.pred_vocab->length()-1);
435 floor_prob_total = 1;
436 }
437
438 // not efficient but who cares ?
439 for (k=this_pdf.item_start();
440 !this_pdf.item_end(k);
441 k = this_pdf.item_next(k))
442 {
443 this_pdf.item_freq(k,name,freq);
444 if(name != n.p_sentence_start_marker)
445 {
446 total_freq += freq;
447 }
448 }
449
450
451 // 0 for prob(word,start marker)
452 *ost << word << " 0 ";
453
454 if (total_freq <= 0)
455 {
456 *ost << 1.0 / (double)(n.pred_vocab->length()-1) << "*";
457 *ost << n.pred_vocab->length()-1 << " " << endl;
458 }
459 else
460 {
461 lfreq=-1;
462
463 for (k=this_pdf.item_start();
464 !this_pdf.item_end(k);
465 k = this_pdf.item_next(k))
466 {
467 this_pdf.item_freq(k,name,freq);
468
469 if ( (name == n.p_sentence_start_marker) ||
470 (name == n.p_sentence_end_marker) ||
471 (name == OOV_MARKER) )
472 continue;
473
474 if (freq == lfreq)
475 lcount++;
476 else
477 {
478 if (lcount > 1)
479 *ost << "*" << lcount << " ";
480 else
481 *ost << " ";
482
483 lcount=1;
484 lfreq = freq;
485
486 if(freq > 0)
487 {
488 double base_prob = freq / total_freq;
489
490 // and floor/scale it
491 *ost << floor + ( base_prob * (1-floor_prob_total) );
492
493 }
494 else
495 *ost << floor;
496
497 }
498
499
500 }
501
502 } // total_freq > 0
503
504
505 if(!n.closed_vocab())
506 {
507
508 // not fully tested !!!!!!!!
509
510 *ost << 0 << " ERROR !!!!!!!! ";
511 }
512
513
514 if (total_freq > 0)
515 {
516 freq = this_pdf.frequency(n.p_sentence_end_marker);
517
518 if(freq == lfreq)
519 {
520 lcount++;
521 *ost << "*" << lcount << " " << endl;
522 }
523 else
524 {
525
526 if (lcount > 1)
527 *ost << "*" << lcount << " ";
528 else
529 *ost << " ";
530
531 if(freq > 0)
532 {
533 double base_prob = freq / total_freq;
534
535 // and floor/scale it
536 *ost << floor + ( base_prob * (1-floor_prob_total) ) << endl;
537
538 }
539 else
540 *ost << floor << endl;
541 }
542 }
543
544 return write_ok;
545 }
546
547 EST_write_status
548 save_ngram_htk_ascii(const EST_String filename,
549 EST_Ngrammar &n, double floor)
550 {
551
552 ostream *ost;
553
554 // only for bigram
555 if(n.order() != 2)
556 {
557 cerr << "Can only save bigrams in htk_ascii format" << endl;
558 return misc_write_error;
559 }
560
561 if (floor < 0)
562 {
563 cerr << "Negative floor probability does not make sense !" << endl;
564 return misc_write_error;
565 }
566
567 if (filename == "-")
568 ost = &cout;
569 else
570 ost = new ofstream(filename);
571
572 if(!(*ost))
573 return write_fail;
574
575 if(floor * (n.pred_vocab->length()-1) > 1)
576 {
577 floor = 1.0 / (double)(n.pred_vocab->length()-1);
578 cerr << "ERROR : floor is impossibly large, scaling it to ";
579 cerr << floor << endl;
580 }
581
582 int i;
583
584 if(n.p_sentence_start_marker == "")
585 {
586 cerr << "Can't save in HTK format as no sentence start/end tags"
587 << " were given !" << endl;
588 return misc_write_error;
589 }
590
591 // need '!ENTER' (or whatever) as first word- that's HTK for you
592 save_ngram_htk_ascii_sub(n.p_sentence_start_marker,ost,n,floor);
593
594 // the real words
595 for(i=0;i<n.vocab->length();i++)
596 {
597 if ( (n.vocab->name(i) != n.p_sentence_start_marker) &&
598 (n.vocab->name(i) != n.p_sentence_end_marker) &&
599 (n.vocab->name(i) != OOV_MARKER) )
600 save_ngram_htk_ascii_sub(n.vocab->name(i),ost,n,floor);
601 }
602
603 if(!n.closed_vocab())
604 save_ngram_htk_ascii_sub(OOV_MARKER,ost,n,floor);
605
606 save_ngram_htk_ascii_sub(n.p_sentence_end_marker,ost,n,floor);
607
608 if(ost != &cout)
609 delete ost;
610
611 return write_ok;
612 }
613
614 /*
615 EST_write_status
616 save_ngram_htk_binary(const EST_String filename, EST_Ngrammar &n)
617 {
618 return write_ok;
619 }
620 */
621
622 void
623 count_ngram_arpa_sub(EST_Ngrammar *n, EST_StrVector &ngram, void *count)
624 {
625 if(n->ngram_exists(ngram))
626 *((double*)count) += 1;
627 }
628
629 void
630 save_ngram_arpa_sub(EST_Ngrammar *n, EST_StrVector &ngram, void *ost)
631 {
632
633 int i;
634
635 if(n->ngram_exists(ngram))
636 {
637 *((ostream*)(ost)) << safe_log10(n->probability(ngram)) << " ";
638 for(i=0;i<ngram.n();i++)
639 *((ostream*)(ost)) << ngram(i) << " ";
640
641 if ((n->representation() == EST_Ngrammar::backoff) &&
642 (n->order() > ngram.n()) )
643 *((ostream*)(ost)) << safe_log10(n->get_backoff_weight(ngram));
644 //<< " = "
645 //<< n->get_backoff_weight(ngram) << " ";
646
647 *((ostream*)(ost)) << endl;
648
649 }
650 }
651
652 EST_write_status
653 save_ngram_arpa(const EST_String filename, EST_Ngrammar &n)
654 {
655 // ARPA MIT-LL format - see HTK manual !!
656
657 ostream *ost;
658 int i,o;
659
660 if (filename == "-")
661 ost = &cout;
662 else
663 ost = new ofstream(filename);
664
665 if (!(*ost))
666 return write_fail;
667
668 //n.set_entry_type(EST_Ngrammar::probabilities);
669 //n.make_htk_compatible(); // fix enter/exit probs
670 //*ost << *(n.vocab) << endl;
671
672 *ost << "\\data\\" << endl;
673
674 double *count = new double;
675
676 if (n.representation() == EST_Ngrammar::backoff)
677 {
678 for(o=1;o<=n.order();o++)
679 {
680 EST_StrVector ngram(o);
681 for(i=0;i<o;i++)
682 ngram[i] = "";
683 *count =0;
684
685 // this is a deeply silly way to count them,
686 // we could traverse the tree directly !
687 n.iterate(ngram,&count_ngram_arpa_sub,(void*)count);
688 *ost << "ngram " << o << "=" << *count << endl;
689 }
690
691 for(o=1;o<=n.order();o++)
692 {
693 *ost << endl;
694 *ost << "\\" << o << "-grams:" << endl;
695 EST_StrVector ngram(o);
696 for(i=0;i<o;i++)
697 ngram[i] = "";
698 n.iterate(ngram,&save_ngram_arpa_sub,(void*)ost);
699 }
700
701 }
702 else
703 {
704 EST_StrVector ngram(n.order());
705 for(i=0;i<n.order();i++)
706 ngram[i] = "";
707 *count =0;
708 n.iterate(ngram,&count_ngram_arpa_sub,(void*)count);
709 *ost << "ngram " << n.order() << "=" << *count << endl;
710
711 *ost << endl;
712 *ost << "\\" << n.order() << "-grams:" << endl;
713
714 for(i=0;i<n.order();i++)
715 ngram[i] = "";
716 n.iterate(ngram,&save_ngram_arpa_sub,ost);
717
718 }
719
720 *ost << "\\end\\" << endl;
721
722 if (ost != &cout)
723 delete ost;
724
725 return write_ok;
726 }
727
728 EST_write_status
729 save_ngram_cstr_ascii(const EST_String filename, EST_Ngrammar &n,
730 const bool trace, double floor)
731 {
732 // awb's format
733 (void)trace;
734 ostream *ost;
735 int i;
736 EST_Litem *k;
737
738 if (filename == "-")
739 ost = &cout;
740 else
741 ost = new ofstream(filename);
742
743 if(!(*ost))
744 return write_fail;
745
746 *ost << "Ngram_2 " << n.order() << endl;
747 for (i=0; i < n.vocab->length(); i++)
748 *ost << n.vocab->name(i) << " ";
749 *ost << endl;
750 for (i=0; i < n.pred_vocab->length(); i++)
751 *ost << n.pred_vocab->name(i) << " ";
752 *ost << endl;
753
754 if (n.representation() == EST_Ngrammar::dense)
755 n.print_freqs(*ost,floor);
756 else if (n.representation() == EST_Ngrammar::backoff)
757 {
758 int total_ngrams = (int)pow(float(n.get_vocab_length()),float(n.order()-1));
759
760 for(i=0;i<total_ngrams;i++)
761 {
762 EST_DiscreteProbDistribution this_pdf;
763 const EST_StrVector this_ngram = n.make_ngram_from_index(i);
764 this_pdf = n.prob_dist(this_ngram);
765
766 for (k=this_pdf.item_start();
767 !this_pdf.item_end(k);
768 k = this_pdf.item_next(k))
769 {
770 double freq;
771 EST_String name;
772 this_pdf.item_freq(k,name,freq);
773
774 for (int jj=0; jj < this_ngram.n(); jj++)
775 *ost << this_ngram(jj) << " ";
776 *ost << name << " : " << freq << endl;
777 }
778 }
779 }
780
781 if(ost != &cout)
782 delete ost;
783
784 return write_ok;
785 }
786
787 EST_write_status
788 save_ngram_wfst(const EST_String filename, EST_Ngrammar &n)
789 {
790 // Save as a WFST
791 FILE *ost;
792 int i;
793
794 if ((ost = fopen(filename,"wb")) == NULL)
795 {
796 cerr << "Ngrammar save: unable to open \"" << filename <<
797 "\" for writing" << endl;
798 return write_fail;
799 }
800
801 fprintf(ost,"EST_File fst\n");
802 fprintf(ost,"DataType ascii\n");
803 fprintf(ost,"in \"(");
804 for (i=0; i < n.vocab->length(); i++)
805 fprintf(ost," %s\n",(const char *)n.vocab->name(i));
806 fprintf(ost," )\"\n");
807 fprintf(ost,"out \"(");
808 for (i=0; i < n.vocab->length(); i++)
809 fprintf(ost," %s\n",(const char *)n.vocab->name(i));
810 fprintf(ost," )\"\n");
811 fprintf(ost,"NumStates %d\n",n.num_states());
812 fprintf(ost,"EST_Header_End\n");
813
814 for (i=0; i<n.num_states(); i++)
815 {
816 fprintf(ost,"((%d nonfinal %d)\n",i,i);
817 fprintf(ost,")\n");
818 }
819
820 fclose(ost);
821
822 return write_ok;
823 }
824
825 EST_write_status
826 save_ngram_cstr_bin(const EST_String filename, EST_Ngrammar &n,
827 const bool trace, double floor)
828 {
829
830 if (n.representation() == EST_Ngrammar::sparse)
831 return misc_write_error;
832
833 int i;
834 EST_Litem *k;
835 FILE *ofd;
836 double lfreq = -1;
837 double count = -1;
838 int magic = EST_NGRAMBIN_MAGIC;
839
840 if (filename == "-")
841 {
842 if ((ofd=stdout) == NULL)
843 return misc_write_error;
844 }
845 else
846 {
847 if ((ofd=fopen(filename,"wb")) == NULL)
848 return misc_write_error;
849 }
850
851 fwrite(&magic,sizeof(int),1,ofd);
852 fprintf(ofd,"mBin_2 %d\n",n.order());
853 for (i=0; i < n.vocab->length(); i++)
854 fprintf(ofd,"%s ",(const char *)n.vocab->name(i));
855 fprintf(ofd,"\n");
856 for (i=0; i < n.pred_vocab->length(); i++)
857 fprintf(ofd,"%s ",(const char *)n.pred_vocab->name(i));
858 fprintf(ofd,"\n");
859
860 // We use a simple form of run-length encoding, if consecutive
861 // values are equal only a length is printed. lengths are
862 // negative as frequencies (even smoothed ones) can never be -ve
863
864 if ( trace )
865 cerr << "Saving ..." << endl;
866
867 if (n.representation() == EST_Ngrammar::dense)
868 {
869 for(i=0;i<n.num_states();i++)
870 {
871
872 if ( trace )
873 cerr << "\r" << i*100/n.num_states() << "%";
874
875 for (k=n.p_states[i].pdf().item_start();
876 !n.p_states[i].pdf().item_end(k);
877 k = n.p_states[i].pdf().item_next(k))
878 {
879 double freq;
880 EST_String name;
881 n.p_states[i].pdf().item_freq(k,name,freq);
882 if (freq == 0.0)
883 freq = floor;
884 if (freq == lfreq)
885 count--;
886 else
887 {
888 if (count < -1)
889 fwrite(&count,sizeof(double),1,ofd);
890 fwrite(&freq,sizeof(double),1,ofd);
891 count = -1;
892 }
893 lfreq = freq;
894 }
895 }
896 if (count < -1)
897 fwrite(&count,sizeof(double),1,ofd);
898 }
899 else if (n.representation() == EST_Ngrammar::backoff)
900 {
901 // need to construct pdfs in right order
902 // noting that dense states are indexed s.t. the last
903 // word in the ngram is the least significant 'bit'
904
905 // number of ngrams, excluding last word, is
906 int total_ngrams = (int)pow(float(n.get_vocab_length()),float(n.order()-1));
907
908 for(i=0;i<total_ngrams;i++)
909 {
910
911 if ( trace )
912 cerr << "\r" << i*100/total_ngrams << "%";
913
914 EST_DiscreteProbDistribution this_pdf;
915 const EST_StrVector this_ngram = n.make_ngram_from_index(i);
916 this_pdf = n.prob_dist(this_ngram);
917
918 for (k=this_pdf.item_start();
919 !this_pdf.item_end(k);
920 k = this_pdf.item_next(k))
921 {
922
923 double freq;
924 EST_String name;
925 this_pdf.item_freq(k,name,freq);
926 if (freq == lfreq)
927 count--;
928 else
929 {
930 if (count < -1)
931 fwrite(&count,sizeof(double),1,ofd);
932 fwrite(&freq,sizeof(double),1,ofd);
933 count = -1;
934 }
935 lfreq = freq;
936 }
937
938
939 }
940
941 }
942 if ( trace )
943 cerr << "\r \r" << endl;
944
945 fclose(ofd);
946
947 return write_ok;
948 }