"Fossies" - the Fresh Open Source Software Archive 
Member "speech_tools/grammar/scfg/EST_SCFG_Chart.cc" (4 Sep 2017, 15071 Bytes) of package /linux/misc/speech_tools-2.5.0-release.tar.gz:
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : June 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* A SCFG chart parser */
38 /* */
39 /*=======================================================================*/
40 #include <cstdlib>
41 #include "siod.h"
42 #include "EST_math.h"
43 #include "EST_SCFG.h"
44 #include "EST_SCFG_Chart.h"
45
46 EST_SCFG_Chart_Edge::EST_SCFG_Chart_Edge()
47 {
48 }
49
50 EST_SCFG_Chart_Edge::EST_SCFG_Chart_Edge(double prob,
51 int d1, int d2,
52 int pos)
53 {
54 p_d1 = d1;
55 p_d2 = d2;
56 p_pos = pos;
57 p_prob = prob;
58 }
59
60 EST_SCFG_Chart_Edge::~EST_SCFG_Chart_Edge()
61 {
62 }
63
64 LISP EST_SCFG_Chart::print_edge(int start, int end, int p,
65 EST_SCFG_Chart_Edge *e)
66 {
67 // Return a lisp representation of the edge
68
69 if (e->prob() == 0)
70 {
71 return NIL; // failed
72 }
73 else if (start+1 == end)
74 {
75 // unary rule, preterminal
76 LISP r = cons(rintern(grammar->nonterminal(p)),
77 cons(flocons(e->prob()),
78 cons(flocons(start),
79 cons(flocons(end),
80 cons(rintern(grammar->terminal(e->d1())),
81 NIL)))));
82 return r;
83 }
84 else
85 {
86 //name prob start end daughters
87 EST_SCFG_Chart_Edge *d1, *d2;
88
89 d1 = edges[start][e->pos()][e->d1()];
90 d2 = edges[e->pos()][end][e->d2()];
91
92 LISP daughters =
93 cons(print_edge(start,e->pos(),e->d1(),d1),
94 cons(print_edge(e->pos(),end,e->d2(),d2),
95 NIL));
96 LISP r = cons(rintern(grammar->nonterminal(p)),
97 cons(flocons(e->prob()),
98 cons(flocons(start),
99 cons(flocons(end),
100 daughters))));
101 return r;
102 }
103 }
104
105 EST_SCFG_Chart::EST_SCFG_Chart()
106 {
107 n_vertices = 0;
108 edges = 0;
109 wfst = 0;
110 emptyedge = 0;
111 grammar_local = TRUE;
112 grammar = new EST_SCFG;
113 }
114
115 EST_SCFG_Chart::~EST_SCFG_Chart()
116 {
117 // delete all the vertices
118
119 delete_edge_table();
120 if (grammar_local)
121 delete grammar;
122 }
123
124 void EST_SCFG_Chart::set_grammar_rules(EST_SCFG &imported_grammar)
125 {
126 if (grammar_local)
127 delete grammar;
128 grammar_local = FALSE;
129 grammar = &imported_grammar;
130 }
131
132 void EST_SCFG_Chart::set_grammar_rules(LISP r)
133 {
134 grammar->set_rules(r);
135 }
136
137 void EST_SCFG_Chart::setup_wfst(EST_Relation *s,const EST_String &name)
138 {
139 // Set up well formed substring table from feature name in each
140 // stream item in s
141 setup_wfst(s->head(),0,name);
142 }
143
144 void EST_SCFG_Chart::setup_wfst(EST_Item *s, EST_Item *e,const EST_String &name)
145 {
146 // Set up well formed substring table from feature name in each
147 // stream item in s
148 EST_Item *p;
149 int n;
150
151 delete_edge_table();
152 for (n_vertices=1,p=s; p != e; p=inext(p))
153 n_vertices++;
154 setup_edge_table();
155
156 for (n=0,p=s; p != e; p=inext(p),n++)
157 {
158 int term = grammar->terminal(p->f(name).string());
159 if (term == -1)
160 {
161 cerr << "SCFG_Chart: unknown terminal symbol \"" <<
162 p->f(name).string() << "\"" << endl;
163 term = 0; // to avoid crashing
164 }
165 wfst[n] = new EST_SCFG_Chart_Edge(1.0,term,0,-1);
166 }
167 }
168
169 void EST_SCFG_Chart::delete_edge_table()
170 {
171 int i,j,k;
172
173 if (wfst == 0) return;
174
175 for (i=0; i < n_vertices; i++)
176 {
177 delete wfst[i];
178 for (j=0; j < n_vertices; j++)
179 {
180 for (k=0; k < grammar->num_nonterminals(); k++)
181 if (edges[i][j][k] != emptyedge)
182 delete edges[i][j][k];
183 delete [] edges[i][j];
184 }
185 delete [] edges[i];
186 }
187 delete [] wfst;
188 delete [] edges;
189 delete emptyedge;
190
191 wfst = 0;
192 edges = 0;
193
194 }
195
196 void EST_SCFG_Chart::setup_edge_table()
197 {
198 int i,j,k;
199 int nt = grammar->num_nonterminals();
200
201 wfst = new EST_SCFG_Chart_Edge*[n_vertices];
202 edges = new EST_SCFG_Chart_Edge ***[n_vertices];
203 emptyedge = new EST_SCFG_Chart_Edge(0,0,0,0);
204
205 for (i=0; i < n_vertices; i++)
206 {
207 wfst[i] = 0;
208 edges[i] = new EST_SCFG_Chart_Edge **[n_vertices];
209 for (j=0; j < n_vertices; j++)
210 {
211 edges[i][j] = new EST_SCFG_Chart_Edge *[nt];
212 for (k=0; k < nt; k++)
213 edges[i][j][k] = 0;
214 }
215 }
216 }
217
218 double EST_SCFG_Chart::find_best_tree_cal(int start,int end,int p)
219 {
220 // Find the best parse for non/terminal p between start and end
221 int best_j = -1;
222 int best_q = -1, best_r = -1;
223 double best_prob = 0;
224
225 if (end-1 == start)
226 {
227 best_prob = grammar->prob_U(p,wfst[start]->d1());
228 if (best_prob > 0)
229 edges[start][end][p] =
230 new EST_SCFG_Chart_Edge(best_prob*wfst[start]->prob(),
231 wfst[start]->d1(),0,-1);
232 else
233 edges[start][end][p] = emptyedge;
234 return best_prob;
235 }
236 else
237 {
238 // for all rules expanding p find total and best prob
239 double s=0,t_prob,left,right;
240 int j,q,r;
241 int nt = grammar->num_nonterminals();
242
243 for (q=0; q < nt; q++)
244 for (r=0; r < nt; r++)
245 {
246 double pBpqr = grammar->prob_B(p,q,r);
247 if (pBpqr > 0)
248 {
249 for (j=start+1; j < end; j++)
250 {
251 left=find_best_tree(start,j,q);
252 if (left > 0)
253 {
254 right = find_best_tree(j,end,r);
255 t_prob = pBpqr * left * right;
256 if (t_prob > best_prob)
257 {
258 best_prob = t_prob;
259 best_q = q;
260 best_r = r;
261 best_j = j;
262 }
263 s += t_prob;
264 }
265 }
266 }
267 }
268
269 if (best_prob > 0)
270 edges[start][end][p] =
271 new EST_SCFG_Chart_Edge(s,best_q,best_r,best_j);
272 else
273 edges[start][end][p] = emptyedge;
274 return s;
275 }
276 }
277
278 void EST_SCFG_Chart::parse(void)
279 {
280 // do the parsing, find best parse only
281 if (n_vertices - 1 > 0)
282 find_best_tree(0,n_vertices-1,grammar->distinguished_symbol());
283
284 }
285
286 LISP EST_SCFG_Chart::find_parse()
287 {
288 // Extract the parse from the edge table
289 EST_SCFG_Chart_Edge *top;
290
291 top = edges[0][n_vertices-1][grammar->distinguished_symbol()];
292
293 if (top == 0)
294 return NIL; // no parse
295 else
296 return print_edge(0,n_vertices-1,grammar->distinguished_symbol(),top);
297 }
298
299 void EST_SCFG_Chart::extract_parse(EST_Relation *syn,
300 EST_Relation *word,
301 int force)
302 {
303 // Build a tree stream in Syn linking the leafs of Syn to those
304 // in word, force guarantees a parse is necessary (though probably
305 // a random one)
306
307 extract_parse(syn,word->head(),0,force);
308 }
309
310 void EST_SCFG_Chart::extract_parse(EST_Relation *syn,
311 EST_Item *s, EST_Item *e, int force)
312 {
313 // Build a tree stream in Syn linking the leafs of Syn to those
314 // in word
315 EST_Item *p;
316 int num_words;
317
318 for (num_words=0,p=s; p != e; p=inext(p))
319 num_words++;
320
321 if (num_words != (n_vertices-1))
322 {
323 cerr << "SCFG_Chart: extract_parse, number of items in link stream " <<
324 " different from those in parse tree" << endl;
325 return;
326 }
327
328 EST_SCFG_Chart_Edge *top;
329 EST_Item *w = s;
330
331 top = edges[0][n_vertices-1][grammar->distinguished_symbol()];
332
333 if (top == 0)
334 return; // failed to parse so no parse tree
335 else
336 {
337 EST_Item *ss = syn->append();
338
339 extract_edge(0,n_vertices-1,grammar->distinguished_symbol(),top,
340 ss,&w);
341
342 if ((force) && (!daughter1(ss))) // no parse found but *need* one
343 extract_forced_parse(0,n_vertices-1,ss,w);
344 return;
345 }
346 }
347
348 void EST_SCFG_Chart::extract_forced_parse(int start, int end,
349 EST_Item *s, EST_Item *w)
350 {
351 // Extract a parse even though one wasn't found (often happens
352 // with single word or dual word sentences.
353
354 if (start+1 == end)
355 {
356 s->append_daughter(w);
357 s->set_name(grammar->nonterminal(grammar->distinguished_symbol()));
358 s->set("prob",0.0); // maybe should be epsilon
359 }
360 else
361 {
362 extract_forced_parse(start,start+1,s->append_daughter(),w);
363 EST_Item *st = s->append_daughter();
364 st->set_name(grammar->nonterminal(grammar->distinguished_symbol()));
365 st->set("prob",0.0); // maybe should be epsilon
366 EST_Item *nw = inext(w);
367 extract_forced_parse(start+1,end,st,nw);
368 }
369 }
370
371 void EST_SCFG_Chart::extract_edge(int start, int end, int p,
372 EST_SCFG_Chart_Edge *e,
373 EST_Item *s,
374 EST_Item **word)
375 {
376 // Build the node for this edge, and all of its daughters
377
378 if (e->prob() == 0)
379 {
380 return; // failed
381 }
382 else if (start+1 == end)
383 {
384 // unary rule, preterminal
385 s->append_daughter((*word));
386 s->set_name(grammar->nonterminal(p));
387 s->set("prob",(float)e->prob());
388 *word = inext(*word); // increment along "word" stream
389 return;
390 }
391 else
392 {
393 //name prob start end daughters
394 EST_SCFG_Chart_Edge *d1, *d2;
395
396 d1 = edges[start][e->pos()][e->d1()];
397 d2 = edges[e->pos()][end][e->d2()];
398
399 // Inserts the new nodes in the tree (and creates new si nodes)
400 s->append_daughter();
401 s->append_daughter();
402
403 extract_edge(start,e->pos(),e->d1(),d1,daughter1(s),word);
404 extract_edge(e->pos(),end,e->d2(),d2,daughter2(s),word);
405
406 s->set_name(grammar->nonterminal(p));
407 s->set("prob",(float)e->prob());
408
409 return;
410 }
411 }
412
413 void EST_SCFG_chart_load_relation(EST_Relation &s,LISP sent)
414 {
415 // Set up well formed substring table form lisp list
416 // Setup a relation and call the standard method of set up
417 LISP w,f;
418
419 for (w=sent; w != NIL; w=cdr(w))
420 {
421 EST_Item *word = s.append();
422
423 if (consp(car(w)))
424 { // a word with other feature info
425 word->set_name(get_c_string(car(car(w))));
426 if (consp(car(cdr(car(w)))))
427 for (f=car(cdr(car(w))); f != NIL; f=cdr(f))
428 {
429 if (FLONUMP(car(cdr(car(f)))))
430 word->set(get_c_string(car(car(f))),
431 get_c_float(car(cdr(car(f)))));
432 else
433 word->set(get_c_string(car(car(f))),
434 get_c_string(car(cdr(car(f)))));
435 }
436 else // we assume its a POS value, cause they didn't say
437 word->set("name",get_c_string(car(cdr(car(w)))));
438 }
439 else // for easy we set the pos field to the be the name
440 word->set("name",get_c_string(car(w)));
441 }
442 }
443
444 void scfg_parse(EST_Relation *Word, const EST_String &name,
445 EST_Relation *Syntax, EST_SCFG &grammar)
446 {
447 // Parse feature name in Word to build Syntax relation
448 // The relations names above are *not* the names of the relations
449 // just named to reflect there conceptual usage
450 EST_SCFG_Chart chart;
451
452 chart.set_grammar_rules(grammar);
453 chart.setup_wfst(Word,name);
454 chart.parse();
455 chart.extract_parse(Syntax,Word,TRUE);
456
457 return;
458 }
459
460 LISP scfg_parse(LISP string, LISP grammar)
461 {
462 // Parse and return full parse
463 EST_SCFG_Chart chart;
464 EST_Relation words;
465 LISP parse;
466
467 chart.set_grammar_rules(grammar);
468
469 EST_SCFG_chart_load_relation(words,string);
470 chart.setup_wfst(&words,"name");
471 chart.parse();
472 parse = chart.find_parse();
473
474 return parse;
475 }
476
477 LISP scfg_parse(LISP string, EST_SCFG &grammar)
478 {
479 // Parse and return full parse
480 EST_SCFG_Chart chart;
481 EST_Relation words;
482 LISP parse;
483
484 chart.set_grammar_rules(grammar);
485
486 EST_SCFG_chart_load_relation(words,string);
487 chart.setup_wfst(&words,"name");
488 chart.parse();
489 parse = chart.find_parse();
490
491 return parse;
492 }
493
494 LISP scfg_bracketing_only(LISP parse)
495 {
496 if (consp(siod_nth(4,parse)))
497 {
498 LISP d,ds;
499
500 for (d=cdr(cdr(cdr(cdr(parse)))),ds=NIL; d != NIL; d=cdr(d))
501 ds = cons(scfg_bracketing_only(car(d)),ds);
502 return reverse(ds);
503 }
504 else
505 return siod_nth(4,parse);
506
507 }
508
509 void EST_SCFG_traintest::test_crossbrackets()
510 {
511 // Compare bracketing of best parse to bracketing on original
512 // For each sentence parse it (unbracketed) and then
513 // find the percentage of valid brackets in parsed version that
514 // are valid in the original one.
515 int c;
516 LISP parse;
517 EST_SuffStats cb;
518 int failed = 0;
519 int fully_contained=0;
520
521 for (c=0; c < corpus.length(); c++)
522 {
523 LISP flat = siod_flatten(corpus.a_no_check(c).string());
524
525 parse = scfg_parse(flat,*this);
526 if (parse == NIL)
527 {
528 failed++;
529 continue;
530 }
531
532 EST_bracketed_string parsed(scfg_bracketing_only(parse));
533 EST_SuffStats vs;
534
535 count_bracket_crossing(corpus.a_no_check(c),parsed,vs);
536
537 if (vs.mean() == 1)
538 fully_contained++;
539 cb += vs.mean();
540 }
541
542 cout << "cross bracketing " << cb.mean()*100 << " (" << failed <<
543 " failed " << (float)(100.0*fully_contained)/corpus.length() <<
544 "% fully consistent from " << corpus.length()
545 << " sentences)" << endl;
546
547 }
548
549 void count_bracket_crossing(const EST_bracketed_string &ref,
550 const EST_bracketed_string &test,
551 EST_SuffStats &vs)
552 {
553 int i,j;
554
555 if (ref.length() != test.length())
556 {
557 EST_error("bracket_crossing: sentences of different lengths");
558 }
559
560 for (i=0; i < ref.length(); i++)
561 for (j=i+1; j <= ref.length(); j++)
562 if (test.valid(i,j) == 1)
563 {
564 if (ref.valid(i,j) == 0)
565 vs += 0;
566 else
567 vs += 1;
568 }
569 }