Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
EST_SCFG_Chart.cc
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : June 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* A SCFG chart parser */
38 /* */
39 /*=======================================================================*/
40 #include <cstdlib>
41 #include "siod.h"
42 #include "EST_math.h"
43 #include "EST_SCFG.h"
44 #include "EST_SCFG_Chart.h"
45 
46 EST_SCFG_Chart_Edge::EST_SCFG_Chart_Edge()
47 {
48 }
49 
50 EST_SCFG_Chart_Edge::EST_SCFG_Chart_Edge(double prob,
51  int d1, int d2,
52  int pos)
53 {
54  p_d1 = d1;
55  p_d2 = d2;
56  p_pos = pos;
57  p_prob = prob;
58 }
59 
60 EST_SCFG_Chart_Edge::~EST_SCFG_Chart_Edge()
61 {
62 }
63 
64 LISP EST_SCFG_Chart::print_edge(int start, int end, int p,
66 {
67  // Return a lisp representation of the edge
68 
69  if (e->prob() == 0)
70  {
71  return NIL; // failed
72  }
73  else if (start+1 == end)
74  {
75  // unary rule, preterminal
76  LISP r = cons(rintern(grammar->nonterminal(p)),
77  cons(flocons(e->prob()),
78  cons(flocons(start),
79  cons(flocons(end),
80  cons(rintern(grammar->terminal(e->d1())),
81  NIL)))));
82  return r;
83  }
84  else
85  {
86  //name prob start end daughters
87  EST_SCFG_Chart_Edge *d1, *d2;
88 
89  d1 = edges[start][e->pos()][e->d1()];
90  d2 = edges[e->pos()][end][e->d2()];
91 
92  LISP daughters =
93  cons(print_edge(start,e->pos(),e->d1(),d1),
94  cons(print_edge(e->pos(),end,e->d2(),d2),
95  NIL));
96  LISP r = cons(rintern(grammar->nonterminal(p)),
97  cons(flocons(e->prob()),
98  cons(flocons(start),
99  cons(flocons(end),
100  daughters))));
101  return r;
102  }
103 }
104 
105 EST_SCFG_Chart::EST_SCFG_Chart()
106 {
107  n_vertices = 0;
108  edges = 0;
109  wfst = 0;
110  emptyedge = 0;
111  grammar_local = TRUE;
112  grammar = new EST_SCFG;
113 }
114 
115 EST_SCFG_Chart::~EST_SCFG_Chart()
116 {
117  // delete all the vertices
118 
119  delete_edge_table();
120  if (grammar_local)
121  delete grammar;
122 }
123 
125 {
126  if (grammar_local)
127  delete grammar;
128  grammar_local = FALSE;
129  grammar = &imported_grammar;
130 }
131 
133 {
134  grammar->set_rules(r);
135 }
136 
138 {
139  // Set up well formed substring table from feature name in each
140  // stream item in s
141  setup_wfst(s->head(),0,name);
142 }
143 
145 {
146  // Set up well formed substring table from feature name in each
147  // stream item in s
148  EST_Item *p;
149  int n;
150 
151  delete_edge_table();
152  for (n_vertices=1,p=s; p != e; p=p->next())
153  n_vertices++;
154  setup_edge_table();
155 
156  for (n=0,p=s; p != e; p=p->next(),n++)
157  {
158  int term = grammar->terminal(p->f(name).string());
159  if (term == -1)
160  {
161  cerr << "SCFG_Chart: unknown terminal symbol \"" <<
162  p->f(name).string() << "\"" << endl;
163  term = 0; // to avoid crashing
164  }
165  wfst[n] = new EST_SCFG_Chart_Edge(1.0,term,0,-1);
166  }
167 }
168 
169 void EST_SCFG_Chart::delete_edge_table()
170 {
171  int i,j,k;
172 
173  if (wfst == 0) return;
174 
175  for (i=0; i < n_vertices; i++)
176  {
177  delete wfst[i];
178  for (j=0; j < n_vertices; j++)
179  {
180  for (k=0; k < grammar->num_nonterminals(); k++)
181  if (edges[i][j][k] != emptyedge)
182  delete edges[i][j][k];
183  delete [] edges[i][j];
184  }
185  delete [] edges[i];
186  }
187  delete [] wfst;
188  delete [] edges;
189  delete emptyedge;
190 
191  wfst = 0;
192  edges = 0;
193 
194 }
195 
196 void EST_SCFG_Chart::setup_edge_table()
197 {
198  int i,j,k;
199  int nt = grammar->num_nonterminals();
200 
201  wfst = new EST_SCFG_Chart_Edge*[n_vertices];
202  edges = new EST_SCFG_Chart_Edge ***[n_vertices];
203  emptyedge = new EST_SCFG_Chart_Edge(0,0,0,0);
204 
205  for (i=0; i < n_vertices; i++)
206  {
207  wfst[i] = 0;
208  edges[i] = new EST_SCFG_Chart_Edge **[n_vertices];
209  for (j=0; j < n_vertices; j++)
210  {
211  edges[i][j] = new EST_SCFG_Chart_Edge *[nt];
212  for (k=0; k < nt; k++)
213  edges[i][j][k] = 0;
214  }
215  }
216 }
217 
218 double EST_SCFG_Chart::find_best_tree_cal(int start,int end,int p)
219 {
220  // Find the best parse for non/terminal p between start and end
221  int best_j = -1;
222  int best_q = -1, best_r = -1;
223  double best_prob = 0;
224 
225  if (end-1 == start)
226  {
227  best_prob = grammar->prob_U(p,wfst[start]->d1());
228  if (best_prob > 0)
229  edges[start][end][p] =
230  new EST_SCFG_Chart_Edge(best_prob*wfst[start]->prob(),
231  wfst[start]->d1(),0,-1);
232  else
233  edges[start][end][p] = emptyedge;
234  return best_prob;
235  }
236  else
237  {
238  // for all rules expanding p find total and best prob
239  double s=0,t_prob,left,right;
240  int j,q,r;
241  int nt = grammar->num_nonterminals();
242 
243  for (q=0; q < nt; q++)
244  for (r=0; r < nt; r++)
245  {
246  double pBpqr = grammar->prob_B(p,q,r);
247  if (pBpqr > 0)
248  {
249  for (j=start+1; j < end; j++)
250  {
251  left=find_best_tree(start,j,q);
252  if (left > 0)
253  {
254  right = find_best_tree(j,end,r);
255  t_prob = pBpqr * left * right;
256  if (t_prob > best_prob)
257  {
258  best_prob = t_prob;
259  best_q = q;
260  best_r = r;
261  best_j = j;
262  }
263  s += t_prob;
264  }
265  }
266  }
267  }
268 
269  if (best_prob > 0)
270  edges[start][end][p] =
271  new EST_SCFG_Chart_Edge(s,best_q,best_r,best_j);
272  else
273  edges[start][end][p] = emptyedge;
274  return s;
275  }
276 }
277 
279 {
280  // do the parsing, find best parse only
281  if (n_vertices - 1 > 0)
282  find_best_tree(0,n_vertices-1,grammar->distinguished_symbol());
283 
284 }
285 
287 {
288  // Extract the parse from the edge table
289  EST_SCFG_Chart_Edge *top;
290 
291  top = edges[0][n_vertices-1][grammar->distinguished_symbol()];
292 
293  if (top == 0)
294  return NIL; // no parse
295  else
296  return print_edge(0,n_vertices-1,grammar->distinguished_symbol(),top);
297 }
298 
300  EST_Relation *word,
301  int force)
302 {
303  // Build a tree stream in Syn linking the leafs of Syn to those
304  // in word, force guarantees a parse is necessary (though probably
305  // a random one)
306 
307  extract_parse(syn,word->head(),0,force);
308 }
309 
311  EST_Item *s, EST_Item *e, int force)
312 {
313  // Build a tree stream in Syn linking the leafs of Syn to those
314  // in word
315  EST_Item *p;
316  int num_words;
317 
318  for (num_words=0,p=s; p != e; p=p->next())
319  num_words++;
320 
321  if (num_words != (n_vertices-1))
322  {
323  cerr << "SCFG_Chart: extract_parse, number of items in link stream " <<
324  " different from those in parse tree" << endl;
325  return;
326  }
327 
328  EST_SCFG_Chart_Edge *top;
329  EST_Item *w = s;
330 
331  top = edges[0][n_vertices-1][grammar->distinguished_symbol()];
332 
333  if (top == 0)
334  return; // failed to parse so no parse tree
335  else
336  {
337  EST_Item *ss = syn->append();
338 
339  extract_edge(0,n_vertices-1,grammar->distinguished_symbol(),top,
340  ss,&w);
341 
342  if ((force) && (!daughter1(ss))) // no parse found but *need* one
343  extract_forced_parse(0,n_vertices-1,ss,w);
344  return;
345  }
346 }
347 
348 void EST_SCFG_Chart::extract_forced_parse(int start, int end,
349  EST_Item *s, EST_Item *w)
350 {
351  // Extract a parse even though one wasn't found (often happens
352  // with single word or dual word sentences.
353 
354  if (start+1 == end)
355  {
356  s->append_daughter(w);
357  s->set_name(grammar->nonterminal(grammar->distinguished_symbol()));
358  s->set("prob",0.0); // maybe should be epsilon
359  }
360  else
361  {
362  extract_forced_parse(start,start+1,s->append_daughter(),w);
363  EST_Item *st = s->append_daughter();
364  st->set_name(grammar->nonterminal(grammar->distinguished_symbol()));
365  st->set("prob",0.0); // maybe should be epsilon
366  EST_Item *nw = w->next();
367  extract_forced_parse(start+1,end,st,nw);
368  }
369 }
370 
371 void EST_SCFG_Chart::extract_edge(int start, int end, int p,
373  EST_Item *s,
374  EST_Item **word)
375 {
376  // Build the node for this edge, and all of its daughters
377 
378  if (e->prob() == 0)
379  {
380  return; // failed
381  }
382  else if (start+1 == end)
383  {
384  // unary rule, preterminal
385  s->append_daughter((*word));
386  s->set_name(grammar->nonterminal(p));
387  s->set("prob",(float)e->prob());
388  *word = (*word)->next(); // increment along "word" stream
389  return;
390  }
391  else
392  {
393  //name prob start end daughters
394  EST_SCFG_Chart_Edge *d1, *d2;
395 
396  d1 = edges[start][e->pos()][e->d1()];
397  d2 = edges[e->pos()][end][e->d2()];
398 
399  // Inserts the new nodes in the tree (and creates new si nodes)
400  s->append_daughter();
401  s->append_daughter();
402 
403  extract_edge(start,e->pos(),e->d1(),d1,daughter1(s),word);
404  extract_edge(e->pos(),end,e->d2(),d2,daughter2(s),word);
405 
406  s->set_name(grammar->nonterminal(p));
407  s->set("prob",(float)e->prob());
408 
409  return;
410  }
411 }
412 
413 void EST_SCFG_chart_load_relation(EST_Relation &s,LISP sent)
414 {
415  // Set up well formed substring table form lisp list
416  // Setup a relation and call the standard method of set up
417  LISP w,f;
418 
419  for (w=sent; w != NIL; w=cdr(w))
420  {
421  EST_Item *word = s.append();
422 
423  if (consp(car(w)))
424  { // a word with other feature info
425  word->set_name(get_c_string(car(car(w))));
426  if (consp(car(cdr(car(w)))))
427  for (f=car(cdr(car(w))); f != NIL; f=cdr(f))
428  {
429  if (FLONUMP(car(cdr(car(f)))))
430  word->set(get_c_string(car(car(f))),
431  get_c_float(car(cdr(car(f)))));
432  else
433  word->set(get_c_string(car(car(f))),
434  get_c_string(car(cdr(car(f)))));
435  }
436  else // we assume its a POS value, cause they didn't say
437  word->set("name",get_c_string(car(cdr(car(w)))));
438  }
439  else // for easy we set the pos field to the be the name
440  word->set("name",get_c_string(car(w)));
441  }
442 }
443 
444 void scfg_parse(EST_Relation *Word, const EST_String &name,
445  EST_Relation *Syntax, EST_SCFG &grammar)
446 {
447  // Parse feature name in Word to build Syntax relation
448  // The relations names above are *not* the names of the relations
449  // just named to reflect there conceptual usage
450  EST_SCFG_Chart chart;
451 
452  chart.set_grammar_rules(grammar);
453  chart.setup_wfst(Word,name);
454  chart.parse();
455  chart.extract_parse(Syntax,Word,TRUE);
456 
457  return;
458 }
459 
460 LISP scfg_parse(LISP string, LISP grammar)
461 {
462  // Parse and return full parse
463  EST_SCFG_Chart chart;
464  EST_Relation words;
465  LISP parse;
466 
467  chart.set_grammar_rules(grammar);
468 
469  EST_SCFG_chart_load_relation(words,string);
470  chart.setup_wfst(&words,"name");
471  chart.parse();
472  parse = chart.find_parse();
473 
474  return parse;
475 }
476 
477 LISP scfg_parse(LISP string, EST_SCFG &grammar)
478 {
479  // Parse and return full parse
480  EST_SCFG_Chart chart;
481  EST_Relation words;
482  LISP parse;
483 
484  chart.set_grammar_rules(grammar);
485 
486  EST_SCFG_chart_load_relation(words,string);
487  chart.setup_wfst(&words,"name");
488  chart.parse();
489  parse = chart.find_parse();
490 
491  return parse;
492 }
493 
494 LISP scfg_bracketing_only(LISP parse)
495 {
496  if (consp(siod_nth(4,parse)))
497  {
498  LISP d,ds;
499 
500  for (d=cdr(cdr(cdr(cdr(parse)))),ds=NIL; d != NIL; d=cdr(d))
501  ds = cons(scfg_bracketing_only(car(d)),ds);
502  return reverse(ds);
503  }
504  else
505  return siod_nth(4,parse);
506 
507 }
508 
510 {
511  // Compare bracketing of best parse to bracketing on original
512  // For each sentence parse it (unbracketed) and then
513  // find the percentage of valid brackets in parsed version that
514  // are valid in the original one.
515  int c;
516  LISP parse;
517  EST_SuffStats cb;
518  int failed = 0;
519  int fully_contained=0;
520 
521  for (c=0; c < corpus.length(); c++)
522  {
523  LISP flat = siod_flatten(corpus.a_no_check(c).string());
524 
525  parse = scfg_parse(flat,*this);
526  if (parse == NIL)
527  {
528  failed++;
529  continue;
530  }
531 
532  EST_bracketed_string parsed(scfg_bracketing_only(parse));
533  EST_SuffStats vs;
534 
535  count_bracket_crossing(corpus.a_no_check(c),parsed,vs);
536 
537  if (vs.mean() == 1)
538  fully_contained++;
539  cb += vs.mean();
540  }
541 
542  cout << "cross bracketing " << cb.mean()*100 << " (" << failed <<
543  " failed " << (float)(100.0*fully_contained)/corpus.length() <<
544  "% fully consistent from " << corpus.length()
545  << " sentences)" << endl;
546 
547 }
548 
549 void count_bracket_crossing(const EST_bracketed_string &ref,
550  const EST_bracketed_string &test,
551  EST_SuffStats &vs)
552 {
553  int i,j;
554 
555  if (ref.length() != test.length())
556  {
557  EST_error("bracket_crossing: sentences of different lengths");
558  }
559 
560  for (i=0; i < ref.length(); i++)
561  for (j=i+1; j <= ref.length(); j++)
562  if (test.valid(i,j) == 1)
563  {
564  if (ref.valid(i,j) == 0)
565  vs += 0;
566  else
567  vs += 1;
568  }
569 }