Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
EST_Ngrammar.cc
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Simon King & Alan W Black */
34 /* Date : February 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* An EST_Ngram class for building and manipulating bigrams trigrams etc */
38 /* */
39 /*=======================================================================*/
40 #include <iostream>
41 #include <fstream>
42 #include <cstring>
43 #include <cmath>
44 #include <climits>
45 #include <cfloat>
46 
47 using namespace std;
48 
49 #include "EST_Ngrammar.h"
50 #include "EST_Pathname.h"
51 #include "EST_Token.h"
52 #include "EST_io_aux.h"
53 
54 
55 const EST_DiscreteProbDistribution PSTnullProbDistribution;
56 static EST_String NOVOCAB("NOVOCAB");
57 
58 // **********************************************************************
59 
60 EST_NgrammarState::EST_NgrammarState(const EST_NgrammarState &s)
61 {
62  clear();
63  init(s.id(),s.pdf_const());
64 }
65 
66 EST_NgrammarState::EST_NgrammarState(const EST_NgrammarState *const s)
67 {
68  clear();
69  init(s->id(),s->pdf_const()); // copy
70 }
71 
72 EST_NgrammarState::~EST_NgrammarState()
73 {
74  clear();
75 }
76 
77 void EST_NgrammarState::clear()
78 {
79  p_id = -1;
80  p_pdf.clear();
81 }
82 
83 void EST_NgrammarState::init()
84 {
85  p_id=-1;
86  p_pdf.init();
87 }
88 
89 void EST_NgrammarState::init(int id,EST_Discrete *d)
90 {
91  p_id = id;
92  p_pdf.init(d);
93 }
94 
95 
96 void EST_NgrammarState::init(int id, const EST_DiscreteProbDistribution &pdf)
97 {
98  p_id = id;
99  p_pdf = pdf; // copy it
100 }
101 
102 
103 ostream& operator<<(ostream& s, const EST_NgrammarState &a)
104 {
105  s << "(" << a.id() << ": " << a.pdf_const() << " )";
106  return s;
107 }
108 
109 // **********************************************************************
110 
111 EST_BackoffNgrammarState::~EST_BackoffNgrammarState()
112 {
113  p_pdf.clear();
114  children.clear();
115 }
116 
117 
118 void EST_BackoffNgrammarState::clear()
119 {
120  backoff_weight=0;
121  p_pdf.clear();
122 }
123 
124 void EST_BackoffNgrammarState::init()
125 {
126  backoff_weight=0;
127  p_pdf.init();
128 }
129 
130 void EST_BackoffNgrammarState::init(const EST_Discrete *d,int level)
131 {
132  backoff_weight=0;
133  p_level = level;
134  p_pdf.init(d);
135 }
136 
137 void EST_BackoffNgrammarState::init(const EST_DiscreteProbDistribution &pdf, int level)
138 {
139  backoff_weight=0;
140  p_level = level;
141  p_pdf = pdf; // copy it
142 }
143 
144 bool EST_BackoffNgrammarState::accumulate(const EST_StrVector &words,
145  const double count)
146 {
147 
148 // int i;
149 // cerr << "accumulate ";
150 // for(i=0;i<words.n();i++)
151 // {
152 // if(words.n()-1-p_level == i)
153 // cerr <<"*";
154 // cerr << words(i);
155 // if(words.n()-1-p_level == i)
156 // cerr <<"*";
157 // }
158 // cerr << endl;
159 
160  // update pdf at this node
161  p_pdf.cumulate(words(words.n()-1-p_level),count);
162 
163  // then go down
164  if (words.n()-1-p_level > 0) // not at bottom of tree
165  {
166 
168 
169  // have we got the child
170  s = get_child(words(words.n()-1-p_level));
171  if (s==NULL)
172  // have to extend the tree
173  s = add_child(p_pdf.get_discrete(),words);
174  return s->accumulate(words,count);
175 
176  }
177  else
178  {
179  return true;
180  }
181 }
182 
183 bool EST_BackoffNgrammarState::accumulate(const EST_IVector &words,
184  const double count)
185 {
186 
187 // int i;
188 // cerr << "accumulate level " << p_level << " : ";
189 // for(i=0;i<words.n();i++)
190 // {
191 // if(words.n()-1-p_level == i)
192 // cerr <<"*";
193 // cerr << p_pdf.item_name(words(i));
194 // if(words.n()-1-p_level == i)
195 // cerr <<"*";
196 // cerr << " ";
197 // }
198 // cerr << endl;
199 
200  // update pdf at this node
201  p_pdf.cumulate(words(words.n()-1-p_level),count);
202 
203  //cerr << *this << endl;
204  //cerr << endl;
205 
206  // then go down
207  if (words.n()-1-p_level > 0) // not at bottom of tree
208  {
209 
211 
212  // have we got the child
213  s = get_child(words(words.n()-1-p_level));
214  if (s==NULL)
215  // have to extend the tree
216  s = add_child(p_pdf.get_discrete(),words);
217 
218  // get pointer again in case we built more than one level
219  s = get_child(words(words.n()-1-p_level));
220  if (s==NULL)
221  {
222  cerr << "Failed to extend tree - unknown reason !" << endl;
223  return false;
224  }
225  return s->accumulate(words,count);
226  }
227  else
228  {
229  return true;
230  }
231 }
232 
233 
235 EST_BackoffNgrammarState::add_child(const EST_Discrete *d,
236  const EST_StrVector &words)
237 {
239 
240  if (words.n()-1-p_level > 0) // more history still to go
241  {
242  // see if we can go down the tree
243  s = get_child(words(words.n()-1-p_level));
244  if (s != NULL)
245  return s->add_child(d,words);
246  else
247  {
248  // construct tree as we go
250  new_child->init(d,p_level+1);
251  children.add(words(words.n()-1-p_level), (void*)new_child);
252  return new_child->add_child(d,words);
253  }
254  }
255  else
256  {
257  // this is the node we are trying to add !
258  return this;
259  }
260 }
261 
262 
263 
265 EST_BackoffNgrammarState::add_child(const EST_Discrete *d,
266  const EST_IVector &words)
267 {
269 
270  if (words.n()-1-p_level > 0) // more history still to go
271  {
272  // see if we can go down the tree
273  s = get_child(words(words.n()-1-p_level));
274  if (s != NULL)
275  return s->add_child(d,words);
276  else
277  {
278  // construct tree as we go
280  new_child->init(d,p_level+1);
281  children.add(p_pdf.get_discrete()->name(words(words.n()-1-p_level)), (void*)new_child);
282  return new_child->add_child(d,words);
283  }
284  }
285  else
286  {
287  // this is the node we are trying to add !
288  return this;
289  }
290 }
291 
292 void EST_BackoffNgrammarState::remove_child(EST_BackoffNgrammarState *child,
293  const EST_String &name)
294 {
295 
296  child->zap();
297  // can't remove from StringTrie, but can set pointer to NULL
298  children.add(name,NULL);
299  delete child;
300 }
301 
302 void EST_BackoffNgrammarState::print_freqs(ostream &os,
303  const int order,
304  EST_String followers)
305 {
306  // not right - just print out, then recurse through children
307  // change to use 'backoff_traverse'
308 
309  EST_Litem *k;
310  double freq;
311  EST_String name;
312  for (k=p_pdf.item_start();
313  !p_pdf.item_end(k);
314  k = p_pdf.item_next(k))
315  {
316  p_pdf.item_freq(k,name,freq);
317  EST_BackoffNgrammarState *s = ((EST_BackoffNgrammarState*)(children.lookup(name)));
318  if (p_level==order-1)
319  {
320  if(freq>0)
321  os << name << " " << followers
322  << ": " << freq << endl;
323  }
324  else if (s!=NULL)
325  s->print_freqs(os,order,name+" "+followers);
326 
327  }
328 }
329 
330 bool EST_BackoffNgrammarState::ngram_exists(const EST_StrVector &words,
331  const double threshold) const
332 {
333  const EST_BackoffNgrammarState *s;
334  s = get_state(words);
335  if (s != NULL)
336  {
337  // return true for unigrams (state level 0)
338  // even if freq < threshold
339  return (bool)((s->level()==0) ||
340  ( s->frequency(words(0)) > threshold ));
341  }
342  else
343  return false;
344 }
345 
346 const EST_BackoffNgrammarState *const
347 EST_BackoffNgrammarState::get_state(const EST_StrVector &words) const
348 {
350  if (words.n()-1-p_level > 0) // more history still to go
351  {
352  s = get_child(words(words.n()-1-p_level));
353  if (s != NULL)
354  {
355  //cerr << "traversing from " << *this << endl;
356  //cerr << " to " << *s << endl << endl;
357  return s->get_state(words);
358  }
359  else
360  {
361  //cerr << "got NULL" << endl;
362  return NULL;
363  }
364  }
365  else
366  {
367  //cerr << "got " << *this << endl;
368  return this;
369  }
370 }
371 
372 void EST_BackoffNgrammarState::zap()
373 {
374 
375  // recursively delete this state and all its children
376  EST_Litem *k;
377  double freq;
378  EST_String name;
379  for (k=p_pdf.item_start();
380  !p_pdf.item_end(k);
381  k = p_pdf.item_next(k))
382  {
383  p_pdf.item_freq(k,name,freq);
384  EST_BackoffNgrammarState *child = get_child(name);
385  if (child!=NULL)
386  remove_child(child,name);
387  }
388 
389  children.clear();
390  p_pdf.clear();
391 
392 }
393 
394 
395 const double EST_BackoffNgrammarState::get_backoff_weight(const EST_StrVector &words) const
396 {
398  if (words.n()-1-p_level >= 0)
399  {
400  s = get_child(words(words.n()-1-p_level));
401  if (s != NULL)
402  return s->get_backoff_weight(words);
403  else
404  {
405  // if there is no node, the weight would have been 1 anyway
406  //cerr << "Couldn't get weight for " << words << endl;
407  return 1; // default
408  }
409  }
410 
411  else
412  {
413  // reached node
414 
415 /*
416  cerr << "got bw for ";
417  for(int i=0;i<words.n();i++)
418  cerr << words(i) << " ";
419  cerr << " at level " << p_level << " = " << backoff_weight << endl;
420 */
421  return backoff_weight;
422  }
423 }
424 
425 
426 bool EST_BackoffNgrammarState::set_backoff_weight(const EST_StrVector &words, const double w)
427 {
429  if (words.n()-1-p_level >= 0)
430  {
431  s = get_child(words(words.n()-1-p_level));
432  if (s != NULL)
433  return s->set_backoff_weight(words,w);
434  else
435  {
436  // we can't set the weight because the node
437  // doesn't exist - this is an error if the weight
438  // is not 1
439  if (w != 1)
440  {
441  cerr << "Couldn't set weight for " << words
442  << " to " << w << endl;
443  return false;
444  }
445  else
446  return true; // a weight of 1 does not need storing
447  }
448  }
449  else
450  {
451  // reached node
452  backoff_weight = w;
453  return true;
454  }
455 }
456 
457 void EST_BackoffNgrammarState::frequency_of_frequencies(EST_DVector &ff)
458 {
459  int max=ff.n();
460  EST_Litem *k;
461  double freq;
462  EST_String name;
463  for (k=p_pdf.item_start();
464  !p_pdf.item_end(k);
465  k = p_pdf.item_next(k))
466  {
467  p_pdf.item_freq(k,name,freq);
468  if(freq < max)
469  ff[(int)(freq+0.5)] += 1;
470  }
471 }
472 
473 ostream& operator<<(ostream& s, const EST_BackoffNgrammarState &a)
474 {
475  s << "(backoff level:" << a.p_level
476  << " weight:" << a.backoff_weight << " " << a.pdf_const() << " )";
477 
478  return s;
479 }
480 
481 // **********************************************************************
482 
483 void EST_Ngrammar::default_values()
484 {
485  p_representation = EST_Ngrammar::sparse;
486  p_entry_type = EST_Ngrammar::frequencies; // by default
487  sparse_representation.clear();
488  allow_oov=false;
489  p_num_samples = 0;
490  p_num_states = 0;
491  p_states = 0;
492  vocab = 0;
493  pred_vocab = 0;
494  backoff_threshold = 1.0;
495  backoff_unigram_floor_freq = 0.0;
496 }
497 
498 EST_Ngrammar::~EST_Ngrammar()
499 {
500  delete [] p_states;
501 }
502 
503 void EST_Ngrammar::clear()
504 {
505  // to do
506 }
507 
508 bool EST_Ngrammar::init(int o, EST_Ngrammar::representation_t r,
509  const EST_StrList &wordlist)
510 {
511  return (bool)(init_vocab(wordlist) && p_init(o,r));
512 }
513 
514 bool EST_Ngrammar::init(int o, EST_Ngrammar::representation_t r,
515  const EST_StrList &wordlist,
516  const EST_StrList &predlist)
517 {
518  return (bool)(init_vocab(wordlist,predlist) && p_init(o,r));
519 }
520 
521 bool EST_Ngrammar::init(int o, EST_Ngrammar::representation_t r,
522  EST_Discrete &v)
523 {
524  vocab = &v;
525  pred_vocab = vocab;
526  vocab_pdf.init(pred_vocab);
527  return p_init(o,r);
528 }
529 
530 bool EST_Ngrammar::init(int o, EST_Ngrammar::representation_t r,
531  EST_Discrete &v, EST_Discrete &pv)
532 {
533  vocab = &v;
534  pred_vocab = &pv;
535  vocab_pdf.init(pred_vocab);
536  return p_init(o,r);
537 }
538 
539 bool EST_Ngrammar::p_init(int o, EST_Ngrammar::representation_t r)
540 {
541  if (o <= 0)
542  {
543  cerr << "EST_Ngrammar order must be > 0" << endl;
544  return false;
545  }
546 
547  p_order = o;
548  p_representation = r;
549  p_number_of_sentences = 0;
550 
551  switch(p_representation)
552  {
553 
554  case EST_Ngrammar::sparse:
555  sparse_representation.init(p_order);
556  return true;
557  break;
558 
559  case EST_Ngrammar::dense:
560  return init_dense_representation();
561  break;
562 
563  case EST_Ngrammar::backoff:
564  return init_backoff_representation();
565  break;
566 
567  default:
568  cerr << "Unknown internal representation requested for EST_Ngrammar"
569  << endl;
570  return false;
571  break;
572  }
573 }
574 
575 bool EST_Ngrammar::init_dense_representation()
576 {
577  // allocate a flattened N-dimensional matrix of states
578  int i;
579 
580  if (vocab->length() <= 0)
581  {
582  cerr << "EST_Ngrammar: dense_representation requires explicit vocab"
583  << endl;
584  return false;
585  }
586 
587  p_num_states = (int)pow(float(vocab->length()),float(p_order-1));
588  p_states = new EST_NgrammarState[p_num_states];
589  for (i=0; i < p_num_states; i++)
590  p_states[i].init(i,pred_vocab);
591 
592  return true;
593 }
594 
595 bool EST_Ngrammar::init_sparse_representation()
596 {
597 
598  if (vocab->length() <= 0)
599  {
600  cerr << "EST_Ngrammar: dense_representation requires explicit vocab"
601  << endl;
602  return false;
603  }
604 
605  p_num_states = (int)pow(float(vocab->length()),float(p_order-1));
606  p_states = new EST_NgrammarState[p_num_states];
607 
608  return (bool)(p_states != NULL);
609 }
610 
611 bool EST_Ngrammar::init_backoff_representation()
612 {
613 
614  // nothing much to do
615  backoff_representation = new EST_BackoffNgrammarState;
616  backoff_representation->init(vocab,0);
617  return true;
618 }
619 
620 
621 const EST_StrVector &EST_Ngrammar::make_ngram_from_index(const int index) const
622 {
623  int i,ind=index;
624  EST_StrVector *ngram = new EST_StrVector;
625  ngram->resize(p_order-1); // exclude last word
626 
627  // matches 'find_dense_state_index' so first word is most significant
628  // also, cannot fill in last word
629 
630  for(i=p_order-2;i>=0;i--)
631  {
632 #if defined(sun) && ! defined(__svr4__)
633 /* SunOS */
634  int rem = ind%vocab->length();
635  int quot = ind/vocab->length();
636  (*ngram)[i] = wordlist_index(rem);
637  ind = quot;
638 #else
639  div_t d = div(ind,vocab->length());
640  (*ngram)[i] = wordlist_index(d.rem);
641  ind = d.quot;
642 #endif
643  }
644 
645  return *ngram;
646 }
647 
648 
649 
650 
651 bool EST_Ngrammar::init_vocab(const EST_StrList &word_list)
652 {
653  vocab = new EST_Discrete();
654  if(!vocab->init(word_list))
655  return false;
656 
657  pred_vocab = vocab; // same thing in this case
658  vocab_pdf.init(pred_vocab);
659 
660  return (bool)(vocab != NULL);
661 }
662 
663 bool EST_Ngrammar::init_vocab(const EST_StrList &word_list,
664  const EST_StrList &pred_list)
665 {
666  vocab = new EST_Discrete();
667  if(!vocab->init(word_list))
668  return false;
669  pred_vocab = new EST_Discrete();
670  if(!pred_vocab->init(pred_list))
671  return false;
672  vocab_pdf.init(pred_vocab);
673 
674  return (bool)(vocab != NULL);
675 }
676 
677 bool EST_Ngrammar::check_vocab(const EST_StrList &word_list)
678 {
679  EST_Discrete *comp_vocab = new EST_Discrete();
680 
681  if(!comp_vocab->init(word_list))
682  {
683  delete comp_vocab;
684  return false;
685  }
686 
687  if(*vocab != *comp_vocab)
688  {
689  delete comp_vocab;
690  return false;
691  }
692 
693  delete comp_vocab;
694  return true;
695 }
696 
697 const EST_String & EST_Ngrammar::wordlist_index(int i) const
698 {
699  return vocab->name(i);
700 }
701 
702 int EST_Ngrammar::predlist_index(const EST_String &word) const
703 {
704 
705  if (word=="") // can't lookup
706  return -1;
707 
708  int i;
709  i = pred_vocab->index(word);
710  if(i >= 0 )
711  return i;
712 
713  cerr << "Word \"" << word << "\" is not in the predictee word list" << endl;
714 
715  if (allow_oov)
716  {
717  i = pred_vocab->index(OOV_MARKER);
718  if(i >= 0)
719  return i;
720 
721  cerr << "Even " << OOV_MARKER << " is not in the predictee word list !" << endl;
722  }
723 
724  return -1;
725 }
726 
727 
728 const EST_String & EST_Ngrammar::predlist_index(int i) const
729 {
730  return pred_vocab->name(i);
731 }
732 
733 int EST_Ngrammar::wordlist_index(const EST_String &word, const bool report) const
734 {
735 
736  if (word=="") // can't lookup
737  return -1;
738 
739  int i;
740  i = vocab->index(word);
741  if(i >= 0 )
742  return i;
743 
744  if(report)
745  cerr << "Word \"" << word << "\" is not in the word list" << endl;
746 
747  if (allow_oov)
748  {
749  i = vocab->index(OOV_MARKER);
750  if(i >= 0)
751  return i;
752  if(report)
753  cerr << "Even " << OOV_MARKER << " is not in the word list !" << endl;
754  }
755 
756  return -1;
757 }
758 
759 bool EST_Ngrammar::build(const EST_StrList &filenames,
760  const EST_String &prev,
761  const EST_String &prev_prev,
762  const EST_String &last,
763  const EST_String &input_format,
764  const EST_String &oov_mode,
765  const int mincount,
766  const int maxcount)
767 {
768 
769  p_sentence_start_marker = prev;
770  p_sentence_end_marker = last;
771 
772 
773  // when backing off, safest modes ...
774  if( (p_representation == EST_Ngrammar::backoff) &&
775  (oov_mode != "skip_file") &&
776  (oov_mode != "skip_sentence"))
777  cerr << "Warning : building a backoff grammar" << endl
778  << " with oov_mode '" << oov_mode
779  << "' is not recommended !" << endl;
780 
781  if( (oov_mode != "skip_ngram") &&
782  (oov_mode != "skip_sentence") &&
783  (oov_mode != "skip_file") &&
784  (oov_mode != "use_oov_marker") )
785  {
786  cerr << "Unknown oov_mode '" << oov_mode << "'" << endl;
787  return false;
788  }
789 
790  if( (oov_mode == "skip_sentence") &&
791  (input_format == "ngram_per_line"))
792  {
793  cerr << "Sorry, with input format 'ngram_per_line' you cannot " << endl
794  << " select oov_mode 'skip_sentence'" << endl;
795  return false;
796  }
797 
798  if(oov_mode == "use_oov_marker")
799  allow_oov = true;
800  else
801  allow_oov = false;
802 
803  bool skip_this;
804  EST_String new_filename;
805  EST_Litem *p;
806  for (p = filenames.head(); p; p = p->next())
807  {
808  cerr << "Building from " << filenames(p) << endl;
809 
810  skip_this=false;
811  if( ((oov_mode == "skip_sentence") &&
812  (input_format == "sentence_per_file")) ||
813  (oov_mode == "skip_file") )
814  skip_this = oov_preprocess(filenames(p),new_filename,
815  "skip if found");
816 
817  else if( ((oov_mode == "skip_sentence") &&
818  (input_format == "sentence_per_line")) ||
819  ((oov_mode == "skip_ngram") &&
820  (input_format == "ngram_per_line")) )
821  oov_preprocess(filenames(p),new_filename,"eliminate lines");
822 
823  else
824  new_filename = filenames(p);
825 
826 
827  if(skip_this)
828  {
829  cerr << "Skipping " << filenames(p)
830  << " (out of vocabulary words found)" << endl;
831  }
832  else
833  {
834 
835  switch(p_representation)
836  {
837  case EST_Ngrammar::sparse:
838  {
839  if (input_format != "")
840  {
841  cerr << "Can't build sparse ngram from '" << input_format;
842  cerr << "' data" << endl;
843  return false;
844  }
845  else if (!build_sparse(new_filename,prev,prev_prev,last))
846  return false;
847  }
848  break;
849 
850  case EST_Ngrammar::dense:
851  if (!build_ngram(new_filename,prev,prev_prev,last,input_format))
852  return false;
853  break;
854 
855  case EST_Ngrammar::backoff:
856  if (!build_ngram(new_filename,prev,prev_prev,last,input_format))
857  return false;
858  break;
859 
860  default:
861  cerr << "Unknown internal representation set for EST_Ngrammar"
862  << endl;
863  return false;
864  break;
865  }
866  }
867 
868  if((new_filename != filenames(p)) &&
869  (new_filename != "") &&
870  !delete_file(new_filename) )
871  {
872  cerr << "Warning : couldn't remove temporary file : "
873  << new_filename << endl;
874  }
875 
876  } // loop round files
877 
878  if (p_representation == EST_Ngrammar::backoff)
879  return compute_backoff_weights(mincount,maxcount);
880 
881  return true;
882 }
883 
884 void EST_Ngrammar::accumulate(const EST_StrVector &words,
885  const double count)
886 {
887  if (words.n() < p_order)
888  cerr << "EST_Ngrammar::accumulate - window is too small" << endl;
889  else
890  {
891  p_num_samples++;
892  const EST_String &w = lastword(words);
893  vocab_pdf.cumulate(w,count);
894 
895  switch(p_representation)
896  {
897  case EST_Ngrammar::dense:
898  case EST_Ngrammar::sparse:
899  find_state(words).cumulate(w,count);
900  break;
901 
902  case EST_Ngrammar::backoff:
903  backoff_representation->accumulate(words,count);
904  break;
905 
906  default:
907  cerr << "EST_Ngrammar::accumulate : invalid representation !"
908  << endl;
909  break;
910  }
911  } // switch
912 }
913 
914 void EST_Ngrammar::accumulate(const EST_IVector &words,
915  const double count)
916 {
917 
918  /*
919  int i;
920  for(i=0;i<words.n();i++)
921  {
922  cerr << vocab_pdf.item_name(words(i));
923  cerr << " ";
924  }
925  cerr << endl;
926  */
927 
928  if (words.n() < p_order)
929  cerr << "EST_Ngrammar::accumulate - window is too small" << endl;
930  else
931  {
932  p_num_samples++;
933  vocab_pdf.cumulate(words(p_order-1),count);
934 
935  switch(p_representation)
936  {
937 
938  case EST_Ngrammar::dense:
939  case EST_Ngrammar::sparse:
940  find_state(words).cumulate(words(p_order-1),count);
941  break;
942 
943  case EST_Ngrammar::backoff:
944  backoff_representation->accumulate(words,count);
945  break;
946 
947  default:
948  cerr << "EST_Ngrammar::accumulate : invalid representation !"
949  << endl;
950  break;
951  } // switch
952  }
953 }
954 
955 
956 bool EST_Ngrammar::ngram_exists(const EST_StrVector &words) const
957 {
958  switch(p_representation)
959  {
960  case EST_Ngrammar::sparse:
961  return false;
962  break;
963 
964  case EST_Ngrammar::dense:
965  return true; // always exists !
966  break;
967 
968  case EST_Ngrammar::backoff:
969  {
970  if(words.n()==1)
971  return backoff_representation->ngram_exists(words,0);
972  else
973  return backoff_representation->ngram_exists(words,backoff_threshold);
974  }
975  break;
976 
977  default:
978  cerr << "ngram_exists: unknown ngrammar representation" << endl;
979  break;
980  }
981  return false;
982 }
983 
984 bool EST_Ngrammar::ngram_exists(const EST_StrVector &words, const double threshold) const
985 {
986  if (p_representation != EST_Ngrammar::backoff)
987  {
988  cerr << "Not a backoff grammar !" << endl;
989  return false;
990  }
991 
992  return backoff_representation->ngram_exists(words,threshold);
993 
994 }
995 
996 
997 const double EST_Ngrammar::get_backoff_weight(const EST_StrVector &words) const
998 {
999  if(p_representation == EST_Ngrammar::backoff)
1000  return backoff_representation->get_backoff_weight(words);
1001  else
1002  {
1003  cerr << "Can't get backoff weight - not a backed off ngrammar !" << endl;
1004  return 0;
1005  }
1006 }
1007 
1008 bool EST_Ngrammar::set_backoff_weight(const EST_StrVector &words, const double w)
1009 {
1010  if(p_representation == EST_Ngrammar::backoff)
1011  return backoff_representation->set_backoff_weight(words,w);
1012  else
1013  {
1014  cerr << "Can't set backoff weight - not a backed off ngrammar !" << endl;
1015  return false;
1016  }
1017 }
1018 
1019 
1020 bool EST_Ngrammar::build_sparse(const EST_String &filename,
1021  const EST_String &prev,
1022  const EST_String &prev_prev,
1023  const EST_String &last)
1024 {
1025  sparse_representation.build(filename,prev,prev_prev,last);
1026  return true;
1027 }
1028 
1029 
1030 void EST_Ngrammar::fill_window_start(EST_IVector &window,
1031  const EST_String &prev,
1032  const EST_String &prev_prev) const
1033 {
1034  int i;
1035 
1036  for (i=0; i<window.n()-1; i++)
1037  window[i] = wordlist_index(prev_prev);
1038  window[i++] = wordlist_index(prev);
1039 }
1040 
1041 void EST_Ngrammar::fill_window_start(EST_StrVector &window,
1042  const EST_String &prev,
1043  const EST_String &prev_prev) const
1044 {
1045  int i;
1046 
1047  for (i=0; i<window.n()-1; i++)
1048  window[i] = prev_prev;
1049  window[i++] = prev;
1050 }
1051 
1052 bool EST_Ngrammar::oov_preprocess(const EST_String &filename,
1053  EST_String &new_filename,
1054  const EST_String &what)
1055 {
1056  ostream *ost=0;
1057  EST_TokenStream ts;
1058  new_filename="";
1059  int bad_line_count=0;
1060  int good_line_count=0;
1061 
1062  // if filename is stdin we have to make a temporary file
1063  // if we are eliminating lines we also need a temporary file
1064 
1065  // what = "skip if found" | "eliminate lines"
1066 
1067  bool write_out = false;
1068  if( (what == "eliminate lines") || (filename == "-") )
1069  write_out = true;
1070 
1071  // open the original files for reading
1072  if (filename == "-")
1073  {
1074  if( ts.open(stdin, FALSE) == -1)
1075  {
1076  cerr << "EST_Ngrammar:: failed to open stdin";
1077  cerr << " for reading" << endl;
1078  return false;
1079  }
1080  }
1081  else if (ts.open(filename) == -1){
1082  cerr << "EST_Ngrammar: failed to open file \"" << filename
1083  << "\" for reading" << endl;
1084  return false; // hmmm ?
1085  }
1086 
1087  // open the output file if necessary
1088  if(write_out)
1089  {
1090  new_filename = make_tmp_filename();
1091  ost = new ofstream(new_filename);
1092 
1093  if(!(*ost))
1094  {
1095  cerr << "Ngrammar: couldn't create temporary file \""
1096  << new_filename << "\"" << endl;
1097  new_filename="";
1098  return false;
1099  }
1100  }
1101  else
1102  new_filename = filename;
1103 
1104  EST_String s,this_line;
1105  bool bad_line=false;
1106  while (!ts.eof())
1107  {
1108  s=ts.get().string();
1109 
1110  if (!bad_line && (s != ""))
1111  {
1112  if(wordlist_index(s,false) < 0)
1113  {
1114 
1115  if(what == "eliminate lines")
1116  {
1117  bad_line=true;
1118  }
1119  else // skip file
1120  {
1121  if(write_out)
1122  {
1123  // input was stdin, but we are now discarding it
1124  delete ost;
1125  if(!delete_file(new_filename))
1126  cerr << "Warning : couldn't delete temporary file '"
1127  << new_filename << "'" << endl;
1128  }
1129  new_filename="";
1130  return false;
1131  }
1132 
1133  }
1134  else
1135  this_line += s + " ";
1136  }
1137 
1138  // write out
1139  if(ts.eoln())
1140  {
1141  if(bad_line)
1142  {
1143  bad_line_count++;
1144  }
1145  else
1146  {
1147  if(write_out)
1148  {
1149  // this_line
1150  *ost << this_line << endl;
1151  good_line_count++;
1152  }
1153  }
1154  bad_line=false;
1155  this_line = "";
1156  }
1157 
1158  }
1159  cerr << "skipped " << bad_line_count << " and kept "
1160  << good_line_count << " lines from file " << filename << endl;
1161  return true;
1162 }
1163 
1164 bool EST_Ngrammar::build_ngram(const EST_String &filename,
1165  const EST_String &prev,
1166  const EST_String &prev_prev,
1167  const EST_String &last,
1168  const EST_String &input_format)
1169 {
1170  p_entry_type = EST_Ngrammar::frequencies;
1171  int bad_word=0;
1172  EST_String s;
1173  EST_TokenStream ts;
1174  int eoln_is_eos = FALSE;
1175  int sliding_window = TRUE;
1176  int count=0;
1177  clear();
1178 
1179  if ( (input_format == "") || (input_format == "sentence_per_line") )
1180  {
1181  // may do something here later
1182  eoln_is_eos = TRUE;
1183  sliding_window = TRUE;
1184  }
1185  else if (input_format == "sentence_per_file")
1186  {
1187  eoln_is_eos = FALSE;
1188  sliding_window = TRUE;
1189  p_number_of_sentences = 1;
1190  }
1191  else if(input_format == "ngram_per_line")
1192  {
1193  eoln_is_eos = FALSE;
1194  sliding_window = FALSE;
1195  p_number_of_sentences = 1;
1196  }
1197  else
1198  {
1199  cerr << "Can't build from '" << input_format << "' data" << endl;
1200  return false;
1201  }
1202 
1203  EST_IVector window(p_order);
1204 
1205  if (filename == "-")
1206  {
1207  if( ts.open(stdin, FALSE) == -1)
1208  {
1209  cerr << "EST_Ngrammar:: failed to open stdin";
1210  cerr << " for reading" << endl;
1211  return false;
1212  }
1213  }
1214  else if (ts.open(filename) == -1){
1215  cerr << "EST_Ngrammar: failed to open \"" << filename
1216  << "\" for reading" << endl;
1217  return false;
1218  }
1219 
1220  // default input format is one sentence per line
1221  // full stops and commas etc. are taken literally
1222  // i.e. the user must do the preprocessing
1223 
1224  // we assume that all of prev,prev_prev,last are either
1225  // null or set, not a mixture of the two
1226 
1227  // at start window contains
1228  // [prev_prev, prev_prev, ....., prev_prev, prev]
1229  // This is not added to the ngram model though
1230  if(sliding_window)
1231  {
1232  fill_window_start(window,prev,prev_prev);
1233  if (window(p_order-1) == -1)
1234  bad_word = p_order;
1235  else if( (p_order>1) && (window(p_order-2) == -1))
1236  bad_word = p_order-1;
1237  else
1238  bad_word=0;
1239 
1240  if(bad_word > 0)
1241  cerr << "at start : bad word at " << bad_word << endl;
1242 
1243  }
1244  while (!ts.eof())
1245  {
1246  s=ts.get().string();
1247 
1248  if (s != "")
1249  {
1250  if(sliding_window)
1251  {
1252  slide(window,-1);
1253  if (bad_word > 0)
1254  bad_word--;
1255 
1256  window[p_order-1] = wordlist_index(s);
1257  if (window(p_order-1) < 0)
1258  {
1259  cerr << "EST_Ngrammar::build_ngram " <<
1260  " word \"" << s << "\" is not in vocabulary, skipping"
1261  << endl;
1262  bad_word = p_order;
1263  }
1264  if (bad_word == 0)
1265  accumulate(window);
1266  else
1267  {
1268  cerr << "not accumulating : bad word at " << bad_word;
1269  cerr << " window=" << window; // l<< endl;
1270  bad_word--;
1271  }
1272  }
1273  else
1274  {
1275  // not a sliding window - wait for end of line and accumulate
1276  if(count < p_order)
1277  {
1278  if(count == p_order-1) // last thing in window (predictee)
1279  window[count++] = predlist_index(s);
1280  else // not last thing (predictor)
1281  window[count++] = wordlist_index(s);
1282 
1283  if (window(count-1) < 0)
1284  {
1285  cerr << "EST_Ngrammar::build_ngram " <<
1286  " word \"" << s << "\" is not in vocabulary, skipping"
1287  << endl;
1288  bad_word = 1;
1289  }
1290  }
1291  else
1292  cerr << "Too many items on line - ignoring trailing ones !" << endl;
1293  }
1294  }
1295 
1296  // end of sentence ?
1297  if(ts.eoln())
1298  {
1299 
1300  if(!sliding_window)
1301  {
1302  if((count == p_order) && bad_word == 0)
1303  accumulate(window);
1304  count = 0;
1305  bad_word = 0;
1306  }
1307  else if (eoln_is_eos)
1308  {
1309  // have there been any accumulates since the last increment
1310  if (window(p_order-1) != wordlist_index(last))
1311  p_number_of_sentences += 1;
1312 
1313  slide(window,-1);
1314  window[p_order-1] = wordlist_index(last);
1315 
1316  if(window(p_order-1) == -1)
1317  {
1318  //cerr << "WARNING : skipping over bad word '"
1319  //<< last << "'" << endl;
1320  bad_word = p_order;
1321  }
1322 
1323  if (bad_word == 0)
1324  accumulate(window);
1325 
1326  fill_window_start(window,prev,prev_prev);
1327 
1328  // check for bad tags
1329  if (window(p_order-1) == -1)
1330  bad_word = p_order;
1331  else if( (p_order>1) && (window(p_order-2) == -1) )
1332  bad_word = p_order-1;
1333  else
1334  bad_word=0;
1335  }
1336  }
1337  }
1338 
1339  // if last accumulate was at end of sentence
1340  // we don't need to do the extra accumulate
1341  if ( sliding_window && (window(p_order-1) != wordlist_index(prev)))
1342  {
1343 
1344  // and finish with the ngram [.....last few words,last]
1345  slide(window,-1);
1346  window[p_order-1] = wordlist_index(last);
1347 
1348  if (window(p_order-1) == -1)
1349  bad_word=p_order;
1350 
1351  if (bad_word == 0)
1352  {
1353  accumulate(window);
1354  p_number_of_sentences += 1;
1355  }
1356  }
1357 
1358  ts.close();
1359 
1360  cerr << "Accumulated " << p_number_of_sentences << " sentences." << endl;
1361  return true;
1362 }
1363 
1364 
1365 void compute_backoff_weight(EST_Ngrammar *n, EST_StrVector &ngram, void*)
1366 {
1367  int i,j;
1368  double sum1=0,sum2=0;
1369 
1370 
1371  EST_StrVector new_ngram;
1372  new_ngram.resize(ngram.n()-1);
1373  for(i=0;i<new_ngram.n();i++)
1374  new_ngram[i] = ngram(i);
1375 
1376 
1377  cerr << "computing backoff w for ";
1378  for(i=0;i<new_ngram.n();i++)
1379  cerr << new_ngram(i) << " ";
1380  cerr << " \r";
1381 
1382  cerr << endl;
1383 
1384  // only have backoff weights if the associated n-1 gram exists
1385  if (!n->ngram_exists(new_ngram))
1386  {
1387  cerr << " NONE";
1388 
1389  // if ngram really exists, but was below threshold
1390  // set backoff weight to 1 (always back off)
1391  if (n->ngram_exists(new_ngram,0))
1392  {
1393  if(!n->set_backoff_weight(new_ngram,1))
1394  cerr << "WARNING : couldn't set weight !" << endl;
1395  cerr << " = 1";
1396  }
1397  cerr << endl;
1398  return;
1399  }
1400 
1401  // save
1402  EST_String tmp = ngram(ngram.n()-1);
1403 
1404  // for each possible word in the last position
1405  for(j=0;j<n->get_pred_vocab_length();j++)
1406  {
1407  ngram[ngram.n()-1] = n->get_pred_vocab_word(j);
1408 
1409  for(i=0;i<ngram.n();i++)
1410  cerr << ngram(i) << " ";
1411 
1412  if (n->ngram_exists(ngram))
1413  {
1414  cerr << n->probability(ngram) << " exists " << endl;
1415  // if we have the complete ngram, add it to sum1
1416  sum1 += n->probability(ngram);
1417  }
1418  else
1419  {
1420 
1421  // make this faster - take out of loop
1422 
1423  // or add the n-1 gram, excluding the first word to sum2
1424  EST_StrVector tmp_ngram;
1425  tmp_ngram.resize(ngram.n()-1);
1426  for(i=0;i<tmp_ngram.n();i++)
1427  tmp_ngram[i] = ngram(i+1);
1428 
1429  cerr << " unseen, P(";
1430  for(i=0;i<tmp_ngram.n();i++)
1431  cerr << tmp_ngram(i) << " ";
1432 
1433  cerr << ") = " << n->probability(tmp_ngram) << " " << endl;
1434  sum2 += n->probability(tmp_ngram);
1435  }
1436  }
1437 
1438  // and fill in the backoff weight
1439 
1440  if (sum2 == 0) // no unseen ngrams, no backing off
1441  {
1442  if(!n->set_backoff_weight(new_ngram,1))
1443  cerr << "WARNING : couldn't set weight !" << endl;
1444  cerr << 0 << endl;
1445  }
1446  else
1447  {
1448  if (sum1 > 1)
1449  {
1450  cerr << "NEGATIVE WEIGHT for ";
1451  for(i=0;i<new_ngram.n();i++)
1452  cerr << new_ngram(i) << " ";
1453  cerr << endl;
1454 
1455  cerr << "sum1=" << sum1 << " sum2=" << sum2;
1456  cerr << " " << (1 - sum1) / sum2 << endl;
1457 
1458  for(j=0;j<n->get_pred_vocab_length();j++)
1459  {
1460  ngram[ngram.n()-1] = n->get_pred_vocab_word(j);
1461 
1462 
1463  if (n->ngram_exists(ngram))
1464  {
1465 
1466  for(i=0;i<ngram.n();i++)
1467  cerr << ngram(i) << " ";
1468  cerr << " exists, prob = ";
1469  cerr << n->probability(ngram,false,true) << endl;
1470  }
1471 
1472  }
1473  sum1 = 1;
1474  sum2 = 1; // to get a weight of 0
1475  }
1476 
1477  if(!n->set_backoff_weight(new_ngram, (1 - sum1) / sum2 ))
1478  cerr << "WARNING : couldn't set weight !" << endl;
1479 
1480  cerr << "sum1=" << sum1 << " sum2=" << sum2;
1481  cerr << " weight=" << (1 - sum1) / sum2 << endl;
1482  }
1483 
1484  // restore
1485  ngram[ngram.n()-1] = tmp;
1486 
1487 }
1488 
1489 bool EST_Ngrammar::compute_backoff_weights(const int mincount,
1490  const int maxcount)
1491 {
1492 
1493 
1494  backoff_threshold = mincount;
1495  backoff_discount = new EST_DVector[p_order];
1496 
1497  int i,o;
1498 
1499  // but we must have children from the root node
1500  // for every unigram, since these can not be backed off
1501  // (must therefore be present, even if zero)
1502  // smoothing will fill in a floor for any zero frequencies
1503 
1504  backoff_restore_unigram_states();
1505 
1506  Good_Turing_discount(*this,maxcount,0.5);
1507 
1508  // and since some frequencies will have been set to zero
1509  // we have to prune away those branches of the tree
1510  //prune_backoff_representation();
1511 
1512 
1513  // now compute backoff weights, to satisfy the
1514  // 'probs sum to 1' condition
1515 
1516  // condition is (trigram case):
1517  // sum( p(w3|w1,w2) ) = 1, over all w1,w2
1518 
1519  // p(wd3|wd1,wd2)=
1520  // if(trigram exists) p_3(wd1,wd2,wd3)
1521  // else if(bigram w1,w2 exists) bo_wt_2(w1,w2)*p(wd3|wd2)
1522  // else p(wd3|w2)
1523  // and for a given wd3 they all sum to 1
1524 
1525  // so compute three sums :
1526  // sum(p_3(wd1,wd2,wd3)), for all w1,w2 where we have the trigram
1527  // sum(p(wd3|wd2)), for all w1,w2 where we have the bigram(w1,w2)
1528  // but not the trigram
1529  // sum(p(wd3|wd2)), for all w1,w2 where we don't have either
1530 
1531  // could probably do this recursively and more elegantly
1532  // but it makes my head hurt
1533 
1534  for (o=2;o<=order();o++) // for (o=1 .. .????
1535  {
1536 
1537  cerr << "Backing off order " << o << endl;
1538  //cerr << "=======================" << endl;
1539 
1540  EST_StrVector words;
1541  words.resize(o);
1542 
1543  // for each possible history (ngram, excluding last word)
1544  // compute the backoff weight
1545  for(i=0;i<o-1;i++)
1546  words[i]="";
1547  words[o-1] = "!FILLED!";
1548  iterate(words,&compute_backoff_weight,NULL);
1549 
1550  //cerr << "=========done==========" << endl;
1551 
1552  }
1553 
1554  return true;
1555 }
1556 
1557 
1558 void EST_Ngrammar::backoff_restore_unigram_states()
1559 {
1560  // for every item in the root pdf, look for a child
1561  // and make it if not present
1562 
1563  // short cut is to cumulate some zero freq bigrams
1564  // to force construction of states where necessary
1565  EST_StrVector words;
1566  int j;
1567  words.resize(2);
1568  words[0] = "wibble"; // doesn't matter what this is, count is 0
1569  for(j=0;j<get_pred_vocab_length();j++)
1570  {
1571  words[1] = get_pred_vocab_word(j);
1572  backoff_representation->accumulate(words,0);
1573  }
1574 
1575 }
1576 
1577 
1578 void EST_Ngrammar::prune_backoff_representation(EST_BackoffNgrammarState *start_state)
1579 {
1580 
1581  if (start_state == NULL)
1582  start_state = backoff_representation;
1583 
1584  //cerr << "pruning state " << start_state << endl << *start_state << endl;
1585 
1586  // remove any branches with zero frequency count
1587 
1588  // find children of this state with zero freq and zap them
1589  EST_Litem *k;
1590  double freq;
1591  EST_String name;
1592  for (k=start_state->pdf_const().item_start();
1593  !start_state->pdf_const().item_end(k);
1594  k = start_state->pdf_const().item_next(k))
1595  {
1596  start_state->pdf_const().item_freq(k,name,freq);
1597  if (freq < TINY_FREQ)
1598  {
1599  EST_BackoffNgrammarState *child = start_state->get_child(name);
1600 
1601  if (child!=NULL)
1602  {
1603  //cerr << "Zapping " << name << " : " << child->level()
1604  //<< " " << child<< endl;
1605  start_state->remove_child(child,name);
1606  }
1607  }
1608 
1609  }
1610 
1611  // then recurse through remaining children
1612  for (k=start_state->pdf_const().item_start();
1613  !start_state->pdf_const().item_end(k);
1614  k = start_state->pdf_const().item_next(k))
1615  {
1616  start_state->pdf_const().item_freq(k,name,freq);
1617  EST_BackoffNgrammarState *child = start_state->get_child(name);
1618  if (child!=NULL)
1619  {
1620  //cerr << "recursing to " << name << " : " << child->level() << endl;
1621  //if((child!=NULL) && (child->level() == 3))
1622  //cerr << *child << endl;
1623  prune_backoff_representation(child);
1624  }
1625  }
1626 }
1627 
1628 
1629 ostream& operator<<(ostream& s, EST_Ngrammar &n)
1630 {
1631  switch(n.p_representation)
1632  {
1633  case EST_Ngrammar::sparse:
1634  n.sparse_representation.print_freqs(s);
1635  break;
1636 
1637  case EST_Ngrammar::dense:
1638  s << "Dense" << endl;
1639  // s << n.dense_representation << endl;
1640  break;
1641 
1642  case EST_Ngrammar::backoff:
1643  s << "Backoff" << endl;
1644  s << *(n.backoff_representation) << endl;
1645  break;
1646 
1647  default:
1648  cerr << "Unknown internal representation of EST_Ngrammar : can't print"
1649  << endl;
1650  break;
1651  }
1652 
1653  return s;
1654 }
1655 
1656 bool
1657 EST_Ngrammar::set_entry_type(EST_Ngrammar::entry_t new_type)
1658 {
1659  if (new_type == p_entry_type)
1660  return true;
1661 
1662  // entry type conversion --- hmmmmm
1663 
1664  cerr << "Couldn't do entry type conversion !" << endl;
1665  return false;
1666 }
1667 
1668 bool EST_Ngrammar::sparse_to_dense()
1669 {
1670  cerr << "EST_Ngrammar::sparse_to_dense() "
1671  <<" not implemented" << endl;
1672  return false;
1673 }
1674 
1675 bool EST_Ngrammar::dense_to_sparse()
1676 {
1677  cerr << "EST_Ngrammar::dense_to_sparse()"
1678  <<" not implemented" << endl;
1679  return false;
1680 }
1681 
1682 int EST_Ngrammar::find_dense_state_index(const EST_IVector &words,
1683  int index) const
1684 {
1685  int i,ind=0;
1686  for(i=0;i<p_order-1;i++)
1687  ind = ind*vocab->length() + words.a_no_check(i+index);
1688 
1689  return ind;
1690 }
1691 
1692 int EST_Ngrammar::find_next_state_id(int state, int word) const
1693 {
1694  // Find a new state index from the current after moving with word
1695  int i,f;
1696 
1697  if (p_order == 1)
1698  return 0;
1699  for (f=1,i=0; i<p_order-2; i++)
1700  f*=vocab->length();
1701  return ((state%f)*vocab->length())+word;
1702 }
1703 
1704 EST_NgrammarState &EST_Ngrammar::find_state(const EST_StrVector &words)
1705 {
1706  switch(p_representation)
1707  {
1708  case EST_Ngrammar::sparse:
1709  // return p_states[sparse_representation.find_state(words)];
1710  return p_states[0];
1711  break;
1712 
1713  case EST_Ngrammar::dense:
1714  {
1715  EST_IVector tmp(words.n());
1716  int i;
1717  for(i=0;i<p_order-1;i++)
1718  {
1719  tmp[i] = wordlist_index(words(i));
1720  if (tmp(i) == -1) break;
1721  }
1722  tmp[i] = pred_vocab->index(words(i));
1723  if (tmp(i) == -1) break;
1724  return p_states[find_dense_state_index(tmp)];
1725  }
1726  break;
1727 
1728  case EST_Ngrammar::backoff:
1729  cerr << "find_state: not valid in backoff mode !" << endl;
1730  break;
1731 
1732  default:
1733  cerr << "find_state: unknown ngrammar representation" << endl;
1734  break;
1735  }
1736 
1737  return p_states[0];
1738 }
1739 
1740 
1741 const EST_NgrammarState &
1742 EST_Ngrammar::find_state_const(const EST_StrVector &words) const
1743 {
1744  switch(p_representation)
1745  {
1746  case EST_Ngrammar::sparse:
1747  // return p_states[sparse_representation.find_state(words)];
1748  return p_states[0];
1749  break;
1750 
1751  case EST_Ngrammar::dense:
1752  {
1753  EST_IVector tmp(words.n());
1754  int i;
1755  for(i=0;i<p_order-1;i++)
1756  {
1757  tmp[i] = wordlist_index(words(i));
1758  if (tmp(i) == -1) break;
1759  }
1760  tmp[i] = pred_vocab->index(words(i));
1761  if (tmp(i) == -1) break;
1762  return p_states[find_dense_state_index(tmp)];
1763  }
1764  break;
1765 
1766  case EST_Ngrammar::backoff:
1767  cerr << "find_state_const: not valid in backoff mode !" << endl;
1768  break;
1769 
1770  default:
1771  cerr << "find_state: unknown ngrammar representation" << endl;
1772  break;
1773  }
1774  return p_states[0];
1775 }
1776 
1777 
1778 EST_NgrammarState &EST_Ngrammar::find_state(const EST_IVector &words)
1779 {
1780  switch(p_representation)
1781  {
1782  case EST_Ngrammar::sparse:
1783  // return p_states[sparse_representation.find_state(words)];
1784  return p_states[0];
1785  break;
1786 
1787  case EST_Ngrammar::dense:
1788  return p_states[find_dense_state_index(words)];
1789  break;
1790 
1791  case EST_Ngrammar::backoff:
1792  cerr << "find_state: not valid in backoff mode !" << endl;
1793  break;
1794 
1795  default:
1796  cerr << "find_state: unknown ngrammar representation" << endl;
1797  break;
1798  }
1799  return p_states[0];
1800 }
1801 
1802 
1803 const EST_NgrammarState &
1804 EST_Ngrammar::find_state_const(const EST_IVector &words) const
1805 {
1806  switch(p_representation)
1807  {
1808  case EST_Ngrammar::sparse:
1809  // return p_states[sparse_representation.find_state(words)];
1810  return p_states[0];
1811  break;
1812  case EST_Ngrammar::dense:
1813  return p_states[find_dense_state_index(words)];
1814  break;
1815 
1816  case EST_Ngrammar::backoff:
1817  cerr << "find_state_const: not valid in backoff mode !" << endl;
1818  break;
1819 
1820  default:
1821  cerr << "find_state: unknown ngrammar representation" << endl;
1822  break;
1823  }
1824 
1825  return p_states[0];
1826 }
1827 
1828 
1829 bool EST_Ngrammar::set_representation(EST_Ngrammar::representation_t new_representation)
1830 {
1831 
1832  if (new_representation == p_representation)
1833  return true;
1834 
1835  if (new_representation == EST_Ngrammar::sparse)
1836  return sparse_to_dense();
1837  else if (new_representation == EST_Ngrammar::dense)
1838  return dense_to_sparse();
1839  else
1840  {
1841  cerr << "set_representation: unknown ngrammar representation" << endl;
1842  return FALSE;
1843  }
1844 }
1845 
1846 double EST_Ngrammar::probability(const EST_StrVector &words, bool force, const bool trace) const
1847 {
1848  // Probability of this ngram
1849  (void)force;
1850 
1851  switch(p_representation)
1852  {
1853  case EST_Ngrammar::sparse:
1854  case EST_Ngrammar::dense:
1855  return find_state_const(words).probability(lastword(words));
1856  break;
1857 
1858  case EST_Ngrammar::backoff:
1859  return backoff_probability(words,trace);
1860  break;
1861 
1862  default:
1863  cerr << "probability: unknown ngrammar representation" << endl;
1864  return -1;
1865  break;
1866  }
1867 }
1868 
1869 double EST_Ngrammar::frequency(const EST_StrVector &words, bool force, const bool trace) const
1870 {
1871  // Frequency of this ngram
1872  (void)force;
1873 
1874  switch(p_representation)
1875  {
1876  case EST_Ngrammar::sparse:
1877  case EST_Ngrammar::dense:
1878  return find_state_const(words).frequency(lastword(words));
1879  break;
1880 
1881  case EST_Ngrammar::backoff:
1882  return backoff_probability(words,trace); // can't do freqs
1883  break;
1884 
1885  default:
1886  cerr << "probability: unknown ngrammar representation" << endl;
1887  return -1;
1888  break;
1889  }
1890 }
1891 
1892 const EST_String &EST_Ngrammar::predict(const EST_StrVector &words,
1893  double *prob,int *state) const
1894 {
1895  // What's the most probable word given this list of words
1896 
1897  switch(p_representation)
1898  {
1899  case EST_Ngrammar::sparse:
1900  case EST_Ngrammar::dense:
1901  {
1902  const EST_NgrammarState &s = find_state_const(words);
1903  *state = s.id();
1904  return s.most_probable(prob);
1905  }
1906  break;
1907 
1908  case EST_Ngrammar::backoff:
1909  state=NULL; // there are no states per se
1910  return backoff_most_probable(words,prob);
1911  break;
1912 
1913  default:
1914  cerr << "probability: unknown ngrammar representation" << endl;
1915  return EST_String::Empty;
1916  break;
1917  }
1918 }
1919 
1920 const EST_String &EST_Ngrammar::predict(const EST_IVector &words,
1921  double *prob,int *state) const
1922 {
1923  // What's the most probable word given this list of words
1924 
1925  switch(p_representation)
1926  {
1927  case EST_Ngrammar::sparse:
1928  case EST_Ngrammar::dense:
1929  {
1930  const EST_NgrammarState &s = find_state_const(words);
1931  *state = s.id();
1932  return s.most_probable(prob);
1933  }
1934  break;
1935 
1936  case EST_Ngrammar::backoff:
1937  cerr << "probability: IVector access to backoff not supported" << endl;
1938  return EST_String::Empty;
1939  break;
1940 
1941  default:
1942  cerr << "probability: unknown ngrammar representation" << endl;
1943  return EST_String::Empty;
1944  break;
1945  }
1946 }
1947 
1948 int EST_Ngrammar::find_state_id(const EST_StrVector &words) const
1949 {
1950  switch(p_representation)
1951  {
1952  case EST_Ngrammar::sparse:
1953  case EST_Ngrammar::dense:
1954  {
1955  const EST_NgrammarState &s = find_state_const(words);
1956  return s.id();
1957  }
1958  default:
1959  cerr << "Ngrammar: representation doesn't support states" << endl;
1960  return 0;
1961  break;
1962  }
1963 }
1964 
1965 int EST_Ngrammar::find_state_id(const EST_IVector &words) const
1966 {
1967  switch(p_representation)
1968  {
1969  case EST_Ngrammar::sparse:
1970  case EST_Ngrammar::dense:
1971  {
1972  const EST_NgrammarState &s = find_state_const(words);
1973  return s.id();
1974  }
1975  default:
1976  cerr << "Ngrammar: representation doesn't support states" << endl;
1977  return 0;
1978  break;
1979  }
1980 }
1981 
1982 EST_String EST_Ngrammar::get_vocab_word(int i) const
1983 {
1984  if (vocab)
1985  return vocab->name(i);
1986  else
1987  return NOVOCAB;
1988 }
1989 
1990 int EST_Ngrammar::get_vocab_word(const EST_String &s) const
1991 {
1992  int index = vocab->name(s);
1993  return index;
1994 }
1995 
1996 double EST_Ngrammar::reverse_probability(const EST_StrVector &words,
1997  bool force) const
1998 {
1999  // Whats the probability of this ngram-1 given last word in ngram
2000  (void)force;
2001 
2002  switch(p_representation)
2003  {
2004  case EST_Ngrammar::sparse:
2005  case EST_Ngrammar::dense:
2006  {
2007  const EST_NgrammarState &s = find_state_const(words);
2008  // need number of occurrences of words[p_order-1]
2009  return s.frequency(lastword(words))/
2010  vocab_pdf.frequency(lastword(words));
2011  }
2012  break;
2013 
2014  case EST_Ngrammar::backoff:
2015  return backoff_reverse_probability(words);
2016  break;
2017 
2018  default:
2019  cerr << "probability: unknown ngrammar representation" << endl;
2020  return -1;
2021  break;
2022  }
2023 }
2024 
2025 double EST_Ngrammar::reverse_probability(const EST_IVector &words,
2026  bool force) const
2027 {
2028  // Whats the probability of this ngram-1 given last word in ngram
2029  (void)force;
2030 
2031  switch(p_representation)
2032  {
2033  case EST_Ngrammar::sparse:
2034  case EST_Ngrammar::dense:
2035  {
2036  const EST_NgrammarState &s = find_state_const(words);
2037  // need number of occurrences of words[p_order-1]
2038  return s.frequency(lastword(words))/
2039  vocab_pdf.frequency(lastword(words));
2040  }
2041  break;
2042 
2043  case EST_Ngrammar::backoff:
2044  cerr << "probability: reverse prob unavailable for backoff ngram"
2045  << endl;
2046  return -1;
2047  break;
2048 
2049  default:
2050  cerr << "probability: unknown ngrammar representation" << endl;
2051  return -1;
2052  break;
2053  }
2054 }
2055 
2057 EST_Ngrammar::prob_dist(int state) const
2058 {
2059  return p_states[state].pdf_const();
2060 }
2061 
2063 EST_Ngrammar::prob_dist(const EST_StrVector &words) const
2064 {
2065 
2066  switch(p_representation)
2067  {
2068  case EST_Ngrammar::sparse:
2069  case EST_Ngrammar::dense:
2070  {
2071  const EST_NgrammarState &s = find_state_const(words);
2072  return s.pdf_const();
2073  }
2074  break;
2075 
2076  case EST_Ngrammar::backoff:
2077  return backoff_prob_dist(words);
2078  break;
2079 
2080  default:
2081  cerr << "probability: unknown ngrammar representation" << endl;
2082  return PSTnullProbDistribution;
2083  break;
2084  }
2085 }
2086 
2088 EST_Ngrammar::prob_dist(const EST_IVector &words) const
2089 {
2090 
2091  switch(p_representation)
2092  {
2093  case EST_Ngrammar::sparse:
2094  case EST_Ngrammar::dense:
2095  {
2096  const EST_NgrammarState &s = find_state_const(words);
2097  return s.pdf_const();
2098  }
2099  break;
2100 
2101  case EST_Ngrammar::backoff:
2102  cerr << "probability: unsupport IVector access of backoff ngram" << endl;
2103  return PSTnullProbDistribution;
2104  break;
2105 
2106  default:
2107  cerr << "probability: unknown ngrammar representation" << endl;
2108  return PSTnullProbDistribution;
2109  break;
2110  }
2111 }
2112 
2113 EST_read_status
2114 EST_Ngrammar::load(const EST_String &filename)
2115 {
2116 
2117  EST_read_status r_val;
2118 
2119  //if ((r_val = load_ngram_htk_ascii(filename, *this)) != wrong_format)
2120  //return r_val;
2121  //if ((r_val = load_ngram_htk_binary(filename, *this)) != wrong_format)
2122  //return r_val;
2123  if ((r_val = load_ngram_cstr_ascii(filename, *this)) != wrong_format)
2124  return r_val;
2125  if ((r_val = load_ngram_cstr_bin(filename, *this)) != wrong_format)
2126  return r_val;
2127 
2128  // maybe the file is compressed ?
2129  EST_Pathname fname(filename);
2130  EST_String tmp_fname("");
2131 
2132  // crude but effective
2133  if (fname.extension() == GZIP_FILENAME_EXTENSION)
2134  tmp_fname = uncompress_file_to_temporary(filename,
2135  "gzip --decompress --stdout");
2136  else if (fname.extension() == COMPRESS_FILENAME_EXTENSION)
2137  tmp_fname = uncompress_file_to_temporary(filename,"uncompress -c");
2138 
2139  if(tmp_fname != "")
2140  {
2141  r_val = load(tmp_fname);
2142  delete_file(tmp_fname);
2143  return r_val;
2144  }
2145  else
2146  return misc_read_error;
2147 
2148  cerr << "EST_Ngrammar::load can't determine ngrammar file type for input file " << filename << endl;
2149  return wrong_format;
2150 }
2151 
2152 EST_read_status
2153 EST_Ngrammar::load(const EST_String &filename, const EST_StrList &wordlist)
2154 {
2155 
2156  // for backed off grammars
2157  // need a wordlist to load ARPA format
2158 
2159  EST_read_status r_val;
2160 
2161  if ((r_val = load_ngram_arpa(filename, *this, wordlist)) != wrong_format)
2162  return r_val;
2163 
2164  // try other types, then check wordlist fits
2165  //if ((r_val = load_ngram_htk_ascii(filename, *this)) != wrong_format)
2166  //return r_val;
2167  //if ((r_val = load_ngram_htk_binary(filename, *this)) != wrong_format)
2168  //return r_val;
2169  if ((r_val = load_ngram_cstr_ascii(filename, *this)) != wrong_format)
2170  {
2171  if ((r_val == format_ok) && check_vocab(wordlist))
2172  return r_val;
2173  else
2174  {
2175  cerr << "Wordlist file does not match grammar wordlist !" << endl;
2176  return misc_read_error;
2177  }
2178  }
2179 
2180  if ((r_val = load_ngram_cstr_bin(filename, *this)) != wrong_format)
2181  {
2182  if ((r_val == format_ok) && check_vocab(wordlist))
2183  return r_val;
2184  else
2185  {
2186  cerr << "Wordlist does not match grammar !" << endl;
2187  return misc_read_error;
2188  }
2189  }
2190 
2191 
2192  cerr << "EST_Ngrammar::load can't determine ngrammar file type for input file " << filename << endl;
2193  return wrong_format;
2194 }
2195 
2196 
2197 void
2198 EST_Ngrammar::make_htk_compatible()
2199 {
2200 
2201  cerr << "EST_Ngrammar::make_htk_compatible() not written yet." << endl;
2202  return;
2203 }
2204 
2205 EST_write_status
2206 EST_Ngrammar::save(const EST_String &filename, const EST_String type,
2207  const bool trace,double floor)
2208 {
2209 
2210  if (type == "")
2211  return save(filename,"cstr_ascii",false,floor); // choose default type
2212  if (type == "htk_ascii")
2213  return save_ngram_htk_ascii(filename, *this, floor);
2214  //if (type == "htk_binary")
2215  //return save_htk_binary(filename, *this);
2216  if (type == "arpa")
2217  return save_ngram_arpa(filename, *this);
2218  if (type == "cstr_ascii")
2219  return save_ngram_cstr_ascii(filename, *this,trace,floor);
2220  if (type == "cstr_bin")
2221  return save_ngram_cstr_bin(filename, *this, trace,floor);
2222  if (type == "wfst")
2223  return save_ngram_wfst(filename, *this);
2224 
2225  cerr << "EST_Ngrammar::save unknown output file type " << type << endl;
2226  return write_fail;
2227 }
2228 
2229 void EST_Ngrammar::iterate(EST_StrVector &words,
2230  void (*function)(EST_Ngrammar *n,
2231  EST_StrVector &words,
2232  void *params),
2233  void *params)
2234 {
2235  int i,j=-1;
2236  EST_String tmp;
2237 
2238  // find next item in ngram to fill in
2239  for(i=0;i<words.n();i++)
2240  if (words[i] == "")
2241  {
2242  j=i;
2243  break;
2244  }
2245 
2246  if (j==-1)
2247  {
2248  // all filled in, so do the function
2249  (*function)(this,words,params);
2250 
2251  }
2252  else
2253  {
2254 
2255  tmp = words(j);
2256  if (j==p_order-1) // final position - use pred vocab
2257  {
2258  for(i=0;i<pred_vocab->length();i++){
2259  words[j] = pred_vocab->name(i);
2260  iterate(words,function,params);
2261  }
2262 
2263  }
2264  else
2265  {
2266  for(i=0;i<vocab->length();i++){
2267  words[j] = vocab->name(i);
2268  iterate(words,function,params);
2269  }
2270  }
2271  words[j] = tmp;
2272  }
2273 }
2274 
2275 void EST_Ngrammar::const_iterate(EST_StrVector &words,
2276  void (*function)(const EST_Ngrammar *const n,
2277  EST_StrVector &words,
2278  void *params),
2279  void *params) const
2280 {
2281  int i,j=-1;
2282  EST_String tmp;
2283 
2284  // find next item in ngram to fill in
2285  for(i=0;i<words.n();i++)
2286  if (words[i] == "")
2287  {
2288  j=i;
2289  break;
2290  }
2291 
2292  if (j==-1)
2293  {
2294  // all filled in, so do the function
2295  (*function)(this,words,params);
2296 
2297  }
2298  else
2299  {
2300 
2301  tmp = words(j);
2302  if (j==p_order-1) // final position - use pred vocab
2303  {
2304  for(i=0;i<pred_vocab->length();i++){
2305  words[j] = pred_vocab->name(i);
2306  const_iterate(words,function,params);
2307  }
2308 
2309  }
2310  else
2311  {
2312  for(i=0;i<vocab->length();i++){
2313  words[j] = vocab->name(i);
2314  const_iterate(words,function,params);
2315  }
2316  }
2317  words[j] = tmp;
2318  }
2319 }
2320 
2321 void EST_Ngrammar::print_freqs(ostream &os,double floor)
2322 {
2323 
2324  if (p_representation == EST_Ngrammar::backoff)
2325  backoff_representation->print_freqs(os,p_order);
2326  else
2327  {
2328  int i,j;
2329  EST_Litem *k;
2330  EST_IVector window(p_order-1);
2331 
2332  for (i=0; i < p_num_states; i++)
2333  {
2334  // print out each ngram : freq
2335  for (k=p_states[i].pdf().item_start();
2336  !p_states[i].pdf().item_end(k);
2337  k = p_states[i].pdf().item_next(k))
2338  {
2339  double freq;
2340  EST_String name;
2341  int ind = i;
2342  p_states[i].pdf().item_freq(k,name,freq);
2343  if (freq == 0)
2344  freq = floor;
2345  if (freq > 0)
2346  {
2347  for (j = p_order-2; j >= 0; j--)
2348  {
2349  window[j] = ind%vocab->length();
2350  ind /= vocab->length();
2351  }
2352  for (j = 0; j < p_order-1; j++)
2353  os << wordlist_index(window(j)) << " ";
2354  os << name << " : " << freq << endl;
2355  }
2356  }
2357  }
2358  }
2359 }
2360 
2362 EST_Ngrammar::backoff_prob_dist(const EST_StrVector &words) const
2363 {
2364  // need to make this on the fly
2365  // by looking at all possible words in the final
2366  // position
2367 
2368  int i,j;
2369  EST_StrVector ngram;
2370  ngram.resize(words.n()+1);
2371  for(i=0;i<words.n();i++)
2372  ngram[i] = words(i);
2373 
2375 
2376  for(j=0;j<get_pred_vocab_length();j++)
2377  {
2378  ngram[ngram.n()-1] = get_pred_vocab_word(j);
2379  double tmp = backoff_probability(ngram,false);
2380  p->set_frequency(j,tmp); // actually probability
2381  }
2382  p->set_num_samples(1.0); // we can't do it in frequencies
2383 
2384  return *p;
2385 }
2386 
2387 const double EST_Ngrammar::get_backoff_discount(const int order, const double freq) const
2388 {
2389  if(order > p_order)
2390  {
2391  cerr << "order too great in EST_Ngrammar::get_backoff_discount" << endl;
2392  return 0;
2393  }
2394 
2395  else if( (int)freq < backoff_discount[order-1].n())
2396  return backoff_discount[order-1]((int)freq);
2397 
2398  else
2399  return 0;
2400 }
2401 
2402 const double EST_Ngrammar::backoff_probability(const EST_StrVector &words,
2403  const bool trace) const
2404 {
2405  const EST_BackoffNgrammarState *state;
2406  int i;
2407  EST_StrVector new_ngram;
2408  double f=0,f2=0;
2409 
2410 
2411  if (trace)
2412  {
2413  cerr << "backoff_probability( ";
2414  for(i=0;i<words.n();i++)
2415  cerr << words(i) << " ";
2416  cerr << ") ";
2417  }
2418 
2419  // are we down to the unigram ?
2420  if (words.n() == 1)
2421  {
2422  if (trace)
2423  cerr << "unigram " << backoff_representation->probability(words(0))
2424  << endl;
2425 
2426  f=backoff_representation->frequency(words(0));
2427 
2428  // it seems outrageous, but use a floor because unigram
2429  // probs of zero mess up backing off
2430  if(f>0)
2431  return f / backoff_representation->pdf_const().samples();
2432  else
2433  return backoff_unigram_floor_freq / backoff_representation->pdf_const().samples();
2434  }
2435 
2436  // the first n-1 words in the ngram -- deeply inefficient at the moment !
2437  // should pass separately
2438  new_ngram.resize(words.n()-1);
2439  for(i=0;i<new_ngram.n();i++)
2440  new_ngram[i] = words(i);
2441 
2442  // see if we have the ngram
2443  state=backoff_representation->get_state(words);
2444 
2445  if( (state != NULL) &&
2446  ((f=state->frequency(words(0))) > backoff_threshold) )
2447  {
2448 
2449  //if (trace)
2450  //cerr << "in state " << state << " = " << *state << endl << endl;
2451 
2452 
2453  // because f>0, the freq of new_ngram must be non-zero too
2454 
2455  // special case - we don't have a freq for !ENTER (or whatever it's called)
2456  // so use the no. of sentences used to build this grammar
2457  if((new_ngram(0) == p_sentence_start_marker) ||
2458  (new_ngram(0) == p_sentence_end_marker) )
2459  {
2460  f2 = p_number_of_sentences;
2461  if (trace)
2462  cerr << "special freq used : " << f2 << endl;
2463  }
2464  else
2465  {
2466  state=backoff_representation->get_state(new_ngram);
2467  if (state == NULL)
2468  {
2469  cerr << "Something went horribly wrong !" << endl;
2470  return -1;
2471  }
2472  // state->frequency(new_ngram(0)) is the number of times
2473  // we saw new_ngram
2474 
2475  f2=state->frequency(new_ngram(0));
2476 
2477  if (trace)
2478  {
2479  //cerr << "n-1 state is " << *state << endl;
2480  cerr << " using freq for " << new_ngram(0) << " of " << f2 << endl;
2481  }
2482  }
2483 
2484  if (trace)
2485  {
2486 
2487  cerr << " ..... got (" << f << " - "
2488  << get_backoff_discount(state->level()+1,f)
2489  << ")/" << f2 << " = "
2490  << (f - get_backoff_discount(state->level()+1,f) ) / f2
2491  << endl;
2492  }
2493  return (f - get_backoff_discount(state->level()+1,f) ) / f2;
2494  }
2495 
2496  else // back off
2497  {
2498 
2499  double bo_wt = get_backoff_weight(new_ngram);
2500 
2501  for(i=0;i<new_ngram.n();i++)
2502  new_ngram[i] = words(i+1);
2503 
2504  if (trace)
2505  {
2506  cerr << "backed off(" << bo_wt << ") to (";
2507  for(i=0;i<new_ngram.n();i++)
2508  cerr << new_ngram(i) << " ";
2509  cerr << ") ";
2510  }
2511 
2512  return bo_wt * backoff_probability(new_ngram,trace);
2513  }
2514 
2515  // should never reach here !
2516  // but gcc thinks it does
2517  cerr << "oops !";
2518  return -1;
2519 
2520 }
2521 
2522 
2523 const double
2524 EST_Ngrammar::backoff_reverse_probability_sub(const EST_StrVector &words,
2525  const EST_BackoffNgrammarState *root) const
2526 {
2527 
2528  // so similar to backoff prob - should combine
2529  // to do ......
2530 
2531  const EST_BackoffNgrammarState *state;
2532  int i;
2533  EST_StrVector new_ngram;
2534  double f=0;
2535 
2536  /*
2537  cerr << "backoff_rev_probability_sub( ";
2538  for(i=0;i<words.n();i++)
2539  cerr << words(i) << " ";
2540  cerr << ") ";
2541  */
2542  // are we down to the unigram ?
2543  if (words.n() == 1)
2544  {
2545  //cerr << "unigram " << root->probability(words(0))
2546  //<< endl;
2547  return root->probability(words(0));
2548  }
2549 
2550  // the first n-1 words in the ngram -- deeply inefficient at the moment !
2551  // should pass separately
2552  new_ngram.resize(words.n()-1);
2553  for(i=0;i<new_ngram.n();i++)
2554  new_ngram[i] = words(i);
2555 
2556  // see if we have the ngram
2557  state=root->get_state(words);
2558 
2559 
2560  if( (state != NULL) &&
2561  ((f=state->frequency(words(0))) > 0) )
2562  {
2563  // because f>0, the freq of new_ngram must be non-zero too
2564  state=root->get_state(new_ngram);
2565  if (state == NULL)
2566  {
2567  cerr << "Something went horribly wrong !" << endl;
2568  return -1;
2569  }
2570  // state->frequency(new_ngram(0)) is the number of times
2571  // we saw new_ngram
2572  //cerr << "got " << f << "/" << state->frequency(new_ngram(0)) << endl;
2573  return f / state->frequency(new_ngram(0));
2574  }
2575 
2576  else // back off
2577  {
2578 
2579  double bo_wt = root->get_backoff_weight(new_ngram);
2580  //double bo_wt = root->get_backoff_weight(words);
2581 
2582  for(i=0;i<new_ngram.n();i++)
2583  new_ngram[i] = words(i+1);
2584 
2585  //cerr << "backed off(" << bo_wt << ") ";
2586  return bo_wt * backoff_reverse_probability_sub(new_ngram,root);
2587  }
2588 
2589  // should never reach here !
2590  // but gcc thinks it does
2591  cerr << "oops !";
2592  return -1;
2593 
2594 }
2595 
2596 const double
2597 EST_Ngrammar::backoff_reverse_probability(const EST_StrVector &words) const
2598 {
2599 
2600  // probability of words 1...n-1 , given the last word
2601  // easier to compute from the backoff tree structure than
2602  // 'normal' probability
2603 
2604  // find level 1 child corresponding to history before last word
2605  EST_BackoffNgrammarState *state;
2606  state = backoff_representation->get_child(words(words.n()-1));
2607 
2608  if(state == NULL)
2609  {
2610  // predictee isn't there so ......... ???
2611  return 0;
2612  }
2613 
2614  // now find backoff probability of words 0...n-2
2615  // starting from this state
2616  return backoff_reverse_probability_sub(words,state);
2617 
2618 }
2619 
2620 const EST_String &
2621 EST_Ngrammar::backoff_most_probable(const EST_StrVector &words,
2622  double *prob) const
2623 {
2624  return backoff_prob_dist(words).most_probable(prob);
2625 }
2626 
2627 void slide(EST_IVector &v, const int l)
2628 {
2629  int i;
2630 
2631  // slide elements by 'l' without wraparound
2632 
2633  if (l==0)
2634  return;
2635 
2636  else if (l<0)
2637  {
2638  // slide left
2639 
2640  for(i=0;i<v.n()+l;i++)
2641  v[i] = v(i-l);
2642 
2643  for(;i<v.n();i++)
2644  v[i] = 0;
2645 
2646  }
2647  else
2648  {
2649  // slide right
2650 
2651  for(i=v.n()-1;i>=l;i--)
2652  v[i] = v(i-l);
2653 
2654  for(;i>=0;i--)
2655  v[i] = 0;
2656  }
2657 }
2658 
2659 void
2660 EST_Ngrammar::backoff_traverse(EST_BackoffNgrammarState *start_state,
2661  void (*function)(EST_BackoffNgrammarState *s,
2662  void *params),
2663  void *params)
2664 {
2665 
2666  // apply the function to this node
2667  function(start_state,params);
2668 
2669  // and recurse down the tree
2670  EST_Litem *k;
2671  double freq;
2672  EST_String name;
2673  for (k=start_state->pdf_const().item_start();
2674  !start_state->pdf_const().item_end(k);
2675  k = start_state->pdf_const().item_next(k))
2676  {
2677  start_state->pdf_const().item_freq(k,name,freq);
2678  EST_BackoffNgrammarState *child = start_state->get_child(name);
2679  if (child!=NULL)
2680  backoff_traverse(child,function,params);
2681 
2682  }
2683 }
2684 
2685 void
2686 EST_Ngrammar::backoff_traverse(EST_BackoffNgrammarState *start_state,
2687  void (*function)(EST_BackoffNgrammarState *s,
2688  void *params),
2689  void *params,
2690  const int level)
2691 {
2692  // apply the function to this node, if level is correct
2693  if (start_state->level() == level)
2694  {
2695  function(start_state,params);
2696  }
2697  else if (start_state->level() < level)
2698  {
2699  // and recurse down the tree if we haven't
2700  // reached the level yet
2701  EST_Litem *k;
2702  double freq;
2703  EST_String name;
2704 
2705  for (k=start_state->pdf_const().item_start();
2706  !start_state->pdf_const().item_end(k);
2707  k = start_state->pdf_const().item_next(k))
2708  {
2709  start_state->pdf_const().item_freq(k,name,freq);
2710  EST_BackoffNgrammarState *child = start_state->get_child(name);
2711  if (child!=NULL)
2712  backoff_traverse(child,function,params,level);
2713 
2714  }
2715 
2716 
2717  }
2718 }
2719 void
2720 merge_other_grammar(EST_Ngrammar *n, EST_StrVector &ngram, void *params)
2721 {
2722 
2723  EST_Ngrammar *other_n = (EST_Ngrammar*)((void**)params)[0];
2724  float *weight = (float*)((void**)params)[1];
2725 
2726  if(other_n->ngram_exists(ngram))
2727  n->accumulate(ngram,*weight * other_n->frequency(ngram));
2728 
2729 }
2730 
2731 bool
2732 EST_Ngrammar::merge(EST_Ngrammar &n,float weight)
2733 {
2734  EST_StrVector words;
2735  words.resize(p_order);
2736 
2737  void **params = new void*[2];
2738  params[0] = (void*)&n;
2739  params[1] = (void*)&weight;
2740 
2741  iterate(words,&merge_other_grammar,(void*)params);
2742 
2743  delete [] params;
2744  return true;
2745 }
2746 
2747 
2748 void slide(EST_StrVector &v, const int l)
2749 {
2750  // slide elements by 'l' without wraparound
2751  int i;
2752 
2753  if (l==0)
2754  return;
2755 
2756  else if (l<0)
2757  {
2758  // slide left
2759 
2760  for(i=0;i<v.n()+l;i++)
2761  v[i] = v(i-l);
2762 
2763  for(;i<v.n();i++)
2764  v[i] = "";
2765 
2766  }
2767  else
2768  {
2769  // slide right
2770 
2771  for(i=v.n()-1;i>=l;i--)
2772  v[i] = v(i-l);
2773 
2774  for(;i>=0;i--)
2775  v[i] = "";
2776 
2777  }
2778 }
2779