49 #include "EST_Ngrammar.h"
50 #include "EST_Pathname.h"
51 #include "EST_Token.h"
52 #include "EST_io_aux.h"
63 init(s.id(),s.pdf_const());
69 init(s->id(),s->pdf_const());
72 EST_NgrammarState::~EST_NgrammarState()
77 void EST_NgrammarState::clear()
83 void EST_NgrammarState::init()
105 s <<
"(" << a.id() <<
": " << a.pdf_const() <<
" )";
111 EST_BackoffNgrammarState::~EST_BackoffNgrammarState()
118 void EST_BackoffNgrammarState::clear()
124 void EST_BackoffNgrammarState::init()
130 void EST_BackoffNgrammarState::init(
const EST_Discrete *d,
int level)
144 bool EST_BackoffNgrammarState::accumulate(
const EST_StrVector &words,
161 p_pdf.cumulate(words(words.
n()-1-p_level),count);
164 if (words.
n()-1-p_level > 0)
170 s = get_child(words(words.
n()-1-p_level));
173 s = add_child(p_pdf.get_discrete(),words);
174 return s->accumulate(words,count);
183 bool EST_BackoffNgrammarState::accumulate(
const EST_IVector &words,
201 p_pdf.cumulate(words(words.
n()-1-p_level),count);
207 if (words.
n()-1-p_level > 0)
213 s = get_child(words(words.
n()-1-p_level));
216 s = add_child(p_pdf.get_discrete(),words);
219 s = get_child(words(words.
n()-1-p_level));
222 cerr <<
"Failed to extend tree - unknown reason !" << endl;
225 return s->accumulate(words,count);
235 EST_BackoffNgrammarState::add_child(
const EST_Discrete *d,
240 if (words.
n()-1-p_level > 0)
243 s = get_child(words(words.
n()-1-p_level));
245 return s->add_child(d,words);
250 new_child->init(d,p_level+1);
251 children.add(words(words.
n()-1-p_level), (
void*)new_child);
252 return new_child->add_child(d,words);
265 EST_BackoffNgrammarState::add_child(
const EST_Discrete *d,
270 if (words.
n()-1-p_level > 0)
273 s = get_child(words(words.
n()-1-p_level));
275 return s->add_child(d,words);
280 new_child->init(d,p_level+1);
281 children.add(p_pdf.get_discrete()->name(words(words.
n()-1-p_level)), (
void*)new_child);
282 return new_child->add_child(d,words);
298 children.add(name,NULL);
302 void EST_BackoffNgrammarState::print_freqs(ostream &os,
312 for (k=p_pdf.item_start();
314 k = p_pdf.item_next(k))
316 p_pdf.item_freq(k,name,freq);
318 if (p_level==order-1)
321 os << name <<
" " << followers
322 <<
": " << freq << endl;
325 s->print_freqs(os,order,name+
" "+followers);
330 bool EST_BackoffNgrammarState::ngram_exists(
const EST_StrVector &words,
331 const double threshold)
const
334 s = get_state(words);
339 return (
bool)((s->level()==0) ||
340 ( s->frequency(words(0)) > threshold ));
347 EST_BackoffNgrammarState::get_state(
const EST_StrVector &words)
const
350 if (words.
n()-1-p_level > 0)
352 s = get_child(words(words.
n()-1-p_level));
357 return s->get_state(words);
372 void EST_BackoffNgrammarState::zap()
379 for (k=p_pdf.item_start();
381 k = p_pdf.item_next(k))
383 p_pdf.item_freq(k,name,freq);
386 remove_child(child,name);
395 const double EST_BackoffNgrammarState::get_backoff_weight(
const EST_StrVector &words)
const
398 if (words.
n()-1-p_level >= 0)
400 s = get_child(words(words.
n()-1-p_level));
402 return s->get_backoff_weight(words);
421 return backoff_weight;
426 bool EST_BackoffNgrammarState::set_backoff_weight(
const EST_StrVector &words,
const double w)
429 if (words.
n()-1-p_level >= 0)
431 s = get_child(words(words.
n()-1-p_level));
433 return s->set_backoff_weight(words,w);
441 cerr <<
"Couldn't set weight for " << words
442 <<
" to " << w << endl;
457 void EST_BackoffNgrammarState::frequency_of_frequencies(
EST_DVector &ff)
463 for (k=p_pdf.item_start();
465 k = p_pdf.item_next(k))
467 p_pdf.item_freq(k,name,freq);
469 ff[(int)(freq+0.5)] += 1;
475 s <<
"(backoff level:" << a.p_level
476 <<
" weight:" << a.backoff_weight <<
" " << a.pdf_const() <<
" )";
483 void EST_Ngrammar::default_values()
485 p_representation = EST_Ngrammar::sparse;
486 p_entry_type = EST_Ngrammar::frequencies;
487 sparse_representation.clear();
494 backoff_threshold = 1.0;
495 backoff_unigram_floor_freq = 0.0;
498 EST_Ngrammar::~EST_Ngrammar()
503 void EST_Ngrammar::clear()
508 bool EST_Ngrammar::init(
int o, EST_Ngrammar::representation_t r,
511 return (
bool)(init_vocab(wordlist) && p_init(o,r));
514 bool EST_Ngrammar::init(
int o, EST_Ngrammar::representation_t r,
518 return (
bool)(init_vocab(wordlist,predlist) && p_init(o,r));
521 bool EST_Ngrammar::init(
int o, EST_Ngrammar::representation_t r,
526 vocab_pdf.init(pred_vocab);
530 bool EST_Ngrammar::init(
int o, EST_Ngrammar::representation_t r,
535 vocab_pdf.
init(pred_vocab);
539 bool EST_Ngrammar::p_init(
int o, EST_Ngrammar::representation_t r)
543 cerr <<
"EST_Ngrammar order must be > 0" << endl;
548 p_representation = r;
549 p_number_of_sentences = 0;
551 switch(p_representation)
554 case EST_Ngrammar::sparse:
555 sparse_representation.init(p_order);
559 case EST_Ngrammar::dense:
560 return init_dense_representation();
563 case EST_Ngrammar::backoff:
564 return init_backoff_representation();
568 cerr <<
"Unknown internal representation requested for EST_Ngrammar"
575 bool EST_Ngrammar::init_dense_representation()
582 cerr <<
"EST_Ngrammar: dense_representation requires explicit vocab"
587 p_num_states = (int)pow(
float(vocab->
length()),
float(p_order-1));
589 for (i=0; i < p_num_states; i++)
590 p_states[i].init(i,pred_vocab);
595 bool EST_Ngrammar::init_sparse_representation()
600 cerr <<
"EST_Ngrammar: dense_representation requires explicit vocab"
605 p_num_states = (int)pow(
float(vocab->
length()),
float(p_order-1));
608 return (
bool)(p_states != NULL);
611 bool EST_Ngrammar::init_backoff_representation()
616 backoff_representation->init(vocab,0);
621 const EST_StrVector &EST_Ngrammar::make_ngram_from_index(
const int index)
const
630 for(i=p_order-2;i>=0;i--)
632 #if defined(sun) && ! defined(__svr4__)
634 int rem = ind%vocab->
length();
635 int quot = ind/vocab->
length();
636 (*ngram)[i] = wordlist_index(rem);
639 div_t d = div(ind,vocab->
length());
640 (*ngram)[i] = wordlist_index(d.rem);
651 bool EST_Ngrammar::init_vocab(
const EST_StrList &word_list)
654 if(!vocab->init(word_list))
658 vocab_pdf.init(pred_vocab);
660 return (
bool)(vocab != NULL);
663 bool EST_Ngrammar::init_vocab(
const EST_StrList &word_list,
667 if(!vocab->init(word_list))
670 if(!pred_vocab->init(pred_list))
672 vocab_pdf.init(pred_vocab);
674 return (
bool)(vocab != NULL);
677 bool EST_Ngrammar::check_vocab(
const EST_StrList &word_list)
681 if(!comp_vocab->
init(word_list))
687 if(*vocab != *comp_vocab)
697 const EST_String & EST_Ngrammar::wordlist_index(
int i)
const
699 return vocab->name(i);
702 int EST_Ngrammar::predlist_index(
const EST_String &word)
const
709 i = pred_vocab->index(word);
713 cerr <<
"Word \"" << word <<
"\" is not in the predictee word list" << endl;
717 i = pred_vocab->
index(OOV_MARKER);
721 cerr <<
"Even " << OOV_MARKER <<
" is not in the predictee word list !" << endl;
728 const EST_String & EST_Ngrammar::predlist_index(
int i)
const
730 return pred_vocab->name(i);
733 int EST_Ngrammar::wordlist_index(
const EST_String &word,
const bool report)
const
740 i = vocab->index(word);
745 cerr <<
"Word \"" << word <<
"\" is not in the word list" << endl;
749 i = vocab->index(OOV_MARKER);
753 cerr <<
"Even " << OOV_MARKER <<
" is not in the word list !" << endl;
759 bool EST_Ngrammar::build(
const EST_StrList &filenames,
769 p_sentence_start_marker = prev;
770 p_sentence_end_marker = last;
774 if( (p_representation == EST_Ngrammar::backoff) &&
775 (oov_mode !=
"skip_file") &&
776 (oov_mode !=
"skip_sentence"))
777 cerr <<
"Warning : building a backoff grammar" << endl
778 <<
" with oov_mode '" << oov_mode
779 <<
"' is not recommended !" << endl;
781 if( (oov_mode !=
"skip_ngram") &&
782 (oov_mode !=
"skip_sentence") &&
783 (oov_mode !=
"skip_file") &&
784 (oov_mode !=
"use_oov_marker") )
786 cerr <<
"Unknown oov_mode '" << oov_mode <<
"'" << endl;
790 if( (oov_mode ==
"skip_sentence") &&
791 (input_format ==
"ngram_per_line"))
793 cerr <<
"Sorry, with input format 'ngram_per_line' you cannot " << endl
794 <<
" select oov_mode 'skip_sentence'" << endl;
798 if(oov_mode ==
"use_oov_marker")
806 for (p = filenames.head(); p; p = p->next())
808 cerr <<
"Building from " << filenames(p) << endl;
811 if( ((oov_mode ==
"skip_sentence") &&
812 (input_format ==
"sentence_per_file")) ||
813 (oov_mode ==
"skip_file") )
814 skip_this = oov_preprocess(filenames(p),new_filename,
817 else if( ((oov_mode ==
"skip_sentence") &&
818 (input_format ==
"sentence_per_line")) ||
819 ((oov_mode ==
"skip_ngram") &&
820 (input_format ==
"ngram_per_line")) )
821 oov_preprocess(filenames(p),new_filename,
"eliminate lines");
824 new_filename = filenames(p);
829 cerr <<
"Skipping " << filenames(p)
830 <<
" (out of vocabulary words found)" << endl;
835 switch(p_representation)
837 case EST_Ngrammar::sparse:
839 if (input_format !=
"")
841 cerr <<
"Can't build sparse ngram from '" << input_format;
842 cerr <<
"' data" << endl;
845 else if (!build_sparse(new_filename,prev,prev_prev,last))
850 case EST_Ngrammar::dense:
851 if (!build_ngram(new_filename,prev,prev_prev,last,input_format))
855 case EST_Ngrammar::backoff:
856 if (!build_ngram(new_filename,prev,prev_prev,last,input_format))
861 cerr <<
"Unknown internal representation set for EST_Ngrammar"
868 if((new_filename != filenames(p)) &&
869 (new_filename !=
"") &&
870 !delete_file(new_filename) )
872 cerr <<
"Warning : couldn't remove temporary file : "
873 << new_filename << endl;
878 if (p_representation == EST_Ngrammar::backoff)
879 return compute_backoff_weights(mincount,maxcount);
887 if (words.
n() < p_order)
888 cerr <<
"EST_Ngrammar::accumulate - window is too small" << endl;
893 vocab_pdf.cumulate(w,count);
895 switch(p_representation)
897 case EST_Ngrammar::dense:
898 case EST_Ngrammar::sparse:
899 find_state(words).cumulate(w,count);
902 case EST_Ngrammar::backoff:
903 backoff_representation->accumulate(words,count);
907 cerr <<
"EST_Ngrammar::accumulate : invalid representation !"
914 void EST_Ngrammar::accumulate(
const EST_IVector &words,
928 if (words.
n() < p_order)
929 cerr <<
"EST_Ngrammar::accumulate - window is too small" << endl;
933 vocab_pdf.cumulate(words(p_order-1),count);
935 switch(p_representation)
938 case EST_Ngrammar::dense:
939 case EST_Ngrammar::sparse:
940 find_state(words).cumulate(words(p_order-1),count);
943 case EST_Ngrammar::backoff:
944 backoff_representation->accumulate(words,count);
948 cerr <<
"EST_Ngrammar::accumulate : invalid representation !"
956 bool EST_Ngrammar::ngram_exists(
const EST_StrVector &words)
const
958 switch(p_representation)
960 case EST_Ngrammar::sparse:
964 case EST_Ngrammar::dense:
968 case EST_Ngrammar::backoff:
971 return backoff_representation->ngram_exists(words,0);
973 return backoff_representation->ngram_exists(words,backoff_threshold);
978 cerr <<
"ngram_exists: unknown ngrammar representation" << endl;
984 bool EST_Ngrammar::ngram_exists(
const EST_StrVector &words,
const double threshold)
const
986 if (p_representation != EST_Ngrammar::backoff)
988 cerr <<
"Not a backoff grammar !" << endl;
992 return backoff_representation->ngram_exists(words,threshold);
997 const double EST_Ngrammar::get_backoff_weight(
const EST_StrVector &words)
const
999 if(p_representation == EST_Ngrammar::backoff)
1000 return backoff_representation->get_backoff_weight(words);
1003 cerr <<
"Can't get backoff weight - not a backed off ngrammar !" << endl;
1008 bool EST_Ngrammar::set_backoff_weight(
const EST_StrVector &words,
const double w)
1010 if(p_representation == EST_Ngrammar::backoff)
1011 return backoff_representation->set_backoff_weight(words,w);
1014 cerr <<
"Can't set backoff weight - not a backed off ngrammar !" << endl;
1020 bool EST_Ngrammar::build_sparse(
const EST_String &filename,
1025 sparse_representation.build(filename,prev,prev_prev,last);
1030 void EST_Ngrammar::fill_window_start(
EST_IVector &window,
1036 for (i=0; i<window.
n()-1; i++)
1037 window[i] = wordlist_index(prev_prev);
1038 window[i++] = wordlist_index(prev);
1047 for (i=0; i<window.
n()-1; i++)
1048 window[i] = prev_prev;
1052 bool EST_Ngrammar::oov_preprocess(
const EST_String &filename,
1059 int bad_line_count=0;
1060 int good_line_count=0;
1067 bool write_out =
false;
1068 if( (what ==
"eliminate lines") || (filename ==
"-") )
1072 if (filename ==
"-")
1074 if( ts.
open(stdin, FALSE) == -1)
1076 cerr <<
"EST_Ngrammar:: failed to open stdin";
1077 cerr <<
" for reading" << endl;
1081 else if (ts.
open(filename) == -1){
1082 cerr <<
"EST_Ngrammar: failed to open file \"" << filename
1083 <<
"\" for reading" << endl;
1090 new_filename = make_tmp_filename();
1091 ost =
new ofstream(new_filename);
1095 cerr <<
"Ngrammar: couldn't create temporary file \""
1096 << new_filename <<
"\"" << endl;
1102 new_filename = filename;
1105 bool bad_line=
false;
1108 s=ts.
get().string();
1110 if (!bad_line && (s !=
""))
1112 if(wordlist_index(s,
false) < 0)
1115 if(what ==
"eliminate lines")
1125 if(!delete_file(new_filename))
1126 cerr <<
"Warning : couldn't delete temporary file '"
1127 << new_filename <<
"'" << endl;
1135 this_line += s +
" ";
1150 *ost << this_line << endl;
1159 cerr <<
"skipped " << bad_line_count <<
" and kept "
1160 << good_line_count <<
" lines from file " << filename << endl;
1164 bool EST_Ngrammar::build_ngram(
const EST_String &filename,
1170 p_entry_type = EST_Ngrammar::frequencies;
1174 int eoln_is_eos = FALSE;
1175 int sliding_window = TRUE;
1179 if ( (input_format ==
"") || (input_format ==
"sentence_per_line") )
1183 sliding_window = TRUE;
1185 else if (input_format ==
"sentence_per_file")
1187 eoln_is_eos = FALSE;
1188 sliding_window = TRUE;
1189 p_number_of_sentences = 1;
1191 else if(input_format ==
"ngram_per_line")
1193 eoln_is_eos = FALSE;
1194 sliding_window = FALSE;
1195 p_number_of_sentences = 1;
1199 cerr <<
"Can't build from '" << input_format <<
"' data" << endl;
1205 if (filename ==
"-")
1207 if( ts.
open(stdin, FALSE) == -1)
1209 cerr <<
"EST_Ngrammar:: failed to open stdin";
1210 cerr <<
" for reading" << endl;
1214 else if (ts.
open(filename) == -1){
1215 cerr <<
"EST_Ngrammar: failed to open \"" << filename
1216 <<
"\" for reading" << endl;
1232 fill_window_start(window,prev,prev_prev);
1233 if (window(p_order-1) == -1)
1235 else if( (p_order>1) && (window(p_order-2) == -1))
1236 bad_word = p_order-1;
1241 cerr <<
"at start : bad word at " << bad_word << endl;
1246 s=ts.
get().string();
1256 window[p_order-1] = wordlist_index(s);
1257 if (window(p_order-1) < 0)
1259 cerr <<
"EST_Ngrammar::build_ngram " <<
1260 " word \"" << s <<
"\" is not in vocabulary, skipping"
1268 cerr <<
"not accumulating : bad word at " << bad_word;
1269 cerr <<
" window=" << window;
1278 if(count == p_order-1)
1279 window[count++] = predlist_index(s);
1281 window[count++] = wordlist_index(s);
1283 if (window(count-1) < 0)
1285 cerr <<
"EST_Ngrammar::build_ngram " <<
1286 " word \"" << s <<
"\" is not in vocabulary, skipping"
1292 cerr <<
"Too many items on line - ignoring trailing ones !" << endl;
1302 if((count == p_order) && bad_word == 0)
1307 else if (eoln_is_eos)
1310 if (window(p_order-1) != wordlist_index(last))
1311 p_number_of_sentences += 1;
1314 window[p_order-1] = wordlist_index(last);
1316 if(window(p_order-1) == -1)
1326 fill_window_start(window,prev,prev_prev);
1329 if (window(p_order-1) == -1)
1331 else if( (p_order>1) && (window(p_order-2) == -1) )
1332 bad_word = p_order-1;
1341 if ( sliding_window && (window(p_order-1) != wordlist_index(prev)))
1346 window[p_order-1] = wordlist_index(last);
1348 if (window(p_order-1) == -1)
1354 p_number_of_sentences += 1;
1360 cerr <<
"Accumulated " << p_number_of_sentences <<
" sentences." << endl;
1368 double sum1=0,sum2=0;
1372 new_ngram.
resize(ngram.
n()-1);
1373 for(i=0;i<new_ngram.
n();i++)
1374 new_ngram[i] = ngram(i);
1377 cerr <<
"computing backoff w for ";
1378 for(i=0;i<new_ngram.
n();i++)
1379 cerr << new_ngram(i) <<
" ";
1385 if (!n->ngram_exists(new_ngram))
1391 if (n->ngram_exists(new_ngram,0))
1393 if(!n->set_backoff_weight(new_ngram,1))
1394 cerr <<
"WARNING : couldn't set weight !" << endl;
1405 for(j=0;j<n->get_pred_vocab_length();j++)
1407 ngram[ngram.
n()-1] = n->get_pred_vocab_word(j);
1409 for(i=0;i<ngram.
n();i++)
1410 cerr << ngram(i) <<
" ";
1412 if (n->ngram_exists(ngram))
1414 cerr << n->probability(ngram) <<
" exists " << endl;
1416 sum1 += n->probability(ngram);
1425 tmp_ngram.
resize(ngram.
n()-1);
1426 for(i=0;i<tmp_ngram.
n();i++)
1427 tmp_ngram[i] = ngram(i+1);
1429 cerr <<
" unseen, P(";
1430 for(i=0;i<tmp_ngram.
n();i++)
1431 cerr << tmp_ngram(i) <<
" ";
1433 cerr <<
") = " << n->probability(tmp_ngram) <<
" " << endl;
1434 sum2 += n->probability(tmp_ngram);
1442 if(!n->set_backoff_weight(new_ngram,1))
1443 cerr <<
"WARNING : couldn't set weight !" << endl;
1450 cerr <<
"NEGATIVE WEIGHT for ";
1451 for(i=0;i<new_ngram.
n();i++)
1452 cerr << new_ngram(i) <<
" ";
1455 cerr <<
"sum1=" << sum1 <<
" sum2=" << sum2;
1456 cerr <<
" " << (1 - sum1) / sum2 << endl;
1458 for(j=0;j<n->get_pred_vocab_length();j++)
1460 ngram[ngram.
n()-1] = n->get_pred_vocab_word(j);
1463 if (n->ngram_exists(ngram))
1466 for(i=0;i<ngram.
n();i++)
1467 cerr << ngram(i) <<
" ";
1468 cerr <<
" exists, prob = ";
1469 cerr << n->probability(ngram,
false,
true) << endl;
1477 if(!n->set_backoff_weight(new_ngram, (1 - sum1) / sum2 ))
1478 cerr <<
"WARNING : couldn't set weight !" << endl;
1480 cerr <<
"sum1=" << sum1 <<
" sum2=" << sum2;
1481 cerr <<
" weight=" << (1 - sum1) / sum2 << endl;
1485 ngram[ngram.
n()-1] = tmp;
1489 bool EST_Ngrammar::compute_backoff_weights(
const int mincount,
1494 backoff_threshold = mincount;
1504 backoff_restore_unigram_states();
1506 Good_Turing_discount(*
this,maxcount,0.5);
1534 for (o=2;o<=order();o++)
1537 cerr <<
"Backing off order " << o << endl;
1547 words[o-1] =
"!FILLED!";
1548 iterate(words,&compute_backoff_weight,NULL);
1558 void EST_Ngrammar::backoff_restore_unigram_states()
1568 words[0] =
"wibble";
1569 for(j=0;j<get_pred_vocab_length();j++)
1571 words[1] = get_pred_vocab_word(j);
1572 backoff_representation->accumulate(words,0);
1581 if (start_state == NULL)
1582 start_state = backoff_representation;
1592 for (k=start_state->pdf_const().
item_start();
1593 !start_state->pdf_const().
item_end(k);
1594 k = start_state->pdf_const().
item_next(k))
1596 start_state->pdf_const().
item_freq(k,name,freq);
1597 if (freq < TINY_FREQ)
1605 start_state->remove_child(child,name);
1612 for (k=start_state->pdf_const().
item_start();
1613 !start_state->pdf_const().
item_end(k);
1614 k = start_state->pdf_const().
item_next(k))
1616 start_state->pdf_const().
item_freq(k,name,freq);
1623 prune_backoff_representation(child);
1631 switch(n.p_representation)
1633 case EST_Ngrammar::sparse:
1634 n.sparse_representation.print_freqs(s);
1637 case EST_Ngrammar::dense:
1638 s <<
"Dense" << endl;
1642 case EST_Ngrammar::backoff:
1643 s <<
"Backoff" << endl;
1644 s << *(n.backoff_representation) << endl;
1648 cerr <<
"Unknown internal representation of EST_Ngrammar : can't print"
1657 EST_Ngrammar::set_entry_type(EST_Ngrammar::entry_t new_type)
1659 if (new_type == p_entry_type)
1664 cerr <<
"Couldn't do entry type conversion !" << endl;
1668 bool EST_Ngrammar::sparse_to_dense()
1670 cerr <<
"EST_Ngrammar::sparse_to_dense() "
1671 <<
" not implemented" << endl;
1675 bool EST_Ngrammar::dense_to_sparse()
1677 cerr <<
"EST_Ngrammar::dense_to_sparse()"
1678 <<
" not implemented" << endl;
1682 int EST_Ngrammar::find_dense_state_index(
const EST_IVector &words,
1686 for(i=0;i<p_order-1;i++)
1692 int EST_Ngrammar::find_next_state_id(
int state,
int word)
const
1699 for (f=1,i=0; i<p_order-2; i++)
1701 return ((state%f)*vocab->
length())+word;
1706 switch(p_representation)
1708 case EST_Ngrammar::sparse:
1713 case EST_Ngrammar::dense:
1717 for(i=0;i<p_order-1;i++)
1719 tmp[i] = wordlist_index(words(i));
1720 if (tmp(i) == -1)
break;
1722 tmp[i] = pred_vocab->
index(words(i));
1723 if (tmp(i) == -1)
break;
1724 return p_states[find_dense_state_index(tmp)];
1728 case EST_Ngrammar::backoff:
1729 cerr <<
"find_state: not valid in backoff mode !" << endl;
1733 cerr <<
"find_state: unknown ngrammar representation" << endl;
1742 EST_Ngrammar::find_state_const(
const EST_StrVector &words)
const
1744 switch(p_representation)
1746 case EST_Ngrammar::sparse:
1751 case EST_Ngrammar::dense:
1755 for(i=0;i<p_order-1;i++)
1757 tmp[i] = wordlist_index(words(i));
1758 if (tmp(i) == -1)
break;
1760 tmp[i] = pred_vocab->
index(words(i));
1761 if (tmp(i) == -1)
break;
1762 return p_states[find_dense_state_index(tmp)];
1766 case EST_Ngrammar::backoff:
1767 cerr <<
"find_state_const: not valid in backoff mode !" << endl;
1771 cerr <<
"find_state: unknown ngrammar representation" << endl;
1780 switch(p_representation)
1782 case EST_Ngrammar::sparse:
1787 case EST_Ngrammar::dense:
1788 return p_states[find_dense_state_index(words)];
1791 case EST_Ngrammar::backoff:
1792 cerr <<
"find_state: not valid in backoff mode !" << endl;
1796 cerr <<
"find_state: unknown ngrammar representation" << endl;
1804 EST_Ngrammar::find_state_const(
const EST_IVector &words)
const
1806 switch(p_representation)
1808 case EST_Ngrammar::sparse:
1812 case EST_Ngrammar::dense:
1813 return p_states[find_dense_state_index(words)];
1816 case EST_Ngrammar::backoff:
1817 cerr <<
"find_state_const: not valid in backoff mode !" << endl;
1821 cerr <<
"find_state: unknown ngrammar representation" << endl;
1829 bool EST_Ngrammar::set_representation(EST_Ngrammar::representation_t new_representation)
1832 if (new_representation == p_representation)
1835 if (new_representation == EST_Ngrammar::sparse)
1836 return sparse_to_dense();
1837 else if (new_representation == EST_Ngrammar::dense)
1838 return dense_to_sparse();
1841 cerr <<
"set_representation: unknown ngrammar representation" << endl;
1846 double EST_Ngrammar::probability(
const EST_StrVector &words,
bool force,
const bool trace)
const
1851 switch(p_representation)
1853 case EST_Ngrammar::sparse:
1854 case EST_Ngrammar::dense:
1855 return find_state_const(words).probability(lastword(words));
1858 case EST_Ngrammar::backoff:
1859 return backoff_probability(words,trace);
1863 cerr <<
"probability: unknown ngrammar representation" << endl;
1869 double EST_Ngrammar::frequency(
const EST_StrVector &words,
bool force,
const bool trace)
const
1874 switch(p_representation)
1876 case EST_Ngrammar::sparse:
1877 case EST_Ngrammar::dense:
1878 return find_state_const(words).frequency(lastword(words));
1881 case EST_Ngrammar::backoff:
1882 return backoff_probability(words,trace);
1886 cerr <<
"probability: unknown ngrammar representation" << endl;
1893 double *prob,
int *state)
const
1897 switch(p_representation)
1899 case EST_Ngrammar::sparse:
1900 case EST_Ngrammar::dense:
1904 return s.most_probable(prob);
1908 case EST_Ngrammar::backoff:
1910 return backoff_most_probable(words,prob);
1914 cerr <<
"probability: unknown ngrammar representation" << endl;
1921 double *prob,
int *state)
const
1925 switch(p_representation)
1927 case EST_Ngrammar::sparse:
1928 case EST_Ngrammar::dense:
1932 return s.most_probable(prob);
1936 case EST_Ngrammar::backoff:
1937 cerr <<
"probability: IVector access to backoff not supported" << endl;
1942 cerr <<
"probability: unknown ngrammar representation" << endl;
1948 int EST_Ngrammar::find_state_id(
const EST_StrVector &words)
const
1950 switch(p_representation)
1952 case EST_Ngrammar::sparse:
1953 case EST_Ngrammar::dense:
1959 cerr <<
"Ngrammar: representation doesn't support states" << endl;
1965 int EST_Ngrammar::find_state_id(
const EST_IVector &words)
const
1967 switch(p_representation)
1969 case EST_Ngrammar::sparse:
1970 case EST_Ngrammar::dense:
1976 cerr <<
"Ngrammar: representation doesn't support states" << endl;
1982 EST_String EST_Ngrammar::get_vocab_word(
int i)
const
1985 return vocab->name(i);
1990 int EST_Ngrammar::get_vocab_word(
const EST_String &s)
const
1992 int index = vocab->name(s);
1996 double EST_Ngrammar::reverse_probability(
const EST_StrVector &words,
2002 switch(p_representation)
2004 case EST_Ngrammar::sparse:
2005 case EST_Ngrammar::dense:
2009 return s.frequency(lastword(words))/
2010 vocab_pdf.frequency(lastword(words));
2014 case EST_Ngrammar::backoff:
2015 return backoff_reverse_probability(words);
2019 cerr <<
"probability: unknown ngrammar representation" << endl;
2025 double EST_Ngrammar::reverse_probability(
const EST_IVector &words,
2031 switch(p_representation)
2033 case EST_Ngrammar::sparse:
2034 case EST_Ngrammar::dense:
2038 return s.frequency(lastword(words))/
2039 vocab_pdf.frequency(lastword(words));
2043 case EST_Ngrammar::backoff:
2044 cerr <<
"probability: reverse prob unavailable for backoff ngram"
2050 cerr <<
"probability: unknown ngrammar representation" << endl;
2057 EST_Ngrammar::prob_dist(
int state)
const
2059 return p_states[state].pdf_const();
2066 switch(p_representation)
2068 case EST_Ngrammar::sparse:
2069 case EST_Ngrammar::dense:
2072 return s.pdf_const();
2076 case EST_Ngrammar::backoff:
2077 return backoff_prob_dist(words);
2081 cerr <<
"probability: unknown ngrammar representation" << endl;
2082 return PSTnullProbDistribution;
2088 EST_Ngrammar::prob_dist(
const EST_IVector &words)
const
2091 switch(p_representation)
2093 case EST_Ngrammar::sparse:
2094 case EST_Ngrammar::dense:
2097 return s.pdf_const();
2101 case EST_Ngrammar::backoff:
2102 cerr <<
"probability: unsupport IVector access of backoff ngram" << endl;
2103 return PSTnullProbDistribution;
2107 cerr <<
"probability: unknown ngrammar representation" << endl;
2108 return PSTnullProbDistribution;
2114 EST_Ngrammar::load(
const EST_String &filename)
2117 EST_read_status r_val;
2123 if ((r_val = load_ngram_cstr_ascii(filename, *
this)) != wrong_format)
2125 if ((r_val = load_ngram_cstr_bin(filename, *
this)) != wrong_format)
2133 if (fname.extension() == GZIP_FILENAME_EXTENSION)
2134 tmp_fname = uncompress_file_to_temporary(filename,
2135 "gzip --decompress --stdout");
2136 else if (fname.extension() == COMPRESS_FILENAME_EXTENSION)
2137 tmp_fname = uncompress_file_to_temporary(filename,
"uncompress -c");
2141 r_val = load(tmp_fname);
2142 delete_file(tmp_fname);
2146 return misc_read_error;
2148 cerr <<
"EST_Ngrammar::load can't determine ngrammar file type for input file " << filename << endl;
2149 return wrong_format;
2159 EST_read_status r_val;
2161 if ((r_val = load_ngram_arpa(filename, *
this, wordlist)) != wrong_format)
2169 if ((r_val = load_ngram_cstr_ascii(filename, *
this)) != wrong_format)
2171 if ((r_val == format_ok) && check_vocab(wordlist))
2175 cerr <<
"Wordlist file does not match grammar wordlist !" << endl;
2176 return misc_read_error;
2180 if ((r_val = load_ngram_cstr_bin(filename, *
this)) != wrong_format)
2182 if ((r_val == format_ok) && check_vocab(wordlist))
2186 cerr <<
"Wordlist does not match grammar !" << endl;
2187 return misc_read_error;
2192 cerr <<
"EST_Ngrammar::load can't determine ngrammar file type for input file " << filename << endl;
2193 return wrong_format;
2198 EST_Ngrammar::make_htk_compatible()
2201 cerr <<
"EST_Ngrammar::make_htk_compatible() not written yet." << endl;
2207 const bool trace,
double floor)
2211 return save(filename,
"cstr_ascii",
false,floor);
2212 if (type ==
"htk_ascii")
2213 return save_ngram_htk_ascii(filename, *
this, floor);
2217 return save_ngram_arpa(filename, *
this);
2218 if (type ==
"cstr_ascii")
2219 return save_ngram_cstr_ascii(filename, *
this,trace,floor);
2220 if (type ==
"cstr_bin")
2221 return save_ngram_cstr_bin(filename, *
this, trace,floor);
2223 return save_ngram_wfst(filename, *
this);
2225 cerr <<
"EST_Ngrammar::save unknown output file type " << type << endl;
2239 for(i=0;i<words.
n();i++)
2249 (*function)(
this,words,params);
2258 for(i=0;i<pred_vocab->length();i++){
2259 words[j] = pred_vocab->name(i);
2260 iterate(words,
function,params);
2266 for(i=0;i<vocab->
length();i++){
2267 words[j] = vocab->name(i);
2268 iterate(words,
function,params);
2285 for(i=0;i<words.
n();i++)
2295 (*function)(
this,words,params);
2304 for(i=0;i<pred_vocab->length();i++){
2305 words[j] = pred_vocab->name(i);
2306 const_iterate(words,
function,params);
2312 for(i=0;i<vocab->
length();i++){
2313 words[j] = vocab->name(i);
2314 const_iterate(words,
function,params);
2321 void EST_Ngrammar::print_freqs(ostream &os,
double floor)
2324 if (p_representation == EST_Ngrammar::backoff)
2325 backoff_representation->print_freqs(os,p_order);
2332 for (i=0; i < p_num_states; i++)
2335 for (k=p_states[i].pdf().item_start();
2336 !p_states[i].pdf().item_end(k);
2337 k = p_states[i].pdf().item_next(k))
2342 p_states[i].pdf().item_freq(k,name,freq);
2347 for (j = p_order-2; j >= 0; j--)
2349 window[j] = ind%vocab->
length();
2352 for (j = 0; j < p_order-1; j++)
2353 os << wordlist_index(window(j)) <<
" ";
2354 os << name <<
" : " << freq << endl;
2362 EST_Ngrammar::backoff_prob_dist(
const EST_StrVector &words)
const
2371 for(i=0;i<words.
n();i++)
2372 ngram[i] = words(i);
2376 for(j=0;j<get_pred_vocab_length();j++)
2378 ngram[ngram.
n()-1] = get_pred_vocab_word(j);
2379 double tmp = backoff_probability(ngram,
false);
2387 const double EST_Ngrammar::get_backoff_discount(
const int order,
const double freq)
const
2391 cerr <<
"order too great in EST_Ngrammar::get_backoff_discount" << endl;
2395 else if( (
int)freq < backoff_discount[order-1].n())
2396 return backoff_discount[order-1]((int)freq);
2402 const double EST_Ngrammar::backoff_probability(
const EST_StrVector &words,
2403 const bool trace)
const
2413 cerr <<
"backoff_probability( ";
2414 for(i=0;i<words.
n();i++)
2415 cerr << words(i) <<
" ";
2423 cerr <<
"unigram " << backoff_representation->probability(words(0))
2426 f=backoff_representation->frequency(words(0));
2431 return f / backoff_representation->pdf_const().samples();
2433 return backoff_unigram_floor_freq / backoff_representation->pdf_const().samples();
2438 new_ngram.
resize(words.
n()-1);
2439 for(i=0;i<new_ngram.
n();i++)
2440 new_ngram[i] = words(i);
2443 state=backoff_representation->get_state(words);
2445 if( (state != NULL) &&
2446 ((f=state->frequency(words(0))) > backoff_threshold) )
2457 if((new_ngram(0) == p_sentence_start_marker) ||
2458 (new_ngram(0) == p_sentence_end_marker) )
2460 f2 = p_number_of_sentences;
2462 cerr <<
"special freq used : " << f2 << endl;
2466 state=backoff_representation->get_state(new_ngram);
2469 cerr <<
"Something went horribly wrong !" << endl;
2475 f2=state->frequency(new_ngram(0));
2480 cerr <<
" using freq for " << new_ngram(0) <<
" of " << f2 << endl;
2487 cerr <<
" ..... got (" << f <<
" - "
2488 << get_backoff_discount(state->level()+1,f)
2489 <<
")/" << f2 <<
" = "
2490 << (f - get_backoff_discount(state->level()+1,f) ) / f2
2493 return (f - get_backoff_discount(state->level()+1,f) ) / f2;
2499 double bo_wt = get_backoff_weight(new_ngram);
2501 for(i=0;i<new_ngram.
n();i++)
2502 new_ngram[i] = words(i+1);
2506 cerr <<
"backed off(" << bo_wt <<
") to (";
2507 for(i=0;i<new_ngram.
n();i++)
2508 cerr << new_ngram(i) <<
" ";
2512 return bo_wt * backoff_probability(new_ngram,trace);
2524 EST_Ngrammar::backoff_reverse_probability_sub(
const EST_StrVector &words,
2547 return root->probability(words(0));
2552 new_ngram.
resize(words.
n()-1);
2553 for(i=0;i<new_ngram.
n();i++)
2554 new_ngram[i] = words(i);
2557 state=root->get_state(words);
2560 if( (state != NULL) &&
2561 ((f=state->frequency(words(0))) > 0) )
2564 state=root->get_state(new_ngram);
2567 cerr <<
"Something went horribly wrong !" << endl;
2573 return f / state->frequency(new_ngram(0));
2579 double bo_wt = root->get_backoff_weight(new_ngram);
2582 for(i=0;i<new_ngram.
n();i++)
2583 new_ngram[i] = words(i+1);
2586 return bo_wt * backoff_reverse_probability_sub(new_ngram,root);
2597 EST_Ngrammar::backoff_reverse_probability(
const EST_StrVector &words)
const
2606 state = backoff_representation->get_child(words(words.
n()-1));
2616 return backoff_reverse_probability_sub(words,state);
2621 EST_Ngrammar::backoff_most_probable(
const EST_StrVector &words,
2624 return backoff_prob_dist(words).most_probable(prob);
2640 for(i=0;i<v.
n()+l;i++)
2651 for(i=v.
n()-1;i>=l;i--)
2667 function(start_state,params);
2673 for (k=start_state->pdf_const().
item_start();
2674 !start_state->pdf_const().
item_end(k);
2675 k = start_state->pdf_const().
item_next(k))
2677 start_state->pdf_const().
item_freq(k,name,freq);
2680 backoff_traverse(child,
function,params);
2693 if (start_state->level() == level)
2695 function(start_state,params);
2697 else if (start_state->level() < level)
2705 for (k=start_state->pdf_const().
item_start();
2706 !start_state->pdf_const().
item_end(k);
2707 k = start_state->pdf_const().
item_next(k))
2709 start_state->pdf_const().
item_freq(k,name,freq);
2712 backoff_traverse(child,
function,params,level);
2724 float *weight = (
float*)((
void**)params)[1];
2726 if(other_n->ngram_exists(ngram))
2727 n->accumulate(ngram,*weight * other_n->frequency(ngram));
2737 void **params =
new void*[2];
2738 params[0] = (
void*)&n;
2739 params[1] = (
void*)&weight;
2741 iterate(words,&merge_other_grammar,(
void*)params);
2760 for(i=0;i<v.
n()+l;i++)
2771 for(i=v.
n()-1;i>=l;i--)