40 #ifndef __EST_NGRAMMAR_H__
41 #define __EST_NGRAMMAR_H__
48 #include "EST_String.h"
50 #include "EST_rw_status.h"
51 #include "EST_types.h"
52 #include "EST_FMatrix.h"
53 #include "EST_TList.h"
54 #include "EST_StringTrie.h"
55 #include "EST_simplestats.h"
57 #include "EST_string_aux.h"
61 #define SENTENCE_START_MARKER "!ENTER"
62 #define SENTENCE_END_MARKER "!EXIT"
63 #define OOV_MARKER "!OOV"
65 #define EST_NGRAMBIN_MAGIC 1315402337
68 #define GZIP_FILENAME_EXTENSION "gz"
69 #define COMPRESS_FILENAME_EXTENSION "Z"
72 #define TINY_FREQ 1.0e-10
95 {clear();init(
id,pdf);};
109 void cumulate(
const int index,
const double count=1)
110 {p_pdf.cumulate(index,count);};
111 void cumulate(
const EST_String &word,
const double count=1)
112 {p_pdf.cumulate(word,count);};
115 int id()
const {
return p_id; };
119 {
return p_pdf.probability(w);}
120 double probability(
int w)
const {
return p_pdf.probability(w);}
122 {
return p_pdf.frequency(w);}
123 double frequency(
int w)
const {
return p_pdf.frequency(w);}
124 const EST_String &most_probable(
double *prob = NULL)
const
125 {
return p_pdf.most_probable(prob);}
137 double backoff_weight;
149 {clear();init(d,level);};
151 {clear();init(pdf,level);};
164 const double count=1);
166 const double count=1);
171 {
return p_pdf.probability(w);}
173 {
return p_pdf.frequency(w);}
174 const EST_String &most_probable(
double *prob = NULL)
const
175 {
return p_pdf.most_probable(prob);}
177 const int level()
const {
return p_level;}
197 const double threshold)
const;
198 const double get_backoff_weight()
const {
return backoff_weight; }
199 const double get_backoff_weight(
const EST_StrVector &words)
const;
200 bool set_backoff_weight(
const EST_StrVector &words,
const double w);
203 void print_freqs(ostream &os,
const int order,
EST_String followers=
"");
214 enum representation_t {sparse, dense, backoff};
219 enum entry_t {frequencies, log_frequencies};
228 double p_number_of_sentences;
235 representation_t p_representation;
236 entry_t p_entry_type;
241 bool init_sparse_representation();
244 bool init_dense_representation();
252 double backoff_threshold;
255 double backoff_unigram_floor_freq;
263 const double get_backoff_discount(
const int order,
const double freq)
const;
265 bool init_backoff_representation();
267 void backoff_restore_unigram_states();
270 int find_dense_state_index(
const EST_IVector &words,
int index=0)
const;
273 const EST_StrVector &make_ngram_from_index(
const int i)
const;
288 {
return words(p_order-1); }
290 {
return words(p_order-1); }
294 bool sparse_to_dense();
295 bool dense_to_sparse();
300 void freqs_to_probs();
327 bool p_init(
int o, representation_t r);
331 bool oov_preprocess(
const EST_String &filename,
343 const double backoff_reverse_probability_sub(
const EST_StrVector &words,
346 const bool trace=
false)
const;
347 const double backoff_reverse_probability(
const EST_StrVector &words)
const;
349 double *prob = NULL)
const;
363 void *params,
const int level);
371 default_values(); init(o,r,wordlist);
379 default_values(); init(o,r,wordlist,predlist);
384 default_values(); init(o,r,v);
388 void default_values();
390 bool init(
int o, representation_t r,
392 bool init(
int o, representation_t r,
396 bool init(
int o, representation_t r,
400 int num_states(
void)
const {
return p_num_states;}
401 double samples(
void)
const {
return p_num_samples;}
402 int order()
const {
return p_order; }
403 int get_vocab_length()
const {
return vocab?vocab->
length():0; }
405 int get_vocab_word(
const EST_String &s)
const;
406 int get_pred_vocab_length()
const {
return pred_vocab->length(); }
407 EST_String get_pred_vocab_word(
int i)
const {
return pred_vocab->name(i); }
408 int get_pred_vocab_word(
const EST_String &s)
const
409 {
return pred_vocab->name(s); }
410 int closed_vocab()
const {
return !allow_oov; }
411 entry_t entry_type()
const {
return p_entry_type;}
412 representation_t representation()
const
413 {
return p_representation;}
417 const EST_String &prev = SENTENCE_START_MARKER,
418 const EST_String &prev_prev = SENTENCE_END_MARKER,
422 const int mincount=1,
423 const int maxcount=10);
427 const double count=1);
430 const double count=1);
434 void make_htk_compatible();
437 EST_read_status load(
const EST_String &filename);
439 EST_write_status save(
const EST_String &filename,
441 const bool trace=
false,
444 int wordlist_index(
const EST_String &word,
const bool report=
true)
const;
445 const EST_String &wordlist_index(
int i)
const;
446 int predlist_index(
const EST_String &word)
const;
447 const EST_String &predlist_index(
int i)
const;
450 bool set_entry_type(entry_t new_type);
451 bool set_representation(representation_t new_representation);
456 double probability(
const EST_StrVector &words,
bool force=
false,
457 const bool trace=
false)
const;
458 double frequency(
const EST_StrVector &words,
bool force=
false,
459 const bool trace=
false)
const;
462 double *prob,
int *state)
const;
464 {
double p;
int state;
return predict(words,&p,&state); }
466 {
int state;
return predict(words,prob,&state); }
470 {
double p;
int state;
return predict(words,&p,&state); }
472 {
int state;
return predict(words,prob,&state); }
476 int find_next_state_id(
int state,
int word)
const;
485 bool force=
false)
const;
486 double reverse_probability(
const EST_IVector &words,
487 bool force=
false)
const;
514 bool ngram_exists(
const EST_StrVector &words,
const double threshold)
const;
515 const double get_backoff_weight(
const EST_StrVector &words)
const;
516 bool set_backoff_weight(
const EST_StrVector &words,
const double w);
518 void print_freqs(ostream &os,
double floor=0.0);
522 friend ostream& operator<<(ostream& s,
EST_Ngrammar &n);
523 friend EST_read_status load_ngram_htk_ascii(
const EST_String filename,
525 friend EST_read_status load_ngram_htk_binary(
const EST_String filename,
527 friend EST_read_status load_ngram_arpa(
const EST_String filename,
530 friend EST_read_status load_ngram_cstr_ascii(
const EST_String filename,
532 friend EST_read_status load_ngram_cstr_bin(
const EST_String filename,
535 friend EST_write_status save_ngram_htk_ascii_sub(
const EST_String &word,
539 friend EST_write_status save_ngram_htk_ascii(
const EST_String filename,
545 friend EST_write_status save_ngram_cstr_ascii(
const EST_String filename,
549 friend EST_write_status save_ngram_cstr_bin(
const EST_String filename,
553 friend EST_write_status save_ngram_arpa(
const EST_String filename,
555 friend EST_write_status save_ngram_arpa_sub(ostream *ost,
558 friend EST_write_status save_ngram_wfst(
const EST_String filename,
566 friend bool Good_Turing_smooth(
EST_Ngrammar &n,
int maxcount,
int mincount);
567 friend void Good_Turing_discount(
EST_Ngrammar &ngrammar,
const int maxcount,
568 const double default_discount);
570 friend void fs_build_backoff_ngrams(
EST_Ngrammar *backoff_ngrams,
572 friend int fs_backoff_smooth(
EST_Ngrammar *backoff_ngrams,
577 bool compute_backoff_weights(
const int mincount=1,
578 const int maxcount=10);
583 friend class EST_BackoffNgrammar;
602 const EST_String &prev = SENTENCE_START_MARKER,
603 const EST_String &prev_prev = SENTENCE_END_MARKER,
604 const EST_String &last = SENTENCE_END_MARKER);
608 #endif // __EST_NGRAMMAR_H__