40 #ifndef __EST_NGRAMMAR_H__ 
   41 #define __EST_NGRAMMAR_H__ 
   48 #include "EST_String.h" 
   50 #include "EST_rw_status.h" 
   51 #include "EST_types.h" 
   52 #include "EST_FMatrix.h" 
   53 #include "EST_TList.h" 
   54 #include "EST_StringTrie.h" 
   55 #include "EST_simplestats.h" 
   57 #include "EST_string_aux.h" 
   61 #define SENTENCE_START_MARKER "!ENTER" 
   62 #define SENTENCE_END_MARKER "!EXIT" 
   63 #define OOV_MARKER "!OOV" 
   65 #define EST_NGRAMBIN_MAGIC 1315402337 
   68 #define GZIP_FILENAME_EXTENSION "gz" 
   69 #define COMPRESS_FILENAME_EXTENSION "Z" 
   72 #define TINY_FREQ 1.0e-10 
   95               {clear();init(
id,pdf);};
 
  109     void cumulate(
const int index, 
const double count=1)
 
  110                   {p_pdf.cumulate(index,count);};
 
  111     void cumulate(
const EST_String &word, 
const double count=1)
 
  112                   {p_pdf.cumulate(word,count);};
 
  115     int id()
 const {
return p_id; };
 
  119       {
return p_pdf.probability(w);}
 
  120     double probability(
int w)
 const {
return p_pdf.probability(w);}
 
  122       {
return p_pdf.frequency(w);}
 
  123     double frequency(
int w)
 const {
return p_pdf.frequency(w);}
 
  124     const EST_String &most_probable(
double *prob = NULL)
 const 
  125       {
return p_pdf.most_probable(prob);}
 
  137   double backoff_weight;
 
  149     {clear();init(d,level);};
 
  151     {clear();init(pdf,level);};
 
  164           const double count=1);
 
  166           const double count=1);
 
  171     {
return p_pdf.probability(w);}
 
  173     {
return p_pdf.frequency(w);}
 
  174   const EST_String &most_probable(
double *prob = NULL)
 const 
  175     {
return p_pdf.most_probable(prob);}
 
  177   const int level()
 const {
return p_level;}
 
  197             const double threshold) 
const;
 
  198   const double get_backoff_weight()
 const {
return backoff_weight; }
 
  199   const double get_backoff_weight(
const EST_StrVector &words) 
const;
 
  200   bool set_backoff_weight(
const EST_StrVector &words, 
const double w);
 
  203   void print_freqs(ostream &os,
const int order,
EST_String followers=
"");
 
  214     enum representation_t {sparse, dense, backoff};
 
  219     enum entry_t {frequencies, log_frequencies};
 
  228     double p_number_of_sentences; 
 
  235     representation_t p_representation; 
 
  236     entry_t p_entry_type;
 
  241     bool init_sparse_representation();
 
  244     bool init_dense_representation();
 
  252     double backoff_threshold;
 
  255     double backoff_unigram_floor_freq;
 
  263     const double get_backoff_discount(
const int order, 
const double freq) 
const;
 
  265     bool init_backoff_representation();
 
  267     void backoff_restore_unigram_states();
 
  270     int find_dense_state_index(
const EST_IVector &words, 
int index=0) 
const;
 
  273     const EST_StrVector &make_ngram_from_index(
const int i) 
const;
 
  288         { 
return words(p_order-1); }
 
  290         { 
return words(p_order-1); }
 
  294     bool sparse_to_dense();
 
  295     bool dense_to_sparse();
 
  300     void freqs_to_probs(); 
 
  327     bool p_init(
int o, representation_t r);
 
  331     bool oov_preprocess(
const EST_String &filename,
 
  343     const double backoff_reverse_probability_sub(
const EST_StrVector &words,
 
  346                      const bool trace=
false) 
const;
 
  347     const double backoff_reverse_probability(
const EST_StrVector &words) 
const;
 
  349                          double *prob = NULL) 
const;
 
  363               void *params, 
const int level);
 
  371     default_values(); init(o,r,wordlist); 
 
  379     default_values(); init(o,r,wordlist,predlist); 
 
  384     default_values(); init(o,r,v); 
 
  388     void default_values();
 
  390     bool init(
int o, representation_t r, 
 
  392     bool init(
int o, representation_t r, 
 
  396     bool init(
int o, representation_t r, 
 
  400     int num_states(
void)
 const { 
return p_num_states;}
 
  401     double samples(
void)
 const { 
return p_num_samples;}
 
  402     int order()
 const { 
return p_order; }
 
  403     int get_vocab_length()
 const { 
return vocab?vocab->
length():0; }
 
  405     int get_vocab_word(
const EST_String &s) 
const;
 
  406     int get_pred_vocab_length()
 const { 
return pred_vocab->length(); }
 
  407     EST_String get_pred_vocab_word(
int i)
 const { 
return pred_vocab->name(i); }
 
  408     int get_pred_vocab_word(
const EST_String &s)
 const  
  409        { 
return pred_vocab->name(s); }
 
  410     int closed_vocab()
 const {
return !allow_oov; }
 
  411     entry_t entry_type()
 const {
return p_entry_type;}
 
  412     representation_t representation()
 const  
  413        { 
return p_representation;}
 
  417            const EST_String &prev = SENTENCE_START_MARKER,
 
  418            const EST_String &prev_prev = SENTENCE_END_MARKER,
 
  422            const int mincount=1,
 
  423            const int maxcount=10);
 
  427             const double count=1);
 
  430             const double count=1);
 
  434     void make_htk_compatible();
 
  437     EST_read_status load(
const EST_String &filename);
 
  439     EST_write_status save(
const EST_String &filename, 
 
  441               const bool trace=
false,
 
  444     int wordlist_index(
const EST_String &word, 
const bool report=
true) 
const;
 
  445     const EST_String &wordlist_index(
int i) 
const;
 
  446     int predlist_index(
const EST_String &word) 
const;
 
  447     const EST_String &predlist_index(
int i) 
const;
 
  450     bool set_entry_type(entry_t new_type);
 
  451     bool set_representation(representation_t new_representation);
 
  456     double probability(
const EST_StrVector &words, 
bool force=
false,
 
  457                const bool trace=
false) 
const;
 
  458     double frequency(
const EST_StrVector &words, 
bool force=
false,
 
  459              const bool trace=
false) 
const;
 
  462                   double *prob,
int *state) 
const;
 
  464        {
double p; 
int state; 
return predict(words,&p,&state); }
 
  466        {
int state; 
return predict(words,prob,&state); }
 
  470        {
double p; 
int state; 
return predict(words,&p,&state); }
 
  472        {
int state; 
return predict(words,prob,&state); }
 
  476     int find_next_state_id(
int state, 
int word) 
const;
 
  485                    bool force=
false) 
const;
 
  486     double reverse_probability(
const EST_IVector &words,
 
  487                    bool force=
false) 
const;
 
  514     bool ngram_exists(
const EST_StrVector &words, 
const double threshold) 
const;
 
  515     const double get_backoff_weight(
const EST_StrVector &words) 
const;
 
  516     bool set_backoff_weight(
const EST_StrVector &words, 
const double w);
 
  518     void print_freqs(ostream &os,
double floor=0.0);
 
  522     friend ostream& operator<<(ostream& s, 
EST_Ngrammar &n);
 
  523     friend EST_read_status load_ngram_htk_ascii(
const EST_String filename, 
 
  525     friend EST_read_status load_ngram_htk_binary(
const EST_String filename, 
 
  527     friend EST_read_status load_ngram_arpa(
const EST_String filename, 
 
  530     friend EST_read_status load_ngram_cstr_ascii(
const EST_String filename, 
 
  532     friend EST_read_status load_ngram_cstr_bin(
const EST_String filename, 
 
  535     friend EST_write_status save_ngram_htk_ascii_sub(
const EST_String &word,
 
  539     friend EST_write_status save_ngram_htk_ascii(
const EST_String filename, 
 
  545     friend EST_write_status save_ngram_cstr_ascii(
const EST_String filename, 
 
  549     friend EST_write_status save_ngram_cstr_bin(
const EST_String filename, 
 
  553     friend EST_write_status save_ngram_arpa(
const EST_String filename, 
 
  555     friend EST_write_status save_ngram_arpa_sub(ostream *ost, 
 
  558     friend EST_write_status save_ngram_wfst(
const EST_String filename, 
 
  566 friend bool Good_Turing_smooth(
EST_Ngrammar &n, 
int maxcount, 
int mincount);
 
  567 friend void Good_Turing_discount(
EST_Ngrammar &ngrammar, 
const int maxcount,
 
  568                  const double default_discount);
 
  570 friend void fs_build_backoff_ngrams(
EST_Ngrammar *backoff_ngrams,
 
  572 friend int fs_backoff_smooth(
EST_Ngrammar *backoff_ngrams,
 
  577     bool compute_backoff_weights(
const int mincount=1,
 
  578                  const int maxcount=10);
 
  583 friend class EST_BackoffNgrammar;
 
  602         const EST_String &prev = SENTENCE_START_MARKER, 
 
  603         const EST_String &prev_prev = SENTENCE_END_MARKER,
 
  604         const EST_String &last = SENTENCE_END_MARKER);
 
  608 #endif // __EST_NGRAMMAR_H__