48 #include "EST_String.h"
49 #include "EST_Ngrammar.h"
50 #include "EST_Token.h"
51 #include "EST_cutils.h"
77 int this_num,this_order;
79 if (ts.
open(filename) == -1)
80 return misc_read_error;
83 while ((!ts.
eof()) && !ts.
get().string().contains(
"\\data\\"));
108 this_order=atoi(s.
before(
"="));
109 this_num=atoi(s.
after(
"="));
114 nums[this_order] = this_num;
116 if(this_order > order)
131 if(!n.init(order,EST_Ngrammar::backoff,vocab))
132 return misc_read_error;
135 for(i=1;i<=order;i++)
141 EST_String tmp =
"\\" + itoString(i) +
"-grams:";
152 cerr <<
"Unexpected end of grammar file whilst looking for '"
153 << tmp <<
"'" << endl;
154 return misc_read_error;
161 for(j=0;j<nums(i);j++)
164 for (k=0; ((k<i) && !ts.
eof()); k++)
165 window[k] = ts.
get().string();
169 cerr <<
"Unexpected end of file whilst reading " << i
170 <<
"-grams !" << endl;
171 return misc_read_error;
174 occur = atof(ts.
get().string());
179 cerr <<
"ooooooooops" << endl;
186 weight = atof(ts.
get().string());
187 n.set_backoff_weight(window,weight);
192 cerr <<
"EST_Ngrammar:load_ngram_arpa expect end of line at filepos "
195 return misc_read_error;
204 if (ts.
get().string() ==
"\\end\\")
211 cerr <<
"Missing \\end\\ !" << endl;
214 return misc_read_error;
225 if (ts.
open(filename) == -1)
226 return misc_read_error;
228 if (ts.
peek().string() !=
"Ngram_2")
235 order = atoi(ts.
get().string());
246 if(!n.init(order,EST_Ngrammar::dense,vocab,pred_vocab))
248 cerr <<
"Something may be wrong with the vocab lists in '"
249 << filename <<
"'" << endl;
250 return misc_read_error;
257 for (i=0; i < order; i++)
258 window[i] = ts.
get().string();
259 if (ts.
get().string() !=
":")
261 cerr <<
"EST_Ngrammar:load_ngram_cstr_ascii missing colon at filepos "
263 return misc_read_error;
265 occur = atof(ts.
get().string());
266 n.accumulate(window,occur);
269 cerr <<
"EST_Ngrammar:load_ngram_cstr_ascii expect end of line at filepos "
271 return misc_read_error;
287 double approx_num_samples = 0.0;
288 long freq_data_start, freq_data_end;
293 if ((ifd=fopen(filename,
"rb")) == NULL)
294 return misc_read_error;
295 fread(&magic,
sizeof(
int),1,ifd);
297 if (SWAPINT(magic) == EST_NGRAMBIN_MAGIC)
299 else if (magic != EST_NGRAMBIN_MAGIC)
301 if (ts.
open(ifd, FALSE) == -1)
302 return misc_read_error;
307 if (ts.
peek().string() !=
"mBin_2")
315 order = atoi(ts.
get().string());
316 if (ts.
get() !=
"\n")
320 return misc_read_error;
325 while ((ts.
peek() !=
"\n") && (!ts.
eof()))
328 while ((ts.
peek() !=
"\n") && (!ts.
eof()))
335 if(!n.init(order,EST_Ngrammar::dense,vocab,pred_vocab))
339 return misc_read_error;
344 freq_data_start = ftell(ifd);
345 fseek(ifd,0,SEEK_END);
346 freq_data_end = ftell(ifd);
347 num_entries = (freq_data_end-freq_data_start)/
sizeof(
double);
348 double *dd =
new double[num_entries];
351 fseek(ifd,freq_data_start,SEEK_SET);
353 if (fread(dd,
sizeof(
double),num_entries,ifd) != (
unsigned)num_entries)
355 cerr <<
"EST_Ngrammar::load_ngram_cstr_bin format does not have expected number of entries" << endl;
358 return misc_read_error;
361 swap_bytes_double(dd,num_entries);
363 for(j=i=0;i<n.num_states();i++)
365 if (j >= num_entries)
367 cerr <<
"EST_Ngrammar::load_ngram_cstr_bin unexpected end of frequency data" << endl;
370 return misc_read_error;
373 (!n.p_states[i].pdf().
item_end(k)) && (j < num_entries) ;
378 approx_num_samples += dd[j];
383 if (j+1 >= num_entries)
385 else if (dd[j+1] < -1)
387 else if (dd[j+1] == -1)
395 n.p_num_samples = (int)approx_num_samples;
408 save_ngram_htk_ascii_sub(
const EST_String &word, ostream *ost,
415 this_ngram[0] = word;
417 this_pdf = n.prob_dist(this_ngram);
423 double floor_prob_total = floor * (n.pred_vocab->
length()-1);
425 if (word == n.p_sentence_end_marker)
428 *ost <<
" 0*" << n.pred_vocab->
length()-1 <<
" " << 1 << endl;
432 if(floor_prob_total > 1)
434 cerr <<
"ERROR : floor is impossibly large, scaling it !" << endl;
435 floor = 1.0 / (double)(n.pred_vocab->
length()-1);
436 floor_prob_total = 1;
445 if(name != n.p_sentence_start_marker)
453 *ost << word <<
" 0 ";
457 *ost << 1.0 / (double)(n.pred_vocab->
length()-1) <<
"*";
458 *ost << n.pred_vocab->
length()-1 <<
" " << endl;
470 if ( (name == n.p_sentence_start_marker) ||
471 (name == n.p_sentence_end_marker) ||
472 (name == OOV_MARKER) )
480 *ost <<
"*" << lcount <<
" ";
489 double base_prob = freq / total_freq;
492 *ost << floor + ( base_prob * (1-floor_prob_total) );
506 if(!n.closed_vocab())
511 *ost << 0 <<
" ERROR !!!!!!!! ";
517 freq = this_pdf.frequency(n.p_sentence_end_marker);
522 *ost <<
"*" << lcount <<
" " << endl;
528 *ost <<
"*" << lcount <<
" ";
534 double base_prob = freq / total_freq;
537 *ost << floor + ( base_prob * (1-floor_prob_total) ) << endl;
541 *ost << floor << endl;
549 save_ngram_htk_ascii(
const EST_String filename,
558 cerr <<
"Can only save bigrams in htk_ascii format" << endl;
559 return misc_write_error;
564 cerr <<
"Negative floor probability does not make sense !" << endl;
565 return misc_write_error;
571 ost =
new ofstream(filename);
576 if(floor * (n.pred_vocab->
length()-1) > 1)
578 floor = 1.0 / (double)(n.pred_vocab->
length()-1);
579 cerr <<
"ERROR : floor is impossibly large, scaling it to ";
580 cerr << floor << endl;
585 if(n.p_sentence_start_marker ==
"")
587 cerr <<
"Can't save in HTK format as no sentence start/end tags"
588 <<
" were given !" << endl;
589 return misc_write_error;
593 save_ngram_htk_ascii_sub(n.p_sentence_start_marker,ost,n,floor);
596 for(i=0;i<n.vocab->
length();i++)
598 if ( (n.vocab->
name(i) != n.p_sentence_start_marker) &&
599 (n.vocab->
name(i) != n.p_sentence_end_marker) &&
600 (n.vocab->
name(i) != OOV_MARKER) )
601 save_ngram_htk_ascii_sub(n.vocab->
name(i),ost,n,floor);
604 if(!n.closed_vocab())
605 save_ngram_htk_ascii_sub(OOV_MARKER,ost,n,floor);
607 save_ngram_htk_ascii_sub(n.p_sentence_end_marker,ost,n,floor);
626 if(n->ngram_exists(ngram))
627 *((
double*)count) += 1;
636 if(n->ngram_exists(ngram))
638 *((ostream*)(ost)) << safe_log10(n->probability(ngram)) <<
" ";
639 for(i=0;i<ngram.
n();i++)
640 *((ostream*)(ost)) << ngram(i) <<
" ";
642 if ((n->representation() == EST_Ngrammar::backoff) &&
643 (n->order() > ngram.
n()) )
644 *((ostream*)(ost)) << safe_log10(n->get_backoff_weight(ngram));
648 *((ostream*)(ost)) << endl;
664 ost =
new ofstream(filename);
674 num_n = (int)n.samples();
675 *ost <<
"\\data\\" << endl;
677 double *count =
new double;
679 if (n.representation() == EST_Ngrammar::backoff)
681 for(o=1;o<=n.order();o++)
690 n.iterate(ngram,&count_ngram_arpa_sub,(
void*)count);
691 *ost <<
"ngram " << o <<
"=" << *count << endl;
694 for(o=1;o<=n.order();o++)
697 *ost <<
"\\" << o <<
"-grams:" << endl;
701 n.iterate(ngram,&save_ngram_arpa_sub,(
void*)ost);
708 for(i=0;i<n.order();i++)
711 n.iterate(ngram,&count_ngram_arpa_sub,(
void*)count);
712 *ost <<
"ngram " << n.order() <<
"=" << *count << endl;
715 *ost <<
"\\" << n.order() <<
"-grams:" << endl;
717 for(i=0;i<n.order();i++)
719 n.iterate(ngram,&save_ngram_arpa_sub,ost);
723 *ost <<
"\\end\\" << endl;
733 const bool trace,
double floor)
744 ost =
new ofstream(filename);
749 *ost <<
"Ngram_2 " << n.order() << endl;
750 for (i=0; i < n.vocab->
length(); i++)
751 *ost << n.vocab->
name(i) <<
" ";
753 for (i=0; i < n.pred_vocab->
length(); i++)
754 *ost << n.pred_vocab->
name(i) <<
" ";
757 if (n.representation() == EST_Ngrammar::dense)
758 n.print_freqs(*ost,floor);
759 else if (n.representation() == EST_Ngrammar::backoff)
761 int total_ngrams = (int)pow(
float(n.get_vocab_length()),
float(n.order()-1));
763 for(i=0;i<total_ngrams;i++)
767 this_pdf = n.prob_dist(this_ngram);
777 for (
int jj=0; jj < this_ngram.
n(); jj++)
778 *ost << this_ngram(jj) <<
" ";
779 *ost << name <<
" : " << freq << endl;
797 if ((ost = fopen(filename,
"wb")) == NULL)
799 cerr <<
"Ngrammar save: unable to open \"" << filename <<
800 "\" for writing" << endl;
804 fprintf(ost,
"EST_File fst\n");
805 fprintf(ost,
"DataType ascii\n");
806 fprintf(ost,
"in \"(");
807 for (i=0; i < n.vocab->
length(); i++)
808 fprintf(ost,
" %s\n",(
const char *)n.vocab->
name(i));
809 fprintf(ost,
" )\"\n");
810 fprintf(ost,
"out \"(");
811 for (i=0; i < n.vocab->
length(); i++)
812 fprintf(ost,
" %s\n",(
const char *)n.vocab->
name(i));
813 fprintf(ost,
" )\"\n");
814 fprintf(ost,
"NumStates %d\n",n.num_states());
815 fprintf(ost,
"EST_Header_End\n");
817 for (i=0; i<n.num_states(); i++)
819 fprintf(ost,
"((%d nonfinal %d)\n",i,i);
830 const bool trace,
double floor)
833 if (n.representation() == EST_Ngrammar::sparse)
834 return misc_write_error;
841 int magic = EST_NGRAMBIN_MAGIC;
845 if ((ofd=stdout) == NULL)
846 return misc_write_error;
850 if ((ofd=fopen(filename,
"wb")) == NULL)
851 return misc_write_error;
854 fwrite(&magic,
sizeof(
int),1,ofd);
855 fprintf(ofd,
"mBin_2 %d\n",n.order());
856 for (i=0; i < n.vocab->
length(); i++)
857 fprintf(ofd,
"%s ",(
const char *)n.vocab->
name(i));
859 for (i=0; i < n.pred_vocab->
length(); i++)
860 fprintf(ofd,
"%s ",(
const char *)n.pred_vocab->
name(i));
868 cerr <<
"Saving ..." << endl;
870 if (n.representation() == EST_Ngrammar::dense)
872 for(i=0;i<n.num_states();i++)
876 cerr <<
"\r" << i*100/n.num_states() <<
"%";
884 n.p_states[i].pdf().
item_freq(k,name,freq);
892 fwrite(&count,
sizeof(
double),1,ofd);
893 fwrite(&freq,
sizeof(
double),1,ofd);
900 fwrite(&count,
sizeof(
double),1,ofd);
902 else if (n.representation() == EST_Ngrammar::backoff)
909 int total_ngrams = (int)pow(
float(n.get_vocab_length()),
float(n.order()-1));
911 for(i=0;i<total_ngrams;i++)
915 cerr <<
"\r" << i*100/total_ngrams <<
"%";
919 this_pdf = n.prob_dist(this_ngram);
934 fwrite(&count,
sizeof(
double),1,ofd);
935 fwrite(&freq,
sizeof(
double),1,ofd);
946 cerr <<
"\r \r" << endl;