40 #include "EST_Ngrammar.h"
117 int main(
int argc,
char **argv)
122 EST_String wordlist_file, script_file, in_file, format;
130 bool per_file_stats=
false;
131 bool raw_stats=
false;
135 double raw_entropy,count,entropy,perplexity,total_raw_H,total_count;
141 EST_String(
"[input file0] [input file1] ...\n")+
142 "-g <ifile> grammar file (required)\n"+
143 "-w <ifile> filename containing word list (required for some grammar formats)\n"+
144 "-S <ifile> script file\n"+
145 "-raw_stats print unnormalised entropy and sample count\n"+
146 "-brief print results in brief format\n"+
147 "-f print stats for each file\n"+
149 "-input_format <string>\n"+
150 " format of input data (default sentence_per_line)\n"+
151 " may also be sentence_per_file, or ngram_per_line.\n"+
154 "-prev_tag <string>\n"+
155 " tag before sentence start\n"+
156 "-prev_prev_tag <string>\n"+
157 " all words before 'prev_tag'\n"+
158 "-last_tag <string>\n"+
159 " after sentence end\n"+
161 " use default tags of "+SENTENCE_START_MARKER+
162 ","+SENTENCE_END_MARKER+
" and "+SENTENCE_END_MARKER+
"\n"+
168 wordlist_file = al.
val(
"-w");
174 per_file_stats =
true;
175 if (al.
present(
"-input_format"))
176 input_format = al.
val(
"-input_format");
178 input_format =
"sentence_per_line";
187 if (al.
present(
"-default_tags"))
189 prev_tag = SENTENCE_START_MARKER;
190 prev_prev_tag = SENTENCE_END_MARKER;
191 last_tag = SENTENCE_END_MARKER;
196 if (al.
present(
"-default_tags"))
197 cerr <<
"test_ngram: WARNING : -prev_tag overrides -default_tags"
199 prev_tag = al.
val(
"-prev_tag");
202 if (al.
present(
"-prev_prev_tag"))
204 if (al.
present(
"-default_tags"))
205 cerr <<
"test_ngram: WARNING : -prev_prev_tag overrides -default_tags" << endl;
206 prev_prev_tag = al.
val(
"-prev_prev_tag");
211 if (al.
present(
"-default_tags"))
212 cerr <<
"test_ngram: WARNING : -last_tag overrides -default_tags" << endl;
213 last_tag = al.
val(
"-last_tag");
216 if ( ( (prev_tag==
"") || (prev_prev_tag==
"") || (last_tag==
"") )
217 && ( (prev_tag!=
"") || (prev_prev_tag!=
"") || (last_tag!=
"") ) )
219 cerr <<
"test_ngram: ERROR : if any tags are given, ALL must be given" << endl;
227 script_file = al.
val(
"-S");
229 if(load_StrList(script_file,script) != format_ok)
231 cerr <<
"test_ngram: Could not read script from file "
232 << script_file << endl;
238 in_file = al.
val(
"-g");
241 cerr <<
"test_ngram: Must give a grammar filename using -g" << endl;
247 if(script.head()==NULL)
250 for(p=files.head();p!=0;p=p->next())
254 if(script.head() == NULL)
256 cerr <<
"test_ngram: No test files given" << endl;
260 if (wordlist_file !=
"")
263 if (load_StrList(wordlist_file,wordlist) != format_ok)
265 cerr <<
"test_ngram: Could not read wordlist from file " << wordlist_file
271 if (ngrammar.load(in_file,wordlist) != format_ok)
273 cerr <<
"test_ngram: Failed to load grammar" << endl;
279 if (ngrammar.load(in_file) != format_ok)
281 cerr <<
"test_ngram: Failed to load grammar" << endl;
288 cout <<
"Ngram Test Results" << endl;
289 cout <<
"==================" << endl;
292 for (p = script.head(); p; p = p->next())
295 if (test_stats(ngrammar,
303 total_raw_H += raw_entropy;
304 total_count += count;
309 cout << basename(script(p)) <<
" \t";
311 cout << script(p) << endl;
316 cout << raw_entropy <<
" " << count <<
" ";
319 cout <<
" raw entropy " << raw_entropy << endl;
320 cout <<
" count " << count << endl;
325 cout << entropy <<
" " << perplexity << endl;
328 cout <<
" entropy " << entropy << endl;
329 cout <<
" perplexity " << perplexity << endl << endl;
335 cerr <<
"test_ngram: WARNING : file '" << script(p)
336 <<
"' could not be processed" << endl;
343 cout <<
"Summary for grammar " << in_file << endl;
346 cout <<
"summary \t";
351 cout << total_raw_H <<
" " << total_count <<
" ";
354 cout <<
" raw entropy " << total_raw_H << endl;
355 cout <<
" count " << total_count << endl;
360 cout << total_raw_H / total_count;
361 cout <<
" " << pow(2.0,total_raw_H / total_count);
366 cout <<
" entropy " << total_raw_H / total_count << endl;
367 cout <<
" perplexity " << pow(2.0,total_raw_H / total_count);
373 cerr <<
"test_ngram: No data processed" << endl;