42 #include "EST_Ngrammar.h"
43 #include "EST_Pathname.h"
146 int main(
int argc,
char **argv)
151 EST_String wordlist_file,wordlist_file2, out_file, format;
152 EST_String prev_tag(
""), prev_prev_tag(
""), last_tag(
"");
153 EST_String input_format(
""), oov_mode(
""), oov_marker(
"");
154 EST_Ngrammar::representation_t representation =
164 EST_String(
"[input file0] [input file1] ... -o [output file]\n")+
165 "-w <ifile> filename containing word list (required)\n"+
166 "-p <ifile> filename containing predictee word list\n"+
167 " (default is to use wordlist given by -w)\n"+
168 "-order <int> order, 1=unigram, 2=bigram etc. (default 2)\n"+
169 "-smooth <int> Good-Turing smooth the grammar up to the\n"+
170 " given frequency\n"+
171 "-o <ofile> Output file for constructed ngram\n"+
173 "-input_format <string>\n"+
174 " format of input data (default sentence_per_line)\n"+
175 " may be sentence_per_file, ngram_per_line.\n"+
176 "-otype <string> format of output file, one of cstr_ascii\n"+
177 " cstr_bin or htk_ascii\n"+
178 "-sparse build ngram in sparse representation\n"+
179 "-dense build ngram in dense representation (default)\n"+
181 " build backoff ngram (requires -smooth)\n"+
183 " frequency floor value used with some ngrams\n"+
184 "-freqsmooth <int>\n"+
185 " build frequency backed off smoothed ngram, this\n"+
186 " requires -smooth option\n"+
187 "-trace give verbose outout about build process\n"+
188 "-save_compressed save ngram in gzipped format\n"+
189 "-oov_mode <string>\n"+
190 " what to do about out-of-vocabulary words,\n"+
191 " one of skip_ngram, skip_sentence (default),\n"+
192 " skip_file, or use_oov_marker\n"+
193 "-oov_marker <string>\n"+
194 " special word for oov words (default "+OOV_MARKER+
")\n"+
195 " (use in conjunction with '-oov_mode use_oov_marker'\n"+
198 "-prev_tag <string>\n"+
199 " tag before sentence start\n"+
200 "-prev_prev_tag <string>\n"+
201 " all words before 'prev_tag'\n"+
202 "-last_tag <string>\n"+
203 " after sentence end\n"+
204 "-default_tags use default tags of "+SENTENCE_START_MARKER+
205 ","+SENTENCE_END_MARKER+
" and "+SENTENCE_END_MARKER+
"\n"+
209 if (al.
present(
"-input_format"))
210 input_format = al.
val(
"-input_format");
212 input_format =
"sentence_per_line";
215 oov_mode = al.
val(
"-oov_mode");
217 oov_mode =
"skip_sentence";
222 if(oov_mode !=
"use_oov_marker")
224 cerr <<
"Error : can only use -oov_marker with '-oov_mode use_oov_marker'" << endl;
228 oov_marker = al.
val(
"-oov_marker");
234 if( (oov_mode !=
"skip_ngram") &&
235 (oov_mode !=
"skip_sentence") &&
236 (oov_mode !=
"skip_file") &&
237 (oov_mode !=
"use_oov_marker") )
239 cerr << oov_mode <<
" is not a valid oov_mode !" << endl;
244 wordlist_file = al.
val(
"-w");
246 cerr <<
"build_ngram: Must specify a wordlist with -w" << endl;
250 if (load_StrList(wordlist_file,wordlist) != format_ok)
252 cerr <<
"build_ngram: Could not read wordlist from file "
253 << wordlist_file << endl;
261 if(input_format !=
"ngram_per_line")
263 cerr <<
"Can't have differering predictor/predictee lists unless data is in ngram_per_line format !" << endl;
267 wordlist_file2 = al.
val(
"-p");
268 if (load_StrList(wordlist_file2,wordlist2) != format_ok)
270 cerr <<
"build_ngram: Could not read predictee list from file "
271 << wordlist_file2 << endl;
280 out_file = al.
val(
"-o");
284 if (al.
present(
"-default_tags"))
286 prev_tag = SENTENCE_START_MARKER;
287 prev_prev_tag = SENTENCE_END_MARKER;
288 last_tag = SENTENCE_END_MARKER;
290 wordlist.
append(SENTENCE_START_MARKER);
291 wordlist.
append(SENTENCE_END_MARKER);
295 wordlist2.
append(SENTENCE_START_MARKER);
296 wordlist2.
append(SENTENCE_END_MARKER);
302 if (al.
present(
"-default_tags"))
303 cerr <<
"build_ngram: WARNING : -prev_tag overrides -default_tags"
305 prev_tag = al.
val(
"-prev_tag");
308 if (al.
present(
"-prev_prev_tag"))
310 if (al.
present(
"-default_tags"))
311 cerr <<
"build_ngram: WARNING : -prev_prev_tag overrides -default_tags"
313 prev_prev_tag = al.
val(
"-prev_prev_tag");
318 if (al.
present(
"-default_tags"))
319 cerr <<
"build_ngram: WARNING : -last_tag overrides -default_tags"
321 last_tag = al.
val(
"-last_tag");
324 if ( ( (prev_tag==
"") || (prev_prev_tag==
"") || (last_tag==
"") )
325 && ( (prev_tag!=
"") || (prev_prev_tag!=
"") || (last_tag!=
"") ) )
327 cerr <<
"build_ngram: ERROR : if any tags are given, ALL must be given"
333 order = al.
ival(
"-order");
336 cerr <<
"build_ngram: WARNING : No order specified with -order : defaulting to bigram"
342 format = al.
val(
"-otype");
347 floor = al.
dval(
"-floor");
354 cerr <<
"build_ngram: backoff requires smooth value" << endl;
360 cerr <<
"build_ngram: frequency smooth requires smooth value"
366 representation = EST_Ngrammar::dense;
367 else if (al.
present(
"-sparse"))
369 cerr <<
"build_ngram: Sorry, sparse representation is not yet available " << endl;
371 representation = EST_Ngrammar::sparse;
373 else if (al.
present(
"-backoff"))
374 representation = EST_Ngrammar::backoff;
376 cerr <<
"build_ngram: Defaulting to dense representation" << endl;
380 if (!ngrammar.init(order,representation,wordlist,wordlist2))
382 cerr <<
"build_ngram: Failed to initialise " << order <<
"-gram" << endl;
388 if (!ngrammar.init(order,representation,wordlist))
390 cerr <<
"build_ngram: Failed to initialise " << order <<
"-gram" << endl;
398 if (!ngrammar.build(files,prev_tag,prev_prev_tag,
399 last_tag,input_format,oov_mode,
400 al.
ival(
"-backoff"),al.
ival(
"-smooth")))
402 cerr <<
"build_ngram: Failed to build backoff " << order
407 cerr <<
"build_ngram: Built backoff " << order <<
412 if (!ngrammar.build(files,prev_tag,prev_prev_tag,
413 last_tag,input_format,oov_mode))
415 cerr <<
"build_ngram: Failed to build " << order <<
"-gram" << endl;
420 cerr <<
"build_ngram: Built " << order <<
"-gram" << endl;
427 Ngram_freqsmooth(ngrammar,al.
ival(
"-smooth"),al.
ival(
"-freqsmooth"));
431 int smoothcount = atoi(al.
val(
"-smooth"));
432 if(!Good_Turing_smooth(ngrammar,smoothcount,0))
434 cerr <<
"build_ngram: Failed to smooth " << order <<
"-gram" << endl;
439 cerr <<
"build_ngram: Good Turing smoothed " << order <<
"-gram" << endl;
444 if (al.
present(
"-save_compressed"))
447 if (ngrammar.save(tmp_file,format,trace,floor) == write_ok)
451 if (tmp.extension() == GZIP_FILENAME_EXTENSION)
452 prog_name =
"gzip --stdout";
453 else if (tmp.extension() == COMPRESS_FILENAME_EXTENSION)
454 prog_name =
"compress -c";
457 prog_name =
"gzip --stdout";
459 out_file = out_file +
"." + GZIP_FILENAME_EXTENSION;
463 cerr <<
"build_ngram: Compressing with '" << prog_name <<
"'" << endl;
466 if(compress_file(tmp_file,out_file,prog_name) != 0)
468 cerr <<
"build_ngram: Failed to compress to file "
470 (void)delete_file(tmp_file);
474 (void)delete_file(tmp_file);
477 cerr <<
"build_ngram: Saved in compressed " << format
478 <<
" format to " << out_file << endl;
482 cerr <<
"build_ngram: Failed to write temporary file "
491 if (ngrammar.save(out_file,format,trace,floor) == write_ok)
494 cerr <<
"build_ngram: Saved in " << format
495 <<
" format to " << out_file << endl;
499 cerr <<
"build_ngram: Failed to save " << format <<
" format data to "