50 float (*local_cost_function)(
const EST_Item *item1,
54 bool (*local_pruning_function)(
const int i,
62 local_cost_function lcf,
63 local_pruning_function lpf,
69 local_cost_function lcf,
73 bool local_prune(
const int i,
const int j,
74 const int max_i,
const int max_j);
75 static void load_vocab(
const EST_String &vfile);
79 static bool show_cost=FALSE;
80 static int prune_width = 100;
95 float insertion_cost = 1;
96 float deletion_cost = 1;
97 float substitution_cost = 1;
157 int main(
int argc,
char **argv)
166 null_sym->set_name(
"<null>");
168 parse_command_line(argc, argv,
170 "dp <options> \"pattern 1\" \"pattern 2\"\n"+
171 "Find the best alignment of a pair of symbol sequences (e.g. word pronuciations).\n"+
172 "-vocab <string> file containing vocabulary\n"+
173 "-place_holder <string> which vocab item is the place holder (default is " + null_sym->name() +
" )\n"+
174 "-show_cost show cost of matching path\n"+
175 "-o <string> output file\n"+
176 "-p <int> 'beam' width\n"+
178 "-i <float> insertion cost\n"+
179 "-d <float> deletion cost\n"+
180 "-s <float> substitution cost\n"+
182 "-cost_matrix <string> file containing cost matrix\n",
188 load_vocab(al.
val(
"-vocab"));
191 cerr << argv[0] <<
": no vocab file specified" << endl;
196 prune_width = al.
ival(
"-p");
198 if (al.
present(
"-cost_matrix"))
202 cerr <<
"Can't have ins/del/subs costs as well as matrix !" << endl;
205 distance_measure=
"matrix";
206 cost_matrix.
load(al.
val(
"-cost_matrix"));
208 if(al.
present(
"-place_holder"))
209 null_sym->set_name(al.
val(
"-place_holder"));
211 if(StrVector_index(vocab,null_sym->name()) < 0)
213 cerr <<
"The place holder symbol '" << null_sym->name();
214 cerr <<
"' is not in the vocbulary !" << endl;
220 cerr <<
"Cost matrix number of columns must match vocabulary size !" << endl;
225 cerr <<
"Cost matrix number of rows must match vocabulary size !" << endl;
232 insertion_cost = al.
fval(
"-i");
233 deletion_cost = al.
fval(
"-d");
234 substitution_cost = al.
fval(
"-s");
238 cerr <<
"Must give either ins/del/subs costs or cost matrix !" << endl;
246 if(files.length() != 2)
248 cerr <<
"Must give 2 patterns !" << endl;
252 StringtoStrList(files(files.head()),pattern1_l,
" ");
253 StringtoStrList(files(files.head()->next()),pattern2_l,
" ");
261 for(p=pattern1_l.head();p != 0; p=p->next())
263 if( StrVector_index(vocab,pattern1_l(p)) < 0)
265 cerr << pattern1_l(p) <<
" is not in the vocabulary !" << endl;
269 new_item.set_name(pattern1_l(p));
270 path1->append(&new_item);
273 for(p=pattern2_l.head();p != 0; p=p->next())
275 if( StrVector_index(vocab,pattern2_l(p)) < 0)
277 cerr << pattern2_l(p) <<
" is not in the vocabulary !" << endl;
281 new_item.set_name(pattern2_l(p));
282 path2->append(&new_item);
290 if(!dp_match(*path1,*path2,*match,
291 local_cost,local_prune,null_sym))
294 cerr <<
"No match could be found." << endl;
304 static void load_vocab(
const EST_String &vfile)
309 if (ts.
open(vfile) == -1)
311 cerr <<
"can't find vocab file \"" << vfile <<
"\"" << endl;
321 StrList_to_StrVector(vocab_l,vocab);
332 if(distance_measure ==
"simple")
334 if(s1->name() == s2->name())
339 return insertion_cost;
340 else if(s2 == null_sym)
341 return deletion_cost;
343 return substitution_cost;
350 return cost_matrix(StrVector_index(vocab,s1->name()),
351 StrVector_index(vocab,s2->name()));
355 bool local_prune(
const int i,
const int j,
356 const int max_i,
const int max_j)
362 float scale = (float)max_i / (
float)max_j;
364 float near_j = (float)i / scale;
365 float near_i = (float)j * scale;
374 if( (abs((
int)(near_i - (float)i)) > prune_width) ||
375 (abs((
int)(near_j - (
float)j)) > prune_width) )