Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
wfst_build_main.cc
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996,1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : November 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* Build a WFST from some base: */
37 /* 1 a set of context dependent rewrite rules using the */
38 /* the algorithms from "An Efficient Compiler for Weighted Rewrite */
39 /* Rules", by Mehryar Mohri and Richard Sproat ACL 1996 */
40 /* and information from the techniques in Rithie el al. 1992 */
41 /* 2 A regular grammar (but can be written as a CFG as long as it */
42 /* contains no centre embedding */
43 /* 3 A regular expression */
44 /* 4 lts rules (but that doesn't work yet) */
45 /* */
46 /* or apply some operator on existing wfst(s): compose, concatenate, */
47 /* difference, union, */
48 /* */
49 /* Also allow determinizing and minimization as required */
50 /* */
51 /*=======================================================================*/
52 #include <cstdlib>
53 #include <cstdio>
54 #include <iostream>
55 #include <fstream>
56 #include <cstring>
57 #include "EST.h"
58 #include "EST_WFST.h"
59 
60 static int wfst_build_main(int argc, char **argv);
61 
62 
63 
64 /** @name <command>wfst_build</command> <emphasis>Build a weighted finite-state transducer</emphasis>
65  @id wfst-build-manual
66  * @toc
67  */
68 
69 //@{
70 
71 
72 /**@name Synopsis
73  */
74 //@{
75 
76 //@synopsis
77 
78 /**
79 
80 Build and.or process weighted finite state transducers (WFSTs) form
81 various input formats. This program accepts descriptions
82 in the following formats and converts them to WFSTs
83 <itemizedlist>
84 <listitem><para>regular expressions</para></listitem>
85 <listitem><para>regular grammars</para></listitem>
86 <listitem><para>Koskenniemi/Kay/Kaplan context restriction rules</para></listitem>
87 </itemizedlist>
88 In addition various operations can be performed on two WFSTs
89 <itemizedlist>
90 <listitem><para>compose: form new WFST feeding output of first WFSTs into
91 second WFSTs.</para></listitem>
92 <listitem><para>union: form new WFST accepting the language both WFSTs
93 </para></listitem>
94 <listitem><para>intersect: form new WFST accepting only the language common
95 to both WFSTs
96 </para></listitem>
97 <listitem><para>concat: form new WFST accepting the language from the
98 concatenation of all strings in the first WFST to all strings in the
99 second.
100 </para></listitem>
101 </itemizedlist>
102 The newly formed WFSTs can be optionally determinized and minimzed.
103 
104 The option asis allows a single WFSTs to be loaded and determinized
105 and/or minimized
106 
107  */
108 
109 //@}
110 
111 /**@name OPTIONS
112  */
113 //@{
114 
115 //@options
116 
117 //@}
118 int main(int argc, char **argv)
119 {
120 
121  wfst_build_main(argc,argv);
122 
123  exit(0);
124  return 0;
125 }
126 
127 static int wfst_build_main(int argc, char **argv)
128 {
129  // Top level function generates a WFST from rules
130  EST_Option al;
131  EST_StrList files;
132  EST_String outfile;
133 
134  parse_command_line
135  (argc, argv,
136  EST_String("[option] [rulefile0] [rulefile1] ...\n")+
137  "Summary: Build a weighted finite state transducer from rules/wfsts\n"+
138  "-type <string> {kk} Input rule type: kk, lts, rg, tl, compose, regex\n"+
139  " union, intersect, concat, asis\n"+
140  "-determinize Determinize WFST before saving it\n"+
141  "-detmin Determinize and minimize WFST before saving it\n"+
142  "-o <ofile> Output file for saved WFST (default stdout)\n"+
143  "-otype <string> {ascii}\n"+
144  " Output type, ascii or binary\n"+
145  "-heap <int> {210000}\n"+
146  " Set size of Lisp heap, needed for large rulesets\n"+
147  "-q Quiet mode, no summary generated\n",
148  files, al);
149 
150  if (al.present("-o"))
151  outfile = al.val("-o");
152  else
153  outfile = "-";
154 
155  siod_init(al.ival("-heap"));
156 
157  LISP ruleset;
158  LISP inalpha, outalpha;
159  EST_WFST *wfst = new EST_WFST;
160  gc_protect(&ruleset);
161 
162  if (al.val("-type") == "kk")
163  {
164  ruleset = car(vload(files(files.head()),1));
165  kkcompile(ruleset,*wfst);
166  }
167  else if (al.val("-type") == "lts")
168  {
169  ruleset = car(vload(files(files.head()),1));
170  ltscompile(ruleset,*wfst);
171  }
172  else if (al.val("-type") == "rg")
173  {
174  ruleset = car(vload(files(files.head()),1));
175  rgcompile(ruleset,*wfst);
176  }
177  else if (al.val("-type") == "tl")
178  {
179  ruleset = car(vload(files(files.head()),1));
180  tlcompile(ruleset,*wfst);
181  }
182  else if (al.val("-type") == "asis")
183  {
184  if (wfst->load(files.nth(0)) != format_ok) exit(-1);
185  }
186  else if (al.val("-type") == "compose")
187  {
188  EST_WFST a,b;
189 
190  if (files.length() != 2)
191  EST_error("compose requires two WFSTs to combine");
192 
193  if (a.load(files.nth(0)) != format_ok) exit(-1);
194  if (b.load(files.nth(1)) != format_ok) exit(-1);
195 
196  wfst->compose(a,b);
197  }
198  else if (al.val("-type") == "union")
199  {
200  EST_WFST a,b;
201 
202  if (files.length() != 2)
203  EST_error("union requires two WFSTs to combine");
204 
205  if (a.load(files.nth(0)) != format_ok) exit(-1);
206  if (b.load(files.nth(1)) != format_ok) exit(-1);
207 
208  wfst->uunion(a,b);
209  }
210  else if (al.val("-type") == "intersect")
211  {
212  EST_WFST a,b;
213 
214  if (files.length() != 2)
215  EST_error("intersect requires two WFSTs to combine");
216  if (a.load(files.nth(0)) != format_ok) exit(-1);
217  if (b.load(files.nth(1)) != format_ok) exit(-1);
218 
219  wfst->intersection(a,b);
220  }
221  else if (al.val("-type") == "concat")
222  {
223  EST_WFST a,b;
224 
225  if (files.length() != 2)
226  EST_error("concat requires two WFSTs to combine");
227  if (a.load(files.nth(0)) != format_ok) exit(-1);
228  if (b.load(files.nth(1)) != format_ok) exit(-1);
229 
230  wfst->concat(a,b);
231  }
232  else if (al.val("-type") == "difference")
233  {
234  EST_WFST a,b;
235 
236  if (files.length() != 2)
237  EST_error("difference requires two WFSTs to combine");
238  if (a.load(files.nth(0)) != format_ok) exit(-1);
239  if (b.load(files.nth(1)) != format_ok) exit(-1);
240 
241  wfst->difference(a,b);
242  }
243  else if (al.val("-type") == "regex")
244  {
245  ruleset = car(vload(files(files.head()),1));
246  inalpha = siod_nth(0,ruleset);
247  outalpha = siod_nth(1,ruleset);
248  wfst->build_from_regex(inalpha,outalpha,car(cdr(cdr(ruleset))));
249  }
250  else
251  {
252  cerr << "wfst_build: unknown rule type \"" << al.val("-type")
253  << "\"" << endl;
254  exit(-1);
255  }
256 
257  if (al.present("-determinize"))
258  {
259  EST_WFST *dwfst = new EST_WFST;
260  dwfst->determinize(*wfst);
261  if (!al.present("-q"))
262  {
263  cout << "wfst_build summary: " << endl;
264  cout << " non-deterministic wfst: " <<
265  wfst->summary() << endl;
266  cout << " deterministic wfst: " <<
267  dwfst->summary() << endl;
268  }
269  delete wfst;
270  wfst = dwfst;
271  }
272  else if (al.present("-detmin"))
273  {
274  if (!al.present("-q"))
275  {
276  cout << "wfst_build summary: " << endl;
277  cout << " non-deterministic wfst: " <<
278  wfst->summary() << endl;
279  }
280  EST_WFST *dwfst = new EST_WFST;
281  dwfst->determinize(*wfst);
282  delete wfst;
283  if (!al.present("-q"))
284  cout << " deterministic wfst: " <<
285  dwfst->summary() << endl;
286  EST_WFST *mwfst = new EST_WFST;
287  mwfst->minimize(*dwfst);
288  if (!al.present("-q"))
289  cout << " minimized wfst: " <<
290  mwfst->summary() << endl;
291  delete dwfst;
292  wfst = mwfst;
293  }
294  else
295  {
296  if (!al.present("-q"))
297  cout << "wfst_build: " << wfst->summary() << endl;
298  }
299 
300  wfst->save(outfile,al.val("-otype"));
301  delete wfst;
302  gc_unprotect(&ruleset);
303 
304  return 0;
305 }
306