Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
pitchmark_main.cc
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Paul Taylor */
34 /* Date : 1997, 1998, 1999 */
35 /*-----------------------------------------------------------------------*/
36 /* Pitchmarking program */
37 /*************************************************************************/
38 
39 #include <cstdlib>
40 #include <iostream>
41 #include <fstream>
42 #include "EST_unix.h"
43 #include "EST_cmd_line_options.h"
44 #include "EST_cmd_line.h"
45 #include "EST_speech_class.h"
46 #include "sigpr/EST_pitchmark.h"
47 
48 
49 void set_options(EST_Features &op, EST_Option &al);
50 
51 static EST_write_status save_msec(EST_Track &pm, EST_String filename);
52 static EST_write_status save_ogi_bin(EST_Track &pm, EST_String filename,
53  int sr);
54 void pm_to_label(EST_Track &pm, EST_Relation &lab);
55 
56 
57 /*void pm_to_label(EST_Track &pm, EST_Relation &lab);
58 void find_pm(EST_Wave &sig, EST_Track &pm);
59 
60 void pm_min_check(EST_Track &pm, float min);
61 void pm_sanity_check(EST_Track &pm, float new_end,
62  float max, float min, float def);
63 
64 void pm_fill(EST_Track &pm, float new_end, float max,
65  float min, float def);
66 
67 void pm_to_f0(EST_Track &pm, EST_Track &f0);
68 */
69 
70 
71 /** @name <command> pitchmark </command> <emphasis> Find instants of glottal closure in Laryngograph file</emphasis>
72 
73  * @id pitchmark-manual
74  * @toc */
75 
76 //@{
77 
78 
79 /**@name Synopsis
80  */
81 //@{
82 
83 //@synopsis
84 
85 /**
86 <command>pitchmark</command> locates instants of glottal closure in a
87 laryngograph waveform, and performs post-processing to produce even
88 pitchmarks. EST does not currently provide any means of pitchmarking a
89 speech waveform.
90 
91 Pitchmarking is performed by calling the
92 <function>pitchmark()</function> function, which carries out the
93 following operations:
94 
95 <orderedlist> <listitem><para>Double low pass filter the signal. This
96 removes noise in the signal. The parameter
97 <parameter>lx_lf</parameter> specifies the low pass cutoff frequency,
98 and <parameter>lx_lo</parameter> specifies the order. Double filtering
99 (feeding the waveform through the filter, then reversing the waveform
100 and feeding it through again) is performed to reduce any phase shift
101 between the input and output of the filtering operation.
102 </para></listitem>
103 
104 <listitem><para>Double high pass filter the signal. This removes the
105 very low frequency swell that is often observed in laryngograph
106 waveforms. The parameter <parameter>lx_hf</parameter> specifies the high pass cutoff frequency,
107 and <parameter>lx_ho</parameter> specifies the order.
108 Double filtering is performed to reduce any phase shift
109 between the input and output of the filtering operation.
110 </para></listitem>
111 
112 <listitem><para>Calculate the delta signal. The filtered waveform is
113 differentiated using the <function>delta()</function>
114 function.</para></listitem>
115 
116 <listitem><para>Low pass filter the delta signal. Some noise may still
117 be present in the signal, and this is removed by further low pass
118 filtering. Experimentation has shown that simple mean smoothing is
119 often more effective than FIR smoothing at this point. The parameter
120 <parameter>mo</parameter> is used to specify the size of the mean
121 smoothing window. If FIR smoothing is chosen, the parameter
122 <parameter>df_lf</parameter> specifies the low pass cutoff frequency,
123 and <parameter>df_lo</parameter> specifies the order. Double filtering
124 is again used to avoid phase distortion.
125 
126 </para></listitem>
127 
128 <listitem><para>Pick zero crossings. Now simple zero-crossing is used
129 to find the pitchmarks themselves. </para></listitem>
130 
131 </orderedlist>
132 
133 <command>pitchmark</command> also performs post-processing on the pitchmarks.
134 This can be used to eliminate pitchmarks which occur too closely together,
135 or to provide estimated evenly spaced pitchmarks during unvoiced regions.
136 The -fill option switches <action>this facility on</action>,
137 and -min, -max, -def,
138 -end and -wave_end control its operation.
139 
140 */
141 
142 //@}
143 
144 /**@name OPTIONS
145  */
146 //@{
147 
148 //@options
149 
150 //@}
151 
152 
153 int main (int argc, char *argv[])
154 {
155  EST_Track pm;
156  EST_Wave lx;
157  EST_Option al;
158  EST_Features op;
159  EST_String out_file("-");
160  EST_StrList files;
161 
162  parse_command_line
163  (argc, argv,
164  EST_String("[input file] -o [output file] [options]")+
165  "Summary: pitchmark laryngograph (lx) files\n"
166  "use \"-\" to make input and output files stdin/out\n"
167  "-h Options help\n\n"+
168  options_wave_input()+
169  options_track_output()+
170  "-lx_lf <int> lx low frequency cutoff\n\n"
171  "-lx_lo <int> lx low order\n\n"
172  "-lx_hf <int> lx high frequency cutoff\n\n"
173  "-lx_ho <int> lx high order\n\n"
174  "-df_lf <int> df low frequeny cutoff\n\n"
175  "-df_lo <int> df low order\n\n"
176  "-med_o <int> median smoothing order\n\n"
177  "-mean_o <int> mean smoothing order\n\n"
178  "-inv Invert polarity of lx signal. Often the lx signal \n"
179  " is upside down. This option inverts the signal prior to \n"
180  " processing.\n\n"
181  "-fill Insert and remove pitchmarks according to min, max\n"
182  " and def period values. Often it is desirable to place limits\n"
183  " on the values of the pitchmarks. This option enforces a \n"
184  " minimum and maximum pitch period (specified by -man and -max).\n"
185  " If the maximum pitch setting is low enough, this will \n"
186  " esnure that unvoiced regions have evenly spaced pitchmarks \n\n"
187  "-min <float> Minimum allowed pitch period, in seconds\n\n"
188  "-max <float> Maximum allowed pitch period, in seconds\n\n"
189  "-def <float> Default pitch period in seconds, used for a guide\n"
190  " as to what length pitch periods should be in unvoiced \n"
191  " sections \n\n"
192  "-pm <ifile> Input is raw pitchmark file. This option is \n"
193  " used to perform filling operations on an already existing \n"
194  " set of pitchmarks \n\n"
195  "-f0 <ofile> Calculate F0 from pitchmarks and save to file\n\n"
196  "-end <float> Specify the end time of the last pitchmark, for use \n"
197  " with the -fill option\n\n"
198  "-wave_end Use the end of a waveform to specify when the \n"
199  " last pitchmark position should be. The waveform file is only \n"
200  " read to determine its end, no processing is performed\n\n"
201  "-inter Output intermediate waveforms. This will output the \n"
202  " signal at various stages of processing. Examination of these \n"
203  " waveforms is extremely useful in setting the parameters for \n"
204  " similar waveforms\n\n"
205  "-style <string> \"track\" or \"lab\"\n\n", files, al);
206 
207  set_options(op, al);
208 
209  out_file = al.present("-o") ? al.val("-o") : (EST_String)"-";
210 
211  if (!al.present("-pm") || (al.present("-pm") && al.present("-wave_end")))
212  if (read_wave(lx, files.first(), al) != read_ok)
213  exit(-1);
214 
215  if (al.present("-pm"))
216  pm.load(al.val("-pm"));
217  else
218  {
219  if (al.present("-inv"))
220  invert(lx);
221  pm = pitchmark(lx, op);
222  }
223 
224  // this allows the end to be aligned with the end of a waveform
225  op.set("pm_end", lx.end());
226 
227  if (al.present("-f0"))
228  {
229  EST_Track f0;
230  pm_to_f0(pm, f0);
231  f0.save(al.val("-f0"));
232  }
233 
234  // various options for filling he gaps between distant pitchmarks
235  // and removing pitchmarks that are too close together
236 
237  if (al.present("-fill"))
238  {
239  pm_fill(pm, op.F("pm_end"), op.F("max_period"),
240  op.F("min_period"), op.F("def_period"));
241  pm_fill(pm, op.F("pm_end"), op.F("max_period"),
242  op.F("min_period"), op.F("def_period"));
243  }
244  else if (al.present("-min"))
245  pm_min_check(pm, al.fval("-min"));
246 
247  if (al.present("-style"))
248  {
249  // label format
250  if (al.val("-style").contains("lab"))
251  {
252  EST_Relation lab;
253  pm_to_label(pm, lab);
254  if (lab.save(out_file + ".pm_lab") != write_ok)
255  exit(-1);
256  }
257  // save file in "traditional" milli-second format
258  if (al.val("-style").contains("msec"))
259  save_msec(pm, out_file + ".pm");
260 
261  // ogi binary integer sample point format
262  if (al.val("-style").contains("ogi_bin"))
263  save_ogi_bin(pm, out_file + ".pmv", lx.sample_rate());
264  }
265  else if (pm.save(out_file, al.val("-otype", 0)) != write_ok)
266  {
267  cerr << "pitchmark: failed to write output to \""
268  << out_file << "\"" << endl;
269  exit(-1);
270  }
271  return 0;
272 }
273 
274 static EST_write_status save_msec(EST_Track &pm, EST_String filename)
275 {
276  ostream *outf;
277 
278  if (filename == "-")
279  outf = &cout;
280  else
281  outf = new ofstream(filename);
282 
283  if (!(*outf))
284  return write_fail;
285 
286  outf->precision(5);
287  outf->setf(ios::fixed, ios::floatfield);
288  outf->width(8);
289 
290  for (int i = 0; i < pm.num_frames(); ++i)
291  *outf << pm.t(i) * 1000.0 << endl;
292 
293  return write_ok;
294 }
295 
296 static EST_write_status save_ogi_bin(EST_Track &pm, EST_String filename, int sr)
297 {
298  int *d;
299  FILE *fp;
300  int i;
301 
302  d = new int[pm.num_frames()];
303 
304  for (i = 0; i < pm.num_frames(); ++i)
305  d[i] = int(pm.t(i) * (float) sr);
306 
307  if ((fp = fopen(filename, "wb")) == NULL)
308  return misc_write_error;
309 
310  if (fwrite(d, pm.num_frames(), sizeof(int), fp) != 1)
311  {
312  fclose(fp);
313  return misc_write_error;
314  }
315  delete d;
316 
317  return write_ok;
318 }
319 
320 void override_lib_ops(EST_Option &op, EST_Option &al)
321 {
322  op.override_ival("lx_low_frequency", 400);
323  op.override_ival("lx_low_order", 19);
324  op.override_ival("lx_high_frequency", 40);
325  op.override_ival("lx_high_order", 19);
326  op.override_ival("df_low_frequency", 1000);
327  op.override_ival("df_low_order", 19);
328  op.override_fval("min_period", 0.003);
329  op.override_fval("max_period", 0.02);
330  op.override_fval("def_period", 0.01);
331  op.override_fval("pm_end", -1.0);
332 
333  if (al.present("-lx_lf"))
334  op.override_ival("lx_low_frequency", al.ival("-lx_lf", 0));
335  if (al.present("-lx_lo"))
336  op.override_ival("lx_low_order", al.ival("-lx_lo", 0));
337  if (al.present("-lx_hf"))
338  op.override_ival("lx_high_frequency", al.ival("-lx_hf", 0));
339  if (al.present("-lx_ho"))
340  op.override_ival("lx_high_order", al.ival("-lx_ho", 0));
341  if (al.present("-med_o"))
342  op.override_ival("median_order", al.ival("-med_o", 0));
343  if (al.present("-mean_o"))
344  op.override_ival("mean_order", al.ival("-mean_o", 0));
345  if (al.present("-df_lf"))
346  op.override_ival("df_low_frequency", al.ival("-df_lf", 0));
347  if (al.present("-df_lo"))
348  op.override_ival("df_low_order", al.ival("-df_lo", 0));
349  if (al.present("-min"))
350  op.override_fval("min_period", al.fval("-min", 0));
351  if (al.present("-max"))
352  op.override_fval("max_period", al.fval("-max", 0));
353  if (al.present("-def"))
354  op.override_fval("def_period", al.fval("-def", 0));
355  if (al.present("-end"))
356  op.override_fval("pm_end", al.fval("-end", 0));
357  if (al.present("-inter"))
358  op.override_ival("pm_debug", 1);
359 }
360 
361 void set_options(EST_Features &op, EST_Option &al)
362 {
363  op.set("lx_low_frequency", LX_LOW_FREQUENCY);
364  op.set("lx_low_order", LX_LOW_ORDER);
365  op.set("lx_high_frequency", LX_HIGH_FREQUENCY);
366  op.set("lx_high_order", LX_HIGH_ORDER);
367  op.set("df_low_frequency", DF_LOW_FREQUENCY);
368  op.set("df_low_order", DF_LOW_ORDER);
369  op.set("min_period", MIN_PERIOD);
370  op.set("max_period", MAX_PERIOD);
371  op.set("def_period", DEF_PERIOD);
372  op.set("pm_end", PM_END);
373 
374  if (al.present("-lx_lf"))
375  op.set("lx_low_frequency", al.ival("-lx_lf", 0));
376  if (al.present("-lx_lo"))
377  op.set("lx_low_order", al.ival("-lx_lo", 0));
378  if (al.present("-lx_hf"))
379  op.set("lx_high_frequency", al.ival("-lx_hf", 0));
380  if (al.present("-lx_ho"))
381  op.set("lx_high_order", al.ival("-lx_ho", 0));
382  if (al.present("-med_o"))
383  op.set("median_order", al.ival("-med_o", 0));
384  if (al.present("-mean_o"))
385  op.set("mean_order", al.ival("-mean_o", 0));
386  if (al.present("-df_lf"))
387  op.set("df_low_frequency", al.ival("-df_lf", 0));
388  if (al.present("-df_lo"))
389  op.set("df_low_order", al.ival("-df_lo", 0));
390  if (al.present("-min"))
391  op.set("min_period", al.fval("-min", 0));
392  if (al.present("-max"))
393  op.set("max_period", al.fval("-max", 0));
394  if (al.present("-def"))
395  op.set("def_period", al.fval("-def", 0));
396  if (al.present("-end"))
397  op.set("pm_end", al.fval("-end", 0));
398  if (al.present("-inter"))
399  op.set("pm_debug", 1);
400 }
401 
402 /** @name Examples
403 </para>
404 <formalpara><title>Basic Pitchmarking</title>
405 <para>
406 <screen>
407 $ pitchmark kdt_010.lar -o kdt_010.pm -otype est
408 </screen>
409 </para>
410 </formalpara>
411 
412 <formalpara><title>Pitchmarking with unvoiced regions
413 filled</title> <para> The following fills unvoiced regions with pitch
414 periods that are about 0.01 seconds long. It also post-processes the
415 set of pitchmarks and ensures that noe are above 0.02 seconds long and
416 none below 0.003. A final unvoiced region extending to the end of the
417 wave is specified by using the -wave_end option.
418 </para> </formalpara><para>
419 <screen>
420 $ pitchmark kdt_010.lar -o kdt_010.pm -otype est -fill -min 0.003 \
421  -max 0.02 -def 0.01 -wave_end
422 </screen>
423 
424 */
425 
426 //@{
427 //@}