Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
EST_Token.cc
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : April 1996 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* A Tokenize class, both for Tokens (Strings plus alpha) */
38 /* EST_TokenStream for strings, FILE *, files, pipes etc */
39 /* */
40 /*=======================================================================*/
41 #include <cstdio>
42 #include <iostream>
43 #include "EST_unix.h"
44 #include <cstdlib>
45 #include <climits>
46 #include <cstring>
47 #include "EST_math.h"
48 #include "EST_Token.h"
49 #include "EST_string_aux.h"
50 #include "EST_cutils.h"
51 #include "EST_error.h"
52 
53 const EST_String EST_Token_Default_WhiteSpaceChars = " \t\n\r";
54 const EST_String EST_Token_Default_SingleCharSymbols = "(){}[]";
55 const EST_String EST_Token_Default_PrePunctuationSymbols = "\"'`({[";
56 const EST_String EST_Token_Default_PunctuationSymbols = "\"'`.,:;!?]})";
57 const EST_String Token_Origin_FD = "existing file descriptor";
58 const EST_String Token_Origin_Stream = "existing istream";
59 const EST_String Token_Origin_String = "existing string";
60 
61 static EST_Regex RXanywhitespace("[ \t\n\r]");
62 
63 static inline char *check_extend_str_in(char *str, int pos, int *max)
64 {
65  // Check we are not at the end of the string, if so get some more
66  // and copy the old one into the new one
67  char *newstuff;
68 
69  if (pos >= *max)
70  {
71  if (pos > *max)
72  *max = 2 * pos;
73  else
74  *max *= 2;
75  newstuff = new char[*max];
76  strncpy(newstuff,str,pos);
77  delete [] str;
78  return newstuff;
79  }
80  else
81  return str;
82 }
83 
84 #define check_extend_str(STR, POS, MAX) \
85  (((POS)>= *(MAX))?check_extend_str_in((STR),(POS),(MAX)):(STR))
86 
87 ostream& operator<<(ostream& s, const EST_Token &p)
88 {
89  s << "[TOKEN " << p.pname << "]";
90  return s;
91 }
92 
93 
94 EST_Token &EST_Token::operator = (const EST_Token &a)
95 {
96  linenum = a.linenum;
97  linepos = a.linepos;
98  p_filepos = a.p_filepos;
99  p_quoted = a.p_quoted;
100  space = a.space;
101  prepunc = a.prepunc;
102  pname = a.pname;
103  punc = a.punc;
104  return *this;
105 }
106 
108 {
109  return "line "+itoString(linenum)+" char "+itoString(linepos);
110 }
111 
112 EST_Token &EST_Token::operator = (const EST_String &a)
113 {
114  pname = a;
115  return *this;
116 }
117 
118 EST_TokenStream::EST_TokenStream()
119 {
120  tok_wspacelen = 64; // will grow if necessary
121  tok_wspace = new char[tok_wspacelen];
122  tok_stufflen = 512; // will grow if necessary
123  tok_stuff = new char[tok_stufflen];
124  tok_prepuncslen = 32; // will grow if necessary
125  tok_prepuncs = new char[tok_prepuncslen];
126 
127  default_values();
128 }
129 
130 EST_TokenStream::EST_TokenStream(EST_TokenStream &s)
131 {
132  (void)s;
133 
134  cerr << "TokenStream: warning passing TokenStream not as reference"
135  << endl;
136 
137  // You *really* shouldn't use this AT ALL unless you
138  // fully understand its consequences, you'll be copying open
139  // files and moving file pointers all over the place
140  // basically *DON'T* do this, pass the stream by reference
141 
142  // Now there may be occasions when you do want to do this for example
143  // when you need to do far look ahead or check point as you read
144  // but they are obscure and I'm not sure how to do that for all
145  // the file forms supported by the TokenStream. If you do
146  // I can write a clone function that might do it.
147 
148 }
149 
150 void EST_TokenStream::default_values()
151 {
152  type = tst_none;
153  peeked_tokp = FALSE;
154  peeked_charp = FALSE;
155  eof_flag = FALSE;
156  quotes = FALSE;
157  p_filepos = 0;
158  linepos = 1;
159  WhiteSpaceChars = EST_Token_Default_WhiteSpaceChars;
160  SingleCharSymbols = EST_String::Empty;
161  PrePunctuationSymbols = EST_String::Empty;
162  PunctuationSymbols = EST_String::Empty;
163  build_table();
164  close_at_end=TRUE;
165 }
166 
168 {
169  if (type != tst_none)
170  close();
171  delete [] tok_wspace;
172  delete [] tok_stuff;
173  delete [] tok_prepuncs;
174 
175 }
176 
177 ostream& operator<<(ostream& s, EST_TokenStream &p)
178 {
179  s << "[TOKENSTREAM ";
180  switch (p.type)
181  {
182  case tst_none:
183  cerr << "UNSET"; break;
184  case tst_file:
185  cerr << "FILE"; break;
186  case tst_pipe:
187  cerr << "PIPE"; break;
188  case tst_istream:
189  cerr << "ISTREAM"; break;
190  case tst_string:
191  cerr << "STRING"; break;
192  default:
193  cerr << "UNKNOWN" << endl;
194  }
195  s << "]";
196 
197  return s;
198 }
199 
200 int EST_TokenStream::open(const EST_String &filename)
201 {
202  if (type != tst_none)
203  close();
204  default_values();
205  fp = fopen(filename,"rb");
206  if (fp == NULL)
207  {
208  cerr << "Cannot open file " << filename << " as tokenstream"
209  << endl;
210  return -1;
211  }
212  Origin = filename;
213  type = tst_file;
214 
215  return 0;
216 }
217 
218 int EST_TokenStream::open(FILE *ofp, int close_when_finished)
219 {
220  // absorb already open stream
221  if (type != tst_none)
222  close();
223  default_values();
224  fp = ofp;
225  if (fp == NULL)
226  {
227  cerr << "Cannot absorb NULL filestream as tokenstream" << endl;
228  return -1;
229  }
230  Origin = Token_Origin_FD;
231  type = tst_file;
232 
233  close_at_end = close_when_finished;
234 
235  return 0;
236 }
237 
238 int EST_TokenStream::open(istream &newis)
239 {
240  // absorb already open istream
241  if (type != tst_none)
242  close();
243  default_values();
244  is = &newis;
245  Origin = Token_Origin_Stream;
246  type = tst_istream;
247 
248  return 0;
249 }
250 
252 {
253  // Make a tokenstream from an internal existing string/buffer
254  const char *buf;
255  if (type != tst_none)
256  close();
257  default_values();
258  buf = (const char *)newbuffer;
259  buffer_length = newbuffer.length();
260  buffer = new char[buffer_length+1];
261  memmove(buffer,buf,buffer_length+1);
262  pos = 0;
263  Origin = Token_Origin_String;
264  type = tst_string;
265 
266  return 0;
267 }
268 
269 int EST_TokenStream::seek_end()
270 {
271  // This isn't actually useful but people expect it
272  peeked_charp = FALSE;
273  peeked_tokp = FALSE;
274 
275  switch (type)
276  {
277  case tst_none:
278  cerr << "EST_TokenStream unset" << endl;
279  return -1;
280  break;
281  case tst_file:
282  fseek(fp,0,SEEK_END);
283  p_filepos = ftell(fp);
284  return p_filepos;
285  case tst_pipe:
286  cerr << "EST_TokenStream seek on pipe not supported" << endl;
287  return -1;
288  break;
289  case tst_istream:
290  cerr << "EST_TokenStream seek on istream not yet supported" << endl;
291  return -1;
292  break;
293  case tst_string:
294  pos = buffer_length;
295  return pos;
296  default:
297  cerr << "EST_TokenStream: unknown type" << endl;
298  return -1;
299  }
300 
301  return -1; // can't get here
302 }
303 
304 int EST_TokenStream::seek(int position)
305 {
306  peeked_charp = FALSE;
307  peeked_tokp = FALSE;
308 
309  switch (type)
310  {
311  case tst_none:
312  cerr << "EST_TokenStream unset" << endl;
313  return -1;
314  break;
315  case tst_file:
316  p_filepos = position;
317  return fseek(fp,position,SEEK_SET);
318  case tst_pipe:
319  cerr << "EST_TokenStream seek on pipe not supported" << endl;
320  return -1;
321  break;
322  case tst_istream:
323  cerr << "EST_TokenStream seek on istream not yet supported" << endl;
324  return -1;
325  break;
326  case tst_string:
327  if (position >= pos)
328  {
329  pos = position;
330  return -1;
331  }
332  else
333  {
334  pos = position;
335  return 0;
336  }
337  break;
338  default:
339  cerr << "EST_TokenStream: unknown type" << endl;
340  return -1;
341  }
342 
343  return -1; // can't get here
344 
345 }
346 
347 static int stdio_fread(void *buff,int size,int nitems,FILE *fp)
348 {
349  // So it can find the stdio one rather than the TokenStream one
350  return fread(buff,size,nitems,fp);
351 }
352 
353 int EST_TokenStream::fread(void *buff, int size, int nitems)
354 {
355  // switching into binary mode for current position
356  int items_read;
357 
358  // so we can continue to read afterwards
359  if (peeked_tokp)
360  {
361  cerr << "ERROR " << pos_description()
362  << " peeked into binary data" << endl;
363  return 0;
364  }
365 
366  peeked_charp = FALSE;
367  peeked_tokp = FALSE;
368 
369  switch (type)
370  {
371  case tst_none:
372  cerr << "EST_TokenStream unset" << endl;
373  return 0;
374  break;
375  case tst_file:
376  items_read = stdio_fread(buff,(size_t)size,(size_t)nitems,fp);
377  p_filepos += items_read*size;
378  return items_read;
379  case tst_pipe:
380  cerr << "EST_TokenStream fread pipe not yet supported" << endl;
381  return 0;
382  break;
383  case tst_istream:
384  cerr << "EST_TokenStream fread istream not yet supported" << endl;
385  return 0;
386  case tst_string:
387  if ((buffer_length-pos)/size < nitems)
388  items_read = (buffer_length-pos)/size;
389  else
390  items_read = nitems;
391  memcpy(buff,&buffer[pos],items_read*size);
392  pos += items_read*size;
393  return items_read;
394  default:
395  cerr << "EST_TokenStream: unknown type" << endl;
396  return EOF;
397  }
398 
399  return 0; // can't get here
400 
401 }
402 
404 {
405  // close any files (if they were used)
406 
407  switch (type)
408  {
409  case tst_none:
410  break;
411  case tst_file:
412  if (close_at_end)
413  fclose(fp);
414  case tst_pipe:
415  // close(fd);
416  break;
417  case tst_istream:
418  break;
419  case tst_string:
420  delete [] buffer;
421  buffer = 0;
422  break;
423  default:
424  cerr << "EST_TokenStream: unknown type" << endl;
425  break;
426  }
427 
428  type = tst_none;
429  peeked_charp = FALSE;
430  peeked_tokp = FALSE;
431 
432 }
433 
435 {
436  // For paul, the only person I know who uses this
437 
438  switch (type)
439  {
440  case tst_none:
441  break;
442  case tst_file:
443  fp = freopen(Origin,"rb",fp);
444  p_filepos = 0;
445  break;
446  case tst_pipe:
447  cerr << "EST_TokenStream: can't rewind pipe" << endl;
448  return -1;
449  break;
450  case tst_istream:
451  cerr << "EST_TokenStream: can't rewind istream" << endl;
452  break;
453  case tst_string:
454  pos = 0;
455  break;
456  default:
457  cerr << "EST_TokenStream: unknown type" << endl;
458  break;
459  }
460 
461  linepos = 1;
462  peeked_charp = FALSE;
463  peeked_tokp = FALSE;
464  eof_flag = FALSE;
465 
466  return 0;
467 }
468 
469 EST_TokenStream & EST_TokenStream::operator >>(EST_Token &p)
470 {
471  return get(p);
472 }
473 
474 EST_TokenStream & EST_TokenStream::operator >>(EST_String &p)
475 {
476  EST_Token t;
477 
478  get(t);
479  p = t.string();
480  return *this;
481 }
482 
484 {
485  tok = get();
486  return *this;
487 }
488 
490 {
491  // Returns a concatenated token form here to next symbol that matches s
492  // including s (though not adding s on the result)
493  // Not really for the purist but lots of times very handy
494  // Note this is not very efficient
495  EST_String result;
496  EST_Token t;
497 
498  for (result=EST_String::Empty; (t=get()) != s; )
499  {
500  result += t.whitespace() + t.prepunctuation() +
501  t.string() + t.punctuation();
502  if (eof())
503  {
504  cerr << "EST_TokenStream: end of file when looking for \"" <<
505  s << "\"" << endl;
506  break;
507  }
508  }
509 
510  return EST_Token(result);
511 }
512 
514 {
515  // Swallow the lot up to end of line
516  // assumes \n is a whitespace character
517 
519 
520  while (!eoln())
521  {
522  EST_Token &t=get();
523  result += t.whitespace() + t.prepunctuation();
524 
525  if (quotes)
526  result += quote_string(t.string());
527  else
528  result += t.string();
529 
530  result += t.punctuation();
531 
532  if (eof())
533  {
534 // cerr << "EST_TokenStream: end of file when looking for end of line"
535 // << endl;
536  break;
537  }
538  }
539  // So that the next call works I have to step over the eoln condition
540  // That involves removing the whitespace upto and including the next
541  // \n in the peek token.
542 
543  char *w = wstrdup(peek().whitespace());
544  int i;
545  for (i=0; w[i] != 0; i++)
546  if (w[i] == '\n') // maybe not portable
547  peek().set_whitespace(&w[i+1]);
548 
549  wfree(w);
550 
551  static EST_Token result_t;
552 
553  result_t.set_token(result);
554 
555  return result_t;
556 }
557 
558 EST_Token &EST_TokenStream::must_get(EST_String expected, bool *ok)
559 {
560  EST_Token &tok = get();
561 
562  if (tok != expected)
563  {
564  if (ok != NULL)
565  {
566  *ok=FALSE;
567  return tok;
568  }
569  else
570  EST_error("Expected '%s' got '%s' at %s",
571  (const char *)expected,
572  (const char *)(EST_String)tok,
573  (const char *)pos_description());
574  }
575 
576  if (ok != NULL)
577  *ok=TRUE;
578  return tok;
579 }
580 
581 void EST_TokenStream::build_table()
582 {
583  int i;
584  const char *p;
585  unsigned char c;
586 
587  for (i=0; i<256; ++i)
588  p_table[i]=0;
589 
590  for (p=WhiteSpaceChars; *p; ++p)
591  if (p_table[c=(unsigned char)*p])
592  EST_warning("Character '%c' has two classes, '%c' and '%c'",
593  *p, c, ' ');
594  else
595  p_table[c] = ' ';
596 
597  for (p=SingleCharSymbols; *p; ++p)
598  if (p_table[c=(unsigned char)*p])
599  EST_warning("Character '%c' has two classes, '%c' and '%c'",
600  *p, p_table[c], '!');
601  else
602  p_table[c] = '@';
603 
604  for (p=PunctuationSymbols; *p; ++p)
605  if (p_table[c=(unsigned char)*p] == '@')
606  continue;
607  else if (p_table[c])
608  EST_warning("Character '%c' has two classes, '%c' and '%c'",
609  *p, p_table[c], '.');
610  else
611  p_table[c] = '.';
612 
613  for(p=PrePunctuationSymbols; *p; ++p)
614  if (p_table[c=(unsigned char)*p] == '@')
615  continue;
616  else if (p_table[c] == '.')
617  p_table[c] = '"';
618  else if (p_table[c])
619  EST_warning("Character '%c' has two classes, '%c' and '%c'",
620  *p, p_table[c], '$');
621  else
622  p_table[c] = '$';
623 
624  p_table_wrong=0;
625 }
626 
627 inline int EST_TokenStream::getpeeked_internal(void)
628 {
629  peeked_charp = FALSE;
630  return peeked_char;
631 }
632 
633 inline
634 int EST_TokenStream::getch_internal()
635 {
636  // Return next character in stream
637  if (EST_TokenStream::peeked_charp)
638  {
639  return getpeeked_internal();
640  }
641 
642  switch (type)
643  {
644  case tst_none:
645  cerr << "EST_TokenStream unset" << endl;
646  return EOF;
647  break;
648  case tst_file:
649  p_filepos++;
650  {
651  char lc;
652  if (stdio_fread(&lc,1,1,fp) == 0)
653  return EOF;
654  else
655  return (int)lc;
656  }
657 /* return getc(fp); */
658  case tst_pipe:
659  cerr << "EST_TokenStream pipe not yet supported" << endl;
660  return EOF;
661  break;
662  case tst_istream:
663  p_filepos++;
664  return is->get();
665  case tst_string:
666  if (pos < buffer_length)
667  {
668  p_filepos++;
669  return buffer[pos++];
670  }
671  else
672  return EOF;
673  default:
674  cerr << "EST_TokenStream: unknown type" << endl;
675  return EOF;
676  }
677 
678  return EOF; // can't get here
679 }
680 
681 int EST_TokenStream::getch(void)
682 {
683  return getch_internal();
684 }
685 
686 inline int EST_TokenStream::peekch_internal()
687 {
688  // Return next character in stream (without reading it)
689 
690  if (!peeked_charp)
691  peeked_char = getch_internal();
692  peeked_charp = TRUE;
693  return peeked_char;
694 }
695 
696 
697 int EST_TokenStream::peekch(void)
698 {
699  return peekch_internal();
700 
701 }
702 
703 #define CLASS(C,CL) (p_table[(unsigned char)(C)]==(CL))
704 
705 #define CLASS2(C,CL1,CL2) (p_table[(unsigned char)(C)]==(CL1)||p_table[(unsigned char)(C)]==(CL2))
706 
708 {
709  if (peeked_tokp)
710  {
711  peeked_tokp = FALSE;
712  return current_tok;
713  }
714 
715  if (p_table_wrong)
716  build_table();
717 
718  char *word;
719  int c,i,j;
720 
721  for (i=0; (CLASS(c=getch_internal(),' ') &&
722  ( c != EOF )); i++)
723  {
724  if (c == '\n') linepos++;
725  tok_wspace = check_extend_str(tok_wspace,i,&tok_wspacelen);
726  tok_wspace[i] = c;
727  }
728  tok_wspace[i] = '\0';
729 
730  current_tok.init();
731 
732  if (c != EOF)
733  {
734  current_tok.set_filepos(p_filepos-1);
735 
736  if ((quotes) && // quoted strings (with escapes) are allowed
737  (c == quote))
738  {
739  for (i=0;
740  ((c = getch_internal()) != EOF)
741  ;)
742  {
743  if (c == quote)
744  break;
745  tok_stuff = check_extend_str(tok_stuff,i,&tok_stufflen);
746  if (c == escape)
747  c = getch_internal();
748  tok_stuff[i++] = c;
749  }
750  current_tok.set_quoted(TRUE);
751  }
752  else // standard whitespace separated tokens
753  {
754  for (i=0,tok_stuff[i++]=c;
755  (
756  !CLASS(c,'@') &&
757  !CLASS(c=peekch_internal(),' ') &&
758  !CLASS(c,'@') &&
759  ( c != EOF )) ;)
760  {
761  tok_stuff = check_extend_str(tok_stuff,i,&tok_stufflen);
762  // note, we must have peeked to get here.
763  tok_stuff[i++] = getpeeked_internal();
764  }
765  }
766  tok_stuff[i] = '\0';
767  // Are there any punctuation symbols at the start?
768  for (j=0;
769  ((j < i) && CLASS2(tok_stuff[j], '$', '"'));
770  j++);
771  if ((j > 0) && (j < i)) // there are
772  {
773  tok_prepuncs = check_extend_str(tok_prepuncs,j+1,&tok_prepuncslen);
774  memmove(tok_prepuncs,tok_stuff,j);
775  tok_prepuncs[j] = '\0';
776  current_tok.set_prepunctuation(tok_prepuncs);
777  word=&tok_stuff[j];
778  i-=j; // reduce size by number of prepuncs
779  }
780  else
781  {
783  word = tok_stuff;
784  }
785  // Are there any punctuation symbols at the end
786  for (j=i-1;
787  ((j > 0) && CLASS2(word[j],'.','"'));
788  j--);
789  if (word[j+1] != '\0')
790  {
791  current_tok.set_punctuation(&word[j+1]);
792  word[j+1] = '\0';
793  }
794  else
795  current_tok.set_punctuation(EST_String::Empty);
796 
797  current_tok.set_token(word);
798  if (tok_wspace[0] == '\0') // feature paths will have null whitespace
799  current_tok.set_whitespace(EST_String::Empty);
800  else
801  current_tok.set_whitespace(tok_wspace);
802  }
803  else
804  {
805  current_tok.set_token(EST_String::Empty);
806  current_tok.set_whitespace(tok_wspace);
807  current_tok.set_punctuation(EST_String::Empty);
809  eof_flag = TRUE;
810  }
811 
812  return current_tok;
813 }
814 
816 {
817  // This doesn't really work if there are blank lines (and you want
818  // to know about them)
819 
820  if ((peek().whitespace().contains("\n")) || eof())
821  return TRUE;
822  else
823  return FALSE;
824 
825 }
826 
827 EST_String quote_string(const EST_String &s,
828  const EST_String &quote,
829  const EST_String &escape,
830  int force)
831 {
832  // Quotes s always if force true, or iff s contains whitespace,
833  // quotes or escapes force is false
834  // Note quote and escape are assumed to be string of length 1
835  EST_String quoted_form;
836  if ((force) ||
837  (s.contains(quote)) ||
838  (s.contains(escape)) ||
839  (s.contains(RXanywhitespace)) ||
840  (s.length() == 0))
841  {
842  // bigger than the quoted form could ever be
843  int i,j;
844  char *quoted = new char[s.length()*(quote.length()+escape.length())+
845  1+quote.length()+quote.length()];
846  quoted[0] = quote(0);
847  for (i=1,j=0; j < s.length(); j++,i++)
848  {
849  if (s(j) == quote(0))
850  quoted[i++] = escape(0);
851  else if (s(j) == escape(0))
852  quoted[i++] = escape(0);
853  quoted[i] = s(j);
854  }
855  quoted[i++] = quote(0);
856  quoted[i] = '\0';
857  quoted_form = quoted;
858  delete [] quoted;
859  return quoted_form;
860  }
861  else
862  return s;
863 }
864 
866 {
867  return Origin+":"+itoString(linepos);
868 }