Edinburgh Speech Tools  2.4-release
 All Classes Functions Variables Typedefs Enumerations Enumerator Friends Pages
EST_String.h
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 
34 #ifndef __EST_STRING_H__
35 #define __EST_STRING_H__
36 
37 class EST_String;
38 class EST_Regex;
39 
40 #define EST_Regex_max_subexpressions 10
41 
42 #include <cstring>
43 #include <iostream>
44 #include <climits>
45 using namespace std;
46 #include "EST_Chunk.h"
47 #include "EST_strcasecmp.h"
48 #include "EST_bool.h"
49 
50 extern "C" void abort(void);
51 
52 /** A non-copyleft implementation of a string class to use with
53  * compilers that aren't GNU C++.
54  *
55  * Strings are reference-counted and reasonably efficient (eg you
56  * can pass them around, into and out of functions and so on
57  * without worrying too much about the cost).
58  *
59  * The associated class EST_Regex can be used to represent regular
60  * expressions.
61  *
62  * @see EST_Chunk
63  * @see EST_Regex
64  * @see string_example
65  * @author Alan W Black <awb@cstr.ed.ac.uk>
66  * @author Richard Caley <rjc@cstr.ed.ac.uk>
67  * @version $Id: EST_String.h,v 1.10 2014/10/13 13:26:19 robert Exp $
68  */
69 
70 class EST_String {
71 
72  /** For better libg++ compatibility.
73  *
74  * Includes String from char constructor which
75  * tends to mask errors in use. Also reverses the () and [] operators.
76  */
77 # define __FSF_COMPATIBILITY__ (0)
78 
79  /** Allow gsub() to be used in multi-threaded applications
80  * This will cause gsub to use a local table of substitution points
81  * walloced for each gsub. Otherwise one global one is used which
82  * should be faster, but non reentrant.
83  */
84 # define __GSUB_REENTRANT__ (1)
85 
86 /// Gripe about weird arguments like Nulls
87 #define __STRING_ARG_GRIPE__ (1)
88 
89 /// When we find something to gripe about we die then and there.
90 #define __GRIPE_FATAL__ (1)
91 
92 #if __GRIPE_FATAL__
93 # define gripe(WHAT) (cerr<< ("oops! " WHAT "\n"),abort())
94 #else
95 # define gripe(WHAT) (cerr<< ("oops! " WHAT "\n"))
96 #endif
97 
98 #if __STRING_ARG_GRIPE__
99 # define safe_strlen(S) ((S)?strlen(S):(gripe("null strlen"),0))
100 # define CHECK_STRING_ARG(S) if (!(S)) gripe("null string arg")
101 #else
102 # define safe_strlen(S) ((S)?strlen(S):0)
103 # define CHECK_STRING_ARG(S) /* empty */
104 #endif
105 
106 public:
107  /// Global version string.
108  static const char *version;
109 
110  /// Constant empty string
111  static const EST_String Empty;
112 
113  /// Type of string size field.
114  typedef int EST_string_size;
115  /// Maximum string size.
116 # define MAX_STRING_SIZE (INT_MAX)
117 
118 private:
119  /// Smart pointer to actual memory.
120  EST_ChunkPtr memory;
121  /// Size of string.
122  EST_string_size size;
123 
124  // Make sure this is exactly the same as an EST_String. This is being too
125  // clever by half.
126 
127  struct EST_dumb_string {
128  EST_ChunkPtr memory;
129  EST_string_size size;
130  } ;
131 
132  /// Flags indicating which bit of a string to extract.
133  enum EST_chop_direction {
134  Chop_Before = -1,
135  Chop_At = 0,
136  Chop_After = 1
137  };
138 
139  /// Simple utility which removes const-ness from memory
140  static inline EST_ChunkPtr &NON_CONST_CHUNKPTR(const EST_ChunkPtr &ecp)
141  { return *((EST_ChunkPtr *)&ecp);}
142 
143  /// private constructor which uses the buffer given.
144  EST_String(int len, EST_ChunkPtr cp) {
145  size=len;
146  memory = cp;
147  }
148 
149  /// Is more than one String represented by the same memory?
150  int shareing (void) { return memory.shareing();}
151 
152  /**@name Finding substrings */
153  //@{
154  /// Find substring
155  int locate(const char *it, int len, int from, int &start, int &end) const;
156  /// Find substring
157  int locate(const EST_String &s, int from, int &start, int &end) const
158  { return locate((const char *)s.memory, s.size, from, start, end); }
159  /// Find match for regexp.
160  int locate(EST_Regex &ex, int from, int &start, int &end, int *starts=NULL, int *ends=NULL) const;
161  //@}
162 
163 
164  /**@name Extract Substrings */
165  //@{
166  int extract(const char *it, int len, int from, int &start, int &end) const;
167  int extract(const EST_String &s, int from, int &start, int &end) const
168  { return extract((const char *)s.memory, s.size, from, start, end); }
169  int extract(EST_Regex &ex, int from, int &start, int &end) const;
170  //@}
171 
172  /**@name Chop out part of string */
173  //@{
174  /// Locate subsring and chop.
175  EST_String chop_internal(const char *s, int length, int pos, EST_chop_direction directionult) const;
176  /// Chop at given position.
177  EST_String chop_internal(int pos, int length, EST_chop_direction directionult) const;
178 
179  /// Locate match for expression and chop.
180  EST_String chop_internal(EST_Regex &ex, int pos, EST_chop_direction directionult) const;
181  //@}
182 
183  /**@name Global search and replace */
184  //@{
185  /// Substitute for string
186  int gsub_internal(const char *os, int olength, const char *s, int length);
187  /// Substitute for matches of regexp.
188  int gsub_internal(EST_Regex &ex, const char *s, int length);
189  //@}
190 
191  /// Split the string down into parts.
192  int split_internal(EST_String result[], int max, const char* s_seperator, int slen, EST_Regex *re_separator, char quote) const;
193 
194  int Int(bool *ok_p) const;
195  long Long(bool *ok_p) const;
196  float Float(bool *ok_p) const;
197  double Double(bool *ok_p) const;
198 public:
199 
200  /// Construct an empty string.
201  EST_String(void) :memory() {size=0;}
202 
203  /// Construct from char *
204  EST_String(const char *s);
205 
206  /// Construct from part of char * or fill with given character.
207  EST_String(const char *s, int start_or_fill, int len);
208 
209  /// Construct from C string.
210  EST_String(const char *s, int s_size, int start, int len);
211 
212  // Create from EST_String
213  EST_String(const EST_String &s, int start, int len);
214 
215  /** Copy constructor
216  * We have to declare our own copy constructor to lie to the
217  * compiler about the constness of the RHS.
218  */
219  EST_String(const EST_String &s) {
220  memory = NON_CONST_CHUNKPTR(s.memory);
221  size = s.size;
222  }
223 
224 #if __FSF_COMPATIBILITY__
225  /** Construct from single char.
226  * This constructor is not usually included as it can mask errors.
227  * @see __FSF_COMPATIBILITY__
228  */
229  EST_String(const char c);
230 #endif
231 
232  /// Destructor.
234  size=0;
235  memory=NULL;
236  }
237 
238  /// Length of string ({\em not} length of underlying chunk)
239  int length(void) const { return size; }
240  /// Size of underlying chunk.
241  int space (void) const { return memory.size(); }
242  /// Get a const-pointer to the actual memory.
243  const char *str(void) const { return size==0?"":(const char *)memory; }
244  /// Get a writable pointer to the actual memory.
245  char *updatable_str(void) { return size==0?(char *)"":(char *)memory; }
246  void make_updatable(void) { cp_make_updatable(memory, size+1);}
247 
248 
249  /// Build string from a single character.
250  static EST_String FromChar(const char c)
251  { const char s[2] = { c, 0 }; return EST_String(s); }
252 
253  /// Build string from an integer.
254  static EST_String Number(int i, int base=10);
255 
256  /// Build string from a long integer.
257  static EST_String Number(long i, int base=10);
258 
259  /// Build string from a double.
260  static EST_String Number(double d);
261 
262  /// Build string from a float
263  static EST_String Number(float f);
264 
265  /// Convert to an integer
266  int Int(bool &ok) const { return Int(&ok); }
267  int Int(void) const { return Int((bool *)NULL); }
268 
269  /// Convert to a long
270  long Long(bool &ok) const { return Long(&ok); }
271  long Long(void) const { return Long((bool *)NULL); }
272 
273  /// Convert to a float
274  float Float(bool &ok) const { return Float(&ok); }
275  float Float(void) const { return Float((bool *)NULL); }
276 
277  /// Convert to a double
278  double Double(bool &ok) const { return Double(&ok); }
279  double Double(void) const { return Double((bool *)NULL); }
280 
281  /**@name Before */
282  //@{
283  /// Part before position
284  EST_String before(int pos, int len=0) const
285  { return chop_internal(pos, len, Chop_Before); }
286  /// Part before first matching substring after pos.
287  EST_String before(const char *s, int pos=0) const
288  { return chop_internal(s, safe_strlen(s), pos, Chop_Before); }
289  /// Part before first matching substring after pos.
290  EST_String before(const EST_String &s, int pos=0) const
291  { return chop_internal(s.str(), s.size, pos, Chop_Before); }
292  /// Part before first match of regexp after pos.
293  EST_String before(EST_Regex &e, int pos=0) const
294  { return chop_internal(e, pos, Chop_Before); }
295  //@}
296 
297  /**@name At */
298  //@{
299  /// Return part at position
300  EST_String at(int from, int len=0) const
301  { return EST_String(str(),size,from<0?(size+from):from,len); }
302  /// Return part where substring found (not useful, included for completeness)
303  EST_String at(const char *s, int pos=0) const
304  { return chop_internal(s, safe_strlen(s), pos, Chop_At); }
305  /// Return part where substring found (not useful, included for completeness)
306  EST_String at(const EST_String &s, int pos=0) const
307  { return chop_internal(s.str(), s.size, pos, Chop_At); }
308  /// Return part matching regexp.
309  EST_String at(EST_Regex &e, int pos=0) const
310  { return chop_internal(e, pos, Chop_At); }
311  //@}
312 
313  /**@name After */
314  //@{
315  /// Part after pos+len
316  EST_String after(int pos, int len=1) const
317  { return chop_internal(pos, len, Chop_After); }
318  /// Part after substring.
319  EST_String after(const char *s, int pos=0) const
320  { return chop_internal(s, safe_strlen(s), pos, Chop_After); }
321  /// Part after substring.
322  EST_String after(const EST_String &s, int pos=0) const
323  { return chop_internal(s.str(), s.size, pos, Chop_After); }
324  /// Part after match of regular expression.
325  EST_String after(EST_Regex &e, int pos=0) const
326  { return chop_internal(e, pos, Chop_After); }
327  //@}
328 
329  /**@name Search for something */
330  //@{
331  /// Find a substring.
332  int search(const char *s, int len, int &mlen, int pos=0) const
333  { int start, end;
334  if (locate(s, len, pos, start, end))
335  { mlen=end-start; return start; }
336  return -1;
337  }
338 
339  /// Find a substring.
340  int search(const EST_String s, int &mlen, int pos=0) const
341  { int start, end;
342  if (locate(s, pos, start, end))
343  { mlen=end-start; return start; }
344  return -1;
345  }
346 
347  /// Find a match of the regular expression.
348  int search(EST_Regex &re, int &mlen, int pos=0, int *starts=NULL, int *ends=NULL) const
349  { int start=0, end=0;
350  if (locate(re, pos, start, end, starts, ends))
351  { mlen=end-start; return start; }
352  return -1;
353  }
354  //@}
355 
356 
357  /**@name Get position of something */
358  //@{
359  /// Position of substring (starting at pos)
360  int index(const char *s, int pos=0) const
361  { int start, end; return locate(s, safe_strlen(s), pos, start, end)?start:-1; }
362  /// Position of substring (starting at pos)
363  int index(const EST_String &s, int pos=0) const
364  { int start, end; return locate(s, pos, start, end)?start:-1; }
365  /// Position of match of regexp (starting at pos)
366  int index(EST_Regex &ex, int pos=0) const
367  { int start, end; return locate(ex, pos, start, end)?start:-1; }
368  //@}
369 
370  /**@name Does string contain something? */
371  //@{
372  /// Does it contain this substring?
373  int contains(const char *s, int pos=-1) const
374  { int start, end; return extract(s, safe_strlen(s), pos, start, end); }
375  /// Does it contain this substring?
376  int contains(const EST_String &s, int pos=-1) const
377  { int start, end; return extract(s, pos, start, end); }
378  /// Does it contain this character?
379  int contains(const char c, int pos=-1) const
380  { int start, end; char s[2] = {c,0}; return extract(s, 1, pos, start, end); }
381  /// Does it contain a match for this regular expression?
382  int contains(EST_Regex &ex, int pos=-1) const
383  { int start, end; return extract(ex, pos, start, end); }
384  //@}
385 
386  /**@name Does string exactly match? */
387  //@{
388  /// Exactly match this string?
389  int matches(const char *e, int pos=0) const;
390  /// Exactly match this string?
391  int matches(const EST_String &e, int pos=0) const;
392  /// Exactly matches this regular expression, can return ends of sub-expressions.
393  int matches(EST_Regex &e, int pos=0, int *starts=NULL, int *ends=NULL) const;
394  //@}
395 
396  /**@name Global replacement */
397  //@{
398  /// Substitute one string for another.
399  int gsub(const char *os, const EST_String &s)
400  { return gsub_internal(os, safe_strlen(os), s, s.size); }
401  /// Substitute one string for another.
402  int gsub(const char *os, const char *s)
403  { return gsub_internal(os, safe_strlen(os), s, safe_strlen(s)); }
404  /// Substitute one string for another.
405  int gsub(const EST_String &os, const EST_String &s)
406  { return gsub_internal(os, os.size, s, s.size); }
407  /// Substitute one string for another.
408  int gsub(const EST_String &os, const char *s)
409  { return gsub_internal(os, os.size, s, safe_strlen(s)); }
410 
411  /// Substitute string for matches of regular expression.
412  int gsub(EST_Regex &ex, const EST_String &s)
413  { return gsub_internal(ex, s, s.size); }
414  /// Substitute string for matches of regular expression.
415  int gsub(EST_Regex &ex, const char *s)
416  { return gsub_internal(ex, s, safe_strlen(s)); }
417  /// Substitute string for matches of regular expression.
418  int gsub(EST_Regex &ex, int bracket_num)
419  { return gsub_internal(ex, NULL, bracket_num); }
420  /// Substitute the result of a match into a string.
421  int subst(EST_String source,
422  int (&starts)[EST_Regex_max_subexpressions],
423  int (&ends)[EST_Regex_max_subexpressions]);
424  //@}
425 
426  /**@name Frequency counts */
427  //@{
428  /// Number of occurrences of substring
429  int freq(const char *s) const;
430  /// Number of occurrences of substring
431  int freq(const EST_String &s) const;
432  /// Number of matches of regular expression.
433  int freq(EST_Regex &s) const;
434  //@}
435 
436  /**@name Quoting */
437  //@{
438  /// Return the string in quotes with internal quotes protected.
439  EST_String quote(const char quotec) const;
440  /// Return in quotes if there is something to protect (e.g. spaces)
441  EST_String quote_if_needed(const char quotec) const;
442  /// Remove quotes and unprotect internal quotes.
443  EST_String unquote(const char quotec) const;
444  /// Remove quotes if any.
445  EST_String unquote_if_needed(const char quotec) const;
446  //@}
447 
448 #if __FSF_COMPATIBILITY__
449  const char operator [] (int i) const { return memory[i]; }
450  char &operator () (int i) { return memory(i); }
451 #else
452  /**@name Operators */
453  //@{
454  /// Function style access to constant strings.
455  const char operator () (int i) const { return memory[i]; }
456  /// Array style access to writable strings.
457  char &operator [] (int i) { return memory(i); }
458 #endif
459 
460  /// Cast to const char * by simply giving access to pointer.
461  operator const char*() const {return str(); }
462  operator const char*() {return str(); }
463  /// Cast to char *, may involve copying.
464  operator char*() { return updatable_str(); }
465 
466  /**@name Add to end of string. */
467  //@{
468  /// Add C string to end of EST_String
469  EST_String &operator += (const char *b);
470  /// Add EST_String to end of EST_String
471  EST_String &operator += (const EST_String b);
472  //@}
473 
474  /**@name Assignment */
475  //@{
476  /// Assign C string to EST_String
477  EST_String &operator = (const char *str);
478  /// Assign single character to EST_String
479  EST_String &operator = (const char c);
480  /// Assign EST_String to EST_String.
481  EST_String &operator = (const EST_String &s);
482  //@}
483 
484  /**@name Concatenation */
485  //@{
486  /// Concatenate two EST_Strings
487  friend EST_String operator + (const EST_String &a, const EST_String &b);
488  /// Concatenate C String with EST_String
489  friend EST_String operator + (const char *a, const EST_String &b);
490  /// Concatenate EST_String with C String
491  friend EST_String operator + (const EST_String &a, const char *b);
492  //@}
493 
494  /// Repeat string N times
495  friend EST_String operator * (const EST_String &s, int n);
496 
497  /**@name relational operators */
498  //@{
499  ///
500  friend int operator == (const char *a, const EST_String &b);
501  ///
502  friend int operator == (const EST_String &a, const char *b)
503  { return b == a; }
504  ///
505  friend int operator == (const EST_String &a, const EST_String &b);
506 
507  ///
508  friend int operator != (const char *a, const EST_String &b)
509  { return !(a==b); }
510  ///
511  friend int operator != (const EST_String &a, const char *b)
512  { return !(a==b); }
513  ///
514  friend int operator != (const EST_String &a, const EST_String &b)
515  { return !(a==b); }
516 
517  ///
518  friend inline int operator < (const char *a, const EST_String &b)
519  { return compare(a,b) < 0; }
520  ///
521  friend inline int operator < (const EST_String &a, const char *b)
522  { return compare(a,b) < 0; }
523  ///
524  friend inline int operator < (const EST_String &a, const EST_String &b)
525  { return compare(a,b) < 0; }
526  ///
527  friend inline int operator > (const char *a, const EST_String &b)
528  { return compare(a,b) > 0; }
529  ///
530  friend inline int operator > (const EST_String &a, const char *b)
531  { return compare(a,b) > 0; }
532  ///
533  friend inline int operator > (const EST_String &a, const EST_String &b)
534  { return compare(a,b) > 0; }
535  ///
536  friend inline int operator <= (const char *a, const EST_String &b)
537  { return compare(a,b) <= 0; }
538  ///
539  friend inline int operator <= (const EST_String &a, const char *b)
540  { return compare(a,b) <= 0; }
541  ///
542  friend inline int operator <= (const EST_String &a, const EST_String &b)
543  { return compare(a,b) <= 0; }
544  ///
545  friend inline int operator >= (const char *a, const EST_String &b)
546  { return compare(a,b) >= 0; }
547  ///
548  friend inline int operator >= (const EST_String &a, const char *b)
549  { return compare(a,b) >= 0; }
550  ///
551  friend inline int operator >= (const EST_String &a, const EST_String &b)
552  { return compare(a,b) >= 0; }
553  //@}
554 
555  //@}
556 
557  /**@name String comparison.
558  * All these operators return -1, 0 or 1 to indicate the sort
559  * order of the strings.
560  */
561  //@{
562  ///
563  friend int compare(const EST_String &a, const EST_String &b);
564  ///
565  friend int compare(const EST_String &a, const char *b);
566  ///
567  friend inline int compare(const char *a, const EST_String &b)
568  { return -compare(b,a); }
569  /** Case folded comparison.
570  *
571  * The table argument can defined how upper and lower
572  * case characters correspond. The default works for
573  * ASCII.
574  */
575  //@{
576  friend int fcompare(const EST_String &a, const EST_String &b,
577  const unsigned char *table);
578 
579  friend inline int fcompare(const EST_String &a, const EST_String &b)
580  { return fcompare(a,b,NULL); }
581 
582 
583  friend int fcompare(const EST_String &a, const char *b,
584  const unsigned char *table);
585  ///
586  friend inline int fcompare(const EST_String &a, const EST_String &b,
587  const EST_String &table)
588  { return fcompare(a, b, (const unsigned char *)(const char *)table); }
589  //@}
590  //@}
591  //@}
592 
593 
594  /**@name Split a string into parts.
595  *
596  * These functions divide up a string producing an array of
597  * substrings.
598  */
599  //@{
600  /// Split at a given separator.
601  friend int split(const EST_String & s, EST_String result[],
602  int max, const EST_String& seperator, char quote=0)
603  { return s.split_internal(result, max, (const char *)seperator, seperator.length(), NULL, quote); }
604  /// Split at a given separator.
605  friend int split(const EST_String &s, EST_String result[],
606  int max, const char *seperator, char quote=0)
607  { return s.split_internal(result, max, seperator, strlen(seperator), NULL, quote); }
608  /// Split at each match of the regular expression.
609  friend int split(const EST_String & s, EST_String result[], int max,
610  EST_Regex& seperator, char quote=0)
611  { return s.split_internal(result, max, NULL, 0, &seperator, quote); }
612  //@}
613 
614  /// Convert to upper case.
615  friend EST_String upcase(const EST_String &s);
616  /// Convert to lower case.
617  friend EST_String downcase(const EST_String &s);
618 
619  /** Concatenate a number of strings.
620  * This is more efficient than multiple uses of + or +=
621  */
622  static EST_String cat(const EST_String s1,
623  const EST_String s2 = Empty,
624  const EST_String s3 = Empty,
625  const EST_String s4 = Empty,
626  const EST_String s5 = Empty,
627  const EST_String s6 = Empty,
628  const EST_String s7 = Empty,
629  const EST_String s8 = Empty,
630  const EST_String s9 = Empty
631  );
632 
633  /* Hacky way to ignore volatile */
634  EST_String & ignore_volatile(void) volatile { return *((EST_String *)(void *)this); }
635 
636  /// Stream output for EST_String.
637  friend ostream &operator << (ostream &s, const EST_String &str);
638  friend class EST_Regex;
639 
640 };
641 
642 EST_ChunkPtr chunk_allocate(int bytes);
643 EST_ChunkPtr chunk_allocate(int bytes, const char *initial, int initial_len);
644 EST_ChunkPtr chunk_allocate(int bytes, const EST_ChunkPtr &initial, int initial_start, int initial_len);
645 
646 int operator == (const char *a, const EST_String &b);
647 int operator == (const EST_String &a, const EST_String &b);
648 ostream &operator << (ostream &s, const EST_String &str);
649 
650 #include "EST_Regex.h"
651 
652 #endif