Logo ROOT   6.10/09
Reference Guide
TRegexp.cxx
Go to the documentation of this file.
1 // @(#)root/base:$Id$
2 // Author: Fons Rademakers 04/08/95
3 
4 /*************************************************************************
5  * Copyright (C) 1995-2000, Rene Brun and Fons Rademakers. *
6  * All rights reserved. *
7  * *
8  * For the licensing terms see $ROOTSYS/LICENSE. *
9  * For the list of contributors see $ROOTSYS/README/CREDITS. *
10  *************************************************************************/
11 
12 /** \class TRegexp
13 \ingroup Base
14 
15 Regular expression class.
16 
17 ~~~ {.cpp}
18  '^' // start-of-line anchor
19  '$' // end-of-line anchor
20  '.' // matches any character
21  '[' // start a character class
22  ']' // end a character class
23  '^' // negates character class if 1st character
24  '*' // Kleene closure (matches 0 or more)
25  '+' // Positive closure (1 or more)
26  '?' // Optional closure (0 or 1)
27 ~~~
28 Note that the '|' operator (union) is not supported, nor are
29 parentheses (grouping). Therefore "a|b" does not match "a".
30 
31 Standard classes like [:alnum:], [:alpha:], etc. are not supported,
32 only [a-zA-Z], [^ntf] and so on.
33 */
34 
35 #include "TRegexp.h"
36 #include "TString.h"
37 #include "TError.h"
38 #include "ThreadLocalStorage.h"
39 
40 const unsigned TRegexp::fgMaxpat = 2048;
41 
42 
44 
45 ////////////////////////////////////////////////////////////////////////////////
46 /// Create a regular expression from the input string. If wildcard is
47 /// true then the input string will first be interpreted as a wildcard
48 /// expression by MakeWildcard(), and the result then interpreted as a
49 /// regular expression.
50 
51 TRegexp::TRegexp(const char *re, Bool_t wildcard)
52 {
53  if (wildcard)
54  GenPattern(MakeWildcard(re));
55  else
56  GenPattern(re);
57 }
58 
59 ////////////////////////////////////////////////////////////////////////////////
60 /// Create a regular expression from a TString.
61 
63 {
64  GenPattern(re.Data());
65 }
66 
67 ////////////////////////////////////////////////////////////////////////////////
68 /// Copy ctor.
69 
71 {
72  CopyPattern(r);
73 }
74 
75 ////////////////////////////////////////////////////////////////////////////////
76 /// Destructor.
77 
79 {
80  delete [] fPattern;
81 }
82 
83 ////////////////////////////////////////////////////////////////////////////////
84 /// Assignment operator.
85 
87 {
88  if (this != &r) {
89  delete [] fPattern;
90  CopyPattern(r);
91  }
92  return *this;
93 }
94 
95 ////////////////////////////////////////////////////////////////////////////////
96 /// Assignment operator taking a char* and assigning it to a regexp.
97 
98 TRegexp& TRegexp::operator=(const char *str)
99 {
100  delete [] fPattern;
101  GenPattern(str);
102  return *this;
103 }
104 
105 ////////////////////////////////////////////////////////////////////////////////
106 /// Assignment operator taking a TString.
107 
109 {
110  delete [] fPattern;
111  GenPattern(str.Data());
112  return *this;
113 }
114 
115 ////////////////////////////////////////////////////////////////////////////////
116 /// Generate the regular expression pattern.
117 
118 void TRegexp::GenPattern(const char *str)
119 {
120  fPattern = new Pattern_t[fgMaxpat];
121  int error = ::Makepat(str, fPattern, fgMaxpat);
122  fStat = (error < 3) ? (EStatVal) error : kToolong;
123 }
124 
125 ////////////////////////////////////////////////////////////////////////////////
126 /// Copy the regular expression pattern.
127 
129 {
130  fPattern = new Pattern_t[fgMaxpat];
131  memcpy(fPattern, r.fPattern, fgMaxpat * sizeof(Pattern_t));
132  fStat = r.fStat;
133 }
134 
135 ////////////////////////////////////////////////////////////////////////////////
136 /// This routine transforms a wildcarding regular expression into
137 /// a general regular expression used for pattern matching.
138 /// When using wildcards the regular expression is assumed to be
139 /// preceded by a "^" (BOL) and terminated by a "$" (EOL). Also, all
140 /// "*"'s and "?"'s (closures) are assumed to be preceded by a "." (i.e. any
141 /// character, except "/"'s) and all .'s are escaped (so *.ps is different
142 /// from *.eps). The special treatment of "/" allows the easy matching of
143 /// pathnames, e.g. "*.root" will match "aap.root", but not "pipo/aap.root".
144 
145 const char *TRegexp::MakeWildcard(const char *re)
146 {
147  TTHREAD_TLS_ARRAY(char,fgMaxpat,buf);
148  char *s = buf;
149  if (!re) return "";
150  int len = strlen(re);
151  int slen = 0;
152 
153  if (!len) return "";
154 
155  for (int i = 0; i < len; i++) {
156  if ((unsigned)slen > fgMaxpat - 10) {
157  Error("MakeWildcard", "regexp too large");
158  break;
159  }
160  if (i == 0 && re[i] != '^') {
161  *s++ = '^';
162  slen++;
163  }
164  if (re[i] == '*') {
165 #ifndef R__WIN32
166  //const char *wc = "[a-zA-Z0-9-+_\\.,: []<>]";
167  const char *wc = "[^/]";
168 #else
169  //const char *wc = "[a-zA-Z0-9-+_., []<>]";
170  const char *wc = "[^\\/:]";
171 #endif
172  strcpy(s, wc);
173  s += strlen(wc);
174  slen += strlen(wc);
175  }
176  if (re[i] == '.') {
177  *s++ = '\\';
178  slen++;
179  }
180  if (re[i] == '?') {
181 #ifndef R__WIN32
182  //const char *wc = "[a-zA-Z0-9-+_\\.,: []<>]";
183  const char *wc = "[^/]";
184 #else
185  //const char *wc = "[a-zA-Z0-9-+_., []<>]";
186  const char *wc = "[^\\/:]";
187 #endif
188  strcpy(s, wc);
189  s += strlen(wc);
190  slen += strlen(wc);
191  } else {
192  *s++ = re[i];
193  slen++;
194  }
195  if (i == len-1 && re[i] != '$') {
196  *s++ = '$';
197  slen++;
198  }
199  }
200  *s = '\0';
201  return buf;
202 }
203 
204 ////////////////////////////////////////////////////////////////////////////////
205 /// Find the first occurrence of the regexp in string and return the
206 /// position, or -1 if there is no match. Len is length of the matched
207 /// string and i is the offset at which the matching should start.
208 
209 Ssiz_t TRegexp::Index(const TString& string, Ssiz_t* len, Ssiz_t i) const
210 {
211  if (fStat != kOK)
212  Error("TRegexp::Index", "Bad Regular Expression");
213 
214  const char* startp;
215  const char* s = string.Data();
216  Ssiz_t slen = string.Length();
217  if (slen < i) return kNPOS;
218  const char* endp = ::Matchs(s+i, slen-i, fPattern, &startp);
219  if (endp) {
220  *len = endp - startp;
221  return startp - s;
222  } else {
223  *len = 0;
224  return kNPOS;
225  }
226 }
227 
228 ////////////////////////////////////////////////////////////////////////////////
229 /// Check status of regexp.
230 
232 {
233  EStatVal temp = fStat;
234  fStat = kOK;
235  return temp;
236 }
237 
238 ////////////////////////////////////////////////////////////////////////////////
239 // //
240 // TString member functions, put here so the linker will include //
241 // them only if regular expressions are used. //
242 // //
243 ////////////////////////////////////////////////////////////////////////////////
244 
245 ////////////////////////////////////////////////////////////////////////////////
246 /// Find the first occurrence of the regexp in string and return the
247 /// position, or -1 if there is no match. Start is the offset at which
248 /// the search should start.
249 
250 Ssiz_t TString::Index(const TRegexp& r, Ssiz_t start) const
251 {
252  Ssiz_t len;
253  return r.Index(*this, &len, start); // len not used
254 }
255 
256 ////////////////////////////////////////////////////////////////////////////////
257 /// Find the first occurrence of the regexp in string and return the
258 /// position, or -1 if there is no match. Extent is length of the matched
259 /// string and start is the offset at which the matching should start.
260 
261 Ssiz_t TString::Index(const TRegexp& r, Ssiz_t* extent, Ssiz_t start) const
262 {
263  return r.Index(*this, extent, start);
264 }
265 
266 ////////////////////////////////////////////////////////////////////////////////
267 /// Return the substring found by applying the regexp starting at start.
268 
270 {
271  Ssiz_t len;
272  Ssiz_t begin = Index(r, &len, start);
273  return TSubString(*this, begin, len);
274 }
275 
276 ////////////////////////////////////////////////////////////////////////////////
277 /// Return the substring found by applying the regexp.
278 
280 {
281  return (*this)(r,0);
282 }
283 
284 ////////////////////////////////////////////////////////////////////////////////
285 /// Search for tokens delimited by regular expression 'delim' (default " ")
286 /// in this string; search starts at 'from' and the token is returned in 'tok'.
287 /// Returns in 'from' the next position after the delimiter.
288 /// Returns kTRUE if a token is found, kFALSE if not or if some inconsistency
289 /// occurred.
290 /// This method allows to loop over tokens in this way:
291 /// ~~~ {.cpp}
292 /// TString myl = "tok1 tok2|tok3";
293 /// TString tok;
294 /// Ssiz_t from = 0;
295 /// while (myl.Tokenize(tok, from, "[ |]")) {
296 /// // Analyse tok
297 /// ...
298 /// }
299 /// ~~~
300 /// more convenient of the other Tokenize method when saving the tokens is not
301 /// needed.
302 
303 Bool_t TString::Tokenize(TString &tok, Ssiz_t &from, const char *delim) const
304 {
305  Bool_t found = kFALSE;
306 
307  // Reset the token
308  tok = "";
309 
310  // Make sure inputs make sense
311  Int_t len = Length();
312  if (len <= 0 || from > (len - 1) || from < 0)
313  return found;
314 
315  // Ensure backward compatibility to allow one or more times the delimiting character
316  TString rdelim(delim);
317  if(rdelim.Length() == 1) {
318  rdelim = "[" + rdelim + "]+";
319  }
320  TRegexp rg(rdelim);
321 
322  // Find delimiter
323  Int_t ext = 0;
324  Int_t pos = Index(rg, &ext, from);
325 
326  // Assign to token
327  if (pos == kNPOS || pos > from) {
328  Ssiz_t last = (pos != kNPOS) ? (pos - 1) : len;
329  tok = (*this)(from, last-from+1);
330  }
331  found = kTRUE;
332 
333  // Update start-of-search index
334  from = pos + ext;
335  if (pos == kNPOS) {
336  from = pos;
337  if (tok.IsNull()) {
338  // Empty, last token
339  found = kFALSE;
340  }
341  }
342  // Make sure that 'from' has a meaningful value
343  from = (from < len) ? from : len;
344 
345  // Done
346  return found;
347 }
A zero length substring is legal.
Definition: TString.h:75
TRegexp(const char *re, Bool_t wildcard=kFALSE)
Create a regular expression from the input string.
Definition: TRegexp.cxx:51
Pattern_t * fPattern
Definition: TRegexp.h:37
EStatVal
Definition: TRegexp.h:34
TRegexp & operator=(const TRegexp &re)
Assignment operator.
Definition: TRegexp.cxx:86
const Ssiz_t kNPOS
Definition: RtypesCore.h:115
Regular expression class.
Definition: TRegexp.h:31
Ssiz_t Index(const char *pat, Ssiz_t i=0, ECaseCompare cmp=kExact) const
Definition: TString.h:587
Basic string class.
Definition: TString.h:129
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
void GenPattern(const char *re)
Generate the regular expression pattern.
Definition: TRegexp.cxx:118
int Makepat(const char *, Pattern_t *, int)
Make a pattern template from the string pointed to by exp.
Definition: Match.cxx:129
void Error(const char *location, const char *msgfmt,...)
char & operator()(Ssiz_t i)
Definition: TString.h:662
unsigned short Pattern_t
Definition: Match.h:26
EStatVal fStat
Definition: TRegexp.h:38
TRandom2 r(17)
const char * MakeWildcard(const char *re)
This routine transforms a wildcarding regular expression into a general regular expression used for p...
Definition: TRegexp.cxx:145
Ssiz_t Length() const
Definition: TString.h:388
const char * Matchs(const char *, size_t len, const Pattern_t *, const char **)
Match a string with a pattern.
Definition: Match.cxx:220
const Bool_t kFALSE
Definition: RtypesCore.h:92
int Ssiz_t
Definition: RtypesCore.h:63
TObjArray * Tokenize(const TString &delim) const
This function is used to isolate sequential tokens in a TString.
Definition: TString.cxx:2251
#define ClassImp(name)
Definition: Rtypes.h:336
void CopyPattern(const TRegexp &re)
Copy the regular expression pattern.
Definition: TRegexp.cxx:128
virtual ~TRegexp()
Destructor.
Definition: TRegexp.cxx:78
Ssiz_t Index(const TString &str, Ssiz_t *len, Ssiz_t start=0) const
Find the first occurrence of the regexp in string and return the position, or -1 if there is no match...
Definition: TRegexp.cxx:209
Bool_t IsNull() const
Definition: TString.h:385
EStatVal Status()
Check status of regexp.
Definition: TRegexp.cxx:231
const Bool_t kTRUE
Definition: RtypesCore.h:91
static const unsigned fgMaxpat
Definition: TRegexp.h:39
const char * Data() const
Definition: TString.h:347