ROOT  6.06/09
Reference Guide
Match.cxx
Go to the documentation of this file.
1 // @(#)root/base:$Id$
2 // Author: Fons Rademakers 04/08/95
3 
4 /*************************************************************************
5  * Copyright (C) 1995-2000, Rene Brun and Fons Rademakers. *
6  * All rights reserved. *
7  * *
8  * For the licensing terms see $ROOTSYS/LICENSE. *
9  * For the list of contributors see $ROOTSYS/README/CREDITS. *
10  *************************************************************************/
11 
12 //////////////////////////////////////////////////////////////////////////
13 // //
14 // Author: Allen I. Holub //
15 // //
16 // (c) C Gazette. May be used freely as long as author and publication //
17 // are acknowledged. //
18 // //
19 //////////////////////////////////////////////////////////////////////////
20 
21 #include <stdio.h>
22 #include <ctype.h>
23 #include <string.h>
24 
25 
26 #include "Match.h"
27 
28 
29 // Metacharacters in the input:
30 #define BOL '^' // start-of-line anchor
31 #define EOL '$' // end-of-line anchor
32 #define ANY '.' // matches any character
33 #define CCL '[' // start a character class
34 #define CCLEND ']' // end a character class
35 #define NCCL '^' // negates character class if 1st character
36 #define CLOSURE '*' // Kleene closure (matches 0 or more)
37 #define PCLOSE '+' // Positive closure (1 or more)
38 #define OPT '?' // Optional closure (0 or 1)
39 
40 enum Eaction { // These are put in the pattern string
41  // to represent metacharacters.
42  kBOL = (0x8000 | '^'),
43  kEOL = (0x8000 | '$'),
44  kANY = (0x8000 | '.'),
45  kCCL = (0x8000 | '['),
46  kOPT = (0x8000 | '?'),
47  kCLOSE = (0x8000 | '*'),
48  kPCLOSE = (0x8000 | '+'),
49  kEND = (0x8000 | 0) // end of pattern
50 
51 };
52 
53 #if 1
54 #define ISHEXDIGIT(x) isxdigit((unsigned char)x)
55 #else
56 #define ISHEXDIGIT(x) (isdigit(x) \
57  || ('a'<=(x) && (x)<='f') \
58  || ('A'<=(x) && (x)<='F') )
59 #endif
60 
61 #define ISOCTDIGIT(x) ('0'<=(x) && (x)<='7')
62 
63 // ----------------------------------------------------------------------
64 #define MAPSIZE 16 // need this many Pattern_t elements for
65  // character class bit map
66 
67 ////////////////////////////////////////////////////////////////////////////////
68 /// Advance a pointer into the pattern template to the next pattern element,
69 /// this is a+1 for all pattern elements but kCCL, where you have to skip
70 /// past both the kCCL character and the bitmap that follows that character.
71 
72 inline void ADVANCE(const Pattern_t*& pat)
73 {
74  if (*pat++ == (Pattern_t) kCCL)
75  pat += MAPSIZE;
76 }
77 
78 //
79 // Bitmap functions. Set bit b in the map and
80 // test bit b to see if it was set previously.
81 //
82 
83 ////////////////////////////////////////////////////////////////////////////////
84 
85 #ifdef SETBIT // from Rtypes.h
86 #undef SETBIT
87 #endif
88 
89 static void SETBIT(unsigned char b, Pattern_t* map)
90 {
91  map[b >> 4] |= 1 << (b & 0x0f);
92 }
93 
94 ////////////////////////////////////////////////////////////////////////////////
95 
96 static int TSTBIT(unsigned char b, const Pattern_t* map)
97 {
98  return map[b >> 4] & (1 << (b & 0x0f));
99 }
100 
101 // ----------------------------------------------------------------------
102 
103 #define E_NONE 0 // Possible return values from pat_err
104 #define E_ILLEGAL 1 // Set in Makepat() to indicate prob-
105 #define E_NOMEM 2 // lems that came up while making the
106 #define E_PAT 3 // pattern template.
107 
108 // ----------------------------------------------------------------------
109 
110 static const char* doccl(Pattern_t*, const char*);
111 static int hex2bin(int);
112 static int oct2bin(int);
113 static int omatch(const char**, size_t*, const Pattern_t*, const char*);
114 static const char* patcmp(const char*, size_t, const Pattern_t*, const char*);
115 static int esc(const char**);
116 
117 // ----------------------------------------------------------------------
118 
119 ////////////////////////////////////////////////////////////////////////////////
120 /// Make a pattern template from the string pointed to by exp. Stop when
121 /// `\0` is found in exp. The pattern template is assembled
122 /// in pat whose length is given by maxpat.
123 ///
124 /// Return:
125 /// - E_ILLEGAL Illegal input pattern.
126 /// - E_NOMEM out of memory.
127 /// - E_PAT pattern too long.
128 
129 int Makepat(const char* exp, // Regular expression
130  Pattern_t* pat, // Assembled compiled pattern
131  int maxpat) // Length of pat
132 {
133  Pattern_t* cur; // pointer to current pattern element
134  Pattern_t* prev; // pointer to previous pattern element
135  int theError = E_ILLEGAL;
136 
137  if (!*exp)
138  goto exit;
139 
140  if (*exp == CLOSURE || *exp == PCLOSE || *exp == OPT)
141  goto exit;
142 
143  theError = E_NOMEM;
144  if (!pat) goto exit; // Check for bad pat
145 
146  prev = cur = pat;
147  theError = E_PAT;
148 
149  while (*exp) {
150 
151  if (cur >= &pat[maxpat - 1]) goto exit;
152 
153  switch (*exp) {
154  case ANY:
155  *cur = (Pattern_t)kANY;
156  prev = cur++;
157  ++exp;
158  break;
159 
160  case BOL:
161  *cur = (cur == pat) ? (Pattern_t)kBOL : (unsigned char)*exp;
162  prev = cur++;
163  ++exp;
164  break;
165 
166  case EOL:
167  *cur = (!exp[1]) ? (Pattern_t)kEOL : (unsigned char)*exp;
168  prev = cur++;
169  ++exp;
170  break;
171 
172  case CCL:
173  if (((cur - pat) + MAPSIZE) >= maxpat)
174  goto exit; // not enough room for bit map
175  prev = cur;
176  *cur++ = (Pattern_t)kCCL;
177  exp = doccl(cur, exp);
178  if (*exp != CCLEND) goto exit;
179  ++exp;
180  cur += MAPSIZE;
181  break;
182 
183  case OPT:
184  case CLOSURE:
185  case PCLOSE:
186  switch (*prev) {
187  case kBOL:
188  case kEOL:
189  case kOPT:
190  case kPCLOSE:
191  case kCLOSE:
192  goto exit;
193  }
194 
195  memmove( prev+1, prev, (cur-prev)*sizeof(*cur));
196  *prev = (Pattern_t) (*exp == OPT) ? kOPT :
197  (*exp == PCLOSE) ? kPCLOSE :
198  kCLOSE;
199  ++cur;
200  ++exp;
201  break;
202 
203  default:
204  prev = cur;
205  *cur++ = esc(&exp);
206  break;
207  }
208  }
209 
210  *cur = (Pattern_t)kEND;
211  theError = E_NONE;
212 
213 exit:
214  return theError;
215 }
216 
217 ////////////////////////////////////////////////////////////////////////////////
218 /// Match a string with a pattern.
219 
220 const char *Matchs(const char* str,
221  size_t slen,
222  const Pattern_t* pat,
223  const char** startpat)
224 {
225  if (!pat) return 0;
226  const char* endp = 0;
227  if (*pat == (Pattern_t)kBOL) {
228  // the rest has to match directly
229  endp = patcmp(str, slen, pat+1, str);
230  } else {
231  // scoot along the string until it matches, or no more string
232  const char* start = str;
233  while ((endp = patcmp(str, slen, pat, start)) == 0 && slen != 0)
234  ++str, --slen;
235  }
236  *startpat = str;
237  return endp;
238 }
239 
240 ////////////////////////////////////////////////////////////////////////////////
241 /// Set bits in the map corresponding to characters specified in the src
242 /// character class.
243 
244 static const char *doccl(Pattern_t* map, const char* src)
245 {
246  int negative;
247 
248  ++src; // skip past the [
249  negative = (*src == NCCL);
250  if (negative) // check for negative ccl
251  ++src;
252  memset(map, 0, MAPSIZE*sizeof(*map)); // bitmap initially empty
253 
254  while (*src && *src != CCLEND) {
255  int isCCL = (*src == CCL); // support [] in pattern, don't put single [ before closing ]
256  unsigned char first = esc(&src);
257  SETBIT(first, map);
258  if (isCCL && *src && *src == CCLEND) {
259  first = esc(&src);
260  SETBIT(first, map);
261  }
262  if (*src == '-' && src[1] && src[1] != CCLEND) {
263  ++src; // skip to end-of-sequence char
264  unsigned char last = esc(&src);
265  if (first <= last) while (first < last ) SETBIT(++first, map);
266  else while (last < first) SETBIT(last++, map);
267  }
268  }
269 
270  if (negative) {
271  for (int i = MAPSIZE; --i >= 0;)
272  *map++ ^= ~0; // invert all bits
273  }
274 
275  return src;
276 }
277 
278 ////////////////////////////////////////////////////////////////////////////////
279 /// Like strcmp, but compares str against pat. Each element of str is
280 /// compared with the template until either a mis-match is found or the end
281 /// of the template is reached. In the former case a 0 is returned; in the
282 /// latter, a pointer into str (pointing after the last character in the
283 /// matched pattern) is returned. start points at the first character in
284 /// the string, which might not be the same thing as str if the search
285 /// started in the middle of the string.
286 
287 static const char *patcmp(const char* str,
288  size_t slen,
289  const Pattern_t* pat,
290  const char* start)
291 {
292  if (!pat) // make sure pattern is valid
293  return 0;
294 
295  while (*pat != (Pattern_t)kEND) {
296  if (*pat == (Pattern_t)kOPT) {
297 
298  // Zero or one matches. It doesn't matter if omatch fails---it will
299  // advance str past the character on success, though. Always advance
300  // the pattern past both the kOPT and the operand.
301  omatch(&str, &slen, ++pat, start);
302  ADVANCE(pat);
303 
304  } else if (*pat != (Pattern_t)kCLOSE &&
305  *pat != (Pattern_t)kPCLOSE) {
306 
307  // Do a simple match. Note that omatch() fails if there's still
308  // something in pat but we're at end of string.
309 
310  if (!omatch(&str, &slen, pat, start))
311  return 0;
312  ADVANCE(pat);
313 
314  } else { // Process a Kleene or positive closure
315 
316  if (*pat++ == (Pattern_t)kPCLOSE) // one match required
317  if (!omatch(&str, &slen, pat, start))
318  return 0;
319 
320  // Match as many as possible, zero is okay
321 
322  const char* bocl = str; // beginning of closure string.
323  while (slen && omatch(&str, &slen, pat, start))
324  ;
325  ADVANCE(pat); // skip over the closure
326  if (*pat == (Pattern_t)kEND)
327  break;
328 
329  // 'str' now points to the character that made made us fail. Try to
330  // process the rest of the string. If the character following the
331  // closure could have been in the closure (as in the pattern "[a-z]*t")
332  // the final 't' will be sucked up in the while loop. So, if the match
333  // fails, back up a notch and try to match the rest of the string
334  // again, repeating this process until we get back to the
335  // beginning of the closure. The recursion goes as many levels
336  // deep as there are closures in the pattern.
337 
338  const char* end;
339  while ((end = patcmp(str, slen, pat, start)) == 0) {
340  ++slen, --str;
341  if (str < bocl) break;
342  }
343  return end;
344 
345  } // closure
346  } // while (*pat != kEND)
347 
348  //
349  // omatch() advances str to point at the next character to be matched. So
350  // str points at the character following the last character matched when
351  // you reach the end of the template. The exceptions are templates
352  // containing only a BOLN or EOLN token. In these cases omatch doesn't
353  // advance.
354  //
355 
356  return str;
357 }
358 
359 ////////////////////////////////////////////////////////////////////////////////
360 /// Match one pattern element, pointed at by pat, against the character at
361 /// **strp. Return 0 on a failure, 1 on success. *strp is advanced to skip
362 /// over the matched character on a successful match. Closure is handled one
363 /// level up by patcmp().
364 ///
365 /// "start" points at the character at the left edge of the line. This might
366 /// not be the same thing as *strp if the search is starting in the middle
367 /// of the string. An end-of- line anchor matches end of string only.
368 
369 static int omatch(const char** strp,
370  size_t* slenp,
371  const Pattern_t* pat,
372  const char* start)
373 {
374  switch (*pat) {
375  // Match beginning of line
376  case kBOL: return (*strp == start);
377 
378  // Match end of line
379  case kEOL: return (*slenp == 0);
380 
381  // Notice: cases above don't advance.
382  // Match any except newline
383  case kANY: if (**strp == '\n') return 0;
384  break;
385 
386  // Set match
387  case kCCL: if (*slenp == 0 || !TSTBIT(**strp, pat + 1)) return 0;
388  break;
389 
390  // Literal match
391  default: if (*slenp == 0 || (unsigned char) **strp != *pat) return 0;
392  break;
393  }
394 
395  if (*slenp) {
396  ++*strp;
397  --*slenp;
398  }
399  return 2;
400 }
401 
402 ////////////////////////////////////////////////////////////////////////////////
403 /// Convert the hex digit represented by 'c' to an int. 'c'
404 /// must be one of: 0123456789abcdefABCDEF
405 
406 static int hex2bin(int c)
407 {
408  return (isdigit(c) ? (c)-'0': ((toupper((unsigned char)c))-'A')+10) & 0xf;
409 }
410 
411 ////////////////////////////////////////////////////////////////////////////////
412 /// Convert the hex digit represented by 'c' to an int. 'c'
413 /// must be a digit in the range '0'-'7'.
414 
415 static int oct2bin(int c)
416 {
417  return ( ((c)-'0') & 0x7 );
418 }
419 
420 ////////////////////////////////////////////////////////////////////////////////
421 /// Map escape sequences into their equivalent symbols. Return
422 /// the equivalent ASCII character. *s is advanced past the
423 /// escape sequence. If no escape sequence is present, the
424 /// current character is returned and the string is advanced by
425 /// one. The following are recognized:
426 ///
427 /// - `\b` backspace
428 /// - `\f` formfeed
429 /// - `\n` newline
430 /// - `\r` carriage return
431 /// - `\s` space
432 /// - `\t` tab
433 /// - `\e` ASCII ESC character ('\033')
434 /// - `\DDD` number formed of 1-3 octal digits
435 /// - `\xDD` number formed of 1-2 hex digits
436 /// - `\^C` C = any letter. Control code
437 
438 static int esc(const char** s)
439 {
440  int rval;
441 
442  if (**s != '\\')
443  rval = *((*s)++);
444  else {
445  ++(*s); // Skip the backslash (\)
446  switch (toupper((unsigned char)**s)) {
447  case '\0': rval = '\\'; break;
448  case 'B': rval = '\b'; break;
449  case 'F': rval = '\f'; break;
450  case 'N': rval = '\n'; break;
451  case 'R': rval = '\r'; break;
452  case 'S': rval = ' '; break;
453  case 'T': rval = '\t'; break;
454  case 'E': rval = '\033'; break;
455 
456  case '^':
457  rval = *++(*s) ;
458  rval = toupper((unsigned char)rval) - '@';
459  break;
460 
461  case 'X':
462  rval = 0;
463  ++(*s);
464  if (ISHEXDIGIT(**s)) {
465  rval = hex2bin((unsigned char) *(*s)++);
466  }
467  if (ISHEXDIGIT(**s)) {
468  rval <<= 4;
469  rval |= hex2bin((unsigned char) *(*s)++);
470  }
471  --(*s);
472  break;
473 
474  default:
475  if (!ISOCTDIGIT(**s))
476  rval = **s;
477  else {
478  rval = oct2bin(*(*s)++);
479  if (ISOCTDIGIT(**s)) {
480  rval <<= 3;
481  rval |= oct2bin(*(*s)++);
482  }
483  if (ISOCTDIGIT(**s)) {
484  rval <<= 3;
485  rval |= oct2bin(*(*s)++);
486  }
487  --(*s);
488  }
489  break;
490  }
491  ++(*s);
492  }
493  return (unsigned char)rval;
494 }
Definition: Match.cxx:46
#define E_NOMEM
Definition: Match.cxx:105
#define ISHEXDIGIT(x)
Definition: Match.cxx:54
const char * Matchs(const char *str, size_t slen, const Pattern_t *pat, const char **startpat)
Match a string with a pattern.
Definition: Match.cxx:220
static int omatch(const char **, size_t *, const Pattern_t *, const char *)
Match one pattern element, pointed at by pat, against the character at **strp.
Definition: Match.cxx:369
#define MAPSIZE
Definition: Match.cxx:64
#define ANY
Definition: Match.cxx:32
static const char * patcmp(const char *, size_t, const Pattern_t *, const char *)
Like strcmp, but compares str against pat.
Definition: Match.cxx:287
#define CCLEND
Definition: Match.cxx:34
#define CCL
Definition: Match.cxx:33
#define EOL
Definition: Match.cxx:31
Definition: Match.cxx:44
#define CLOSURE
Definition: Match.cxx:36
Definition: Match.cxx:43
static int hex2bin(int)
Convert the hex digit represented by 'c' to an int.
Definition: Match.cxx:406
Definition: Match.cxx:47
static void SETBIT(unsigned char b, Pattern_t *map)
Definition: Match.cxx:89
#define E_PAT
Definition: Match.cxx:106
Eaction
Definition: Match.cxx:40
#define OPT
Definition: Match.cxx:38
unsigned short Pattern_t
Definition: Match.h:26
#define ISOCTDIGIT(x)
Definition: Match.cxx:61
static const char * doccl(Pattern_t *, const char *)
Set bits in the map corresponding to characters specified in the src character class.
Definition: Match.cxx:244
#define E_ILLEGAL
Definition: Match.cxx:104
#define NCCL
Definition: Match.cxx:35
#define BOL
Definition: Match.cxx:30
static int TSTBIT(unsigned char b, const Pattern_t *map)
Definition: Match.cxx:96
void ADVANCE(const Pattern_t *&pat)
Advance a pointer into the pattern template to the next pattern element, this is a+1 for all pattern ...
Definition: Match.cxx:72
static int oct2bin(int)
Convert the hex digit represented by 'c' to an int.
Definition: Match.cxx:415
Definition: Match.cxx:49
Definition: Match.cxx:45
Definition: Match.cxx:42
int Makepat(const char *exp, Pattern_t *pat, int maxpat)
Make a pattern template from the string pointed to by exp.
Definition: Match.cxx:129
static int esc(const char **)
Map escape sequences into their equivalent symbols.
Definition: Match.cxx:438
#define PCLOSE
Definition: Match.cxx:37
double exp(double)
#define E_NONE
Definition: Match.cxx:103