Logo ROOT   6.16/01
Reference Guide
Match.cxx
Go to the documentation of this file.
1// @(#)root/base:$Id$
2// Author: Fons Rademakers 04/08/95
3
4/*************************************************************************
5 * Copyright (C) 1995-2000, Rene Brun and Fons Rademakers. *
6 * All rights reserved. *
7 * *
8 * For the licensing terms see $ROOTSYS/LICENSE. *
9 * For the list of contributors see $ROOTSYS/README/CREDITS. *
10 *************************************************************************/
11
12//////////////////////////////////////////////////////////////////////////
13// //
14// Author: Allen I. Holub //
15// //
16// (c) C Gazette. May be used freely as long as author and publication //
17// are acknowledged. //
18// //
19//////////////////////////////////////////////////////////////////////////
20
21#include <stdio.h>
22#include <ctype.h>
23#include <string.h>
24
25
26#include "Match.h"
27
28
29// Metacharacters in the input:
30#define BOL '^' // start-of-line anchor
31#define EOL '$' // end-of-line anchor
32#define ANY '.' // matches any character
33#define CCL '[' // start a character class
34#define CCLEND ']' // end a character class
35#define NCCL '^' // negates character class if 1st character
36#define CLOSURE '*' // Kleene closure (matches 0 or more)
37#define PCLOSE '+' // Positive closure (1 or more)
38#define OPT '?' // Optional closure (0 or 1)
39
40enum Eaction { // These are put in the pattern string
41 // to represent metacharacters.
42 kBOL = (0x8000 | '^'),
43 kEOL = (0x8000 | '$'),
44 kANY = (0x8000 | '.'),
45 kCCL = (0x8000 | '['),
46 kOPT = (0x8000 | '?'),
47 kCLOSE = (0x8000 | '*'),
48 kPCLOSE = (0x8000 | '+'),
49 kEND = (0x8000 | 0) // end of pattern
50
51};
52
53#if 1
54#define ISHEXDIGIT(x) isxdigit((unsigned char)x)
55#else
56#define ISHEXDIGIT(x) (isdigit(x) \
57 || ('a'<=(x) && (x)<='f') \
58 || ('A'<=(x) && (x)<='F') )
59#endif
60
61#define ISOCTDIGIT(x) ('0'<=(x) && (x)<='7')
62
63// ----------------------------------------------------------------------
64#define MAPSIZE 16 // need this many Pattern_t elements for
65 // character class bit map
66
67////////////////////////////////////////////////////////////////////////////////
68/// Advance a pointer into the pattern template to the next pattern element,
69/// this is a+1 for all pattern elements but kCCL, where you have to skip
70/// past both the kCCL character and the bitmap that follows that character.
71
72inline void ADVANCE(const Pattern_t*& pat)
73{
74 if (*pat++ == (Pattern_t) kCCL)
75 pat += MAPSIZE;
76}
77
78//
79// Bitmap functions. Set bit b in the map and
80// test bit b to see if it was set previously.
81//
82
83////////////////////////////////////////////////////////////////////////////////
84
85#ifdef SETBIT // from Rtypes.h
86#undef SETBIT
87#endif
88
89static void SETBIT(unsigned char b, Pattern_t* map)
90{
91 map[b >> 4] |= 1 << (b & 0x0f);
92}
93
94////////////////////////////////////////////////////////////////////////////////
95
96static int TSTBIT(unsigned char b, const Pattern_t* map)
97{
98 return map[b >> 4] & (1 << (b & 0x0f));
99}
100
101// ----------------------------------------------------------------------
102
103#define E_NONE 0 // Possible return values from pat_err
104#define E_ILLEGAL 1 // Set in Makepat() to indicate prob-
105#define E_NOMEM 2 // lems that came up while making the
106#define E_PAT 3 // pattern template.
107
108// ----------------------------------------------------------------------
109
110static const char* doccl(Pattern_t*, const char*);
111static int hex2bin(int);
112static int oct2bin(int);
113static int omatch(const char**, size_t*, const Pattern_t*, const char*);
114static const char* patcmp(const char*, size_t, const Pattern_t*, const char*);
115static int esc(const char**);
116
117// ----------------------------------------------------------------------
118
119////////////////////////////////////////////////////////////////////////////////
120/// Make a pattern template from the string pointed to by exp. Stop when
121/// `\0` is found in exp. The pattern template is assembled
122/// in pat whose length is given by maxpat.
123///
124/// Return:
125/// - E_ILLEGAL Illegal input pattern.
126/// - E_NOMEM out of memory.
127/// - E_PAT pattern too long.
128
129int Makepat(const char* exp, // Regular expression
130 Pattern_t* pat, // Assembled compiled pattern
131 int maxpat) // Length of pat
132{
133 Pattern_t* cur; // pointer to current pattern element
134 Pattern_t* prev; // pointer to previous pattern element
135 int theError = E_ILLEGAL;
136
137 if (!*exp)
138 goto exit;
139
140 if (*exp == CLOSURE || *exp == PCLOSE || *exp == OPT)
141 goto exit;
142
143 theError = E_NOMEM;
144 if (!pat) goto exit; // Check for bad pat
145
146 prev = cur = pat;
147 theError = E_PAT;
148
149 while (*exp) {
150
151 if (cur >= &pat[maxpat - 1]) goto exit;
152
153 switch (*exp) {
154 case ANY:
155 *cur = (Pattern_t)kANY;
156 prev = cur++;
157 ++exp;
158 break;
159
160 case BOL:
161 *cur = (cur == pat) ? (Pattern_t)kBOL : (unsigned char)*exp;
162 prev = cur++;
163 ++exp;
164 break;
165
166 case EOL:
167 *cur = (!exp[1]) ? (Pattern_t)kEOL : (unsigned char)*exp;
168 prev = cur++;
169 ++exp;
170 break;
171
172 case CCL:
173 if (((cur - pat) + MAPSIZE) >= maxpat)
174 goto exit; // not enough room for bit map
175 prev = cur;
176 *cur++ = (Pattern_t)kCCL;
177 exp = doccl(cur, exp);
178 if (*exp != CCLEND) goto exit;
179 ++exp;
180 cur += MAPSIZE;
181 break;
182
183 case OPT:
184 case CLOSURE:
185 case PCLOSE:
186 switch (*prev) {
187 case kBOL:
188 case kEOL:
189 case kOPT:
190 case kPCLOSE:
191 case kCLOSE:
192 goto exit;
193 }
194
195 memmove( prev+1, prev, (cur-prev)*sizeof(*cur));
196 *prev = (Pattern_t) (*exp == OPT) ? kOPT :
197 (*exp == PCLOSE) ? kPCLOSE :
198 kCLOSE;
199 ++cur;
200 ++exp;
201 break;
202
203 default:
204 prev = cur;
205 *cur++ = esc(&exp);
206 break;
207 }
208 }
209
210 *cur = (Pattern_t)kEND;
211 theError = E_NONE;
212
213exit:
214 return theError;
215}
216
217////////////////////////////////////////////////////////////////////////////////
218/// Match a string with a pattern.
219
220const char *Matchs(const char* str,
221 size_t slen,
222 const Pattern_t* pat,
223 const char** startpat)
224{
225 if (!pat) return 0;
226 const char* endp = 0;
227 if (*pat == (Pattern_t)kBOL) {
228 // the rest has to match directly
229 endp = patcmp(str, slen, pat+1, str);
230 } else {
231 // scoot along the string until it matches, or no more string
232 const char* start = str;
233 while ((endp = patcmp(str, slen, pat, start)) == 0 && slen != 0)
234 ++str, --slen;
235 }
236 *startpat = str;
237 return endp;
238}
239
240////////////////////////////////////////////////////////////////////////////////
241/// Set bits in the map corresponding to characters specified in the src
242/// character class.
243
244static const char *doccl(Pattern_t* map, const char* src)
245{
246 int negative;
247
248 ++src; // skip past the [
249 negative = (*src == NCCL);
250 if (negative) // check for negative ccl
251 ++src;
252 memset(map, 0, MAPSIZE*sizeof(*map)); // bitmap initially empty
253
254 while (*src && *src != CCLEND) {
255 int isCCL = (*src == CCL); // support [] in pattern, don't put single [ before closing ]
256 unsigned char first = esc(&src);
257 SETBIT(first, map);
258 if (isCCL && *src && *src == CCLEND) {
259 first = esc(&src);
260 SETBIT(first, map);
261 }
262 if (*src == '-' && src[1] && src[1] != CCLEND) {
263 ++src; // skip to end-of-sequence char
264 unsigned char last = esc(&src);
265 if (first <= last) while (first < last ) SETBIT(++first, map);
266 else while (last < first) SETBIT(last++, map);
267 }
268 }
269
270 if (negative) {
271 for (int i = MAPSIZE; --i >= 0;)
272 *map++ ^= ~0; // invert all bits
273 }
274
275 return src;
276}
277
278////////////////////////////////////////////////////////////////////////////////
279/// Like strcmp, but compares str against pat. Each element of str is
280/// compared with the template until either a mis-match is found or the end
281/// of the template is reached. In the former case a 0 is returned; in the
282/// latter, a pointer into str (pointing after the last character in the
283/// matched pattern) is returned. start points at the first character in
284/// the string, which might not be the same thing as str if the search
285/// started in the middle of the string.
286
287static const char *patcmp(const char* str,
288 size_t slen,
289 const Pattern_t* pat,
290 const char* start)
291{
292 if (!pat) // make sure pattern is valid
293 return 0;
294
295 while (*pat != (Pattern_t)kEND) {
296 if (*pat == (Pattern_t)kOPT) {
297
298 // Zero or one matches. It doesn't matter if omatch fails---it will
299 // advance str past the character on success, though. Always advance
300 // the pattern past both the kOPT and the operand.
301 omatch(&str, &slen, ++pat, start);
302 ADVANCE(pat);
303
304 } else if (*pat != (Pattern_t)kCLOSE &&
305 *pat != (Pattern_t)kPCLOSE) {
306
307 // Do a simple match. Note that omatch() fails if there's still
308 // something in pat but we're at end of string.
309
310 if (!omatch(&str, &slen, pat, start))
311 return 0;
312 ADVANCE(pat);
313
314 } else { // Process a Kleene or positive closure
315
316 if (*pat++ == (Pattern_t)kPCLOSE) // one match required
317 if (!omatch(&str, &slen, pat, start))
318 return 0;
319
320 // Match as many as possible, zero is okay
321
322 const char* bocl = str; // beginning of closure string.
323 while (slen && omatch(&str, &slen, pat, start))
324 ;
325 ADVANCE(pat); // skip over the closure
326 if (*pat == (Pattern_t)kEND)
327 break;
328
329 // 'str' now points to the character that made made us fail. Try to
330 // process the rest of the string. If the character following the
331 // closure could have been in the closure (as in the pattern "[a-z]*t")
332 // the final 't' will be sucked up in the while loop. So, if the match
333 // fails, back up a notch and try to match the rest of the string
334 // again, repeating this process until we get back to the
335 // beginning of the closure. The recursion goes as many levels
336 // deep as there are closures in the pattern.
337
338 const char* end;
339 while ((end = patcmp(str, slen, pat, start)) == 0) {
340 ++slen, --str;
341 if (str < bocl) break;
342 }
343 return end;
344
345 } // closure
346 } // while (*pat != kEND)
347
348 //
349 // omatch() advances str to point at the next character to be matched. So
350 // str points at the character following the last character matched when
351 // you reach the end of the template. The exceptions are templates
352 // containing only a BOLN or EOLN token. In these cases omatch doesn't
353 // advance.
354 //
355
356 return str;
357}
358
359////////////////////////////////////////////////////////////////////////////////
360/// Match one pattern element, pointed at by pat, against the character at
361/// **strp. Return 0 on a failure, 1 on success. *strp is advanced to skip
362/// over the matched character on a successful match. Closure is handled one
363/// level up by patcmp().
364///
365/// "start" points at the character at the left edge of the line. This might
366/// not be the same thing as *strp if the search is starting in the middle
367/// of the string. An end-of- line anchor matches end of string only.
368
369static int omatch(const char** strp,
370 size_t* slenp,
371 const Pattern_t* pat,
372 const char* start)
373{
374 switch (*pat) {
375 // Match beginning of line
376 case kBOL: return (*strp == start);
377
378 // Match end of line
379 case kEOL: return (*slenp == 0);
380
381 // Notice: cases above don't advance.
382 // Match any except newline
383 case kANY: if (**strp == '\n') return 0;
384 break;
385
386 // Set match
387 case kCCL: if (*slenp == 0 || !TSTBIT(**strp, pat + 1)) return 0;
388 break;
389
390 // Literal match
391 default: if (*slenp == 0 || (unsigned char) **strp != *pat) return 0;
392 break;
393 }
394
395 if (*slenp) {
396 ++*strp;
397 --*slenp;
398 }
399 return 2;
400}
401
402////////////////////////////////////////////////////////////////////////////////
403/// Convert the hex digit represented by 'c' to an int. 'c'
404/// must be one of: 0123456789abcdefABCDEF
405
406static int hex2bin(int c)
407{
408 return (isdigit(c) ? (c)-'0': ((toupper((unsigned char)c))-'A')+10) & 0xf;
409}
410
411////////////////////////////////////////////////////////////////////////////////
412/// Convert the hex digit represented by 'c' to an int. 'c'
413/// must be a digit in the range '0'-'7'.
414
415static int oct2bin(int c)
416{
417 return ( ((c)-'0') & 0x7 );
418}
419
420////////////////////////////////////////////////////////////////////////////////
421/// Map escape sequences into their equivalent symbols. Return
422/// the equivalent ASCII character. *s is advanced past the
423/// escape sequence. If no escape sequence is present, the
424/// current character is returned and the string is advanced by
425/// one. The following are recognized:
426///
427/// - `\b` backspace
428/// - `\f` formfeed
429/// - `\n` newline
430/// - `\r` carriage return
431/// - `\s` space
432/// - `\t` tab
433/// - `\e` ASCII ESC character ('\033')
434/// - `\DDD` number formed of 1-3 octal digits
435/// - `\xDD` number formed of 1-2 hex digits
436/// - `\^C` C = any letter. Control code
437
438static int esc(const char** s)
439{
440 int rval;
441
442 if (**s != '\\')
443 rval = *((*s)++);
444 else {
445 ++(*s); // Skip the backslash (\‍)
446 switch (toupper((unsigned char)**s)) {
447 case '\0': rval = '\\'; break;
448 case 'B': rval = '\b'; break;
449 case 'F': rval = '\f'; break;
450 case 'N': rval = '\n'; break;
451 case 'R': rval = '\r'; break;
452 case 'S': rval = ' '; break;
453 case 'T': rval = '\t'; break;
454 case 'E': rval = '\033'; break;
455
456 case '^':
457 rval = *++(*s) ;
458 rval = toupper((unsigned char)rval) - '@';
459 break;
460
461 case 'X':
462 rval = 0;
463 ++(*s);
464 if (ISHEXDIGIT(**s)) {
465 rval = hex2bin((unsigned char) *(*s)++);
466 }
467 if (ISHEXDIGIT(**s)) {
468 rval <<= 4;
469 rval |= hex2bin((unsigned char) *(*s)++);
470 }
471 --(*s);
472 break;
473
474 default:
475 if (!ISOCTDIGIT(**s))
476 rval = **s;
477 else {
478 rval = oct2bin(*(*s)++);
479 if (ISOCTDIGIT(**s)) {
480 rval <<= 3;
481 rval |= oct2bin(*(*s)++);
482 }
483 if (ISOCTDIGIT(**s)) {
484 rval <<= 3;
485 rval |= oct2bin(*(*s)++);
486 }
487 --(*s);
488 }
489 break;
490 }
491 ++(*s);
492 }
493 return (unsigned char)rval;
494}
#define MAPSIZE
Definition: Match.cxx:64
int Makepat(const char *exp, Pattern_t *pat, int maxpat)
Make a pattern template from the string pointed to by exp.
Definition: Match.cxx:129
const char * Matchs(const char *str, size_t slen, const Pattern_t *pat, const char **startpat)
Match a string with a pattern.
Definition: Match.cxx:220
#define ISHEXDIGIT(x)
Definition: Match.cxx:54
#define OPT
Definition: Match.cxx:38
#define E_NOMEM
Definition: Match.cxx:105
static int omatch(const char **, size_t *, const Pattern_t *, const char *)
Match one pattern element, pointed at by pat, against the character at **strp.
Definition: Match.cxx:369
#define BOL
Definition: Match.cxx:30
static void SETBIT(unsigned char b, Pattern_t *map)
Definition: Match.cxx:89
#define NCCL
Definition: Match.cxx:35
Eaction
Definition: Match.cxx:40
@ kEND
Definition: Match.cxx:49
@ kCCL
Definition: Match.cxx:45
@ kCLOSE
Definition: Match.cxx:47
@ kOPT
Definition: Match.cxx:46
@ kEOL
Definition: Match.cxx:43
@ kBOL
Definition: Match.cxx:42
@ kANY
Definition: Match.cxx:44
@ kPCLOSE
Definition: Match.cxx:48
static int hex2bin(int)
Convert the hex digit represented by 'c' to an int.
Definition: Match.cxx:406
#define PCLOSE
Definition: Match.cxx:37
static int TSTBIT(unsigned char b, const Pattern_t *map)
Definition: Match.cxx:96
#define EOL
Definition: Match.cxx:31
#define E_NONE
Definition: Match.cxx:103
static const char * doccl(Pattern_t *, const char *)
Set bits in the map corresponding to characters specified in the src character class.
Definition: Match.cxx:244
static const char * patcmp(const char *, size_t, const Pattern_t *, const char *)
Like strcmp, but compares str against pat.
Definition: Match.cxx:287
#define CCLEND
Definition: Match.cxx:34
#define ANY
Definition: Match.cxx:32
#define ISOCTDIGIT(x)
Definition: Match.cxx:61
#define E_ILLEGAL
Definition: Match.cxx:104
#define CCL
Definition: Match.cxx:33
#define E_PAT
Definition: Match.cxx:106
void ADVANCE(const Pattern_t *&pat)
Advance a pointer into the pattern template to the next pattern element, this is a+1 for all pattern ...
Definition: Match.cxx:72
static int oct2bin(int)
Convert the hex digit represented by 'c' to an int.
Definition: Match.cxx:415
#define CLOSURE
Definition: Match.cxx:36
static int esc(const char **)
Map escape sequences into their equivalent symbols.
Definition: Match.cxx:438
unsigned short Pattern_t
Definition: Match.h:26
#define b(i)
Definition: RSha256.hxx:100
#define c(i)
Definition: RSha256.hxx:101
double exp(double)
static constexpr double s
Definition: first.py:1