Logo ROOT   6.08/07
Reference Guide
TPRegexp.cxx
Go to the documentation of this file.
1 // @(#)root/base:$Id$
2 // Author: Eddy Offermann 24/06/05
3 
4 /*************************************************************************
5  * Copyright (C) 1995-2005, Rene Brun and Fons Rademakers. *
6  * All rights reserved. *
7  * *
8  * For the licensing terms see $ROOTSYS/LICENSE. *
9  * For the list of contributors see $ROOTSYS/README/CREDITS. *
10  *************************************************************************/
11 
12 /* \class TPRegexp
13 \ingroup Base
14 
15 C++ Wrapper for the "Perl Compatible Regular Expressions" library
16  The PCRE lib can be found at: http://www.pcre.org/
17 
18 Extensive documentation about Regular expressions in Perl can be
19 found at : http://perldoc.perl.org/perlre.html
20 */
21 
22 #include "Riostream.h"
23 #include "TPRegexp.h"
24 #include "TObjArray.h"
25 #include "TObjString.h"
26 #include "TError.h"
27 
28 #include <pcre.h>
29 
30 #include <vector>
31 #include <stdexcept>
32 
33 struct PCREPriv_t {
34  pcre *fPCRE;
35  pcre_extra *fPCREExtra;
36 
37  PCREPriv_t() { fPCRE = 0; fPCREExtra = 0; }
38 };
39 
40 
42 
44 
45 ////////////////////////////////////////////////////////////////////////////////
46 /// Default ctor.
47 
49 {
50  fPriv = new PCREPriv_t;
51  fPCREOpts = 0;
52 }
53 
54 ////////////////////////////////////////////////////////////////////////////////
55 /// Create and initialize with pat.
56 
58 {
59  fPattern = pat;
60  fPriv = new PCREPriv_t;
61  fPCREOpts = 0;
62 }
63 
64 ////////////////////////////////////////////////////////////////////////////////
65 /// Copy ctor.
66 
68 {
69  fPattern = p.fPattern;
70  fPriv = new PCREPriv_t;
71  fPCREOpts = p.fPCREOpts;
72 }
73 
74 ////////////////////////////////////////////////////////////////////////////////
75 /// Cleanup.
76 
78 {
79  if (fPriv->fPCRE)
80  pcre_free(fPriv->fPCRE);
81  if (fPriv->fPCREExtra)
82  pcre_free(fPriv->fPCREExtra);
83  delete fPriv;
84 }
85 
86 ////////////////////////////////////////////////////////////////////////////////
87 /// Assignment operator.
88 
90 {
91  if (this != &p) {
92  fPattern = p.fPattern;
93  if (fPriv->fPCRE)
94  pcre_free(fPriv->fPCRE);
95  fPriv->fPCRE = 0;
96  if (fPriv->fPCREExtra)
97  pcre_free(fPriv->fPCREExtra);
98  fPriv->fPCREExtra = 0;
99  fPCREOpts = p.fPCREOpts;
100  }
101  return *this;
102 }
103 
104 ////////////////////////////////////////////////////////////////////////////////
105 /// Translate Perl modifier flags into pcre flags.
106 /// The supported modStr characters are: g, i, m, o, s, x, and the
107 /// special d for debug. The meaning of the letters is:
108 /// - m
109 /// Treat string as multiple lines. That is, change "^" and "$" from
110 /// matching the start or end of the string to matching the start or
111 /// end of any line anywhere within the string.
112 /// - s
113 /// Treat string as single line. That is, change "." to match any
114 /// character whatsoever, even a newline, which normally it would not match.
115 /// Used together, as /ms, they let the "." match any character whatsoever,
116 /// while still allowing "^" and "$" to match, respectively, just after and
117 /// just before newlines within the string.
118 /// - i
119 /// Do case-insensitive pattern matching.
120 /// - x
121 /// Extend your pattern's legibility by permitting whitespace and comments.
122 /// - p
123 /// Preserve the string matched such that ${^PREMATCH}, ${^MATCH},
124 /// and ${^POSTMATCH} are available for use after matching.
125 /// - g and c
126 /// Global matching, and keep the Current position after failed matching.
127 /// Unlike i, m, s and x, these two flags affect the way the regex is used
128 /// rather than the regex itself. See Using regular expressions in Perl in
129 /// perlretut for further explanation of the g and c modifiers.
130 /// For more detail see: http://perldoc.perl.org/perlre.html#Modifiers.
131 
132 UInt_t TPRegexp::ParseMods(const TString &modStr) const
133 {
134  UInt_t opts = 0;
135 
136  if (modStr.Length() <= 0)
137  return fPCREOpts;
138 
139  //translate perl flags into pcre flags
140  const char *m = modStr;
141  while (*m) {
142  switch (*m) {
143  case 'g':
144  opts |= kPCRE_GLOBAL;
145  break;
146  case 'i':
147  opts |= PCRE_CASELESS;
148  break;
149  case 'm':
150  opts |= PCRE_MULTILINE;
151  break;
152  case 'o':
153  opts |= kPCRE_OPTIMIZE;
154  break;
155  case 's':
156  opts |= PCRE_DOTALL;
157  break;
158  case 'x':
159  opts |= PCRE_EXTENDED;
160  break;
161  case 'd': // special flag to enable debug printing (not Perl compat.)
162  opts |= kPCRE_DEBUG_MSGS;
163  break;
164  default:
165  Error("ParseMods", "illegal pattern modifier: %c", *m);
166  opts = 0;
167  }
168  ++m;
169  }
170  return opts;
171 }
172 
173 ////////////////////////////////////////////////////////////////////////////////
174 /// Return PCRE modifier options as string.
175 /// For meaning of mods see ParseMods().
176 
178 {
179  TString ret;
180 
181  if (fPCREOpts & kPCRE_GLOBAL) ret += 'g';
182  if (fPCREOpts & PCRE_CASELESS) ret += 'i';
183  if (fPCREOpts & PCRE_MULTILINE) ret += 'm';
184  if (fPCREOpts & PCRE_DOTALL) ret += 's';
185  if (fPCREOpts & PCRE_EXTENDED) ret += 'x';
186  if (fPCREOpts & kPCRE_OPTIMIZE) ret += 'o';
187  if (fPCREOpts & kPCRE_DEBUG_MSGS) ret += 'd';
188 
189  return ret;
190 }
191 
192 ////////////////////////////////////////////////////////////////////////////////
193 /// Compile the fPattern.
194 
196 {
197  if (fPriv->fPCRE)
198  pcre_free(fPriv->fPCRE);
199 
200  if (fPCREOpts & kPCRE_DEBUG_MSGS)
201  Info("Compile", "PREGEX compiling %s", fPattern.Data());
202 
203  const char *errstr;
204  Int_t patIndex;
205  fPriv->fPCRE = pcre_compile(fPattern.Data(), fPCREOpts & kPCRE_INTMASK,
206  &errstr, &patIndex, 0);
207 
208  if (!fPriv->fPCRE) {
209  if (fgThrowAtCompileError) {
210  throw std::runtime_error
211  (TString::Format("TPRegexp::Compile() compilation of TPRegexp(%s) failed at: %d because %s",
212  fPattern.Data(), patIndex, errstr).Data());
213  } else {
214  Error("Compile", "compilation of TPRegexp(%s) failed at: %d because %s",
215  fPattern.Data(), patIndex, errstr);
216  return;
217  }
218  }
219 
220  if (fPriv->fPCREExtra || (fPCREOpts & kPCRE_OPTIMIZE))
221  Optimize();
222 }
223 
224 ////////////////////////////////////////////////////////////////////////////////
225 /// Send the pattern through the optimizer.
226 
228 {
229  if (fPriv->fPCREExtra)
230  pcre_free(fPriv->fPCREExtra);
231 
232  if (fPCREOpts & kPCRE_DEBUG_MSGS)
233  Info("Optimize", "PREGEX studying %s", fPattern.Data());
234 
235  const char *errstr;
236  // pcre_study allows less options - see pcre_internal.h PUBLIC_STUDY_OPTIONS.
237  fPriv->fPCREExtra = pcre_study(fPriv->fPCRE, 0, &errstr);
238 
239  if (!fPriv->fPCREExtra && errstr) {
240  Error("Optimize", "Optimization of TPRegexp(%s) failed: %s",
241  fPattern.Data(), errstr);
242  }
243 }
244 
245 ////////////////////////////////////////////////////////////////////////////////
246 /// Returns the number of expanded '$' constructs.
247 
249  const TString &replacePattern,
250  Int_t *offVec, Int_t nrMatch) const
251 {
252  Int_t nrSubs = 0;
253  const char *p = replacePattern;
254 
255  Int_t state = 0;
256  Int_t subnum = 0;
257  while (state != -1) {
258  switch (state) {
259  case 0:
260  if (!*p) {
261  state = -1;
262  break;
263  }
264  if (*p == '$') {
265  state = 1;
266  subnum = 0;
267  if (p[1] == '&') {
268  p++;
269  if (isdigit(p[1]))
270  p++;
271  } else if (!isdigit(p[1])) {
272  Error("ReplaceSubs", "badly formed replacement pattern: %s",
273  replacePattern.Data());
274  }
275  } else
276  final += *p;
277  break;
278  case 1:
279  if (isdigit(*p)) {
280  subnum *= 10;
281  subnum += (*p)-'0';
282  } else {
283  if (fPCREOpts & kPCRE_DEBUG_MSGS)
284  Info("ReplaceSubs", "PREGEX appending substr #%d", subnum);
285  if (subnum < 0 || subnum > nrMatch-1) {
286  Error("ReplaceSubs","bad string number: %d",subnum);
287  } else {
288  const TString subStr = s(offVec[2*subnum],offVec[2*subnum+1]-offVec[2*subnum]);
289  final += subStr;
290  nrSubs++;
291  }
292  state = 0;
293  continue; // send char to start state
294  }
295  }
296  p++;
297  }
298  return nrSubs;
299 }
300 
301 ////////////////////////////////////////////////////////////////////////////////
302 /// Perform the actual matching - protected method.
303 
305  Int_t nMaxMatch, TArrayI *pos) const
306 {
307  Int_t *offVec = new Int_t[3*nMaxMatch];
308  // pcre_exec allows less options - see pcre_internal.h PUBLIC_EXEC_OPTIONS.
309  Int_t nrMatch = pcre_exec(fPriv->fPCRE, fPriv->fPCREExtra, s.Data(),
310  s.Length(), start, 0,
311  offVec, 3*nMaxMatch);
312 
313  if (nrMatch == PCRE_ERROR_NOMATCH)
314  nrMatch = 0;
315  else if (nrMatch <= 0) {
316  Error("Match","pcre_exec error = %d", nrMatch);
317  delete [] offVec;
318  return 0;
319  }
320 
321  if (pos)
322  pos->Set(2*nrMatch, offVec);
323  delete [] offVec;
324 
325  return nrMatch;
326 }
327 
328 ////////////////////////////////////////////////////////////////////////////////
329 /// The number of matches is returned, this equals the full match +
330 /// sub-pattern matches.
331 /// nMaxMatch is the maximum allowed number of matches.
332 /// pos contains the string indices of the matches. Its usage is
333 /// shown in the routine MatchS.
334 /// For meaning of mods see ParseMods().
335 
336 Int_t TPRegexp::Match(const TString &s, const TString &mods, Int_t start,
337  Int_t nMaxMatch, TArrayI *pos)
338 {
339  UInt_t opts = ParseMods(mods);
340 
341  if (!fPriv->fPCRE || opts != fPCREOpts) {
342  fPCREOpts = opts;
343  Compile();
344  }
345 
346  return MatchInternal(s, start, nMaxMatch, pos);
347 }
348 
349 
350 ////////////////////////////////////////////////////////////////////////////////
351 /// Returns a TObjArray of matched substrings as TObjString's.
352 /// The TObjArray is owner of the objects and must be deleted by the user.
353 /// The first entry is the full matched pattern, followed by the sub-patterns.
354 /// If a pattern was not matched, it will return an empty substring:
355 /// ~~~ {.cpp}
356 /// TObjArray *subStrL = TPRegexp("(a|(z))(bc)").MatchS("abc");
357 /// for (Int_t i = 0; i < subStrL->GetLast()+1; i++) {
358 /// const TString subStr = ((TObjString *)subStrL->At(i))->GetString();
359 /// std::cout << "\"" << subStr << "\" ";
360 /// }
361 /// std::cout << subStr << std::endl;
362 /// ~~~
363 /// produces: "abc" "a" "" "bc"
364 ///
365 /// For meaning of mods see ParseMods().
366 
367 TObjArray *TPRegexp::MatchS(const TString &s, const TString &mods,
368  Int_t start, Int_t nMaxMatch)
369 {
370  TArrayI pos;
371  Int_t nrMatch = Match(s, mods, start, nMaxMatch, &pos);
372 
373  TObjArray *subStrL = new TObjArray();
374  subStrL->SetOwner();
375 
376  for (Int_t i = 0; i < nrMatch; i++) {
377  Int_t startp = pos[2*i];
378  Int_t stopp = pos[2*i+1];
379  if (startp >= 0 && stopp >= 0) {
380  const TString subStr = s(pos[2*i], pos[2*i+1]-pos[2*i]);
381  subStrL->Add(new TObjString(subStr));
382  } else
383  subStrL->Add(new TObjString());
384  }
385 
386  return subStrL;
387 }
388 
389 ////////////////////////////////////////////////////////////////////////////////
390 /// Perform pattern substitution with optional back-ref replacement
391 /// - protected method.
392 
394  Int_t start, Int_t nMaxMatch,
395  Bool_t doDollarSubst) const
396 {
397  Int_t *offVec = new Int_t[3*nMaxMatch];
398 
399  TString final;
400  Int_t nrSubs = 0;
401  Int_t offset = start;
402  Int_t last = 0;
403 
404  while (kTRUE) {
405 
406  // find next matching subs
407  // pcre_exec allows less options - see pcre_internal.h PUBLIC_EXEC_OPTIONS.
408  Int_t nrMatch = pcre_exec(fPriv->fPCRE, fPriv->fPCREExtra, s.Data(),
409  s.Length(), offset, 0,
410  offVec, 3*nMaxMatch);
411 
412  if (nrMatch == PCRE_ERROR_NOMATCH) {
413  nrMatch = 0;
414  break;
415  } else if (nrMatch <= 0) {
416  Error("Substitute", "pcre_exec error = %d", nrMatch);
417  break;
418  }
419 
420  // append anything previously unmatched, but not substituted
421  if (last <= offVec[0]) {
422  final += s(last,offVec[0]-last);
423  last = offVec[1];
424  }
425 
426  // replace stuff in s
427  if (doDollarSubst) {
428  ReplaceSubs(s, final, replacePattern, offVec, nrMatch);
429  } else {
430  final += replacePattern;
431  }
432  ++nrSubs;
433 
434  // if global gotta check match at every pos
435  if (!(fPCREOpts & kPCRE_GLOBAL))
436  break;
437 
438  if (offVec[0] != offVec[1])
439  offset = offVec[1];
440  else {
441  // matched empty string
442  if (offVec[1] == s.Length())
443  break;
444  offset = offVec[1]+1;
445  }
446  }
447 
448  delete [] offVec;
449 
450  final += s(last,s.Length()-last);
451  s = final;
452 
453  return nrSubs;
454 }
455 
456 ////////////////////////////////////////////////////////////////////////////////
457 /// Substitute replaces the string s by a new string in which matching
458 /// patterns are replaced by the replacePattern string. The number of
459 /// substitutions are returned.
460 /// ~~~ {.cpp}
461 /// TString s("aap noot mies");
462 /// const Int_t nrSub = TPRegexp("(\\w*) noot (\\w*)").Substitute(s,"$2 noot $1");
463 /// std::cout << nrSub << " \"" << s << "\"" <<std::endl;
464 /// ~~~
465 /// produces: 2 "mies noot aap"
466 ///
467 /// For meaning of mods see ParseMods().
468 
469 Int_t TPRegexp::Substitute(TString &s, const TString &replacePattern,
470  const TString &mods, Int_t start, Int_t nMaxMatch)
471 {
472  UInt_t opts = ParseMods(mods);
473 
474  if (!fPriv->fPCRE || opts != fPCREOpts) {
475  fPCREOpts = opts;
476  Compile();
477  }
478 
479  return SubstituteInternal(s, replacePattern, start, nMaxMatch, kTRUE);
480 }
481 
482 
483 ////////////////////////////////////////////////////////////////////////////////
484 /// Returns true if underlying PCRE structure has been successfully
485 /// generated via regexp compilation.
486 
488 {
489  return fPriv->fPCRE != 0;
490 }
491 
492 ////////////////////////////////////////////////////////////////////////////////
493 /// Get value of static flag controlling whether exception should be thrown upon an
494 /// error during regular expression compilation by the PCRE engine.
495 
497 {
498  return fgThrowAtCompileError;
499 }
500 
501 ////////////////////////////////////////////////////////////////////////////////
502 /// Set static flag controlling whether exception should be thrown upon an
503 /// error during regular expression compilation by the PCRE engine.
504 
506 {
507  fgThrowAtCompileError = throwp;
508 }
509 
510 ////////////////////////////////////////////////////////////////////////////////
511 // //
512 // TString member functions, put here so the linker will include //
513 // them only if regular expressions are used. //
514 // //
515 ////////////////////////////////////////////////////////////////////////////////
516 
517 ////////////////////////////////////////////////////////////////////////////////
518 /// Find the first occurrence of the regexp in string and return the position.
519 /// Start is the offset at which the search should start.
520 
522 {
523  TArrayI pos;
524  Int_t nrMatch = r.Match(*this,"",start,10,&pos);
525  if (nrMatch > 0)
526  return pos[0];
527  else
528  return -1;
529 }
530 
531 ////////////////////////////////////////////////////////////////////////////////
532 /// Find the first occurrence of the regexp in string and return the position.
533 /// Extent is length of the matched string and start is the offset at which
534 /// the matching should start.
535 
536 Ssiz_t TString::Index(TPRegexp& r, Ssiz_t* extent, Ssiz_t start) const
537 {
538  TArrayI pos;
539  const Int_t nrMatch = r.Match(*this,"",start,10,&pos);
540  if (nrMatch > 0) {
541  *extent = pos[1]-pos[0];
542  return pos[0];
543  } else {
544  *extent = 0;
545  return -1;
546  }
547 }
548 
549 ////////////////////////////////////////////////////////////////////////////////
550 /// Return the substring found by applying the regexp starting at start.
551 
553 {
554  Ssiz_t len;
555  Ssiz_t begin = Index(r, &len, start);
556  return TSubString(*this, begin, len);
557 }
558 
559 ////////////////////////////////////////////////////////////////////////////////
560 /// Return the substring found by applying the regexp.
561 
563 {
564  return (*this)(r, 0);
565 }
566 
567 
568 /** \class TPMERegexp
569 
570 Wrapper for PCRE library (Perl Compatible Regular Expressions).
571 Based on PME - PCRE Made Easy by Zachary Hansen.
572 
573 Supports main Perl operations using regular expressions (Match,
574 Substitute and Split). To retrieve the results one can simply use
575 operator[] returning a TString.
576 
577 See $ROOTSYS/tutorials/regexp_pme.C for examples.
578 */
579 
581 
582 ////////////////////////////////////////////////////////////////////////////////
583 /// Default constructor. This regexp will match an empty string.
584 
586  TPRegexp(),
587  fNMaxMatches(10),
588  fNMatches(0),
589  fAddressOfLastString(0),
590  fLastGlobalPosition(0)
591 {
592  Compile();
593 }
594 
595 ////////////////////////////////////////////////////////////////////////////////
596 /// Constructor.
597 ///
598 /// \param[in] s string to compile into regular expression
599 /// \param[in] opts perl-style character flags to be set on TPME object
600 /// \param[in] nMatchMax maximum number of matches
601 
602 TPMERegexp::TPMERegexp(const TString& s, const TString& opts, Int_t nMatchMax) :
603  TPRegexp(s),
604  fNMaxMatches(nMatchMax),
605  fNMatches(0),
608 {
609  fPCREOpts = ParseMods(opts);
610  Compile();
611 }
612 
613 ////////////////////////////////////////////////////////////////////////////////
614 /// Constructor.
615 ///
616 /// \param[in] s string to compile into regular expression
617 /// \param[in] opts PCRE-style option flags to be set on TPME object
618 /// \param[in] nMatchMax maximum number of matches
619 
620 TPMERegexp::TPMERegexp(const TString& s, UInt_t opts, Int_t nMatchMax) :
621  TPRegexp(s),
622  fNMaxMatches(nMatchMax),
623  fNMatches(0),
626 {
627  fPCREOpts = opts;
628  Compile();
629 }
630 
631 ////////////////////////////////////////////////////////////////////////////////
632 /// Copy constructor.
633 /// Only PCRE specifics are copied, not last-match or global-match
634 /// information.
635 
637  TPRegexp(r),
639  fNMatches(0),
642 {
643  Compile();
644 }
645 
646 ////////////////////////////////////////////////////////////////////////////////
647 /// Reset the pattern and options.
648 /// If 'nMatchMax' other than -1 (the default) is passed, it is also set.
649 
650 void TPMERegexp::Reset(const TString& s, const TString& opts, Int_t nMatchMax)
651 {
652  Reset(s, ParseMods(opts), nMatchMax);
653 }
654 
655 ////////////////////////////////////////////////////////////////////////////////
656 /// Reset the pattern and options.
657 /// If 'nMatchMax' other than -1 (the default) is passed, it is also set.
658 
659 void TPMERegexp::Reset(const TString& s, UInt_t opts, Int_t nMatchMax)
660 {
661  fPattern = s;
662  fPCREOpts = opts;
663  Compile();
664 
665  if (nMatchMax != -1)
666  fNMatches = nMatchMax;
667  fNMatches = 0;
669 }
670 
671 ////////////////////////////////////////////////////////////////////////////////
672 /// Copy global-match state from 're; so that this regexp can continue
673 /// parsing the string from where 're' left off.
674 ///
675 /// Alternatively, GetGlobalPosition() get be used to retrieve the
676 /// last match position so that it can passed to Match().
677 ///
678 /// Ideally, as it is done in PERL, the last match position would be
679 /// stored in the TString itself.
680 
682 {
685 }
686 
687 ////////////////////////////////////////////////////////////////////////////////
688 /// Reset state of global match.
689 /// This happens automatically when a new string is passed for matching.
690 /// But be carefull, as the address of last TString object is used
691 /// to make this decision.
692 
694 {
696 }
697 
698 ////////////////////////////////////////////////////////////////////////////////
699 /// Runs a match on s against the regex 'this' was created with.
700 ///
701 /// \param[in] s string to match against
702 /// \param[in] start offset at which to start matching
703 /// \return number of matches found
704 
706 {
707  // If we got a new string, reset the global position counter.
708  if (fAddressOfLastString != (void*) &s) {
710  }
711 
712  if (fPCREOpts & kPCRE_GLOBAL) {
713  start += fLastGlobalPosition;
714  }
715 
716  //fprintf(stderr, "string: '%s' length: %d offset: %d\n", s.Data(), s.length(), offset);
718 
719  //fprintf(stderr, "MatchInternal_exec result = %d\n", fNMatches);
720 
721  fLastStringMatched = s;
722  fAddressOfLastString = (void*) &s;
723 
724  if (fPCREOpts & kPCRE_GLOBAL) {
725  if (fNMatches == PCRE_ERROR_NOMATCH) {
726  // fprintf(stderr, "TPME RESETTING: reset for no match\n");
727  fLastGlobalPosition = 0; // reset the position for next match (perl does this)
728  } else if (fNMatches > 0) {
729  // fprintf(stderr, "TPME RESETTING: setting to %d\n", marks[0].second);
730  fLastGlobalPosition = fMarkers[1]; // set to the end of the match
731  } else {
732  // fprintf(stderr, "TPME RESETTING: reset for no unknown\n");
734  }
735  }
736 
737  return fNMatches;
738 }
739 
740 ////////////////////////////////////////////////////////////////////////////////
741 /// Splits into at most maxfields. If maxfields is unspecified or
742 /// 0, trailing empty matches are discarded. If maxfields is
743 /// positive, no more than maxfields fields will be returned and
744 /// trailing empty matches are preserved. If maxfields is empty,
745 /// all fields (including trailing empty ones) are returned. This
746 /// *should* be the same as the perl behaviour.
747 ///
748 /// If pattern produces sub-matches, these are also stored in
749 /// the result.
750 ///
751 /// A pattern matching the null string will split the value of EXPR
752 /// into separate characters at each point it matches that way.
753 ///
754 /// \param[in] s string to split
755 /// \param[in] maxfields maximum number of fields to be split out. 0 means
756 /// split all fields, but discard any trailing empty bits.
757 /// Negative means split all fields and keep trailing empty bits.
758 /// Positive means keep up to N fields including any empty fields
759 /// less than N. Anything remaining is in the last field.
760 /// \return number of fields found
761 
762 Int_t TPMERegexp::Split(const TString& s, Int_t maxfields)
763 {
764  typedef std::pair<int, int> MarkerLoc_t;
765  typedef std::vector<MarkerLoc_t> MarkerLocVec_t;
766 
767  // stores the marks for the split
768  MarkerLocVec_t oMarks;
769 
770  // this is a list of current trailing empty matches if maxfields is
771  // unspecified or 0. If there is stuff in it and a non-empty match
772  // is found, then everything in here is pushed into oMarks and then
773  // the new match is pushed on. If the end of the string is reached
774  // and there are empty matches in here, they are discarded.
775  MarkerLocVec_t oCurrentTrailingEmpties;
776 
777  Int_t nOffset = 0;
778  Int_t nMatchesFound = 0;
779 
780  // while we are still finding matches and maxfields is 0 or negative
781  // (meaning we get all matches), or we haven't gotten to the number
782  // of specified matches
783  Int_t matchRes;
784  while ((matchRes = Match(s, nOffset)) &&
785  ((maxfields < 1) || nMatchesFound < maxfields)) {
786  ++nMatchesFound;
787 
788  if (fMarkers[1] - fMarkers[0] == 0) {
789  oMarks.push_back(MarkerLoc_t(nOffset, nOffset + 1));
790  ++nOffset;
791  if (nOffset >= s.Length())
792  break;
793  else
794  continue;
795  }
796 
797  // match can be empty
798  if (nOffset != fMarkers[0]) {
799  if (!oCurrentTrailingEmpties.empty()) {
800  oMarks.insert(oMarks.end(),
801  oCurrentTrailingEmpties.begin(),
802  oCurrentTrailingEmpties.end());
803  oCurrentTrailingEmpties.clear();
804  }
805  oMarks.push_back(MarkerLoc_t(nOffset, fMarkers[0]));
806  } else {
807  // empty match
808  if (maxfields == 0) {
809  // store for possible later inclusion
810  oCurrentTrailingEmpties.push_back(MarkerLoc_t(nOffset, nOffset));
811  } else {
812  oMarks.push_back(MarkerLoc_t(nOffset, nOffset));
813  }
814  }
815 
816  nOffset = fMarkers[1];
817 
818  if (matchRes > 1) {
819  for (Int_t i = 1; i < matchRes; ++i)
820  oMarks.push_back(MarkerLoc_t(fMarkers[2*i], fMarkers[2*i + 1]));
821  }
822  }
823 
824 
825  // if there were no matches found, push the whole thing on
826  if (nMatchesFound == 0) {
827  oMarks.push_back(MarkerLoc_t(0, s.Length()));
828  }
829  // if we ran out of matches, then append the rest of the string
830  // onto the end of the last split field
831  else if (maxfields > 0 && nMatchesFound >= maxfields) {
832  oMarks[oMarks.size() - 1].second = s.Length();
833  }
834  // else we have to add another entry for the end of the string
835  else {
836  Bool_t last_empty = (nOffset == s.Length());
837  if (!last_empty || maxfields < 0) {
838  if (!oCurrentTrailingEmpties.empty()) {
839  oMarks.insert(oMarks.end(),
840  oCurrentTrailingEmpties.begin(),
841  oCurrentTrailingEmpties.end());
842  }
843  oMarks.push_back(MarkerLoc_t(nOffset, s.Length()));
844  }
845  }
846 
847  fNMatches = oMarks.size();
849  for (Int_t i = 0; i < fNMatches; ++i) {
850  fMarkers[2*i] = oMarks[i].first;
851  fMarkers[2*i + 1] = oMarks[i].second;
852  }
853 
854  // fprintf(stderr, "match returning %d\n", fNMatches);
855  return fNMatches;
856 }
857 
858 ////////////////////////////////////////////////////////////////////////////////
859 /// Substitute matching part of s with r, dollar back-ref
860 /// substitution is performed if doDollarSubst is true (default).
861 /// Returns the number of substitutions made.
862 ///
863 /// After the substitution, another pass is made over the resulting
864 /// string and the following special tokens are interpreted:
865 /// - `\l` lowercase next char,
866 /// - `\u` uppercase next char,
867 /// - `\L` lowercase till `\E`,
868 /// - `\U` uppercase till `\E`, and
869 /// - `\E` end case modification.
870 
872 {
873  Int_t cnt = SubstituteInternal(s, r, 0, fNMaxMatches, doDollarSubst);
874 
875  TString ret;
876  Int_t state = 0;
877  Ssiz_t pos = 0, len = s.Length();
878  const Char_t *data = s.Data();
879  while (pos < len) {
880  Char_t c = data[pos];
881  if (c == '\\') {
882  c = data[pos+1]; // Rely on string-data being null-terminated.
883  switch (c) {
884  case 0 : ret += '\\'; break;
885  case 'l': state = 1; break;
886  case 'u': state = 2; break;
887  case 'L': state = 3; break;
888  case 'U': state = 4; break;
889  case 'E': state = 0; break;
890  default : ret += '\\'; ret += c; break;
891  }
892  pos += 2;
893  } else {
894  switch (state) {
895  case 0: ret += c; break;
896  case 1: ret += (Char_t) tolower(c); state = 0; break;
897  case 2: ret += (Char_t) toupper(c); state = 0; break;
898  case 3: ret += (Char_t) tolower(c); break;
899  case 4: ret += (Char_t) toupper(c); break;
900  default: Error("TPMERegexp::Substitute", "invalid state.");
901  }
902  ++pos;
903  }
904  }
905 
906  s = ret;
907 
908  return cnt;
909 }
910 
911 ////////////////////////////////////////////////////////////////////////////////
912 /// Returns the sub-string from the internal fMarkers vector.
913 /// Requires having run match or split first.
914 
916 {
917  if (index >= fNMatches)
918  return "";
919 
920  Int_t begin = fMarkers[2*index];
921  Int_t end = fMarkers[2*index + 1];
922  return fLastStringMatched(begin, end-begin);
923 }
924 
925 ////////////////////////////////////////////////////////////////////////////////
926 /// Print the regular expression and modifier options.
927 /// If 'option' contains "all", prints also last string match and
928 /// match results.
929 
931 {
932  TString opt = option;
933  opt.ToLower();
934 
935  Printf("Regexp='%s', Opts='%s'", fPattern.Data(), GetModifiers().Data());
936  if (opt.Contains("all")) {
937  Printf(" last string='%s'", fLastStringMatched.Data());
938  Printf(" number of matches = %d", fNMatches);
939  for (Int_t i=0; i<fNMatches; ++i)
940  Printf(" %d - %s", i, operator[](i).Data());
941  }
942 }
943 
944 
945 /** \class TStringToken
946 Provides iteration through tokens of a given string.
947 
948  - fFullStr stores the string to be split. It is never modified.
949  - fSplitRe is the perl-re that is used to separate the tokens.
950  - fReturnVoid if true, empty strings will be returned.
951 
952 Current token is stored in the TString base-class.
953 During construction no match is done, use NextToken() to get the first
954 and all subsequent tokens.
955 */
956 
958 
959 ////////////////////////////////////////////////////////////////////////////////
960 /// Constructor.
961 
962 TStringToken::TStringToken(const TString& fullStr, const TString& splitRe, Bool_t retVoid) :
963  fFullStr (fullStr),
964  fSplitRe (splitRe),
965  fReturnVoid (retVoid),
966  fPos (0)
967 {
968 }
969 
970 ////////////////////////////////////////////////////////////////////////////////
971 /// Get the next token, it is stored in this TString.
972 /// Returns true if new token is available, false otherwise.
973 
975 {
976  TArrayI x;
977  while (fPos < fFullStr.Length()) {
978  if (fSplitRe.Match(fFullStr, "", fPos, 2, &x)) {
979  TString::operator=(fFullStr(fPos, x[0] - fPos));
980  fPos = x[1];
981  } else {
982  TString::operator=(fFullStr(fPos, fFullStr.Length() - fPos));
983  fPos = fFullStr.Length() + 1;
984  }
985  if (Length() || fReturnVoid)
986  return kTRUE;
987  }
988 
989  // Special case: void-strings are requested and the full-string
990  // ends with the separator. Thus we return another empty string.
991  if (fPos == fFullStr.Length() && fReturnVoid) {
992  TString::operator=("");
993  fPos = fFullStr.Length() + 1;
994  return kTRUE;
995  }
996 
997  return kFALSE;
998 }
A zero length substring is legal.
Definition: TString.h:83
Int_t SubstituteInternal(TString &s, const TString &replace, Int_t start, Int_t nMaxMatch0, Bool_t doDollarSubst) const
Perform pattern substitution with optional back-ref replacement.
Definition: TPRegexp.cxx:393
TString fLastStringMatched
Definition: TPRegexp.h:113
RooCmdArg Optimize(Int_t flag=2)
An array of TObjects.
Definition: TObjArray.h:39
Int_t fNMatches
Definition: TPRegexp.h:110
Int_t MatchInternal(const TString &s, Int_t start, Int_t nMaxMatch, TArrayI *pos=0) const
Perform the actual matching - protected method.
Definition: TPRegexp.cxx:304
Collectable string class.
Definition: TObjString.h:32
return c
const char Option_t
Definition: RtypesCore.h:62
TArrayI fMarkers
Definition: TPRegexp.h:111
void AssignGlobalState(const TPMERegexp &re)
Copy global-match state from &#39;re; so that this regexp can continue parsing the string from where &#39;re&#39;...
Definition: TPRegexp.cxx:681
virtual void SetOwner(Bool_t enable=kTRUE)
Set whether this collection is the owner (enable==true) of its content.
Int_t ReplaceSubs(const TString &s, TString &final, const TString &replacePattern, Int_t *ovec, Int_t nmatch) const
Returns the number of expanded &#39;$&#39; constructs.
Definition: TPRegexp.cxx:248
TObjArray * MatchS(const TString &s, const TString &mods="", Int_t start=0, Int_t nMaxMatch=10)
Returns a TObjArray of matched substrings as TObjString&#39;s.
Definition: TPRegexp.cxx:367
static void SetThrowAtCompileError(Bool_t throwp)
Set static flag controlling whether exception should be thrown upon an error during regular expressio...
Definition: TPRegexp.cxx:505
virtual ~TPRegexp()
Cleanup.
Definition: TPRegexp.cxx:77
Ssiz_t Index(const char *pat, Ssiz_t i=0, ECaseCompare cmp=kExact) const
Definition: TString.h:582
Basic string class.
Definition: TString.h:137
TString GetModifiers() const
Return PCRE modifier options as string.
Definition: TPRegexp.cxx:177
void ToLower()
Change string to lower-case.
Definition: TString.cxx:1089
int Int_t
Definition: RtypesCore.h:41
TString & operator=(char s)
Assign character c to TString.
Definition: TString.cxx:246
bool Bool_t
Definition: RtypesCore.h:59
const Bool_t kFALSE
Definition: Rtypes.h:92
Int_t Substitute(TString &s, const TString &replace, const TString &mods="", Int_t start=0, Int_t nMatchMax=10)
Substitute replaces the string s by a new string in which matching patterns are replaced by the repla...
Definition: TPRegexp.cxx:469
Array of integers (32 bits per element).
Definition: TArrayI.h:29
Int_t fNMaxMatches
Definition: TPRegexp.h:109
Double_t x[n]
Definition: legend1.C:17
static TString Format(const char *fmt,...)
Static method which formats a string using a printf style format descriptor and return a TString...
Definition: TString.cxx:2335
Provides iteration through tokens of a given string.
Definition: TPRegexp.h:149
void Compile()
Compile the fPattern.
Definition: TPRegexp.cxx:195
void Info(const char *location, const char *msgfmt,...)
std::vector< std::vector< double > > Data
Bool_t IsValid() const
Returns true if underlying PCRE structure has been successfully generated via regexp compilation...
Definition: TPRegexp.cxx:487
void Set(Int_t n)
Set size of this array to n ints.
Definition: TArrayI.cxx:105
void Error(const char *location, const char *msgfmt,...)
char & operator()(Ssiz_t i)
Definition: TString.h:657
TRandom2 r(17)
Int_t Split(const TString &s, Int_t maxfields=0)
Splits into at most maxfields.
Definition: TPRegexp.cxx:762
void * fAddressOfLastString
Definition: TPRegexp.h:114
TPRegexp & operator=(const TPRegexp &p)
Assignment operator.
Definition: TPRegexp.cxx:89
unsigned int UInt_t
Definition: RtypesCore.h:42
TMarker * m
Definition: textangle.C:8
virtual void Print(Option_t *option="")
Print the regular expression and modifier options.
Definition: TPRegexp.cxx:930
Ssiz_t Length() const
Definition: TString.h:390
#define Printf
Definition: TGeoToOCC.h:18
int Ssiz_t
Definition: RtypesCore.h:63
RooCmdArg Index(RooCategory &icat)
#define ClassImp(name)
Definition: Rtypes.h:279
UInt_t fPCREOpts
Definition: TPRegexp.h:54
TPMERegexp()
Default constructor. This regexp will match an empty string.
Definition: TPRegexp.cxx:585
Bool_t Contains(const char *pat, ECaseCompare cmp=kExact) const
Definition: TString.h:567
Bool_t NextToken()
Get the next token, it is stored in this TString.
Definition: TPRegexp.cxx:974
static Bool_t fgThrowAtCompileError
Definition: TPRegexp.h:56
UInt_t ParseMods(const TString &mods) const
Translate Perl modifier flags into pcre flags.
Definition: TPRegexp.cxx:132
static Bool_t GetThrowAtCompileError()
Get value of static flag controlling whether exception should be thrown upon an error during regular ...
Definition: TPRegexp.cxx:496
Int_t Match(const TString &s, UInt_t start=0)
Runs a match on s against the regex &#39;this&#39; was created with.
Definition: TPRegexp.cxx:705
char Char_t
Definition: RtypesCore.h:29
Wrapper for PCRE library (Perl Compatible Regular Expressions).
Definition: TPRegexp.h:103
Int_t fLastGlobalPosition
Definition: TPRegexp.h:116
void Add(TObject *obj)
Definition: TObjArray.h:75
void Reset(const TString &s, const TString &opts="", Int_t nMatchMax=-1)
Reset the pattern and options.
Definition: TPRegexp.cxx:650
Int_t Substitute(TString &s, const TString &r, Bool_t doDollarSubst=kTRUE)
Substitute matching part of s with r, dollar back-ref substitution is performed if doDollarSubst is t...
Definition: TPRegexp.cxx:871
const Bool_t kTRUE
Definition: Rtypes.h:91
Int_t Match(const TString &s, const TString &mods="", Int_t start=0, Int_t nMaxMatch=10, TArrayI *pos=0)
The number of matches is returned, this equals the full match + sub-pattern matches.
Definition: TPRegexp.cxx:336
TPRegexp()
Default ctor.
Definition: TPRegexp.cxx:48
TString operator[](Int_t)
Returns the sub-string from the internal fMarkers vector.
Definition: TPRegexp.cxx:915
const char * cnt
Definition: TXMLSetup.cxx:75
TString fPattern
Definition: TPRegexp.h:52
void Optimize()
Send the pattern through the optimizer.
Definition: TPRegexp.cxx:227
const char * Data() const
Definition: TString.h:349
void ResetGlobalState()
Reset state of global match.
Definition: TPRegexp.cxx:693