"Fossies" - the Fresh Open Source Software Archive

Member "pcre-8.44/pcrecpp.cc" (12 Feb 2020, 34971 Bytes) of package /linux/misc/pcre-8.44.tar.bz2:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "pcrecpp.cc" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 8.43_vs_8.44.

    1 // Copyright (c) 2010, Google Inc.
    2 // All rights reserved.
    3 //
    4 // Redistribution and use in source and binary forms, with or without
    5 // modification, are permitted provided that the following conditions are
    6 // met:
    7 //
    8 //     * Redistributions of source code must retain the above copyright
    9 // notice, this list of conditions and the following disclaimer.
   10 //     * Redistributions in binary form must reproduce the above
   11 // copyright notice, this list of conditions and the following disclaimer
   12 // in the documentation and/or other materials provided with the
   13 // distribution.
   14 //     * Neither the name of Google Inc. nor the names of its
   15 // contributors may be used to endorse or promote products derived from
   16 // this software without specific prior written permission.
   17 //
   18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
   22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   29 //
   30 // Author: Sanjay Ghemawat
   31 
   32 #ifdef HAVE_CONFIG_H
   33 #include "config.h"
   34 #endif
   35 
   36 #include <stdlib.h>
   37 #include <stdio.h>
   38 #include <ctype.h>
   39 #include <limits.h>      /* for SHRT_MIN, USHRT_MAX, etc */
   40 #include <string.h>      /* for memcpy */
   41 #include <assert.h>
   42 #include <errno.h>
   43 #include <string>
   44 #include <algorithm>
   45 
   46 #include "pcrecpp_internal.h"
   47 #include "pcre.h"
   48 #include "pcrecpp.h"
   49 #include "pcre_stringpiece.h"
   50 
   51 
   52 namespace pcrecpp {
   53 
   54 // Maximum number of args we can set
   55 static const int kMaxArgs = 16;
   56 static const int kVecSize = (1 + kMaxArgs) * 3;  // results + PCRE workspace
   57 
   58 // Special object that stands-in for no argument
   59 Arg RE::no_arg((void*)NULL);
   60 
   61 // This is for ABI compatibility with old versions of pcre (pre-7.6),
   62 // which defined a global no_arg variable instead of putting it in the
   63 // RE class.  This works on GCC >= 3, at least.  It definitely works
   64 // for ELF, but may not for other object formats (Mach-O, for
   65 // instance, does not support aliases.)  We could probably have a more
   66 // inclusive test if we ever needed it.  (Note that not only the
   67 // __attribute__ syntax, but also __USER_LABEL_PREFIX__, are
   68 // gnu-specific.)
   69 #if defined(__GNUC__) && __GNUC__ >= 3 && defined(__ELF__) \
   70        && !defined(__INTEL_COMPILER) && !defined(__LCC__)
   71 # define ULP_AS_STRING(x)            ULP_AS_STRING_INTERNAL(x)
   72 # define ULP_AS_STRING_INTERNAL(x)   #x
   73 # define USER_LABEL_PREFIX_STR       ULP_AS_STRING(__USER_LABEL_PREFIX__)
   74 extern Arg no_arg
   75   __attribute__((alias(USER_LABEL_PREFIX_STR "_ZN7pcrecpp2RE6no_argE")));
   76 #endif
   77 
   78 // If a regular expression has no error, its error_ field points here
   79 static const string empty_string;
   80 
   81 // If the user doesn't ask for any options, we just use this one
   82 static RE_Options default_options;
   83 
   84 // Specials for the start of patterns. See comments where start_options is used
   85 // below. (PH June 2018)
   86 static const char *start_options[] = {
   87   "(*UTF8)",
   88   "(*UTF)",
   89   "(*UCP)",
   90   "(*NO_START_OPT)",
   91   "(*NO_AUTO_POSSESS)",
   92   "(*LIMIT_RECURSION=",
   93   "(*LIMIT_MATCH=",
   94   "(*CRLF)",
   95   "(*LF)",
   96   "(*CR)",
   97   "(*BSR_UNICODE)",
   98   "(*BSR_ANYCRLF)",
   99   "(*ANYCRLF)",
  100   "(*ANY)",
  101   "" };
  102 
  103 void RE::Init(const string& pat, const RE_Options* options) {
  104   pattern_ = pat;
  105   if (options == NULL) {
  106     options_ = default_options;
  107   } else {
  108     options_ = *options;
  109   }
  110   error_ = &empty_string;
  111   re_full_ = NULL;
  112   re_partial_ = NULL;
  113 
  114   re_partial_ = Compile(UNANCHORED);
  115   if (re_partial_ != NULL) {
  116     re_full_ = Compile(ANCHOR_BOTH);
  117   }
  118 }
  119 
  120 void RE::Cleanup() {
  121   if (re_full_ != NULL)         (*pcre_free)(re_full_);
  122   if (re_partial_ != NULL)      (*pcre_free)(re_partial_);
  123   if (error_ != &empty_string)  delete error_;
  124 }
  125 
  126 
  127 RE::~RE() {
  128   Cleanup();
  129 }
  130 
  131 
  132 pcre* RE::Compile(Anchor anchor) {
  133   // First, convert RE_Options into pcre options
  134   int pcre_options = 0;
  135   pcre_options = options_.all_options();
  136 
  137   // Special treatment for anchoring.  This is needed because at
  138   // runtime pcre only provides an option for anchoring at the
  139   // beginning of a string (unless you use offset).
  140   //
  141   // There are three types of anchoring we want:
  142   //    UNANCHORED      Compile the original pattern, and use
  143   //                    a pcre unanchored match.
  144   //    ANCHOR_START    Compile the original pattern, and use
  145   //                    a pcre anchored match.
  146   //    ANCHOR_BOTH     Tack a "\z" to the end of the original pattern
  147   //                    and use a pcre anchored match.
  148 
  149   const char* compile_error;
  150   int eoffset;
  151   pcre* re;
  152   if (anchor != ANCHOR_BOTH) {
  153     re = pcre_compile(pattern_.c_str(), pcre_options,
  154                       &compile_error, &eoffset, NULL);
  155   } else {
  156     // Tack a '\z' at the end of RE.  Parenthesize it first so that
  157     // the '\z' applies to all top-level alternatives in the regexp.
  158 
  159     /* When this code was written (for PCRE 6.0) it was enough just to
  160     parenthesize the entire pattern. Unfortunately, when the feature of
  161     starting patterns with (*UTF8) or (*CR) etc. was added to PCRE patterns,
  162     this code was never updated. This bug was not noticed till 2018, long after
  163     PCRE became obsolescent and its maintainer no longer around. Since PCRE is
  164     frozen, I have added a hack to check for all the existing "start of
  165     pattern" specials - knowing that no new ones will ever be added. I am not a
  166     C++ programmer, so the code style is no doubt crude. It is also
  167     inefficient, but is only run when the pattern starts with "(*".
  168     PH June 2018. */
  169 
  170     string wrapped = "";
  171 
  172     if (pattern_.c_str()[0] == '(' && pattern_.c_str()[1] == '*') {
  173       int kk, klen, kmat;
  174       for (;;) {   // Loop for any number of leading items
  175 
  176         for (kk = 0; start_options[kk][0] != 0; kk++) {
  177           klen = strlen(start_options[kk]);
  178           kmat = strncmp(pattern_.c_str(), start_options[kk], klen);
  179           if (kmat >= 0) break;
  180         }
  181         if (kmat != 0) break;  // Not found
  182 
  183         // If the item ended in "=" we must copy digits up to ")".
  184 
  185         if (start_options[kk][klen-1] == '=') {
  186           while (isdigit(pattern_.c_str()[klen])) klen++;
  187           if (pattern_.c_str()[klen] != ')') break;  // Syntax error
  188           klen++;
  189         }
  190 
  191         // Move the item from the pattern to the start of the wrapped string.
  192 
  193         wrapped += pattern_.substr(0, klen);
  194         pattern_.erase(0, klen);
  195       }
  196     }
  197 
  198     // Wrap the rest of the pattern.
  199 
  200     wrapped += "(?:";  // A non-counting grouping operator
  201     wrapped += pattern_;
  202     wrapped += ")\\z";
  203     re = pcre_compile(wrapped.c_str(), pcre_options,
  204                       &compile_error, &eoffset, NULL);
  205   }
  206   if (re == NULL) {
  207     if (error_ == &empty_string) error_ = new string(compile_error);
  208   }
  209   return re;
  210 }
  211 
  212 /***** Matching interfaces *****/
  213 
  214 bool RE::FullMatch(const StringPiece& text,
  215                    const Arg& ptr1,
  216                    const Arg& ptr2,
  217                    const Arg& ptr3,
  218                    const Arg& ptr4,
  219                    const Arg& ptr5,
  220                    const Arg& ptr6,
  221                    const Arg& ptr7,
  222                    const Arg& ptr8,
  223                    const Arg& ptr9,
  224                    const Arg& ptr10,
  225                    const Arg& ptr11,
  226                    const Arg& ptr12,
  227                    const Arg& ptr13,
  228                    const Arg& ptr14,
  229                    const Arg& ptr15,
  230                    const Arg& ptr16) const {
  231   const Arg* args[kMaxArgs];
  232   int n = 0;
  233   if (&ptr1  == &no_arg) { goto done; } args[n++] = &ptr1;
  234   if (&ptr2  == &no_arg) { goto done; } args[n++] = &ptr2;
  235   if (&ptr3  == &no_arg) { goto done; } args[n++] = &ptr3;
  236   if (&ptr4  == &no_arg) { goto done; } args[n++] = &ptr4;
  237   if (&ptr5  == &no_arg) { goto done; } args[n++] = &ptr5;
  238   if (&ptr6  == &no_arg) { goto done; } args[n++] = &ptr6;
  239   if (&ptr7  == &no_arg) { goto done; } args[n++] = &ptr7;
  240   if (&ptr8  == &no_arg) { goto done; } args[n++] = &ptr8;
  241   if (&ptr9  == &no_arg) { goto done; } args[n++] = &ptr9;
  242   if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
  243   if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
  244   if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
  245   if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
  246   if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
  247   if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
  248   if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
  249  done:
  250 
  251   int consumed;
  252   int vec[kVecSize];
  253   return DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
  254 }
  255 
  256 bool RE::PartialMatch(const StringPiece& text,
  257                       const Arg& ptr1,
  258                       const Arg& ptr2,
  259                       const Arg& ptr3,
  260                       const Arg& ptr4,
  261                       const Arg& ptr5,
  262                       const Arg& ptr6,
  263                       const Arg& ptr7,
  264                       const Arg& ptr8,
  265                       const Arg& ptr9,
  266                       const Arg& ptr10,
  267                       const Arg& ptr11,
  268                       const Arg& ptr12,
  269                       const Arg& ptr13,
  270                       const Arg& ptr14,
  271                       const Arg& ptr15,
  272                       const Arg& ptr16) const {
  273   const Arg* args[kMaxArgs];
  274   int n = 0;
  275   if (&ptr1  == &no_arg) { goto done; } args[n++] = &ptr1;
  276   if (&ptr2  == &no_arg) { goto done; } args[n++] = &ptr2;
  277   if (&ptr3  == &no_arg) { goto done; } args[n++] = &ptr3;
  278   if (&ptr4  == &no_arg) { goto done; } args[n++] = &ptr4;
  279   if (&ptr5  == &no_arg) { goto done; } args[n++] = &ptr5;
  280   if (&ptr6  == &no_arg) { goto done; } args[n++] = &ptr6;
  281   if (&ptr7  == &no_arg) { goto done; } args[n++] = &ptr7;
  282   if (&ptr8  == &no_arg) { goto done; } args[n++] = &ptr8;
  283   if (&ptr9  == &no_arg) { goto done; } args[n++] = &ptr9;
  284   if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
  285   if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
  286   if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
  287   if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
  288   if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
  289   if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
  290   if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
  291  done:
  292 
  293   int consumed;
  294   int vec[kVecSize];
  295   return DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
  296 }
  297 
  298 bool RE::Consume(StringPiece* input,
  299                  const Arg& ptr1,
  300                  const Arg& ptr2,
  301                  const Arg& ptr3,
  302                  const Arg& ptr4,
  303                  const Arg& ptr5,
  304                  const Arg& ptr6,
  305                  const Arg& ptr7,
  306                  const Arg& ptr8,
  307                  const Arg& ptr9,
  308                  const Arg& ptr10,
  309                  const Arg& ptr11,
  310                  const Arg& ptr12,
  311                  const Arg& ptr13,
  312                  const Arg& ptr14,
  313                  const Arg& ptr15,
  314                  const Arg& ptr16) const {
  315   const Arg* args[kMaxArgs];
  316   int n = 0;
  317   if (&ptr1  == &no_arg) { goto done; } args[n++] = &ptr1;
  318   if (&ptr2  == &no_arg) { goto done; } args[n++] = &ptr2;
  319   if (&ptr3  == &no_arg) { goto done; } args[n++] = &ptr3;
  320   if (&ptr4  == &no_arg) { goto done; } args[n++] = &ptr4;
  321   if (&ptr5  == &no_arg) { goto done; } args[n++] = &ptr5;
  322   if (&ptr6  == &no_arg) { goto done; } args[n++] = &ptr6;
  323   if (&ptr7  == &no_arg) { goto done; } args[n++] = &ptr7;
  324   if (&ptr8  == &no_arg) { goto done; } args[n++] = &ptr8;
  325   if (&ptr9  == &no_arg) { goto done; } args[n++] = &ptr9;
  326   if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
  327   if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
  328   if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
  329   if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
  330   if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
  331   if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
  332   if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
  333  done:
  334 
  335   int consumed;
  336   int vec[kVecSize];
  337   if (DoMatchImpl(*input, ANCHOR_START, &consumed,
  338                   args, n, vec, kVecSize)) {
  339     input->remove_prefix(consumed);
  340     return true;
  341   } else {
  342     return false;
  343   }
  344 }
  345 
  346 bool RE::FindAndConsume(StringPiece* input,
  347                         const Arg& ptr1,
  348                         const Arg& ptr2,
  349                         const Arg& ptr3,
  350                         const Arg& ptr4,
  351                         const Arg& ptr5,
  352                         const Arg& ptr6,
  353                         const Arg& ptr7,
  354                         const Arg& ptr8,
  355                         const Arg& ptr9,
  356                         const Arg& ptr10,
  357                         const Arg& ptr11,
  358                         const Arg& ptr12,
  359                         const Arg& ptr13,
  360                         const Arg& ptr14,
  361                         const Arg& ptr15,
  362                         const Arg& ptr16) const {
  363   const Arg* args[kMaxArgs];
  364   int n = 0;
  365   if (&ptr1  == &no_arg) { goto done; } args[n++] = &ptr1;
  366   if (&ptr2  == &no_arg) { goto done; } args[n++] = &ptr2;
  367   if (&ptr3  == &no_arg) { goto done; } args[n++] = &ptr3;
  368   if (&ptr4  == &no_arg) { goto done; } args[n++] = &ptr4;
  369   if (&ptr5  == &no_arg) { goto done; } args[n++] = &ptr5;
  370   if (&ptr6  == &no_arg) { goto done; } args[n++] = &ptr6;
  371   if (&ptr7  == &no_arg) { goto done; } args[n++] = &ptr7;
  372   if (&ptr8  == &no_arg) { goto done; } args[n++] = &ptr8;
  373   if (&ptr9  == &no_arg) { goto done; } args[n++] = &ptr9;
  374   if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
  375   if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
  376   if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
  377   if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
  378   if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
  379   if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
  380   if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
  381  done:
  382 
  383   int consumed;
  384   int vec[kVecSize];
  385   if (DoMatchImpl(*input, UNANCHORED, &consumed,
  386                   args, n, vec, kVecSize)) {
  387     input->remove_prefix(consumed);
  388     return true;
  389   } else {
  390     return false;
  391   }
  392 }
  393 
  394 bool RE::Replace(const StringPiece& rewrite,
  395                  string *str) const {
  396   int vec[kVecSize];
  397   int matches = TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
  398   if (matches == 0)
  399     return false;
  400 
  401   string s;
  402   if (!Rewrite(&s, rewrite, *str, vec, matches))
  403     return false;
  404 
  405   assert(vec[0] >= 0);
  406   assert(vec[1] >= 0);
  407   str->replace(vec[0], vec[1] - vec[0], s);
  408   return true;
  409 }
  410 
  411 // Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF.
  412 // Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF.
  413 // Modified by PH to add PCRE_NEWLINE_ANY and PCRE_NEWLINE_ANYCRLF.
  414 
  415 static int NewlineMode(int pcre_options) {
  416   // TODO: if we can make it threadsafe, cache this var
  417   int newline_mode = 0;
  418   /* if (newline_mode) return newline_mode; */  // do this once it's cached
  419   if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
  420                       PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)) {
  421     newline_mode = (pcre_options &
  422                     (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
  423                      PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF));
  424   } else {
  425     int newline;
  426     pcre_config(PCRE_CONFIG_NEWLINE, &newline);
  427     if (newline == 10)
  428       newline_mode = PCRE_NEWLINE_LF;
  429     else if (newline == 13)
  430       newline_mode = PCRE_NEWLINE_CR;
  431     else if (newline == 3338)
  432       newline_mode = PCRE_NEWLINE_CRLF;
  433     else if (newline == -1)
  434       newline_mode = PCRE_NEWLINE_ANY;
  435     else if (newline == -2)
  436       newline_mode = PCRE_NEWLINE_ANYCRLF;
  437     else
  438       assert(NULL == "Unexpected return value from pcre_config(NEWLINE)");
  439   }
  440   return newline_mode;
  441 }
  442 
  443 int RE::GlobalReplace(const StringPiece& rewrite,
  444                       string *str) const {
  445   int count = 0;
  446   int vec[kVecSize];
  447   string out;
  448   int start = 0;
  449   bool last_match_was_empty_string = false;
  450 
  451   while (start <= static_cast<int>(str->length())) {
  452     // If the previous match was for the empty string, we shouldn't
  453     // just match again: we'll match in the same way and get an
  454     // infinite loop.  Instead, we do the match in a special way:
  455     // anchored -- to force another try at the same position --
  456     // and with a flag saying that this time, ignore empty matches.
  457     // If this special match returns, that means there's a non-empty
  458     // match at this position as well, and we can continue.  If not,
  459     // we do what perl does, and just advance by one.
  460     // Notice that perl prints '@@@' for this;
  461     //    perl -le '$_ = "aa"; s/b*|aa/@/g; print'
  462     int matches;
  463     if (last_match_was_empty_string) {
  464       matches = TryMatch(*str, start, ANCHOR_START, false, vec, kVecSize);
  465       if (matches <= 0) {
  466         int matchend = start + 1;     // advance one character.
  467         // If the current char is CR and we're in CRLF mode, skip LF too.
  468         // Note it's better to call pcre_fullinfo() than to examine
  469         // all_options(), since options_ could have changed bewteen
  470         // compile-time and now, but this is simpler and safe enough.
  471         // Modified by PH to add ANY and ANYCRLF.
  472         if (matchend < static_cast<int>(str->length()) &&
  473             (*str)[start] == '\r' && (*str)[matchend] == '\n' &&
  474             (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF ||
  475              NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY ||
  476              NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)) {
  477           matchend++;
  478         }
  479         // We also need to advance more than one char if we're in utf8 mode.
  480 #ifdef SUPPORT_UTF
  481         if (options_.utf8()) {
  482           while (matchend < static_cast<int>(str->length()) &&
  483                  ((*str)[matchend] & 0xc0) == 0x80)
  484             matchend++;
  485         }
  486 #endif
  487         if (start < static_cast<int>(str->length()))
  488           out.append(*str, start, matchend - start);
  489         start = matchend;
  490         last_match_was_empty_string = false;
  491         continue;
  492       }
  493     } else {
  494       matches = TryMatch(*str, start, UNANCHORED, true, vec, kVecSize);
  495       if (matches <= 0)
  496         break;
  497     }
  498     int matchstart = vec[0], matchend = vec[1];
  499     assert(matchstart >= start);
  500     assert(matchend >= matchstart);
  501     out.append(*str, start, matchstart - start);
  502     Rewrite(&out, rewrite, *str, vec, matches);
  503     start = matchend;
  504     count++;
  505     last_match_was_empty_string = (matchstart == matchend);
  506   }
  507 
  508   if (count == 0)
  509     return 0;
  510 
  511   if (start < static_cast<int>(str->length()))
  512     out.append(*str, start, str->length() - start);
  513   swap(out, *str);
  514   return count;
  515 }
  516 
  517 bool RE::Extract(const StringPiece& rewrite,
  518                  const StringPiece& text,
  519                  string *out) const {
  520   int vec[kVecSize];
  521   int matches = TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
  522   if (matches == 0)
  523     return false;
  524   out->erase();
  525   return Rewrite(out, rewrite, text, vec, matches);
  526 }
  527 
  528 /*static*/ string RE::QuoteMeta(const StringPiece& unquoted) {
  529   string result;
  530 
  531   // Escape any ascii character not in [A-Za-z_0-9].
  532   //
  533   // Note that it's legal to escape a character even if it has no
  534   // special meaning in a regular expression -- so this function does
  535   // that.  (This also makes it identical to the perl function of the
  536   // same name; see `perldoc -f quotemeta`.)  The one exception is
  537   // escaping NUL: rather than doing backslash + NUL, like perl does,
  538   // we do '\0', because pcre itself doesn't take embedded NUL chars.
  539   for (int ii = 0; ii < unquoted.size(); ++ii) {
  540     // Note that using 'isalnum' here raises the benchmark time from
  541     // 32ns to 58ns:
  542     if (unquoted[ii] == '\0') {
  543       result += "\\0";
  544     } else if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
  545                (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
  546                (unquoted[ii] < '0' || unquoted[ii] > '9') &&
  547                unquoted[ii] != '_' &&
  548                // If this is the part of a UTF8 or Latin1 character, we need
  549                // to copy this byte without escaping.  Experimentally this is
  550                // what works correctly with the regexp library.
  551                !(unquoted[ii] & 128)) {
  552       result += '\\';
  553       result += unquoted[ii];
  554     } else {
  555       result += unquoted[ii];
  556     }
  557   }
  558 
  559   return result;
  560 }
  561 
  562 /***** Actual matching and rewriting code *****/
  563 
  564 int RE::TryMatch(const StringPiece& text,
  565                  int startpos,
  566                  Anchor anchor,
  567                  bool empty_ok,
  568                  int *vec,
  569                  int vecsize) const {
  570   pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
  571   if (re == NULL) {
  572     //fprintf(stderr, "Matching against invalid re: %s\n", error_->c_str());
  573     return 0;
  574   }
  575 
  576   pcre_extra extra = { 0, 0, 0, 0, 0, 0, 0, 0 };
  577   if (options_.match_limit() > 0) {
  578     extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
  579     extra.match_limit = options_.match_limit();
  580   }
  581   if (options_.match_limit_recursion() > 0) {
  582     extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
  583     extra.match_limit_recursion = options_.match_limit_recursion();
  584   }
  585 
  586   // int options = 0;
  587   // Changed by PH as a result of bugzilla #1288
  588   int options = (options_.all_options() & PCRE_NO_UTF8_CHECK);
  589 
  590   if (anchor != UNANCHORED)
  591     options |= PCRE_ANCHORED;
  592   if (!empty_ok)
  593     options |= PCRE_NOTEMPTY;
  594 
  595   int rc = pcre_exec(re,              // The regular expression object
  596                      &extra,
  597                      (text.data() == NULL) ? "" : text.data(),
  598                      text.size(),
  599                      startpos,
  600                      options,
  601                      vec,
  602                      vecsize);
  603 
  604   // Handle errors
  605   if (rc == PCRE_ERROR_NOMATCH) {
  606     return 0;
  607   } else if (rc < 0) {
  608     //fprintf(stderr, "Unexpected return code: %d when matching '%s'\n",
  609     //        re, pattern_.c_str());
  610     return 0;
  611   } else if (rc == 0) {
  612     // pcre_exec() returns 0 as a special case when the number of
  613     // capturing subpatterns exceeds the size of the vector.
  614     // When this happens, there is a match and the output vector
  615     // is filled, but we miss out on the positions of the extra subpatterns.
  616     rc = vecsize / 2;
  617   }
  618 
  619   return rc;
  620 }
  621 
  622 bool RE::DoMatchImpl(const StringPiece& text,
  623                      Anchor anchor,
  624                      int* consumed,
  625                      const Arg* const* args,
  626                      int n,
  627                      int* vec,
  628                      int vecsize) const {
  629   assert((1 + n) * 3 <= vecsize);  // results + PCRE workspace
  630   int matches = TryMatch(text, 0, anchor, true, vec, vecsize);
  631   assert(matches >= 0);  // TryMatch never returns negatives
  632   if (matches == 0)
  633     return false;
  634 
  635   *consumed = vec[1];
  636 
  637   if (n == 0 || args == NULL) {
  638     // We are not interested in results
  639     return true;
  640   }
  641 
  642   if (NumberOfCapturingGroups() < n) {
  643     // RE has fewer capturing groups than number of arg pointers passed in
  644     return false;
  645   }
  646 
  647   // If we got here, we must have matched the whole pattern.
  648   // We do not need (can not do) any more checks on the value of 'matches' here
  649   // -- see the comment for TryMatch.
  650   for (int i = 0; i < n; i++) {
  651     const int start = vec[2*(i+1)];
  652     const int limit = vec[2*(i+1)+1];
  653     if (!args[i]->Parse(text.data() + start, limit-start)) {
  654       // TODO: Should we indicate what the error was?
  655       return false;
  656     }
  657   }
  658 
  659   return true;
  660 }
  661 
  662 bool RE::DoMatch(const StringPiece& text,
  663                  Anchor anchor,
  664                  int* consumed,
  665                  const Arg* const args[],
  666                  int n) const {
  667   assert(n >= 0);
  668   size_t const vecsize = (1 + n) * 3;  // results + PCRE workspace
  669                                        // (as for kVecSize)
  670   int space[21];   // use stack allocation for small vecsize (common case)
  671   int* vec = vecsize <= 21 ? space : new int[vecsize];
  672   bool retval = DoMatchImpl(text, anchor, consumed, args, n, vec, (int)vecsize);
  673   if (vec != space) delete [] vec;
  674   return retval;
  675 }
  676 
  677 bool RE::Rewrite(string *out, const StringPiece &rewrite,
  678                  const StringPiece &text, int *vec, int veclen) const {
  679   for (const char *s = rewrite.data(), *end = s + rewrite.size();
  680        s < end; s++) {
  681     int c = *s;
  682     if (c == '\\') {
  683       c = *++s;
  684       if (isdigit(c)) {
  685         int n = (c - '0');
  686         if (n >= veclen) {
  687           //fprintf(stderr, requested group %d in regexp %.*s\n",
  688           //        n, rewrite.size(), rewrite.data());
  689           return false;
  690         }
  691         int start = vec[2 * n];
  692         if (start >= 0)
  693           out->append(text.data() + start, vec[2 * n + 1] - start);
  694       } else if (c == '\\') {
  695         *out += '\\';
  696       } else {
  697         //fprintf(stderr, "invalid rewrite pattern: %.*s\n",
  698         //        rewrite.size(), rewrite.data());
  699         return false;
  700       }
  701     } else {
  702       *out += c;
  703     }
  704   }
  705   return true;
  706 }
  707 
  708 // Return the number of capturing subpatterns, or -1 if the
  709 // regexp wasn't valid on construction.
  710 int RE::NumberOfCapturingGroups() const {
  711   if (re_partial_ == NULL) return -1;
  712 
  713   int result;
  714   int pcre_retval = pcre_fullinfo(re_partial_,  // The regular expression object
  715                                   NULL,         // We did not study the pattern
  716                                   PCRE_INFO_CAPTURECOUNT,
  717                                   &result);
  718   assert(pcre_retval == 0);
  719   return result;
  720 }
  721 
  722 /***** Parsers for various types *****/
  723 
  724 bool Arg::parse_null(const char* str, int n, void* dest) {
  725   (void)str;
  726   (void)n;
  727   // We fail if somebody asked us to store into a non-NULL void* pointer
  728   return (dest == NULL);
  729 }
  730 
  731 bool Arg::parse_string(const char* str, int n, void* dest) {
  732   if (dest == NULL) return true;
  733   reinterpret_cast<string*>(dest)->assign(str, n);
  734   return true;
  735 }
  736 
  737 bool Arg::parse_stringpiece(const char* str, int n, void* dest) {
  738   if (dest == NULL) return true;
  739   reinterpret_cast<StringPiece*>(dest)->set(str, n);
  740   return true;
  741 }
  742 
  743 bool Arg::parse_char(const char* str, int n, void* dest) {
  744   if (n != 1) return false;
  745   if (dest == NULL) return true;
  746   *(reinterpret_cast<char*>(dest)) = str[0];
  747   return true;
  748 }
  749 
  750 bool Arg::parse_uchar(const char* str, int n, void* dest) {
  751   if (n != 1) return false;
  752   if (dest == NULL) return true;
  753   *(reinterpret_cast<unsigned char*>(dest)) = str[0];
  754   return true;
  755 }
  756 
  757 // Largest number spec that we are willing to parse
  758 static const int kMaxNumberLength = 32;
  759 
  760 // REQUIRES "buf" must have length at least kMaxNumberLength+1
  761 // REQUIRES "n > 0"
  762 // Copies "str" into "buf" and null-terminates if necessary.
  763 // Returns one of:
  764 //      a. "str" if no termination is needed
  765 //      b. "buf" if the string was copied and null-terminated
  766 //      c. "" if the input was invalid and has no hope of being parsed
  767 static const char* TerminateNumber(char* buf, const char* str, int n) {
  768   if ((n > 0) && isspace(*str)) {
  769     // We are less forgiving than the strtoxxx() routines and do not
  770     // allow leading spaces.
  771     return "";
  772   }
  773 
  774   // See if the character right after the input text may potentially
  775   // look like a digit.
  776   if (isdigit(str[n]) ||
  777       ((str[n] >= 'a') && (str[n] <= 'f')) ||
  778       ((str[n] >= 'A') && (str[n] <= 'F'))) {
  779     if (n > kMaxNumberLength) return ""; // Input too big to be a valid number
  780     memcpy(buf, str, n);
  781     buf[n] = '\0';
  782     return buf;
  783   } else {
  784     // We can parse right out of the supplied string, so return it.
  785     return str;
  786   }
  787 }
  788 
  789 bool Arg::parse_long_radix(const char* str,
  790                            int n,
  791                            void* dest,
  792                            int radix) {
  793   if (n == 0) return false;
  794   char buf[kMaxNumberLength+1];
  795   str = TerminateNumber(buf, str, n);
  796   char* end;
  797   errno = 0;
  798   long r = strtol(str, &end, radix);
  799   if (end != str + n) return false;   // Leftover junk
  800   if (errno) return false;
  801   if (dest == NULL) return true;
  802   *(reinterpret_cast<long*>(dest)) = r;
  803   return true;
  804 }
  805 
  806 bool Arg::parse_ulong_radix(const char* str,
  807                             int n,
  808                             void* dest,
  809                             int radix) {
  810   if (n == 0) return false;
  811   char buf[kMaxNumberLength+1];
  812   str = TerminateNumber(buf, str, n);
  813   if (str[0] == '-') return false;    // strtoul() on a negative number?!
  814   char* end;
  815   errno = 0;
  816   unsigned long r = strtoul(str, &end, radix);
  817   if (end != str + n) return false;   // Leftover junk
  818   if (errno) return false;
  819   if (dest == NULL) return true;
  820   *(reinterpret_cast<unsigned long*>(dest)) = r;
  821   return true;
  822 }
  823 
  824 bool Arg::parse_short_radix(const char* str,
  825                             int n,
  826                             void* dest,
  827                             int radix) {
  828   long r;
  829   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
  830   if (r < SHRT_MIN || r > SHRT_MAX) return false;       // Out of range
  831   if (dest == NULL) return true;
  832   *(reinterpret_cast<short*>(dest)) = static_cast<short>(r);
  833   return true;
  834 }
  835 
  836 bool Arg::parse_ushort_radix(const char* str,
  837                              int n,
  838                              void* dest,
  839                              int radix) {
  840   unsigned long r;
  841   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
  842   if (r > USHRT_MAX) return false;                      // Out of range
  843   if (dest == NULL) return true;
  844   *(reinterpret_cast<unsigned short*>(dest)) = static_cast<unsigned short>(r);
  845   return true;
  846 }
  847 
  848 bool Arg::parse_int_radix(const char* str,
  849                           int n,
  850                           void* dest,
  851                           int radix) {
  852   long r;
  853   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
  854   if (r < INT_MIN || r > INT_MAX) return false;         // Out of range
  855   if (dest == NULL) return true;
  856   *(reinterpret_cast<int*>(dest)) = r;
  857   return true;
  858 }
  859 
  860 bool Arg::parse_uint_radix(const char* str,
  861                            int n,
  862                            void* dest,
  863                            int radix) {
  864   unsigned long r;
  865   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
  866   if (r > UINT_MAX) return false;                       // Out of range
  867   if (dest == NULL) return true;
  868   *(reinterpret_cast<unsigned int*>(dest)) = r;
  869   return true;
  870 }
  871 
  872 bool Arg::parse_longlong_radix(const char* str,
  873                                int n,
  874                                void* dest,
  875                                int radix) {
  876 #ifndef HAVE_LONG_LONG
  877   return false;
  878 #else
  879   if (n == 0) return false;
  880   char buf[kMaxNumberLength+1];
  881   str = TerminateNumber(buf, str, n);
  882   char* end;
  883   errno = 0;
  884 #if defined HAVE_STRTOQ
  885   long long r = strtoq(str, &end, radix);
  886 #elif defined HAVE_STRTOLL
  887   long long r = strtoll(str, &end, radix);
  888 #elif defined HAVE__STRTOI64
  889   long long r = _strtoi64(str, &end, radix);
  890 #elif defined HAVE_STRTOIMAX
  891   long long r = strtoimax(str, &end, radix);
  892 #else
  893 #error parse_longlong_radix: cannot convert input to a long-long
  894 #endif
  895   if (end != str + n) return false;   // Leftover junk
  896   if (errno) return false;
  897   if (dest == NULL) return true;
  898   *(reinterpret_cast<long long*>(dest)) = r;
  899   return true;
  900 #endif   /* HAVE_LONG_LONG */
  901 }
  902 
  903 bool Arg::parse_ulonglong_radix(const char* str,
  904                                 int n,
  905                                 void* dest,
  906                                 int radix) {
  907 #ifndef HAVE_UNSIGNED_LONG_LONG
  908   return false;
  909 #else
  910   if (n == 0) return false;
  911   char buf[kMaxNumberLength+1];
  912   str = TerminateNumber(buf, str, n);
  913   if (str[0] == '-') return false;    // strtoull() on a negative number?!
  914   char* end;
  915   errno = 0;
  916 #if defined HAVE_STRTOQ
  917   unsigned long long r = strtouq(str, &end, radix);
  918 #elif defined HAVE_STRTOLL
  919   unsigned long long r = strtoull(str, &end, radix);
  920 #elif defined HAVE__STRTOI64
  921   unsigned long long r = _strtoui64(str, &end, radix);
  922 #elif defined HAVE_STRTOIMAX
  923   unsigned long long r = strtoumax(str, &end, radix);
  924 #else
  925 #error parse_ulonglong_radix: cannot convert input to a long-long
  926 #endif
  927   if (end != str + n) return false;   // Leftover junk
  928   if (errno) return false;
  929   if (dest == NULL) return true;
  930   *(reinterpret_cast<unsigned long long*>(dest)) = r;
  931   return true;
  932 #endif   /* HAVE_UNSIGNED_LONG_LONG */
  933 }
  934 
  935 bool Arg::parse_double(const char* str, int n, void* dest) {
  936   if (n == 0) return false;
  937   static const int kMaxLength = 200;
  938   char buf[kMaxLength];
  939   if (n >= kMaxLength) return false;
  940   memcpy(buf, str, n);
  941   buf[n] = '\0';
  942   errno = 0;
  943   char* end;
  944   double r = strtod(buf, &end);
  945   if (end != buf + n) return false;   // Leftover junk
  946   if (errno) return false;
  947   if (dest == NULL) return true;
  948   *(reinterpret_cast<double*>(dest)) = r;
  949   return true;
  950 }
  951 
  952 bool Arg::parse_float(const char* str, int n, void* dest) {
  953   double r;
  954   if (!parse_double(str, n, &r)) return false;
  955   if (dest == NULL) return true;
  956   *(reinterpret_cast<float*>(dest)) = static_cast<float>(r);
  957   return true;
  958 }
  959 
  960 
  961 #define DEFINE_INTEGER_PARSERS(name)                                    \
  962   bool Arg::parse_##name(const char* str, int n, void* dest) {          \
  963     return parse_##name##_radix(str, n, dest, 10);                      \
  964   }                                                                     \
  965   bool Arg::parse_##name##_hex(const char* str, int n, void* dest) {    \
  966     return parse_##name##_radix(str, n, dest, 16);                      \
  967   }                                                                     \
  968   bool Arg::parse_##name##_octal(const char* str, int n, void* dest) {  \
  969     return parse_##name##_radix(str, n, dest, 8);                       \
  970   }                                                                     \
  971   bool Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
  972     return parse_##name##_radix(str, n, dest, 0);                       \
  973   }
  974 
  975 DEFINE_INTEGER_PARSERS(short)      /*                                   */
  976 DEFINE_INTEGER_PARSERS(ushort)     /*                                   */
  977 DEFINE_INTEGER_PARSERS(int)        /* Don't use semicolons after these  */
  978 DEFINE_INTEGER_PARSERS(uint)       /* statements because they can cause */
  979 DEFINE_INTEGER_PARSERS(long)       /* compiler warnings if the checking */
  980 DEFINE_INTEGER_PARSERS(ulong)      /* level is turned up high enough.   */
  981 DEFINE_INTEGER_PARSERS(longlong)   /*                                   */
  982 DEFINE_INTEGER_PARSERS(ulonglong)  /*                                   */
  983 
  984 #undef DEFINE_INTEGER_PARSERS
  985 
  986 }   // namespace pcrecpp