"Fossies" - the Fresh Open Source Software Archive

Member "pcre-8.43/pcrecpp.cc" (23 Jan 2019, 34930 Bytes) of package /linux/misc/pcre-8.43.tar.bz2:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "pcrecpp.cc" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 8.42_vs_8.43.

    1 // Copyright (c) 2010, Google Inc.
    2 // All rights reserved.
    3 //
    4 // Redistribution and use in source and binary forms, with or without
    5 // modification, are permitted provided that the following conditions are
    6 // met:
    7 //
    8 //     * Redistributions of source code must retain the above copyright
    9 // notice, this list of conditions and the following disclaimer.
   10 //     * Redistributions in binary form must reproduce the above
   11 // copyright notice, this list of conditions and the following disclaimer
   12 // in the documentation and/or other materials provided with the
   13 // distribution.
   14 //     * Neither the name of Google Inc. nor the names of its
   15 // contributors may be used to endorse or promote products derived from
   16 // this software without specific prior written permission.
   17 //
   18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
   22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   29 //
   30 // Author: Sanjay Ghemawat
   31 
   32 #ifdef HAVE_CONFIG_H
   33 #include "config.h"
   34 #endif
   35 
   36 #include <stdlib.h>
   37 #include <stdio.h>
   38 #include <ctype.h>
   39 #include <limits.h>      /* for SHRT_MIN, USHRT_MAX, etc */
   40 #include <string.h>      /* for memcpy */
   41 #include <assert.h>
   42 #include <errno.h>
   43 #include <string>
   44 #include <algorithm>
   45 
   46 #include "pcrecpp_internal.h"
   47 #include "pcre.h"
   48 #include "pcrecpp.h"
   49 #include "pcre_stringpiece.h"
   50 
   51 
   52 namespace pcrecpp {
   53 
   54 // Maximum number of args we can set
   55 static const int kMaxArgs = 16;
   56 static const int kVecSize = (1 + kMaxArgs) * 3;  // results + PCRE workspace
   57 
   58 // Special object that stands-in for no argument
   59 Arg RE::no_arg((void*)NULL);
   60 
   61 // This is for ABI compatibility with old versions of pcre (pre-7.6),
   62 // which defined a global no_arg variable instead of putting it in the
   63 // RE class.  This works on GCC >= 3, at least.  It definitely works
   64 // for ELF, but may not for other object formats (Mach-O, for
   65 // instance, does not support aliases.)  We could probably have a more
   66 // inclusive test if we ever needed it.  (Note that not only the
   67 // __attribute__ syntax, but also __USER_LABEL_PREFIX__, are
   68 // gnu-specific.)
   69 #if defined(__GNUC__) && __GNUC__ >= 3 && defined(__ELF__) && !defined(__INTEL_COMPILER)
   70 # define ULP_AS_STRING(x)            ULP_AS_STRING_INTERNAL(x)
   71 # define ULP_AS_STRING_INTERNAL(x)   #x
   72 # define USER_LABEL_PREFIX_STR       ULP_AS_STRING(__USER_LABEL_PREFIX__)
   73 extern Arg no_arg
   74   __attribute__((alias(USER_LABEL_PREFIX_STR "_ZN7pcrecpp2RE6no_argE")));
   75 #endif
   76 
   77 // If a regular expression has no error, its error_ field points here
   78 static const string empty_string;
   79 
   80 // If the user doesn't ask for any options, we just use this one
   81 static RE_Options default_options;
   82 
   83 // Specials for the start of patterns. See comments where start_options is used
   84 // below. (PH June 2018)
   85 static const char *start_options[] = {
   86   "(*UTF8)",
   87   "(*UTF)",
   88   "(*UCP)",
   89   "(*NO_START_OPT)",
   90   "(*NO_AUTO_POSSESS)",
   91   "(*LIMIT_RECURSION=",
   92   "(*LIMIT_MATCH=",
   93   "(*CRLF)",
   94   "(*CR)",
   95   "(*BSR_UNICODE)",
   96   "(*BSR_ANYCRLF)",
   97   "(*ANYCRLF)",
   98   "(*ANY)",
   99   "" };
  100 
  101 void RE::Init(const string& pat, const RE_Options* options) {
  102   pattern_ = pat;
  103   if (options == NULL) {
  104     options_ = default_options;
  105   } else {
  106     options_ = *options;
  107   }
  108   error_ = &empty_string;
  109   re_full_ = NULL;
  110   re_partial_ = NULL;
  111 
  112   re_partial_ = Compile(UNANCHORED);
  113   if (re_partial_ != NULL) {
  114     re_full_ = Compile(ANCHOR_BOTH);
  115   }
  116 }
  117 
  118 void RE::Cleanup() {
  119   if (re_full_ != NULL)         (*pcre_free)(re_full_);
  120   if (re_partial_ != NULL)      (*pcre_free)(re_partial_);
  121   if (error_ != &empty_string)  delete error_;
  122 }
  123 
  124 
  125 RE::~RE() {
  126   Cleanup();
  127 }
  128 
  129 
  130 pcre* RE::Compile(Anchor anchor) {
  131   // First, convert RE_Options into pcre options
  132   int pcre_options = 0;
  133   pcre_options = options_.all_options();
  134 
  135   // Special treatment for anchoring.  This is needed because at
  136   // runtime pcre only provides an option for anchoring at the
  137   // beginning of a string (unless you use offset).
  138   //
  139   // There are three types of anchoring we want:
  140   //    UNANCHORED      Compile the original pattern, and use
  141   //                    a pcre unanchored match.
  142   //    ANCHOR_START    Compile the original pattern, and use
  143   //                    a pcre anchored match.
  144   //    ANCHOR_BOTH     Tack a "\z" to the end of the original pattern
  145   //                    and use a pcre anchored match.
  146 
  147   const char* compile_error;
  148   int eoffset;
  149   pcre* re;
  150   if (anchor != ANCHOR_BOTH) {
  151     re = pcre_compile(pattern_.c_str(), pcre_options,
  152                       &compile_error, &eoffset, NULL);
  153   } else {
  154     // Tack a '\z' at the end of RE.  Parenthesize it first so that
  155     // the '\z' applies to all top-level alternatives in the regexp.
  156 
  157     /* When this code was written (for PCRE 6.0) it was enough just to
  158     parenthesize the entire pattern. Unfortunately, when the feature of
  159     starting patterns with (*UTF8) or (*CR) etc. was added to PCRE patterns,
  160     this code was never updated. This bug was not noticed till 2018, long after
  161     PCRE became obsolescent and its maintainer no longer around. Since PCRE is
  162     frozen, I have added a hack to check for all the existing "start of
  163     pattern" specials - knowing that no new ones will ever be added. I am not a
  164     C++ programmer, so the code style is no doubt crude. It is also
  165     inefficient, but is only run when the pattern starts with "(*".
  166     PH June 2018. */
  167 
  168     string wrapped = "";
  169 
  170     if (pattern_.c_str()[0] == '(' && pattern_.c_str()[1] == '*') {
  171       int kk, klen, kmat;
  172       for (;;) {   // Loop for any number of leading items
  173 
  174         for (kk = 0; start_options[kk][0] != 0; kk++) {
  175           klen = strlen(start_options[kk]);
  176           kmat = strncmp(pattern_.c_str(), start_options[kk], klen);
  177           if (kmat >= 0) break;
  178         }
  179         if (kmat != 0) break;  // Not found
  180 
  181         // If the item ended in "=" we must copy digits up to ")".
  182 
  183         if (start_options[kk][klen-1] == '=') {
  184           while (isdigit(pattern_.c_str()[klen])) klen++;
  185           if (pattern_.c_str()[klen] != ')') break;  // Syntax error
  186           klen++;
  187         }
  188 
  189         // Move the item from the pattern to the start of the wrapped string.
  190 
  191         wrapped += pattern_.substr(0, klen);
  192         pattern_.erase(0, klen);
  193       }
  194     }
  195 
  196     // Wrap the rest of the pattern.
  197 
  198     wrapped += "(?:";  // A non-counting grouping operator
  199     wrapped += pattern_;
  200     wrapped += ")\\z";
  201     re = pcre_compile(wrapped.c_str(), pcre_options,
  202                       &compile_error, &eoffset, NULL);
  203   }
  204   if (re == NULL) {
  205     if (error_ == &empty_string) error_ = new string(compile_error);
  206   }
  207   return re;
  208 }
  209 
  210 /***** Matching interfaces *****/
  211 
  212 bool RE::FullMatch(const StringPiece& text,
  213                    const Arg& ptr1,
  214                    const Arg& ptr2,
  215                    const Arg& ptr3,
  216                    const Arg& ptr4,
  217                    const Arg& ptr5,
  218                    const Arg& ptr6,
  219                    const Arg& ptr7,
  220                    const Arg& ptr8,
  221                    const Arg& ptr9,
  222                    const Arg& ptr10,
  223                    const Arg& ptr11,
  224                    const Arg& ptr12,
  225                    const Arg& ptr13,
  226                    const Arg& ptr14,
  227                    const Arg& ptr15,
  228                    const Arg& ptr16) const {
  229   const Arg* args[kMaxArgs];
  230   int n = 0;
  231   if (&ptr1  == &no_arg) { goto done; } args[n++] = &ptr1;
  232   if (&ptr2  == &no_arg) { goto done; } args[n++] = &ptr2;
  233   if (&ptr3  == &no_arg) { goto done; } args[n++] = &ptr3;
  234   if (&ptr4  == &no_arg) { goto done; } args[n++] = &ptr4;
  235   if (&ptr5  == &no_arg) { goto done; } args[n++] = &ptr5;
  236   if (&ptr6  == &no_arg) { goto done; } args[n++] = &ptr6;
  237   if (&ptr7  == &no_arg) { goto done; } args[n++] = &ptr7;
  238   if (&ptr8  == &no_arg) { goto done; } args[n++] = &ptr8;
  239   if (&ptr9  == &no_arg) { goto done; } args[n++] = &ptr9;
  240   if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
  241   if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
  242   if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
  243   if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
  244   if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
  245   if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
  246   if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
  247  done:
  248 
  249   int consumed;
  250   int vec[kVecSize];
  251   return DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
  252 }
  253 
  254 bool RE::PartialMatch(const StringPiece& text,
  255                       const Arg& ptr1,
  256                       const Arg& ptr2,
  257                       const Arg& ptr3,
  258                       const Arg& ptr4,
  259                       const Arg& ptr5,
  260                       const Arg& ptr6,
  261                       const Arg& ptr7,
  262                       const Arg& ptr8,
  263                       const Arg& ptr9,
  264                       const Arg& ptr10,
  265                       const Arg& ptr11,
  266                       const Arg& ptr12,
  267                       const Arg& ptr13,
  268                       const Arg& ptr14,
  269                       const Arg& ptr15,
  270                       const Arg& ptr16) const {
  271   const Arg* args[kMaxArgs];
  272   int n = 0;
  273   if (&ptr1  == &no_arg) { goto done; } args[n++] = &ptr1;
  274   if (&ptr2  == &no_arg) { goto done; } args[n++] = &ptr2;
  275   if (&ptr3  == &no_arg) { goto done; } args[n++] = &ptr3;
  276   if (&ptr4  == &no_arg) { goto done; } args[n++] = &ptr4;
  277   if (&ptr5  == &no_arg) { goto done; } args[n++] = &ptr5;
  278   if (&ptr6  == &no_arg) { goto done; } args[n++] = &ptr6;
  279   if (&ptr7  == &no_arg) { goto done; } args[n++] = &ptr7;
  280   if (&ptr8  == &no_arg) { goto done; } args[n++] = &ptr8;
  281   if (&ptr9  == &no_arg) { goto done; } args[n++] = &ptr9;
  282   if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
  283   if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
  284   if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
  285   if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
  286   if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
  287   if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
  288   if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
  289  done:
  290 
  291   int consumed;
  292   int vec[kVecSize];
  293   return DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
  294 }
  295 
  296 bool RE::Consume(StringPiece* input,
  297                  const Arg& ptr1,
  298                  const Arg& ptr2,
  299                  const Arg& ptr3,
  300                  const Arg& ptr4,
  301                  const Arg& ptr5,
  302                  const Arg& ptr6,
  303                  const Arg& ptr7,
  304                  const Arg& ptr8,
  305                  const Arg& ptr9,
  306                  const Arg& ptr10,
  307                  const Arg& ptr11,
  308                  const Arg& ptr12,
  309                  const Arg& ptr13,
  310                  const Arg& ptr14,
  311                  const Arg& ptr15,
  312                  const Arg& ptr16) const {
  313   const Arg* args[kMaxArgs];
  314   int n = 0;
  315   if (&ptr1  == &no_arg) { goto done; } args[n++] = &ptr1;
  316   if (&ptr2  == &no_arg) { goto done; } args[n++] = &ptr2;
  317   if (&ptr3  == &no_arg) { goto done; } args[n++] = &ptr3;
  318   if (&ptr4  == &no_arg) { goto done; } args[n++] = &ptr4;
  319   if (&ptr5  == &no_arg) { goto done; } args[n++] = &ptr5;
  320   if (&ptr6  == &no_arg) { goto done; } args[n++] = &ptr6;
  321   if (&ptr7  == &no_arg) { goto done; } args[n++] = &ptr7;
  322   if (&ptr8  == &no_arg) { goto done; } args[n++] = &ptr8;
  323   if (&ptr9  == &no_arg) { goto done; } args[n++] = &ptr9;
  324   if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
  325   if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
  326   if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
  327   if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
  328   if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
  329   if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
  330   if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
  331  done:
  332 
  333   int consumed;
  334   int vec[kVecSize];
  335   if (DoMatchImpl(*input, ANCHOR_START, &consumed,
  336                   args, n, vec, kVecSize)) {
  337     input->remove_prefix(consumed);
  338     return true;
  339   } else {
  340     return false;
  341   }
  342 }
  343 
  344 bool RE::FindAndConsume(StringPiece* input,
  345                         const Arg& ptr1,
  346                         const Arg& ptr2,
  347                         const Arg& ptr3,
  348                         const Arg& ptr4,
  349                         const Arg& ptr5,
  350                         const Arg& ptr6,
  351                         const Arg& ptr7,
  352                         const Arg& ptr8,
  353                         const Arg& ptr9,
  354                         const Arg& ptr10,
  355                         const Arg& ptr11,
  356                         const Arg& ptr12,
  357                         const Arg& ptr13,
  358                         const Arg& ptr14,
  359                         const Arg& ptr15,
  360                         const Arg& ptr16) const {
  361   const Arg* args[kMaxArgs];
  362   int n = 0;
  363   if (&ptr1  == &no_arg) { goto done; } args[n++] = &ptr1;
  364   if (&ptr2  == &no_arg) { goto done; } args[n++] = &ptr2;
  365   if (&ptr3  == &no_arg) { goto done; } args[n++] = &ptr3;
  366   if (&ptr4  == &no_arg) { goto done; } args[n++] = &ptr4;
  367   if (&ptr5  == &no_arg) { goto done; } args[n++] = &ptr5;
  368   if (&ptr6  == &no_arg) { goto done; } args[n++] = &ptr6;
  369   if (&ptr7  == &no_arg) { goto done; } args[n++] = &ptr7;
  370   if (&ptr8  == &no_arg) { goto done; } args[n++] = &ptr8;
  371   if (&ptr9  == &no_arg) { goto done; } args[n++] = &ptr9;
  372   if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
  373   if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
  374   if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
  375   if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
  376   if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
  377   if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
  378   if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
  379  done:
  380 
  381   int consumed;
  382   int vec[kVecSize];
  383   if (DoMatchImpl(*input, UNANCHORED, &consumed,
  384                   args, n, vec, kVecSize)) {
  385     input->remove_prefix(consumed);
  386     return true;
  387   } else {
  388     return false;
  389   }
  390 }
  391 
  392 bool RE::Replace(const StringPiece& rewrite,
  393                  string *str) const {
  394   int vec[kVecSize];
  395   int matches = TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
  396   if (matches == 0)
  397     return false;
  398 
  399   string s;
  400   if (!Rewrite(&s, rewrite, *str, vec, matches))
  401     return false;
  402 
  403   assert(vec[0] >= 0);
  404   assert(vec[1] >= 0);
  405   str->replace(vec[0], vec[1] - vec[0], s);
  406   return true;
  407 }
  408 
  409 // Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF.
  410 // Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF.
  411 // Modified by PH to add PCRE_NEWLINE_ANY and PCRE_NEWLINE_ANYCRLF.
  412 
  413 static int NewlineMode(int pcre_options) {
  414   // TODO: if we can make it threadsafe, cache this var
  415   int newline_mode = 0;
  416   /* if (newline_mode) return newline_mode; */  // do this once it's cached
  417   if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
  418                       PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)) {
  419     newline_mode = (pcre_options &
  420                     (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
  421                      PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF));
  422   } else {
  423     int newline;
  424     pcre_config(PCRE_CONFIG_NEWLINE, &newline);
  425     if (newline == 10)
  426       newline_mode = PCRE_NEWLINE_LF;
  427     else if (newline == 13)
  428       newline_mode = PCRE_NEWLINE_CR;
  429     else if (newline == 3338)
  430       newline_mode = PCRE_NEWLINE_CRLF;
  431     else if (newline == -1)
  432       newline_mode = PCRE_NEWLINE_ANY;
  433     else if (newline == -2)
  434       newline_mode = PCRE_NEWLINE_ANYCRLF;
  435     else
  436       assert(NULL == "Unexpected return value from pcre_config(NEWLINE)");
  437   }
  438   return newline_mode;
  439 }
  440 
  441 int RE::GlobalReplace(const StringPiece& rewrite,
  442                       string *str) const {
  443   int count = 0;
  444   int vec[kVecSize];
  445   string out;
  446   int start = 0;
  447   bool last_match_was_empty_string = false;
  448 
  449   while (start <= static_cast<int>(str->length())) {
  450     // If the previous match was for the empty string, we shouldn't
  451     // just match again: we'll match in the same way and get an
  452     // infinite loop.  Instead, we do the match in a special way:
  453     // anchored -- to force another try at the same position --
  454     // and with a flag saying that this time, ignore empty matches.
  455     // If this special match returns, that means there's a non-empty
  456     // match at this position as well, and we can continue.  If not,
  457     // we do what perl does, and just advance by one.
  458     // Notice that perl prints '@@@' for this;
  459     //    perl -le '$_ = "aa"; s/b*|aa/@/g; print'
  460     int matches;
  461     if (last_match_was_empty_string) {
  462       matches = TryMatch(*str, start, ANCHOR_START, false, vec, kVecSize);
  463       if (matches <= 0) {
  464         int matchend = start + 1;     // advance one character.
  465         // If the current char is CR and we're in CRLF mode, skip LF too.
  466         // Note it's better to call pcre_fullinfo() than to examine
  467         // all_options(), since options_ could have changed bewteen
  468         // compile-time and now, but this is simpler and safe enough.
  469         // Modified by PH to add ANY and ANYCRLF.
  470         if (matchend < static_cast<int>(str->length()) &&
  471             (*str)[start] == '\r' && (*str)[matchend] == '\n' &&
  472             (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF ||
  473              NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY ||
  474              NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)) {
  475           matchend++;
  476         }
  477         // We also need to advance more than one char if we're in utf8 mode.
  478 #ifdef SUPPORT_UTF
  479         if (options_.utf8()) {
  480           while (matchend < static_cast<int>(str->length()) &&
  481                  ((*str)[matchend] & 0xc0) == 0x80)
  482             matchend++;
  483         }
  484 #endif
  485         if (start < static_cast<int>(str->length()))
  486           out.append(*str, start, matchend - start);
  487         start = matchend;
  488         last_match_was_empty_string = false;
  489         continue;
  490       }
  491     } else {
  492       matches = TryMatch(*str, start, UNANCHORED, true, vec, kVecSize);
  493       if (matches <= 0)
  494         break;
  495     }
  496     int matchstart = vec[0], matchend = vec[1];
  497     assert(matchstart >= start);
  498     assert(matchend >= matchstart);
  499     out.append(*str, start, matchstart - start);
  500     Rewrite(&out, rewrite, *str, vec, matches);
  501     start = matchend;
  502     count++;
  503     last_match_was_empty_string = (matchstart == matchend);
  504   }
  505 
  506   if (count == 0)
  507     return 0;
  508 
  509   if (start < static_cast<int>(str->length()))
  510     out.append(*str, start, str->length() - start);
  511   swap(out, *str);
  512   return count;
  513 }
  514 
  515 bool RE::Extract(const StringPiece& rewrite,
  516                  const StringPiece& text,
  517                  string *out) const {
  518   int vec[kVecSize];
  519   int matches = TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
  520   if (matches == 0)
  521     return false;
  522   out->erase();
  523   return Rewrite(out, rewrite, text, vec, matches);
  524 }
  525 
  526 /*static*/ string RE::QuoteMeta(const StringPiece& unquoted) {
  527   string result;
  528 
  529   // Escape any ascii character not in [A-Za-z_0-9].
  530   //
  531   // Note that it's legal to escape a character even if it has no
  532   // special meaning in a regular expression -- so this function does
  533   // that.  (This also makes it identical to the perl function of the
  534   // same name; see `perldoc -f quotemeta`.)  The one exception is
  535   // escaping NUL: rather than doing backslash + NUL, like perl does,
  536   // we do '\0', because pcre itself doesn't take embedded NUL chars.
  537   for (int ii = 0; ii < unquoted.size(); ++ii) {
  538     // Note that using 'isalnum' here raises the benchmark time from
  539     // 32ns to 58ns:
  540     if (unquoted[ii] == '\0') {
  541       result += "\\0";
  542     } else if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
  543                (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
  544                (unquoted[ii] < '0' || unquoted[ii] > '9') &&
  545                unquoted[ii] != '_' &&
  546                // If this is the part of a UTF8 or Latin1 character, we need
  547                // to copy this byte without escaping.  Experimentally this is
  548                // what works correctly with the regexp library.
  549                !(unquoted[ii] & 128)) {
  550       result += '\\';
  551       result += unquoted[ii];
  552     } else {
  553       result += unquoted[ii];
  554     }
  555   }
  556 
  557   return result;
  558 }
  559 
  560 /***** Actual matching and rewriting code *****/
  561 
  562 int RE::TryMatch(const StringPiece& text,
  563                  int startpos,
  564                  Anchor anchor,
  565                  bool empty_ok,
  566                  int *vec,
  567                  int vecsize) const {
  568   pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
  569   if (re == NULL) {
  570     //fprintf(stderr, "Matching against invalid re: %s\n", error_->c_str());
  571     return 0;
  572   }
  573 
  574   pcre_extra extra = { 0, 0, 0, 0, 0, 0, 0, 0 };
  575   if (options_.match_limit() > 0) {
  576     extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
  577     extra.match_limit = options_.match_limit();
  578   }
  579   if (options_.match_limit_recursion() > 0) {
  580     extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
  581     extra.match_limit_recursion = options_.match_limit_recursion();
  582   }
  583 
  584   // int options = 0;
  585   // Changed by PH as a result of bugzilla #1288
  586   int options = (options_.all_options() & PCRE_NO_UTF8_CHECK);
  587 
  588   if (anchor != UNANCHORED)
  589     options |= PCRE_ANCHORED;
  590   if (!empty_ok)
  591     options |= PCRE_NOTEMPTY;
  592 
  593   int rc = pcre_exec(re,              // The regular expression object
  594                      &extra,
  595                      (text.data() == NULL) ? "" : text.data(),
  596                      text.size(),
  597                      startpos,
  598                      options,
  599                      vec,
  600                      vecsize);
  601 
  602   // Handle errors
  603   if (rc == PCRE_ERROR_NOMATCH) {
  604     return 0;
  605   } else if (rc < 0) {
  606     //fprintf(stderr, "Unexpected return code: %d when matching '%s'\n",
  607     //        re, pattern_.c_str());
  608     return 0;
  609   } else if (rc == 0) {
  610     // pcre_exec() returns 0 as a special case when the number of
  611     // capturing subpatterns exceeds the size of the vector.
  612     // When this happens, there is a match and the output vector
  613     // is filled, but we miss out on the positions of the extra subpatterns.
  614     rc = vecsize / 2;
  615   }
  616 
  617   return rc;
  618 }
  619 
  620 bool RE::DoMatchImpl(const StringPiece& text,
  621                      Anchor anchor,
  622                      int* consumed,
  623                      const Arg* const* args,
  624                      int n,
  625                      int* vec,
  626                      int vecsize) const {
  627   assert((1 + n) * 3 <= vecsize);  // results + PCRE workspace
  628   int matches = TryMatch(text, 0, anchor, true, vec, vecsize);
  629   assert(matches >= 0);  // TryMatch never returns negatives
  630   if (matches == 0)
  631     return false;
  632 
  633   *consumed = vec[1];
  634 
  635   if (n == 0 || args == NULL) {
  636     // We are not interested in results
  637     return true;
  638   }
  639 
  640   if (NumberOfCapturingGroups() < n) {
  641     // RE has fewer capturing groups than number of arg pointers passed in
  642     return false;
  643   }
  644 
  645   // If we got here, we must have matched the whole pattern.
  646   // We do not need (can not do) any more checks on the value of 'matches' here
  647   // -- see the comment for TryMatch.
  648   for (int i = 0; i < n; i++) {
  649     const int start = vec[2*(i+1)];
  650     const int limit = vec[2*(i+1)+1];
  651     if (!args[i]->Parse(text.data() + start, limit-start)) {
  652       // TODO: Should we indicate what the error was?
  653       return false;
  654     }
  655   }
  656 
  657   return true;
  658 }
  659 
  660 bool RE::DoMatch(const StringPiece& text,
  661                  Anchor anchor,
  662                  int* consumed,
  663                  const Arg* const args[],
  664                  int n) const {
  665   assert(n >= 0);
  666   size_t const vecsize = (1 + n) * 3;  // results + PCRE workspace
  667                                        // (as for kVecSize)
  668   int space[21];   // use stack allocation for small vecsize (common case)
  669   int* vec = vecsize <= 21 ? space : new int[vecsize];
  670   bool retval = DoMatchImpl(text, anchor, consumed, args, n, vec, (int)vecsize);
  671   if (vec != space) delete [] vec;
  672   return retval;
  673 }
  674 
  675 bool RE::Rewrite(string *out, const StringPiece &rewrite,
  676                  const StringPiece &text, int *vec, int veclen) const {
  677   for (const char *s = rewrite.data(), *end = s + rewrite.size();
  678        s < end; s++) {
  679     int c = *s;
  680     if (c == '\\') {
  681       c = *++s;
  682       if (isdigit(c)) {
  683         int n = (c - '0');
  684         if (n >= veclen) {
  685           //fprintf(stderr, requested group %d in regexp %.*s\n",
  686           //        n, rewrite.size(), rewrite.data());
  687           return false;
  688         }
  689         int start = vec[2 * n];
  690         if (start >= 0)
  691           out->append(text.data() + start, vec[2 * n + 1] - start);
  692       } else if (c == '\\') {
  693         *out += '\\';
  694       } else {
  695         //fprintf(stderr, "invalid rewrite pattern: %.*s\n",
  696         //        rewrite.size(), rewrite.data());
  697         return false;
  698       }
  699     } else {
  700       *out += c;
  701     }
  702   }
  703   return true;
  704 }
  705 
  706 // Return the number of capturing subpatterns, or -1 if the
  707 // regexp wasn't valid on construction.
  708 int RE::NumberOfCapturingGroups() const {
  709   if (re_partial_ == NULL) return -1;
  710 
  711   int result;
  712   int pcre_retval = pcre_fullinfo(re_partial_,  // The regular expression object
  713                                   NULL,         // We did not study the pattern
  714                                   PCRE_INFO_CAPTURECOUNT,
  715                                   &result);
  716   assert(pcre_retval == 0);
  717   return result;
  718 }
  719 
  720 /***** Parsers for various types *****/
  721 
  722 bool Arg::parse_null(const char* str, int n, void* dest) {
  723   (void)str;
  724   (void)n;
  725   // We fail if somebody asked us to store into a non-NULL void* pointer
  726   return (dest == NULL);
  727 }
  728 
  729 bool Arg::parse_string(const char* str, int n, void* dest) {
  730   if (dest == NULL) return true;
  731   reinterpret_cast<string*>(dest)->assign(str, n);
  732   return true;
  733 }
  734 
  735 bool Arg::parse_stringpiece(const char* str, int n, void* dest) {
  736   if (dest == NULL) return true;
  737   reinterpret_cast<StringPiece*>(dest)->set(str, n);
  738   return true;
  739 }
  740 
  741 bool Arg::parse_char(const char* str, int n, void* dest) {
  742   if (n != 1) return false;
  743   if (dest == NULL) return true;
  744   *(reinterpret_cast<char*>(dest)) = str[0];
  745   return true;
  746 }
  747 
  748 bool Arg::parse_uchar(const char* str, int n, void* dest) {
  749   if (n != 1) return false;
  750   if (dest == NULL) return true;
  751   *(reinterpret_cast<unsigned char*>(dest)) = str[0];
  752   return true;
  753 }
  754 
  755 // Largest number spec that we are willing to parse
  756 static const int kMaxNumberLength = 32;
  757 
  758 // REQUIRES "buf" must have length at least kMaxNumberLength+1
  759 // REQUIRES "n > 0"
  760 // Copies "str" into "buf" and null-terminates if necessary.
  761 // Returns one of:
  762 //      a. "str" if no termination is needed
  763 //      b. "buf" if the string was copied and null-terminated
  764 //      c. "" if the input was invalid and has no hope of being parsed
  765 static const char* TerminateNumber(char* buf, const char* str, int n) {
  766   if ((n > 0) && isspace(*str)) {
  767     // We are less forgiving than the strtoxxx() routines and do not
  768     // allow leading spaces.
  769     return "";
  770   }
  771 
  772   // See if the character right after the input text may potentially
  773   // look like a digit.
  774   if (isdigit(str[n]) ||
  775       ((str[n] >= 'a') && (str[n] <= 'f')) ||
  776       ((str[n] >= 'A') && (str[n] <= 'F'))) {
  777     if (n > kMaxNumberLength) return ""; // Input too big to be a valid number
  778     memcpy(buf, str, n);
  779     buf[n] = '\0';
  780     return buf;
  781   } else {
  782     // We can parse right out of the supplied string, so return it.
  783     return str;
  784   }
  785 }
  786 
  787 bool Arg::parse_long_radix(const char* str,
  788                            int n,
  789                            void* dest,
  790                            int radix) {
  791   if (n == 0) return false;
  792   char buf[kMaxNumberLength+1];
  793   str = TerminateNumber(buf, str, n);
  794   char* end;
  795   errno = 0;
  796   long r = strtol(str, &end, radix);
  797   if (end != str + n) return false;   // Leftover junk
  798   if (errno) return false;
  799   if (dest == NULL) return true;
  800   *(reinterpret_cast<long*>(dest)) = r;
  801   return true;
  802 }
  803 
  804 bool Arg::parse_ulong_radix(const char* str,
  805                             int n,
  806                             void* dest,
  807                             int radix) {
  808   if (n == 0) return false;
  809   char buf[kMaxNumberLength+1];
  810   str = TerminateNumber(buf, str, n);
  811   if (str[0] == '-') return false;    // strtoul() on a negative number?!
  812   char* end;
  813   errno = 0;
  814   unsigned long r = strtoul(str, &end, radix);
  815   if (end != str + n) return false;   // Leftover junk
  816   if (errno) return false;
  817   if (dest == NULL) return true;
  818   *(reinterpret_cast<unsigned long*>(dest)) = r;
  819   return true;
  820 }
  821 
  822 bool Arg::parse_short_radix(const char* str,
  823                             int n,
  824                             void* dest,
  825                             int radix) {
  826   long r;
  827   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
  828   if (r < SHRT_MIN || r > SHRT_MAX) return false;       // Out of range
  829   if (dest == NULL) return true;
  830   *(reinterpret_cast<short*>(dest)) = static_cast<short>(r);
  831   return true;
  832 }
  833 
  834 bool Arg::parse_ushort_radix(const char* str,
  835                              int n,
  836                              void* dest,
  837                              int radix) {
  838   unsigned long r;
  839   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
  840   if (r > USHRT_MAX) return false;                      // Out of range
  841   if (dest == NULL) return true;
  842   *(reinterpret_cast<unsigned short*>(dest)) = static_cast<unsigned short>(r);
  843   return true;
  844 }
  845 
  846 bool Arg::parse_int_radix(const char* str,
  847                           int n,
  848                           void* dest,
  849                           int radix) {
  850   long r;
  851   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
  852   if (r < INT_MIN || r > INT_MAX) return false;         // Out of range
  853   if (dest == NULL) return true;
  854   *(reinterpret_cast<int*>(dest)) = r;
  855   return true;
  856 }
  857 
  858 bool Arg::parse_uint_radix(const char* str,
  859                            int n,
  860                            void* dest,
  861                            int radix) {
  862   unsigned long r;
  863   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
  864   if (r > UINT_MAX) return false;                       // Out of range
  865   if (dest == NULL) return true;
  866   *(reinterpret_cast<unsigned int*>(dest)) = r;
  867   return true;
  868 }
  869 
  870 bool Arg::parse_longlong_radix(const char* str,
  871                                int n,
  872                                void* dest,
  873                                int radix) {
  874 #ifndef HAVE_LONG_LONG
  875   return false;
  876 #else
  877   if (n == 0) return false;
  878   char buf[kMaxNumberLength+1];
  879   str = TerminateNumber(buf, str, n);
  880   char* end;
  881   errno = 0;
  882 #if defined HAVE_STRTOQ
  883   long long r = strtoq(str, &end, radix);
  884 #elif defined HAVE_STRTOLL
  885   long long r = strtoll(str, &end, radix);
  886 #elif defined HAVE__STRTOI64
  887   long long r = _strtoi64(str, &end, radix);
  888 #elif defined HAVE_STRTOIMAX
  889   long long r = strtoimax(str, &end, radix);
  890 #else
  891 #error parse_longlong_radix: cannot convert input to a long-long
  892 #endif
  893   if (end != str + n) return false;   // Leftover junk
  894   if (errno) return false;
  895   if (dest == NULL) return true;
  896   *(reinterpret_cast<long long*>(dest)) = r;
  897   return true;
  898 #endif   /* HAVE_LONG_LONG */
  899 }
  900 
  901 bool Arg::parse_ulonglong_radix(const char* str,
  902                                 int n,
  903                                 void* dest,
  904                                 int radix) {
  905 #ifndef HAVE_UNSIGNED_LONG_LONG
  906   return false;
  907 #else
  908   if (n == 0) return false;
  909   char buf[kMaxNumberLength+1];
  910   str = TerminateNumber(buf, str, n);
  911   if (str[0] == '-') return false;    // strtoull() on a negative number?!
  912   char* end;
  913   errno = 0;
  914 #if defined HAVE_STRTOQ
  915   unsigned long long r = strtouq(str, &end, radix);
  916 #elif defined HAVE_STRTOLL
  917   unsigned long long r = strtoull(str, &end, radix);
  918 #elif defined HAVE__STRTOI64
  919   unsigned long long r = _strtoui64(str, &end, radix);
  920 #elif defined HAVE_STRTOIMAX
  921   unsigned long long r = strtoumax(str, &end, radix);
  922 #else
  923 #error parse_ulonglong_radix: cannot convert input to a long-long
  924 #endif
  925   if (end != str + n) return false;   // Leftover junk
  926   if (errno) return false;
  927   if (dest == NULL) return true;
  928   *(reinterpret_cast<unsigned long long*>(dest)) = r;
  929   return true;
  930 #endif   /* HAVE_UNSIGNED_LONG_LONG */
  931 }
  932 
  933 bool Arg::parse_double(const char* str, int n, void* dest) {
  934   if (n == 0) return false;
  935   static const int kMaxLength = 200;
  936   char buf[kMaxLength];
  937   if (n >= kMaxLength) return false;
  938   memcpy(buf, str, n);
  939   buf[n] = '\0';
  940   errno = 0;
  941   char* end;
  942   double r = strtod(buf, &end);
  943   if (end != buf + n) return false;   // Leftover junk
  944   if (errno) return false;
  945   if (dest == NULL) return true;
  946   *(reinterpret_cast<double*>(dest)) = r;
  947   return true;
  948 }
  949 
  950 bool Arg::parse_float(const char* str, int n, void* dest) {
  951   double r;
  952   if (!parse_double(str, n, &r)) return false;
  953   if (dest == NULL) return true;
  954   *(reinterpret_cast<float*>(dest)) = static_cast<float>(r);
  955   return true;
  956 }
  957 
  958 
  959 #define DEFINE_INTEGER_PARSERS(name)                                    \
  960   bool Arg::parse_##name(const char* str, int n, void* dest) {          \
  961     return parse_##name##_radix(str, n, dest, 10);                      \
  962   }                                                                     \
  963   bool Arg::parse_##name##_hex(const char* str, int n, void* dest) {    \
  964     return parse_##name##_radix(str, n, dest, 16);                      \
  965   }                                                                     \
  966   bool Arg::parse_##name##_octal(const char* str, int n, void* dest) {  \
  967     return parse_##name##_radix(str, n, dest, 8);                       \
  968   }                                                                     \
  969   bool Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
  970     return parse_##name##_radix(str, n, dest, 0);                       \
  971   }
  972 
  973 DEFINE_INTEGER_PARSERS(short)      /*                                   */
  974 DEFINE_INTEGER_PARSERS(ushort)     /*                                   */
  975 DEFINE_INTEGER_PARSERS(int)        /* Don't use semicolons after these  */
  976 DEFINE_INTEGER_PARSERS(uint)       /* statements because they can cause */
  977 DEFINE_INTEGER_PARSERS(long)       /* compiler warnings if the checking */
  978 DEFINE_INTEGER_PARSERS(ulong)      /* level is turned up high enough.   */
  979 DEFINE_INTEGER_PARSERS(longlong)   /*                                   */
  980 DEFINE_INTEGER_PARSERS(ulonglong)  /*                                   */
  981 
  982 #undef DEFINE_INTEGER_PARSERS
  983 
  984 }   // namespace pcrecpp