"Fossies" - the Fresh Open Source Software Archive

Member "pcre-8.42/pcrecpp.cc" (14 Jun 2016, 32958 Bytes) of package /linux/misc/pcre-8.42.tar.bz2:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "pcrecpp.cc" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 8.38_vs_8.39.

    1 // Copyright (c) 2010, Google Inc.
    2 // All rights reserved.
    3 //
    4 // Redistribution and use in source and binary forms, with or without
    5 // modification, are permitted provided that the following conditions are
    6 // met:
    7 //
    8 //     * Redistributions of source code must retain the above copyright
    9 // notice, this list of conditions and the following disclaimer.
   10 //     * Redistributions in binary form must reproduce the above
   11 // copyright notice, this list of conditions and the following disclaimer
   12 // in the documentation and/or other materials provided with the
   13 // distribution.
   14 //     * Neither the name of Google Inc. nor the names of its
   15 // contributors may be used to endorse or promote products derived from
   16 // this software without specific prior written permission.
   17 //
   18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
   22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   29 //
   30 // Author: Sanjay Ghemawat
   31 
   32 #ifdef HAVE_CONFIG_H
   33 #include "config.h"
   34 #endif
   35 
   36 #include <stdlib.h>
   37 #include <stdio.h>
   38 #include <ctype.h>
   39 #include <limits.h>      /* for SHRT_MIN, USHRT_MAX, etc */
   40 #include <string.h>      /* for memcpy */
   41 #include <assert.h>
   42 #include <errno.h>
   43 #include <string>
   44 #include <algorithm>
   45 
   46 #include "pcrecpp_internal.h"
   47 #include "pcre.h"
   48 #include "pcrecpp.h"
   49 #include "pcre_stringpiece.h"
   50 
   51 
   52 namespace pcrecpp {
   53 
   54 // Maximum number of args we can set
   55 static const int kMaxArgs = 16;
   56 static const int kVecSize = (1 + kMaxArgs) * 3;  // results + PCRE workspace
   57 
   58 // Special object that stands-in for no argument
   59 Arg RE::no_arg((void*)NULL);
   60 
   61 // This is for ABI compatibility with old versions of pcre (pre-7.6),
   62 // which defined a global no_arg variable instead of putting it in the
   63 // RE class.  This works on GCC >= 3, at least.  It definitely works
   64 // for ELF, but may not for other object formats (Mach-O, for
   65 // instance, does not support aliases.)  We could probably have a more
   66 // inclusive test if we ever needed it.  (Note that not only the
   67 // __attribute__ syntax, but also __USER_LABEL_PREFIX__, are
   68 // gnu-specific.)
   69 #if defined(__GNUC__) && __GNUC__ >= 3 && defined(__ELF__) && !defined(__INTEL_COMPILER)
   70 # define ULP_AS_STRING(x)            ULP_AS_STRING_INTERNAL(x)
   71 # define ULP_AS_STRING_INTERNAL(x)   #x
   72 # define USER_LABEL_PREFIX_STR       ULP_AS_STRING(__USER_LABEL_PREFIX__)
   73 extern Arg no_arg
   74   __attribute__((alias(USER_LABEL_PREFIX_STR "_ZN7pcrecpp2RE6no_argE")));
   75 #endif
   76 
   77 // If a regular expression has no error, its error_ field points here
   78 static const string empty_string;
   79 
   80 // If the user doesn't ask for any options, we just use this one
   81 static RE_Options default_options;
   82 
   83 void RE::Init(const string& pat, const RE_Options* options) {
   84   pattern_ = pat;
   85   if (options == NULL) {
   86     options_ = default_options;
   87   } else {
   88     options_ = *options;
   89   }
   90   error_ = &empty_string;
   91   re_full_ = NULL;
   92   re_partial_ = NULL;
   93 
   94   re_partial_ = Compile(UNANCHORED);
   95   if (re_partial_ != NULL) {
   96     re_full_ = Compile(ANCHOR_BOTH);
   97   }
   98 }
   99 
  100 void RE::Cleanup() {
  101   if (re_full_ != NULL)         (*pcre_free)(re_full_);
  102   if (re_partial_ != NULL)      (*pcre_free)(re_partial_);
  103   if (error_ != &empty_string)  delete error_;
  104 }
  105 
  106 
  107 RE::~RE() {
  108   Cleanup();
  109 }
  110 
  111 
  112 pcre* RE::Compile(Anchor anchor) {
  113   // First, convert RE_Options into pcre options
  114   int pcre_options = 0;
  115   pcre_options = options_.all_options();
  116 
  117   // Special treatment for anchoring.  This is needed because at
  118   // runtime pcre only provides an option for anchoring at the
  119   // beginning of a string (unless you use offset).
  120   //
  121   // There are three types of anchoring we want:
  122   //    UNANCHORED      Compile the original pattern, and use
  123   //                    a pcre unanchored match.
  124   //    ANCHOR_START    Compile the original pattern, and use
  125   //                    a pcre anchored match.
  126   //    ANCHOR_BOTH     Tack a "\z" to the end of the original pattern
  127   //                    and use a pcre anchored match.
  128 
  129   const char* compile_error;
  130   int eoffset;
  131   pcre* re;
  132   if (anchor != ANCHOR_BOTH) {
  133     re = pcre_compile(pattern_.c_str(), pcre_options,
  134                       &compile_error, &eoffset, NULL);
  135   } else {
  136     // Tack a '\z' at the end of RE.  Parenthesize it first so that
  137     // the '\z' applies to all top-level alternatives in the regexp.
  138     string wrapped = "(?:";  // A non-counting grouping operator
  139     wrapped += pattern_;
  140     wrapped += ")\\z";
  141     re = pcre_compile(wrapped.c_str(), pcre_options,
  142                       &compile_error, &eoffset, NULL);
  143   }
  144   if (re == NULL) {
  145     if (error_ == &empty_string) error_ = new string(compile_error);
  146   }
  147   return re;
  148 }
  149 
  150 /***** Matching interfaces *****/
  151 
  152 bool RE::FullMatch(const StringPiece& text,
  153                    const Arg& ptr1,
  154                    const Arg& ptr2,
  155                    const Arg& ptr3,
  156                    const Arg& ptr4,
  157                    const Arg& ptr5,
  158                    const Arg& ptr6,
  159                    const Arg& ptr7,
  160                    const Arg& ptr8,
  161                    const Arg& ptr9,
  162                    const Arg& ptr10,
  163                    const Arg& ptr11,
  164                    const Arg& ptr12,
  165                    const Arg& ptr13,
  166                    const Arg& ptr14,
  167                    const Arg& ptr15,
  168                    const Arg& ptr16) const {
  169   const Arg* args[kMaxArgs];
  170   int n = 0;
  171   if (&ptr1  == &no_arg) { goto done; } args[n++] = &ptr1;
  172   if (&ptr2  == &no_arg) { goto done; } args[n++] = &ptr2;
  173   if (&ptr3  == &no_arg) { goto done; } args[n++] = &ptr3;
  174   if (&ptr4  == &no_arg) { goto done; } args[n++] = &ptr4;
  175   if (&ptr5  == &no_arg) { goto done; } args[n++] = &ptr5;
  176   if (&ptr6  == &no_arg) { goto done; } args[n++] = &ptr6;
  177   if (&ptr7  == &no_arg) { goto done; } args[n++] = &ptr7;
  178   if (&ptr8  == &no_arg) { goto done; } args[n++] = &ptr8;
  179   if (&ptr9  == &no_arg) { goto done; } args[n++] = &ptr9;
  180   if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
  181   if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
  182   if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
  183   if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
  184   if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
  185   if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
  186   if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
  187  done:
  188 
  189   int consumed;
  190   int vec[kVecSize];
  191   return DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
  192 }
  193 
  194 bool RE::PartialMatch(const StringPiece& text,
  195                       const Arg& ptr1,
  196                       const Arg& ptr2,
  197                       const Arg& ptr3,
  198                       const Arg& ptr4,
  199                       const Arg& ptr5,
  200                       const Arg& ptr6,
  201                       const Arg& ptr7,
  202                       const Arg& ptr8,
  203                       const Arg& ptr9,
  204                       const Arg& ptr10,
  205                       const Arg& ptr11,
  206                       const Arg& ptr12,
  207                       const Arg& ptr13,
  208                       const Arg& ptr14,
  209                       const Arg& ptr15,
  210                       const Arg& ptr16) const {
  211   const Arg* args[kMaxArgs];
  212   int n = 0;
  213   if (&ptr1  == &no_arg) { goto done; } args[n++] = &ptr1;
  214   if (&ptr2  == &no_arg) { goto done; } args[n++] = &ptr2;
  215   if (&ptr3  == &no_arg) { goto done; } args[n++] = &ptr3;
  216   if (&ptr4  == &no_arg) { goto done; } args[n++] = &ptr4;
  217   if (&ptr5  == &no_arg) { goto done; } args[n++] = &ptr5;
  218   if (&ptr6  == &no_arg) { goto done; } args[n++] = &ptr6;
  219   if (&ptr7  == &no_arg) { goto done; } args[n++] = &ptr7;
  220   if (&ptr8  == &no_arg) { goto done; } args[n++] = &ptr8;
  221   if (&ptr9  == &no_arg) { goto done; } args[n++] = &ptr9;
  222   if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
  223   if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
  224   if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
  225   if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
  226   if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
  227   if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
  228   if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
  229  done:
  230 
  231   int consumed;
  232   int vec[kVecSize];
  233   return DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
  234 }
  235 
  236 bool RE::Consume(StringPiece* input,
  237                  const Arg& ptr1,
  238                  const Arg& ptr2,
  239                  const Arg& ptr3,
  240                  const Arg& ptr4,
  241                  const Arg& ptr5,
  242                  const Arg& ptr6,
  243                  const Arg& ptr7,
  244                  const Arg& ptr8,
  245                  const Arg& ptr9,
  246                  const Arg& ptr10,
  247                  const Arg& ptr11,
  248                  const Arg& ptr12,
  249                  const Arg& ptr13,
  250                  const Arg& ptr14,
  251                  const Arg& ptr15,
  252                  const Arg& ptr16) const {
  253   const Arg* args[kMaxArgs];
  254   int n = 0;
  255   if (&ptr1  == &no_arg) { goto done; } args[n++] = &ptr1;
  256   if (&ptr2  == &no_arg) { goto done; } args[n++] = &ptr2;
  257   if (&ptr3  == &no_arg) { goto done; } args[n++] = &ptr3;
  258   if (&ptr4  == &no_arg) { goto done; } args[n++] = &ptr4;
  259   if (&ptr5  == &no_arg) { goto done; } args[n++] = &ptr5;
  260   if (&ptr6  == &no_arg) { goto done; } args[n++] = &ptr6;
  261   if (&ptr7  == &no_arg) { goto done; } args[n++] = &ptr7;
  262   if (&ptr8  == &no_arg) { goto done; } args[n++] = &ptr8;
  263   if (&ptr9  == &no_arg) { goto done; } args[n++] = &ptr9;
  264   if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
  265   if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
  266   if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
  267   if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
  268   if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
  269   if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
  270   if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
  271  done:
  272 
  273   int consumed;
  274   int vec[kVecSize];
  275   if (DoMatchImpl(*input, ANCHOR_START, &consumed,
  276                   args, n, vec, kVecSize)) {
  277     input->remove_prefix(consumed);
  278     return true;
  279   } else {
  280     return false;
  281   }
  282 }
  283 
  284 bool RE::FindAndConsume(StringPiece* input,
  285                         const Arg& ptr1,
  286                         const Arg& ptr2,
  287                         const Arg& ptr3,
  288                         const Arg& ptr4,
  289                         const Arg& ptr5,
  290                         const Arg& ptr6,
  291                         const Arg& ptr7,
  292                         const Arg& ptr8,
  293                         const Arg& ptr9,
  294                         const Arg& ptr10,
  295                         const Arg& ptr11,
  296                         const Arg& ptr12,
  297                         const Arg& ptr13,
  298                         const Arg& ptr14,
  299                         const Arg& ptr15,
  300                         const Arg& ptr16) const {
  301   const Arg* args[kMaxArgs];
  302   int n = 0;
  303   if (&ptr1  == &no_arg) { goto done; } args[n++] = &ptr1;
  304   if (&ptr2  == &no_arg) { goto done; } args[n++] = &ptr2;
  305   if (&ptr3  == &no_arg) { goto done; } args[n++] = &ptr3;
  306   if (&ptr4  == &no_arg) { goto done; } args[n++] = &ptr4;
  307   if (&ptr5  == &no_arg) { goto done; } args[n++] = &ptr5;
  308   if (&ptr6  == &no_arg) { goto done; } args[n++] = &ptr6;
  309   if (&ptr7  == &no_arg) { goto done; } args[n++] = &ptr7;
  310   if (&ptr8  == &no_arg) { goto done; } args[n++] = &ptr8;
  311   if (&ptr9  == &no_arg) { goto done; } args[n++] = &ptr9;
  312   if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
  313   if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
  314   if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
  315   if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
  316   if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
  317   if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
  318   if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
  319  done:
  320 
  321   int consumed;
  322   int vec[kVecSize];
  323   if (DoMatchImpl(*input, UNANCHORED, &consumed,
  324                   args, n, vec, kVecSize)) {
  325     input->remove_prefix(consumed);
  326     return true;
  327   } else {
  328     return false;
  329   }
  330 }
  331 
  332 bool RE::Replace(const StringPiece& rewrite,
  333                  string *str) const {
  334   int vec[kVecSize];
  335   int matches = TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
  336   if (matches == 0)
  337     return false;
  338 
  339   string s;
  340   if (!Rewrite(&s, rewrite, *str, vec, matches))
  341     return false;
  342 
  343   assert(vec[0] >= 0);
  344   assert(vec[1] >= 0);
  345   str->replace(vec[0], vec[1] - vec[0], s);
  346   return true;
  347 }
  348 
  349 // Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF.
  350 // Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF.
  351 // Modified by PH to add PCRE_NEWLINE_ANY and PCRE_NEWLINE_ANYCRLF.
  352 
  353 static int NewlineMode(int pcre_options) {
  354   // TODO: if we can make it threadsafe, cache this var
  355   int newline_mode = 0;
  356   /* if (newline_mode) return newline_mode; */  // do this once it's cached
  357   if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
  358                       PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)) {
  359     newline_mode = (pcre_options &
  360                     (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
  361                      PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF));
  362   } else {
  363     int newline;
  364     pcre_config(PCRE_CONFIG_NEWLINE, &newline);
  365     if (newline == 10)
  366       newline_mode = PCRE_NEWLINE_LF;
  367     else if (newline == 13)
  368       newline_mode = PCRE_NEWLINE_CR;
  369     else if (newline == 3338)
  370       newline_mode = PCRE_NEWLINE_CRLF;
  371     else if (newline == -1)
  372       newline_mode = PCRE_NEWLINE_ANY;
  373     else if (newline == -2)
  374       newline_mode = PCRE_NEWLINE_ANYCRLF;
  375     else
  376       assert(NULL == "Unexpected return value from pcre_config(NEWLINE)");
  377   }
  378   return newline_mode;
  379 }
  380 
  381 int RE::GlobalReplace(const StringPiece& rewrite,
  382                       string *str) const {
  383   int count = 0;
  384   int vec[kVecSize];
  385   string out;
  386   int start = 0;
  387   bool last_match_was_empty_string = false;
  388 
  389   while (start <= static_cast<int>(str->length())) {
  390     // If the previous match was for the empty string, we shouldn't
  391     // just match again: we'll match in the same way and get an
  392     // infinite loop.  Instead, we do the match in a special way:
  393     // anchored -- to force another try at the same position --
  394     // and with a flag saying that this time, ignore empty matches.
  395     // If this special match returns, that means there's a non-empty
  396     // match at this position as well, and we can continue.  If not,
  397     // we do what perl does, and just advance by one.
  398     // Notice that perl prints '@@@' for this;
  399     //    perl -le '$_ = "aa"; s/b*|aa/@/g; print'
  400     int matches;
  401     if (last_match_was_empty_string) {
  402       matches = TryMatch(*str, start, ANCHOR_START, false, vec, kVecSize);
  403       if (matches <= 0) {
  404         int matchend = start + 1;     // advance one character.
  405         // If the current char is CR and we're in CRLF mode, skip LF too.
  406         // Note it's better to call pcre_fullinfo() than to examine
  407         // all_options(), since options_ could have changed bewteen
  408         // compile-time and now, but this is simpler and safe enough.
  409         // Modified by PH to add ANY and ANYCRLF.
  410         if (matchend < static_cast<int>(str->length()) &&
  411             (*str)[start] == '\r' && (*str)[matchend] == '\n' &&
  412             (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF ||
  413              NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY ||
  414              NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)) {
  415           matchend++;
  416         }
  417         // We also need to advance more than one char if we're in utf8 mode.
  418 #ifdef SUPPORT_UTF8
  419         if (options_.utf8()) {
  420           while (matchend < static_cast<int>(str->length()) &&
  421                  ((*str)[matchend] & 0xc0) == 0x80)
  422             matchend++;
  423         }
  424 #endif
  425         if (start < static_cast<int>(str->length()))
  426           out.append(*str, start, matchend - start);
  427         start = matchend;
  428         last_match_was_empty_string = false;
  429         continue;
  430       }
  431     } else {
  432       matches = TryMatch(*str, start, UNANCHORED, true, vec, kVecSize);
  433       if (matches <= 0)
  434         break;
  435     }
  436     int matchstart = vec[0], matchend = vec[1];
  437     assert(matchstart >= start);
  438     assert(matchend >= matchstart);
  439     out.append(*str, start, matchstart - start);
  440     Rewrite(&out, rewrite, *str, vec, matches);
  441     start = matchend;
  442     count++;
  443     last_match_was_empty_string = (matchstart == matchend);
  444   }
  445 
  446   if (count == 0)
  447     return 0;
  448 
  449   if (start < static_cast<int>(str->length()))
  450     out.append(*str, start, str->length() - start);
  451   swap(out, *str);
  452   return count;
  453 }
  454 
  455 bool RE::Extract(const StringPiece& rewrite,
  456                  const StringPiece& text,
  457                  string *out) const {
  458   int vec[kVecSize];
  459   int matches = TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
  460   if (matches == 0)
  461     return false;
  462   out->erase();
  463   return Rewrite(out, rewrite, text, vec, matches);
  464 }
  465 
  466 /*static*/ string RE::QuoteMeta(const StringPiece& unquoted) {
  467   string result;
  468 
  469   // Escape any ascii character not in [A-Za-z_0-9].
  470   //
  471   // Note that it's legal to escape a character even if it has no
  472   // special meaning in a regular expression -- so this function does
  473   // that.  (This also makes it identical to the perl function of the
  474   // same name; see `perldoc -f quotemeta`.)  The one exception is
  475   // escaping NUL: rather than doing backslash + NUL, like perl does,
  476   // we do '\0', because pcre itself doesn't take embedded NUL chars.
  477   for (int ii = 0; ii < unquoted.size(); ++ii) {
  478     // Note that using 'isalnum' here raises the benchmark time from
  479     // 32ns to 58ns:
  480     if (unquoted[ii] == '\0') {
  481       result += "\\0";
  482     } else if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
  483                (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
  484                (unquoted[ii] < '0' || unquoted[ii] > '9') &&
  485                unquoted[ii] != '_' &&
  486                // If this is the part of a UTF8 or Latin1 character, we need
  487                // to copy this byte without escaping.  Experimentally this is
  488                // what works correctly with the regexp library.
  489                !(unquoted[ii] & 128)) {
  490       result += '\\';
  491       result += unquoted[ii];
  492     } else {
  493       result += unquoted[ii];
  494     }
  495   }
  496 
  497   return result;
  498 }
  499 
  500 /***** Actual matching and rewriting code *****/
  501 
  502 int RE::TryMatch(const StringPiece& text,
  503                  int startpos,
  504                  Anchor anchor,
  505                  bool empty_ok,
  506                  int *vec,
  507                  int vecsize) const {
  508   pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
  509   if (re == NULL) {
  510     //fprintf(stderr, "Matching against invalid re: %s\n", error_->c_str());
  511     return 0;
  512   }
  513 
  514   pcre_extra extra = { 0, 0, 0, 0, 0, 0, 0, 0 };
  515   if (options_.match_limit() > 0) {
  516     extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
  517     extra.match_limit = options_.match_limit();
  518   }
  519   if (options_.match_limit_recursion() > 0) {
  520     extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
  521     extra.match_limit_recursion = options_.match_limit_recursion();
  522   }
  523 
  524   // int options = 0;
  525   // Changed by PH as a result of bugzilla #1288
  526   int options = (options_.all_options() & PCRE_NO_UTF8_CHECK);
  527 
  528   if (anchor != UNANCHORED)
  529     options |= PCRE_ANCHORED;
  530   if (!empty_ok)
  531     options |= PCRE_NOTEMPTY;
  532 
  533   int rc = pcre_exec(re,              // The regular expression object
  534                      &extra,
  535                      (text.data() == NULL) ? "" : text.data(),
  536                      text.size(),
  537                      startpos,
  538                      options,
  539                      vec,
  540                      vecsize);
  541 
  542   // Handle errors
  543   if (rc == PCRE_ERROR_NOMATCH) {
  544     return 0;
  545   } else if (rc < 0) {
  546     //fprintf(stderr, "Unexpected return code: %d when matching '%s'\n",
  547     //        re, pattern_.c_str());
  548     return 0;
  549   } else if (rc == 0) {
  550     // pcre_exec() returns 0 as a special case when the number of
  551     // capturing subpatterns exceeds the size of the vector.
  552     // When this happens, there is a match and the output vector
  553     // is filled, but we miss out on the positions of the extra subpatterns.
  554     rc = vecsize / 2;
  555   }
  556 
  557   return rc;
  558 }
  559 
  560 bool RE::DoMatchImpl(const StringPiece& text,
  561                      Anchor anchor,
  562                      int* consumed,
  563                      const Arg* const* args,
  564                      int n,
  565                      int* vec,
  566                      int vecsize) const {
  567   assert((1 + n) * 3 <= vecsize);  // results + PCRE workspace
  568   int matches = TryMatch(text, 0, anchor, true, vec, vecsize);
  569   assert(matches >= 0);  // TryMatch never returns negatives
  570   if (matches == 0)
  571     return false;
  572 
  573   *consumed = vec[1];
  574 
  575   if (n == 0 || args == NULL) {
  576     // We are not interested in results
  577     return true;
  578   }
  579 
  580   if (NumberOfCapturingGroups() < n) {
  581     // RE has fewer capturing groups than number of arg pointers passed in
  582     return false;
  583   }
  584 
  585   // If we got here, we must have matched the whole pattern.
  586   // We do not need (can not do) any more checks on the value of 'matches' here
  587   // -- see the comment for TryMatch.
  588   for (int i = 0; i < n; i++) {
  589     const int start = vec[2*(i+1)];
  590     const int limit = vec[2*(i+1)+1];
  591     if (!args[i]->Parse(text.data() + start, limit-start)) {
  592       // TODO: Should we indicate what the error was?
  593       return false;
  594     }
  595   }
  596 
  597   return true;
  598 }
  599 
  600 bool RE::DoMatch(const StringPiece& text,
  601                  Anchor anchor,
  602                  int* consumed,
  603                  const Arg* const args[],
  604                  int n) const {
  605   assert(n >= 0);
  606   size_t const vecsize = (1 + n) * 3;  // results + PCRE workspace
  607                                        // (as for kVecSize)
  608   int space[21];   // use stack allocation for small vecsize (common case)
  609   int* vec = vecsize <= 21 ? space : new int[vecsize];
  610   bool retval = DoMatchImpl(text, anchor, consumed, args, n, vec, (int)vecsize);
  611   if (vec != space) delete [] vec;
  612   return retval;
  613 }
  614 
  615 bool RE::Rewrite(string *out, const StringPiece &rewrite,
  616                  const StringPiece &text, int *vec, int veclen) const {
  617   for (const char *s = rewrite.data(), *end = s + rewrite.size();
  618        s < end; s++) {
  619     int c = *s;
  620     if (c == '\\') {
  621       c = *++s;
  622       if (isdigit(c)) {
  623         int n = (c - '0');
  624         if (n >= veclen) {
  625           //fprintf(stderr, requested group %d in regexp %.*s\n",
  626           //        n, rewrite.size(), rewrite.data());
  627           return false;
  628         }
  629         int start = vec[2 * n];
  630         if (start >= 0)
  631           out->append(text.data() + start, vec[2 * n + 1] - start);
  632       } else if (c == '\\') {
  633         *out += '\\';
  634       } else {
  635         //fprintf(stderr, "invalid rewrite pattern: %.*s\n",
  636         //        rewrite.size(), rewrite.data());
  637         return false;
  638       }
  639     } else {
  640       *out += c;
  641     }
  642   }
  643   return true;
  644 }
  645 
  646 // Return the number of capturing subpatterns, or -1 if the
  647 // regexp wasn't valid on construction.
  648 int RE::NumberOfCapturingGroups() const {
  649   if (re_partial_ == NULL) return -1;
  650 
  651   int result;
  652   int pcre_retval = pcre_fullinfo(re_partial_,  // The regular expression object
  653                                   NULL,         // We did not study the pattern
  654                                   PCRE_INFO_CAPTURECOUNT,
  655                                   &result);
  656   assert(pcre_retval == 0);
  657   return result;
  658 }
  659 
  660 /***** Parsers for various types *****/
  661 
  662 bool Arg::parse_null(const char* str, int n, void* dest) {
  663   (void)str;
  664   (void)n;
  665   // We fail if somebody asked us to store into a non-NULL void* pointer
  666   return (dest == NULL);
  667 }
  668 
  669 bool Arg::parse_string(const char* str, int n, void* dest) {
  670   if (dest == NULL) return true;
  671   reinterpret_cast<string*>(dest)->assign(str, n);
  672   return true;
  673 }
  674 
  675 bool Arg::parse_stringpiece(const char* str, int n, void* dest) {
  676   if (dest == NULL) return true;
  677   reinterpret_cast<StringPiece*>(dest)->set(str, n);
  678   return true;
  679 }
  680 
  681 bool Arg::parse_char(const char* str, int n, void* dest) {
  682   if (n != 1) return false;
  683   if (dest == NULL) return true;
  684   *(reinterpret_cast<char*>(dest)) = str[0];
  685   return true;
  686 }
  687 
  688 bool Arg::parse_uchar(const char* str, int n, void* dest) {
  689   if (n != 1) return false;
  690   if (dest == NULL) return true;
  691   *(reinterpret_cast<unsigned char*>(dest)) = str[0];
  692   return true;
  693 }
  694 
  695 // Largest number spec that we are willing to parse
  696 static const int kMaxNumberLength = 32;
  697 
  698 // REQUIRES "buf" must have length at least kMaxNumberLength+1
  699 // REQUIRES "n > 0"
  700 // Copies "str" into "buf" and null-terminates if necessary.
  701 // Returns one of:
  702 //      a. "str" if no termination is needed
  703 //      b. "buf" if the string was copied and null-terminated
  704 //      c. "" if the input was invalid and has no hope of being parsed
  705 static const char* TerminateNumber(char* buf, const char* str, int n) {
  706   if ((n > 0) && isspace(*str)) {
  707     // We are less forgiving than the strtoxxx() routines and do not
  708     // allow leading spaces.
  709     return "";
  710   }
  711 
  712   // See if the character right after the input text may potentially
  713   // look like a digit.
  714   if (isdigit(str[n]) ||
  715       ((str[n] >= 'a') && (str[n] <= 'f')) ||
  716       ((str[n] >= 'A') && (str[n] <= 'F'))) {
  717     if (n > kMaxNumberLength) return ""; // Input too big to be a valid number
  718     memcpy(buf, str, n);
  719     buf[n] = '\0';
  720     return buf;
  721   } else {
  722     // We can parse right out of the supplied string, so return it.
  723     return str;
  724   }
  725 }
  726 
  727 bool Arg::parse_long_radix(const char* str,
  728                            int n,
  729                            void* dest,
  730                            int radix) {
  731   if (n == 0) return false;
  732   char buf[kMaxNumberLength+1];
  733   str = TerminateNumber(buf, str, n);
  734   char* end;
  735   errno = 0;
  736   long r = strtol(str, &end, radix);
  737   if (end != str + n) return false;   // Leftover junk
  738   if (errno) return false;
  739   if (dest == NULL) return true;
  740   *(reinterpret_cast<long*>(dest)) = r;
  741   return true;
  742 }
  743 
  744 bool Arg::parse_ulong_radix(const char* str,
  745                             int n,
  746                             void* dest,
  747                             int radix) {
  748   if (n == 0) return false;
  749   char buf[kMaxNumberLength+1];
  750   str = TerminateNumber(buf, str, n);
  751   if (str[0] == '-') return false;    // strtoul() on a negative number?!
  752   char* end;
  753   errno = 0;
  754   unsigned long r = strtoul(str, &end, radix);
  755   if (end != str + n) return false;   // Leftover junk
  756   if (errno) return false;
  757   if (dest == NULL) return true;
  758   *(reinterpret_cast<unsigned long*>(dest)) = r;
  759   return true;
  760 }
  761 
  762 bool Arg::parse_short_radix(const char* str,
  763                             int n,
  764                             void* dest,
  765                             int radix) {
  766   long r;
  767   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
  768   if (r < SHRT_MIN || r > SHRT_MAX) return false;       // Out of range
  769   if (dest == NULL) return true;
  770   *(reinterpret_cast<short*>(dest)) = static_cast<short>(r);
  771   return true;
  772 }
  773 
  774 bool Arg::parse_ushort_radix(const char* str,
  775                              int n,
  776                              void* dest,
  777                              int radix) {
  778   unsigned long r;
  779   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
  780   if (r > USHRT_MAX) return false;                      // Out of range
  781   if (dest == NULL) return true;
  782   *(reinterpret_cast<unsigned short*>(dest)) = static_cast<unsigned short>(r);
  783   return true;
  784 }
  785 
  786 bool Arg::parse_int_radix(const char* str,
  787                           int n,
  788                           void* dest,
  789                           int radix) {
  790   long r;
  791   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
  792   if (r < INT_MIN || r > INT_MAX) return false;         // Out of range
  793   if (dest == NULL) return true;
  794   *(reinterpret_cast<int*>(dest)) = r;
  795   return true;
  796 }
  797 
  798 bool Arg::parse_uint_radix(const char* str,
  799                            int n,
  800                            void* dest,
  801                            int radix) {
  802   unsigned long r;
  803   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
  804   if (r > UINT_MAX) return false;                       // Out of range
  805   if (dest == NULL) return true;
  806   *(reinterpret_cast<unsigned int*>(dest)) = r;
  807   return true;
  808 }
  809 
  810 bool Arg::parse_longlong_radix(const char* str,
  811                                int n,
  812                                void* dest,
  813                                int radix) {
  814 #ifndef HAVE_LONG_LONG
  815   return false;
  816 #else
  817   if (n == 0) return false;
  818   char buf[kMaxNumberLength+1];
  819   str = TerminateNumber(buf, str, n);
  820   char* end;
  821   errno = 0;
  822 #if defined HAVE_STRTOQ
  823   long long r = strtoq(str, &end, radix);
  824 #elif defined HAVE_STRTOLL
  825   long long r = strtoll(str, &end, radix);
  826 #elif defined HAVE__STRTOI64
  827   long long r = _strtoi64(str, &end, radix);
  828 #elif defined HAVE_STRTOIMAX
  829   long long r = strtoimax(str, &end, radix);
  830 #else
  831 #error parse_longlong_radix: cannot convert input to a long-long
  832 #endif
  833   if (end != str + n) return false;   // Leftover junk
  834   if (errno) return false;
  835   if (dest == NULL) return true;
  836   *(reinterpret_cast<long long*>(dest)) = r;
  837   return true;
  838 #endif   /* HAVE_LONG_LONG */
  839 }
  840 
  841 bool Arg::parse_ulonglong_radix(const char* str,
  842                                 int n,
  843                                 void* dest,
  844                                 int radix) {
  845 #ifndef HAVE_UNSIGNED_LONG_LONG
  846   return false;
  847 #else
  848   if (n == 0) return false;
  849   char buf[kMaxNumberLength+1];
  850   str = TerminateNumber(buf, str, n);
  851   if (str[0] == '-') return false;    // strtoull() on a negative number?!
  852   char* end;
  853   errno = 0;
  854 #if defined HAVE_STRTOQ
  855   unsigned long long r = strtouq(str, &end, radix);
  856 #elif defined HAVE_STRTOLL
  857   unsigned long long r = strtoull(str, &end, radix);
  858 #elif defined HAVE__STRTOI64
  859   unsigned long long r = _strtoui64(str, &end, radix);
  860 #elif defined HAVE_STRTOIMAX
  861   unsigned long long r = strtoumax(str, &end, radix);
  862 #else
  863 #error parse_ulonglong_radix: cannot convert input to a long-long
  864 #endif
  865   if (end != str + n) return false;   // Leftover junk
  866   if (errno) return false;
  867   if (dest == NULL) return true;
  868   *(reinterpret_cast<unsigned long long*>(dest)) = r;
  869   return true;
  870 #endif   /* HAVE_UNSIGNED_LONG_LONG */
  871 }
  872 
  873 bool Arg::parse_double(const char* str, int n, void* dest) {
  874   if (n == 0) return false;
  875   static const int kMaxLength = 200;
  876   char buf[kMaxLength];
  877   if (n >= kMaxLength) return false;
  878   memcpy(buf, str, n);
  879   buf[n] = '\0';
  880   errno = 0;
  881   char* end;
  882   double r = strtod(buf, &end);
  883   if (end != buf + n) return false;   // Leftover junk
  884   if (errno) return false;
  885   if (dest == NULL) return true;
  886   *(reinterpret_cast<double*>(dest)) = r;
  887   return true;
  888 }
  889 
  890 bool Arg::parse_float(const char* str, int n, void* dest) {
  891   double r;
  892   if (!parse_double(str, n, &r)) return false;
  893   if (dest == NULL) return true;
  894   *(reinterpret_cast<float*>(dest)) = static_cast<float>(r);
  895   return true;
  896 }
  897 
  898 
  899 #define DEFINE_INTEGER_PARSERS(name)                                    \
  900   bool Arg::parse_##name(const char* str, int n, void* dest) {          \
  901     return parse_##name##_radix(str, n, dest, 10);                      \
  902   }                                                                     \
  903   bool Arg::parse_##name##_hex(const char* str, int n, void* dest) {    \
  904     return parse_##name##_radix(str, n, dest, 16);                      \
  905   }                                                                     \
  906   bool Arg::parse_##name##_octal(const char* str, int n, void* dest) {  \
  907     return parse_##name##_radix(str, n, dest, 8);                       \
  908   }                                                                     \
  909   bool Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
  910     return parse_##name##_radix(str, n, dest, 0);                       \
  911   }
  912 
  913 DEFINE_INTEGER_PARSERS(short)      /*                                   */
  914 DEFINE_INTEGER_PARSERS(ushort)     /*                                   */
  915 DEFINE_INTEGER_PARSERS(int)        /* Don't use semicolons after these  */
  916 DEFINE_INTEGER_PARSERS(uint)       /* statements because they can cause */
  917 DEFINE_INTEGER_PARSERS(long)       /* compiler warnings if the checking */
  918 DEFINE_INTEGER_PARSERS(ulong)      /* level is turned up high enough.   */
  919 DEFINE_INTEGER_PARSERS(longlong)   /*                                   */
  920 DEFINE_INTEGER_PARSERS(ulonglong)  /*                                   */
  921 
  922 #undef DEFINE_INTEGER_PARSERS
  923 
  924 }   // namespace pcrecpp