"Fossies" - the Fresh Open Source Software Archive

Member "flyspray-1.0-rc9/plugins/dokuwiki/inc/parser/lexer.php" (22 Apr 2019, 20304 Bytes) of package /linux/privat/flyspray-1.0-rc9.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) PHP source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "lexer.php" see the Fossies "Dox" file reference documentation.

    1 <?php
    2 /**
    3 * Author Markus Baker: http://www.lastcraft.com
    4 * Version adapted from Simple Test: http://sourceforge.net/projects/simpletest/
    5 * For an intro to the Lexer see:
    6 * http://www.phppatterns.com/index.php/article/articleview/106/1/2/
    7 * @author Marcus Baker
    8 * @package Doku
    9 * @subpackage Lexer
   10 * @version $Id$
   11 */
   12 
   13 /**
   14 * Init path constant
   15 */
   16 if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../../').'/');
   17 
   18 /**#@+
   19  * lexer mode constant
   20  */
   21 define("DOKU_LEXER_ENTER", 1);
   22 define("DOKU_LEXER_MATCHED", 2);
   23 define("DOKU_LEXER_UNMATCHED", 3);
   24 define("DOKU_LEXER_EXIT", 4);
   25 define("DOKU_LEXER_SPECIAL", 5);
   26 /**#@-*/
   27 
   28 /**
   29  *    Compounded regular expression. Any of
   30  *    the contained patterns could match and
   31  *    when one does it's label is returned.
   32  *    @package Doku
   33  *    @subpackage Lexer
   34  */
   35 class Doku_LexerParallelRegex {
   36     var $_patterns;
   37     var $_labels;
   38     var $_regex;
   39     var $_case;
   40 
   41     /**
   42      *    Constructor. Starts with no patterns.
   43      *    @param boolean $case    True for case sensitive, false
   44      *                            for insensitive.
   45      *    @access public
   46      */
   47     function __construct($case) {
   48         $this->_case = $case;
   49         $this->_patterns = array();
   50         $this->_labels = array();
   51         $this->_regex = null;
   52     }
   53 
   54     /**
   55      *    Adds a pattern with an optional label.
   56      *    @param mixed $pattern       Perl style regex. Must be UTF-8
   57      *                                encoded. If its a string, the (, )
   58      *                                lose their meaning unless they
   59      *                                form part of a lookahead or
   60      *                                lookbehind assertation.
   61      *    @param string $label        Label of regex to be returned
   62      *                                on a match. Label must be ASCII
   63      *    @access public
   64      */
   65     function addPattern($pattern, $label = true) {
   66         $duplicate = false;
   67         // Nux: check if not duplicate
   68         // Note! This prevents regexp overflow when viewing Flyspray issues with  many comments
   69         if (in_array($pattern, $this->_patterns)) {
   70             $index = array_search($pattern, $this->_patterns, true);
   71             // label also have to be the same
   72             if (isset($this->_labels[$index]) && $this->_labels[$index] === $label) {
   73                 $duplicate = true;
   74             }
   75         }
   76         // only add new pattern if not duplicate...
   77         if (!$duplicate) {
   78             $count = count($this->_patterns);
   79             $this->_patterns[$count] = $pattern;
   80             $this->_labels[$count] = $label;
   81         }
   82         $this->_regex = null;
   83     }
   84 
   85     /**
   86      *    Attempts to match all patterns at once against
   87      *    a string.
   88      *    @param string $subject      String to match against.
   89      *    @param string $match        First matched portion of
   90      *                                subject.
   91      *    @return boolean             True on success.
   92      *    @access public
   93      */
   94     function match($subject, &$match) {
   95         if (count($this->_patterns) == 0) {
   96             return false;
   97         }
   98         if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
   99             $match = "";
  100             return false;
  101         }
  102 
  103         $match = $matches[0];
  104         $size = count($matches);
  105         for ($i = 1; $i < $size; $i++) {
  106             if ($matches[$i] && isset($this->_labels[$i - 1])) {
  107                 return $this->_labels[$i - 1];
  108             }
  109         }
  110         return true;
  111     }
  112 
  113     /**
  114      *    Attempts to split the string against all patterns at once
  115      *
  116      *    @param string $subject      String to match against.
  117      *    @param array $split         The split result: array containing, pre-match, match & post-match strings
  118      *    @return boolean             True on success.
  119      *    @access public
  120      *
  121      *    @author Christopher Smith <chris@jalakai.co.uk>
  122      */
  123     function explode($subject, &$split) {
  124         if (count($this->_patterns) == 0) {
  125             return false;
  126         }
  127 
  128         if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
  129             $split = array($subject, "", "");
  130             return false;
  131         }
  132 
  133         $idx = count($matches)-2;
  134 
  135         list($pre, $post) = preg_split($this->_patterns[$idx].$this->_getPerlMatchingFlags(), $subject, 2);
  136 
  137         $split = array($pre, $matches[0], $post);
  138         return isset($this->_labels[$idx]) ? $this->_labels[$idx] : true;
  139     }
  140 
  141     /**
  142      *    Compounds the patterns into a single
  143      *    regular expression separated with the
  144      *    "or" operator. Caches the regex.
  145      *    Will automatically escape (, ) and / tokens.
  146      *    @param array $patterns    List of patterns in order.
  147      *    @access private
  148      */
  149     function _getCompoundedRegex() {
  150         if ($this->_regex == null) {
  151             $cnt = count($this->_patterns);
  152             for ($i = 0; $i < $cnt; $i++) {
  153 
  154                 // Replace lookaheads / lookbehinds with marker
  155                 $m = "\1\1";
  156                 $pattern = preg_replace(
  157                         array (
  158                             '/\(\?(i|m|s|x|U)\)/U',
  159                             '/\(\?(\-[i|m|s|x|U])\)/U',
  160                             '/\(\?\=(.*)\)/sU',
  161                             '/\(\?\!(.*)\)/sU',
  162                             '/\(\?\<\=(.*)\)/sU',
  163                             '/\(\?\<\!(.*)\)/sU',
  164                             '/\(\?\:(.*)\)/sU',
  165                         ),
  166                         array (
  167                             $m.'SO:\\1'.$m,
  168                             $m.'SOR:\\1'.$m,
  169                             $m.'LA:IS:\\1'.$m,
  170                             $m.'LA:NOT:\\1'.$m,
  171                             $m.'LB:IS:\\1'.$m,
  172                             $m.'LB:NOT:\\1'.$m,
  173                             $m.'GRP:\\1'.$m,
  174                         ),
  175                         $this->_patterns[$i]
  176                     );
  177                 // Quote the rest
  178                 $pattern = str_replace(
  179                     array('/', '(', ')'),
  180                     array('\/', '\(', '\)'),
  181                     $pattern
  182                     );
  183 
  184                 // Restore lookaheads / lookbehinds
  185                 $pattern = preg_replace(
  186                         array (
  187                             '/'.$m.'SO:(.{1})'.$m.'/',
  188                             '/'.$m.'SOR:(.{2})'.$m.'/',
  189                             '/'.$m.'LA:IS:(.*)'.$m.'/sU',
  190                             '/'.$m.'LA:NOT:(.*)'.$m.'/sU',
  191                             '/'.$m.'LB:IS:(.*)'.$m.'/sU',
  192                             '/'.$m.'LB:NOT:(.*)'.$m.'/sU',
  193                             '/'.$m.'GRP:(.*)'.$m.'/sU',
  194                         ),
  195                         array (
  196                             '(?\\1)',
  197                             '(?\\1)',
  198                             '(?=\\1)',
  199                             '(?!\\1)',
  200                             '(?<=\\1)',
  201                             '(?<!\\1)',
  202                             '(?:\\1)',
  203                         ),
  204                         $pattern
  205                 );
  206 
  207                 $this->_patterns[$i] = '('.$pattern.')';
  208             }
  209             $this->_regex = "/" . implode("|", $this->_patterns) . "/" . $this->_getPerlMatchingFlags();
  210         }
  211         return $this->_regex;
  212     }
  213 
  214     /**
  215      *    Accessor for perl regex mode flags to use.
  216      *    @return string       Perl regex flags.
  217      *    @access private
  218      */
  219     function _getPerlMatchingFlags() {
  220         return ($this->_case ? "msS" : "msSi");
  221     }
  222 }
  223 
  224 /**
  225  *    States for a stack machine.
  226  *    @package Lexer
  227  *    @subpackage Lexer
  228  */
  229 class Doku_LexerStateStack {
  230     var $_stack;
  231 
  232     /**
  233      *    Constructor. Starts in named state.
  234      *    @param string $start        Starting state name.
  235      *    @access public
  236      */
  237     function __construct($start) {
  238         $this->_stack = array($start);
  239     }
  240 
  241     /**
  242      *    Accessor for current state.
  243      *    @return string       State.
  244      *    @access public
  245      */
  246     function getCurrent() {
  247         return $this->_stack[count($this->_stack) - 1];
  248     }
  249 
  250     /**
  251      *    Adds a state to the stack and sets it
  252      *    to be the current state.
  253      *    @param string $state        New state.
  254      *    @access public
  255      */
  256     function enter($state) {
  257         array_push($this->_stack, $state);
  258     }
  259 
  260     /**
  261      *    Leaves the current state and reverts
  262      *    to the previous one.
  263      *    @return boolean    False if we drop off
  264      *                       the bottom of the list.
  265      *    @access public
  266      */
  267     function leave() {
  268         if (count($this->_stack) == 1) {
  269             return false;
  270         }
  271         array_pop($this->_stack);
  272         return true;
  273     }
  274 }
  275 
  276 /**
  277  *    Accepts text and breaks it into tokens.
  278  *    Some optimisation to make the sure the
  279  *    content is only scanned by the PHP regex
  280  *    parser once. Lexer modes must not start
  281  *    with leading underscores.
  282  *    @package Doku
  283  *    @subpackage Lexer
  284  */
  285 class Doku_Lexer {
  286     var $_regexes;
  287     var $_parser;
  288     var $_mode;
  289     var $_mode_handlers;
  290     var $_case;
  291 
  292     /**
  293      *    Sets up the lexer in case insensitive matching
  294      *    by default.
  295      *    @param Doku_Parser $parser  Handling strategy by
  296      *                                    reference.
  297      *    @param string $start            Starting handler.
  298      *    @param boolean $case            True for case sensitive.
  299      *    @access public
  300      */
  301     function __construct(&$parser, $start = "accept", $case = false) {
  302         $this->_case = $case;
  303         $this->_regexes = array();
  304         $this->_parser = &$parser;
  305         $this->_mode = new Doku_LexerStateStack($start);
  306         $this->_mode_handlers = array();
  307     }
  308 
  309     /**
  310      *    Adds a token search pattern for a particular
  311      *    parsing mode. The pattern does not change the
  312      *    current mode.
  313      *    @param string $pattern      Perl style regex, but ( and )
  314      *                                lose the usual meaning.
  315      *    @param string $mode         Should only apply this
  316      *                                pattern when dealing with
  317      *                                this type of input.
  318      *    @access public
  319      */
  320     function addPattern($pattern, $mode = "accept") {
  321         if (! isset($this->_regexes[$mode])) {
  322             $this->_regexes[$mode] = new Doku_LexerParallelRegex($this->_case);
  323         }
  324         $this->_regexes[$mode]->addPattern($pattern);
  325     }
  326 
  327     /**
  328      *    Adds a pattern that will enter a new parsing
  329      *    mode. Useful for entering parenthesis, strings,
  330      *    tags, etc.
  331      *    @param string $pattern      Perl style regex, but ( and )
  332      *                                lose the usual meaning.
  333      *    @param string $mode         Should only apply this
  334      *                                pattern when dealing with
  335      *                                this type of input.
  336      *    @param string $new_mode     Change parsing to this new
  337      *                                nested mode.
  338      *    @access public
  339      */
  340     function addEntryPattern($pattern, $mode, $new_mode) {
  341         if (! isset($this->_regexes[$mode])) {
  342             $this->_regexes[$mode] = new Doku_LexerParallelRegex($this->_case);
  343         }
  344         $this->_regexes[$mode]->addPattern($pattern, $new_mode);
  345     }
  346 
  347     /**
  348      *    Adds a pattern that will exit the current mode
  349      *    and re-enter the previous one.
  350      *    @param string $pattern      Perl style regex, but ( and )
  351      *                                lose the usual meaning.
  352      *    @param string $mode         Mode to leave.
  353      *    @access public
  354      */
  355     function addExitPattern($pattern, $mode) {
  356         if (! isset($this->_regexes[$mode])) {
  357             $this->_regexes[$mode] = new Doku_LexerParallelRegex($this->_case);
  358         }
  359         $this->_regexes[$mode]->addPattern($pattern, "__exit");
  360     }
  361 
  362     /**
  363      *    Adds a pattern that has a special mode. Acts as an entry
  364      *    and exit pattern in one go, effectively calling a special
  365      *    parser handler for this token only.
  366      *    @param string $pattern      Perl style regex, but ( and )
  367      *                                lose the usual meaning.
  368      *    @param string $mode         Should only apply this
  369      *                                pattern when dealing with
  370      *                                this type of input.
  371      *    @param string $special      Use this mode for this one token.
  372      *    @access public
  373      */
  374     function addSpecialPattern($pattern, $mode, $special) {
  375         if (! isset($this->_regexes[$mode])) {
  376             $this->_regexes[$mode] = new Doku_LexerParallelRegex($this->_case);
  377         }
  378         $this->_regexes[$mode]->addPattern($pattern, "_$special");
  379     }
  380 
  381     /**
  382      *    Adds a mapping from a mode to another handler.
  383      *    @param string $mode        Mode to be remapped.
  384      *    @param string $handler     New target handler.
  385      *    @access public
  386      */
  387     function mapHandler($mode, $handler) {
  388         $this->_mode_handlers[$mode] = $handler;
  389     }
  390 
  391     /**
  392      *    Splits the page text into tokens. Will fail
  393      *    if the handlers report an error or if no
  394      *    content is consumed. If successful then each
  395      *    unparsed and parsed token invokes a call to the
  396      *    held listener.
  397      *    @param string $raw        Raw HTML text.
  398      *    @return boolean           True on success, else false.
  399      *    @access public
  400      */
  401     function parse($raw) {
  402         if (! isset($this->_parser)) {
  403             return false;
  404         }
  405         $initialLength = strlen($raw);
  406         $length = $initialLength;
  407         $pos = 0;
  408         while (is_array($parsed = $this->_reduce($raw))) {
  409             list($unmatched, $matched, $mode) = $parsed;
  410             $currentLength = strlen($raw);
  411             $matchPos = $initialLength - $currentLength - strlen($matched);
  412             if (! $this->_dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
  413                 return false;
  414             }
  415             if ($currentLength == $length) {
  416                 return false;
  417             }
  418             $length = $currentLength;
  419             $pos = $initialLength - $currentLength;
  420         }
  421         if (!$parsed) {
  422             return false;
  423         }
  424         return $this->_invokeParser($raw, DOKU_LEXER_UNMATCHED, $pos);
  425     }
  426 
  427     /**
  428      *    Sends the matched token and any leading unmatched
  429      *    text to the parser changing the lexer to a new
  430      *    mode if one is listed.
  431      *    @param string $unmatched    Unmatched leading portion.
  432      *    @param string $matched      Actual token match.
  433      *    @param string $mode         Mode after match. A boolean
  434      *                                false mode causes no change.
  435      *    @param int $pos         Current byte index location in raw doc
  436      *                                thats being parsed
  437      *    @return boolean             False if there was any error
  438      *                                from the parser.
  439      *    @access private
  440      */
  441     function _dispatchTokens($unmatched, $matched, $mode = false, $initialPos, $matchPos) {
  442         if (! $this->_invokeParser($unmatched, DOKU_LEXER_UNMATCHED, $initialPos) ){
  443             return false;
  444         }
  445         if ($this->_isModeEnd($mode)) {
  446             if (! $this->_invokeParser($matched, DOKU_LEXER_EXIT, $matchPos)) {
  447                 return false;
  448             }
  449             return $this->_mode->leave();
  450         }
  451         if ($this->_isSpecialMode($mode)) {
  452             $this->_mode->enter($this->_decodeSpecial($mode));
  453             if (! $this->_invokeParser($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
  454                 return false;
  455             }
  456             return $this->_mode->leave();
  457         }
  458         if (is_string($mode)) {
  459             $this->_mode->enter($mode);
  460             return $this->_invokeParser($matched, DOKU_LEXER_ENTER, $matchPos);
  461         }
  462         return $this->_invokeParser($matched, DOKU_LEXER_MATCHED, $matchPos);
  463     }
  464 
  465     /**
  466      *    Tests to see if the new mode is actually to leave
  467      *    the current mode and pop an item from the matching
  468      *    mode stack.
  469      *    @param string $mode    Mode to test.
  470      *    @return boolean        True if this is the exit mode.
  471      *    @access private
  472      */
  473     function _isModeEnd($mode) {
  474         return ($mode === "__exit");
  475     }
  476 
  477     /**
  478      *    Test to see if the mode is one where this mode
  479      *    is entered for this token only and automatically
  480      *    leaves immediately afterwoods.
  481      *    @param string $mode    Mode to test.
  482      *    @return boolean        True if this is the exit mode.
  483      *    @access private
  484      */
  485     function _isSpecialMode($mode) {
  486         return (strncmp($mode, "_", 1) == 0);
  487     }
  488 
  489     /**
  490      *    Strips the magic underscore marking single token
  491      *    modes.
  492      *    @param string $mode    Mode to decode.
  493      *    @return string         Underlying mode name.
  494      *    @access private
  495      */
  496     function _decodeSpecial($mode) {
  497         return substr($mode, 1);
  498     }
  499 
  500     /**
  501      *    Calls the parser method named after the current
  502      *    mode. Empty content will be ignored. The lexer
  503      *    has a parser handler for each mode in the lexer.
  504      *    @param string $content        Text parsed.
  505      *    @param boolean $is_match      Token is recognised rather
  506      *                                  than unparsed data.
  507      *    @param int $pos         Current byte index location in raw doc
  508      *                                thats being parsed
  509      *    @access private
  510      */
  511     function _invokeParser($content, $is_match, $pos) {
  512         if (($content === "") || ($content === false)) {
  513             return true;
  514         }
  515         $handler = $this->_mode->getCurrent();
  516         if (isset($this->_mode_handlers[$handler])) {
  517             $handler = $this->_mode_handlers[$handler];
  518         }
  519 
  520         // modes starting with plugin_ are all handled by the same
  521         // handler but with an additional parameter
  522         if(substr($handler,0,7)=='plugin_'){
  523           list($handler,$plugin) = explode('_',$handler,2);
  524               return $this->_parser->$handler($content, $is_match, $pos, $plugin);
  525         }
  526 
  527             return $this->_parser->$handler($content, $is_match, $pos);
  528         }
  529 
  530     /**
  531      *    Tries to match a chunk of text and if successful
  532      *    removes the recognised chunk and any leading
  533      *    unparsed data. Empty strings will not be matched.
  534      *    @param string $raw         The subject to parse. This is the
  535      *                               content that will be eaten.
  536      *    @return array              Three item list of unparsed
  537      *                               content followed by the
  538      *                               recognised token and finally the
  539      *                               action the parser is to take.
  540      *                               True if no match, false if there
  541      *                               is a parsing error.
  542      *    @access private
  543      */
  544     function _reduce(&$raw) {
  545         if (! isset($this->_regexes[$this->_mode->getCurrent()])) {
  546             return false;
  547         }
  548         if ($raw === "") {
  549             return true;
  550         }
  551         if ($action = $this->_regexes[$this->_mode->getCurrent()]->explode($raw, $split)) {
  552             list($unparsed, $match, $raw) = $split;
  553             return array($unparsed, $match, $action);
  554         }
  555         return true;
  556     }
  557 }
  558 
  559 /**
  560 * Escapes regex characters other than (, ) and /
  561 * @TODO
  562 */
  563 function Doku_Lexer_Escape($str) {
  564     //$str = addslashes($str);
  565     $chars = array(
  566         '/\\\\/',
  567         '/\./',
  568         '/\+/',
  569         '/\*/',
  570         '/\?/',
  571         '/\[/',
  572         '/\^/',
  573         '/\]/',
  574         '/\$/',
  575         '/\{/',
  576         '/\}/',
  577         '/\=/',
  578         '/\!/',
  579         '/\</',
  580         '/\>/',
  581         '/\|/',
  582         '/\:/'
  583         );
  584 
  585     $escaped = array(
  586         '\\\\\\\\',
  587         '\.',
  588         '\+',
  589         '\*',
  590         '\?',
  591         '\[',
  592         '\^',
  593         '\]',
  594         '\$',
  595         '\{',
  596         '\}',
  597         '\=',
  598         '\!',
  599         '\<',
  600         '\>',
  601         '\|',
  602         '\:'
  603         );
  604     return preg_replace($chars, $escaped, $str);
  605 }
  606 
  607 //Setup VIM: ex: et ts=4 enc=utf-8 :