"Fossies" - the Fresh Open Source Software Archive

Member "zim-0.71.1/zim/tokenparser.py" (21 May 2019, 6440 Bytes) of package /linux/privat/zim-0.71.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Python source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. For more information about "tokenparser.py" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 0.69.1_vs_0.70.

    1 
    2 # Copyright 2016-2017 Jaap Karssenberg <jaap.karssenberg@gmail.com>
    3 
    4 # Tokens come in 3 variants
    5 #   tuple((tag, attrib))  e.g. (HEADING, {'level': 3})
    6 #   tuple((TEXT, string))   e.g. (TEXT, 'Some heading ...')
    7 #   tuple((END, tag))     e.g. (END, HEADING)
    8 #
    9 # Extra constraint is parsing must be per line, therefore a TEXT
   10 # item cannot contain newline other than at the end of the string
   11 
   12 
   13 from zim.parser import Builder
   14 from zim.formats import NUMBEREDLIST, BULLETLIST, LISTITEM, PARAGRAPH
   15 
   16 TEXT = 'T'
   17 END = '/'
   18 
   19 class TokenBuilder(Builder):
   20 
   21     def __init__(self):
   22         self._tokens = []
   23 
   24     @property
   25     def tokens(self):
   26         return topLevelLists(self._tokens)
   27 
   28     def start(self, tag, attrib=None):
   29         self._tokens.append((tag, attrib))
   30 
   31     def text(self, text):
   32         if '\n' in text:
   33             for line in text.splitlines(True):
   34                 self._tokens.append((TEXT, line))
   35         else:
   36             self._tokens.append((TEXT, text))
   37 
   38     def end(self, tag):
   39         self._tokens.append((END, tag))
   40 
   41     def append(self, tag, attrib=None, text=None):
   42         if text:
   43             if '\n' in text:
   44                 self._tokens.append((tag, attrib))
   45                 for line in text.splitlines(True):
   46                     self._tokens.append((TEXT, line))
   47                 self._tokens.append((END, tag))
   48             else:
   49                 self._tokens.extend([
   50                     (tag, attrib),
   51                     (TEXT, text),
   52                     (END, tag)
   53                 ])
   54         else:
   55             self._tokens.extend([
   56                 (tag, attrib),
   57                 (END, tag)
   58             ])
   59 
   60 
   61 class TokenParser(object):
   62 
   63     def __init__(self, builder):
   64         self.builder = builder
   65 
   66     def parse(self, tokens):
   67         for t in reverseTopLevelLists(tokens):
   68             if t[0] == END:
   69                 self.builder.end(t[1])
   70             elif t[0] == TEXT:
   71                 self.builder.text(t[1])
   72             else:
   73                 self.builder.start(*t)
   74 
   75 
   76 class TokenVisitor(object):
   77     # Adaptor for the visit interface
   78 
   79     def __init__(self, tokens):
   80         self.tokens = tokens
   81 
   82     def visit(self, builder):
   83         parser = TokenParser(builder)
   84         builder.parse(self.tokens)
   85 
   86 
   87 def skip_to_end_token(token_iter, end_token):
   88     eol = 0
   89     nesting = 0
   90     for t in token_iter:
   91         if t[0] == end_token:
   92             nesting += 1
   93         elif t == (END, end_token):
   94             nesting -= 1
   95             if nesting < 0:
   96                 break
   97         elif t[0] == TEXT:
   98             eol += t[1].count('\n')
   99 
  100     return eol
  101 
  102 
  103 def topLevelLists(tokens):
  104     # Make tree more HTML-like:
  105     # - Move UL / OL to top level, outside P
  106     # - Put sub-UL / sub-OL inside LI element
  107     # - Make indent blocks their own para
  108     #
  109     # <p><ul>...</ul></p> --> <ul>...</ul>
  110     # <p><ul>...</ul>.. --> <ul>...</ul><p>..
  111     # ..<ul>...</ul>.. --> ..</p><ul>...</ul><p>..
  112     # ..<ul>...</ul></p> --> ..</p><ul>...</ul>
  113     #
  114 
  115     tokeniter = iter(tokens)
  116     newtokens = []
  117     for t in tokeniter:
  118         if t[0] in (NUMBEREDLIST, BULLETLIST):
  119             if newtokens[-1][0] == PARAGRAPH:
  120                 newtokens.pop()
  121             else:
  122                 newtokens.append((END, PARAGRAPH))
  123 
  124             newtokens.append(t)
  125             newtokens.extend(_changeList(tokeniter))
  126 
  127             nexttoken = next(tokeniter)
  128             while nexttoken[0] in (BULLETLIST, NUMBEREDLIST):
  129                 # edge case due to messed up indenting: jumping back to
  130                 # lower level than start of list will cause new list
  131                 newtokens.append(nexttoken)
  132                 newtokens.extend(_changeList(tokeniter))
  133                 nexttoken = next(tokeniter)
  134 
  135             assert not (nexttoken[0] == END and nexttoken[1] in (BULLETLIST, NUMBEREDLIST))
  136 
  137             if nexttoken == (END, PARAGRAPH):
  138                 pass
  139             else:
  140                 newtokens.append((PARAGRAPH, None))
  141                 newtokens.append(nexttoken)
  142         else:
  143             newtokens.append(t)
  144 
  145     return newtokens
  146 
  147 def _changeList(tokeniter):
  148     # </li><ul>...</ul> --> <ul>...</ul></li>
  149     newtokens = []
  150     for t in tokeniter:
  151         if t[0] in (NUMBEREDLIST, BULLETLIST):
  152             if newtokens:
  153                 listend = newtokens.pop()
  154                 if not listend == (END, LISTITEM):
  155                     raise AssertionError
  156                 newtokens.append(t)
  157                 newtokens.extend(_changeList(tokeniter)) # recurs
  158                 newtokens.append(listend)
  159             else:
  160                 # edge case, list skipped a level without LISTITEM -- remove
  161                 # one nesting level by recursing while dropping start and end
  162                 newtokens.extend(_changeList(tokeniter)) # recurs
  163                 if not newtokens.pop() == (END, t[0]):
  164                     raise AssertionError
  165         else:
  166             newtokens.append(t)
  167 
  168         if t[0] == END and t[1] in (NUMBEREDLIST, BULLETLIST):
  169             break
  170 
  171     return newtokens
  172 
  173 
  174 def reverseTopLevelLists(tokens):
  175     # Undo effect of topLevelLists()
  176     #
  177     # <br><ul>...</ul><br> --> <p><ul>...</ul></p>
  178     # <br><ul>...</ul><p>.. --> <p><ul>...</ul>..
  179     # ..</p><ul>...</ul><p>.. ..<ul>...</ul>..
  180     # ..</p><ul>...</ul><br> --> ..<ul>...</ul></p>
  181     #
  182 
  183     def isbr(token):
  184         return token[0] == TEXT and token[1].isspace() and '\n' in token[1]
  185 
  186     tokeniter = iter(tokens)
  187     newtokens = []
  188     for t in tokeniter:
  189         if t[0] in (NUMBEREDLIST, BULLETLIST):
  190             if newtokens and newtokens[-1] == (END, PARAGRAPH):
  191                 newtokens.pop()
  192             else:
  193                 newtokens.append((PARAGRAPH, None))
  194 
  195             newtokens.append(t)
  196             newtokens.extend(_reverseChangeList(tokeniter))
  197 
  198             nexttoken = next(tokeniter)
  199             if nexttoken[0] in (BULLETLIST, NUMBEREDLIST) \
  200             or nexttoken[0] == END and nexttoken[1] in (BULLETLIST, NUMBEREDLIST):
  201                 raise AssertionError
  202 
  203             if nexttoken[0] == PARAGRAPH:
  204                 pass
  205             else:
  206                 newtokens.append((END, PARAGRAPH))
  207                 newtokens.append(nexttoken)
  208         else:
  209             newtokens.append(t)
  210 
  211     return newtokens
  212 
  213 
  214 def _reverseChangeList(tokeniter):
  215     # <ul>...</ul></li> --> </li><ul>...</ul>
  216     newtokens = []
  217     for t in tokeniter:
  218         if t[0] in (NUMBEREDLIST, BULLETLIST):
  219             listtokens = _reverseChangeList(tokeniter) # recurs
  220             liend = next(tokeniter)
  221             if not liend == (END, LISTITEM):
  222                 raise AssertionError
  223             newtokens.append(liend)
  224             newtokens.append(t)
  225             newtokens.extend(listtokens)
  226         else:
  227             newtokens.append(t)
  228 
  229         if t[0] == END and t[1] in (NUMBEREDLIST, BULLETLIST):
  230             break
  231 
  232     return newtokens
  233 
  234 
  235 def testTokenStream(token_iter):
  236     nesting = []
  237     for t in token_iter:
  238         assert isinstance(t, tuple) and len(t) == 2, 'Malformed token'
  239         if t[0] == END:
  240             assert nesting[-1] == t[1], 'Got /%s, expected /%s' % (t[1], nesting[-1])
  241             nesting.pop()
  242         elif t[0] == TEXT:
  243             assert isinstance(t[1], str), 'Wrong type for text'
  244             assert not '\n' in t[1][:-1], 'Text token should not cross line break: %r' % (t,)
  245         else:
  246             assert t[1] is None or isinstance(t[1], dict), 'Wrong type for attributes'
  247 
  248             if t[0] in (BULLETLIST, NUMBEREDLIST):
  249                 assert PARAGRAPH not in nesting, 'Lists should not appear inside paragraphs'
  250             elif t[0] == PARAGRAPH:
  251                 assert len(nesting) == 1, 'Paragraphs should only appear in top level - got %r' % nesting
  252             # TODO more semantic rules
  253 
  254             nesting.append(t[0])
  255 
  256     assert len(nesting) == 0, 'Open tags: %r' % nesting