"Fossies" - the Fresh Open Source Software Archive

Member "roundup-2.0.0/roundup/cgi/TAL/HTMLParser.py" (26 Aug 2019, 13853 Bytes) of package /linux/www/roundup-2.0.0.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Python source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. See also the latest Fossies "Diffs" side-by-side code changes report for "HTMLParser.py": 1.6.1_vs_2.0.0.

    1 """A parser for HTML and XHTML."""
    2 
    3 # This file is based on sgmllib.py, but the API is slightly different.
    4 
    5 # XXX There should be a way to distinguish between PCDATA (parsed
    6 # character data -- the normal case), RCDATA (replaceable character
    7 # data -- only char and entity references and end tags are special)
    8 # and CDATA (character data -- only end tags are special).
    9 
   10 
   11 from . import markupbase
   12 import re
   13 
   14 # Regular expressions used for parsing
   15 
   16 interesting_normal = re.compile('[&<]')
   17 interesting_cdata = re.compile(r'<(/|\Z)')
   18 incomplete = re.compile('&[a-zA-Z#]')
   19 
   20 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
   21 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
   22 
   23 starttagopen = re.compile('<[a-zA-Z]')
   24 piclose = re.compile('>')
   25 endtagopen = re.compile('</')
   26 commentclose = re.compile(r'--\s*>')
   27 tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
   28 attrfind = re.compile(
   29     r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
   30     r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?')
   31 
   32 locatestarttagend = re.compile(r"""
   33   <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
   34   (?:\s+                             # whitespace before attribute name
   35     (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
   36       (?:\s*=\s*                     # value indicator
   37         (?:'[^']*'                   # LITA-enclosed value
   38           |\"[^\"]*\"                # LIT-enclosed value
   39           |[^'\">\s]+                # bare value
   40          )
   41        )?
   42      )
   43    )*
   44   \s*                                # trailing whitespace
   45 """, re.VERBOSE)
   46 endendtag = re.compile(r'>')
   47 endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
   48 
   49 
   50 class HTMLParseError(BaseException):
   51     """Exception raised for all parse errors."""
   52 
   53     def __init__(self, msg, position=(None, None)):
   54         assert msg
   55         self.msg = msg
   56         self.lineno = position[0]
   57         self.offset = position[1]
   58 
   59     def __str__(self):
   60         result = self.msg
   61         if self.lineno is not None:
   62             result = result + ", at line %d" % self.lineno
   63         if self.offset is not None:
   64             result = result + ", column %d" % (self.offset + 1)
   65         return result
   66 
   67 
   68 def _contains_at(s, sub, pos):
   69     return s[pos:pos+len(sub)] == sub
   70 
   71 
   72 class HTMLParser(markupbase.ParserBase):
   73     """Find tags and other markup and call handler functions.
   74 
   75     Usage:
   76         p = HTMLParser()
   77         p.feed(data)
   78         ...
   79         p.close()
   80 
   81     Start tags are handled by calling self.handle_starttag() or
   82     self.handle_startendtag(); end tags by self.handle_endtag().  The
   83     data between tags is passed from the parser to the derived class
   84     by calling self.handle_data() with the data as argument (the data
   85     may be split up in arbitrary chunks).  Entity references are
   86     passed by calling self.handle_entityref() with the entity
   87     reference as the argument.  Numeric character references are
   88     passed to self.handle_charref() with the string containing the
   89     reference as the argument.
   90     """
   91 
   92     CDATA_CONTENT_ELEMENTS = ("script", "style")
   93 
   94 
   95     def __init__(self):
   96         """Initialize and reset this instance."""
   97         self.reset()
   98 
   99     def reset(self):
  100         """Reset this instance.  Loses all unprocessed data."""
  101         self.rawdata = ''
  102         self.stack = []
  103         self.lasttag = '???'
  104         self.interesting = interesting_normal
  105         markupbase.ParserBase.reset(self)
  106 
  107     def feed(self, data):
  108         """Feed data to the parser.
  109 
  110         Call this as often as you want, with as little or as much text
  111         as you want (may include '\n').
  112         """
  113         self.rawdata = self.rawdata + data
  114         self.goahead(0)
  115 
  116     def close(self):
  117         """Handle any buffered data."""
  118         self.goahead(1)
  119 
  120     def error(self, message):
  121         raise HTMLParseError(message, self.getpos())
  122 
  123     __starttag_text = None
  124 
  125     def get_starttag_text(self):
  126         """Return full source of start tag: '<...>'."""
  127         return self.__starttag_text
  128 
  129     cdata_endtag = None
  130 
  131     def set_cdata_mode(self, endtag=None):
  132         self.cdata_endtag = endtag
  133         self.interesting = interesting_cdata
  134 
  135     def clear_cdata_mode(self):
  136         self.cdata_endtag = None
  137         self.interesting = interesting_normal
  138 
  139     # Internal -- handle data as far as reasonable.  May leave state
  140     # and data to be processed by a subsequent call.  If 'end' is
  141     # true, force handling all data as if followed by EOF marker.
  142     def goahead(self, end):
  143         rawdata = self.rawdata
  144         i = 0
  145         n = len(rawdata)
  146         while i < n:
  147             match = self.interesting.search(rawdata, i) # < or &
  148             if match:
  149                 j = match.start()
  150             else:
  151                 j = n
  152             if i < j: self.handle_data(rawdata[i:j])
  153             i = self.updatepos(i, j)
  154             if i == n: break
  155             if rawdata[i] == '<':
  156                 if starttagopen.match(rawdata, i): # < + letter
  157                     k = self.parse_starttag(i)
  158                 elif endtagopen.match(rawdata, i): # </
  159                     k = self.parse_endtag(i)
  160                 elif _contains_at(rawdata, "<!--", i): # <!--
  161                     k = self.parse_comment(i)
  162                 elif _contains_at(rawdata, "<!", i): # <!
  163                     k = self.parse_declaration(i)
  164                 elif _contains_at(rawdata, "<?", i): # <?
  165                     k = self.parse_pi(i)
  166                 elif _contains_at(rawdata, "<?", i): # <!
  167                     k = self.parse_declaration(i)
  168                 elif (i + 1) < n:
  169                     self.handle_data("<")
  170                     k = i + 1
  171                 else:
  172                     break
  173                 if k < 0:
  174                     if end:
  175                         self.error("EOF in middle of construct")
  176                     break
  177                 i = self.updatepos(i, k)
  178             elif rawdata[i:i+2] == "&#":
  179                 match = charref.match(rawdata, i)
  180                 if match:
  181                     name = match.group()[2:-1]
  182                     self.handle_charref(name)
  183                     k = match.end()
  184                     if rawdata[k-1] != ';':
  185                         k = k - 1
  186                     i = self.updatepos(i, k)
  187                     continue
  188                 else:
  189                     break
  190             elif rawdata[i] == '&':
  191                 match = entityref.match(rawdata, i)
  192                 if match:
  193                     name = match.group(1)
  194                     self.handle_entityref(name)
  195                     k = match.end()
  196                     if rawdata[k-1] != ';':
  197                         k = k - 1
  198                     i = self.updatepos(i, k)
  199                     continue
  200                 match = incomplete.match(rawdata, i)
  201                 if match:
  202                     # match.group() will contain at least 2 chars
  203                     rest = rawdata[i:]
  204                     if end and match.group() == rest:
  205                         self.error("EOF in middle of entity or char ref")
  206                     # incomplete
  207                     break
  208                 elif (i + 1) < n:
  209                     # not the end of the buffer, and can't be confused
  210                     # with some other construct
  211                     self.handle_data("&")
  212                     i = self.updatepos(i, i + 1)
  213                 else:
  214                     break
  215             else:
  216                 assert 0, "interesting.search() lied"
  217         # end while
  218         if end and i < n:
  219             self.handle_data(rawdata[i:n])
  220             i = self.updatepos(i, n)
  221         self.rawdata = rawdata[i:]
  222 
  223     # Internal -- parse comment, return end or -1 if not terminated
  224     def parse_comment(self, i, report=1):
  225         rawdata = self.rawdata
  226         assert rawdata[i:i+4] == '<!--', 'unexpected call to parse_comment()'
  227         match = commentclose.search(rawdata, i+4)
  228         if not match:
  229             return -1
  230         if report:
  231             j = match.start()
  232             self.handle_comment(rawdata[i+4: j])
  233         j = match.end()
  234         return j
  235 
  236     # Internal -- parse processing instr, return end or -1 if not terminated
  237     def parse_pi(self, i):
  238         rawdata = self.rawdata
  239         assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
  240         match = piclose.search(rawdata, i+2) # >
  241         if not match:
  242             return -1
  243         j = match.start()
  244         self.handle_pi(rawdata[i+2: j])
  245         j = match.end()
  246         return j
  247 
  248     # Internal -- handle starttag, return end or -1 if not terminated
  249     def parse_starttag(self, i):
  250         self.__starttag_text = None
  251         endpos = self.check_for_whole_start_tag(i)
  252         if endpos < 0:
  253             return endpos
  254         rawdata = self.rawdata
  255         self.__starttag_text = rawdata[i:endpos]
  256 
  257         # Now parse the data between i+1 and j into a tag and attrs
  258         attrs = []
  259         match = tagfind.match(rawdata, i+1)
  260         assert match, 'unexpected call to parse_starttag()'
  261         k = match.end()
  262         self.lasttag = tag = rawdata[i+1:k].lower()
  263 
  264         while k < endpos:
  265             m = attrfind.match(rawdata, k)
  266             if not m:
  267                 break
  268             attrname, rest, attrvalue = m.group(1, 2, 3)
  269             if not rest:
  270                 attrvalue = None
  271             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
  272                  attrvalue[:1] == '"' == attrvalue[-1:]:
  273                 attrvalue = attrvalue[1:-1]
  274                 attrvalue = self.unescape(attrvalue)
  275             attrs.append((attrname.lower(), attrvalue))
  276             k = m.end()
  277 
  278         end = rawdata[k:endpos].strip()
  279         if end not in (">", "/>"):
  280             lineno, offset = self.getpos()
  281             if "\n" in self.__starttag_text:
  282                 lineno = lineno + self.__starttag_text.count("\n")
  283                 offset = len(self.__starttag_text) \
  284                          - self.__starttag_text.rfind("\n")
  285             else:
  286                 offset = offset + len(self.__starttag_text)
  287             self.error("junk characters in start tag: %s"
  288                        % repr(rawdata[k:endpos][:20]))
  289         if end[-2:] == '/>':
  290             # XHTML-style empty tag: <span attr="value" />
  291             self.handle_startendtag(tag, attrs)
  292         else:
  293             self.handle_starttag(tag, attrs)
  294             if tag in self.CDATA_CONTENT_ELEMENTS:
  295                 self.set_cdata_mode(tag)
  296         return endpos
  297 
  298     # Internal -- check to see if we have a complete starttag; return end
  299     # or -1 if incomplete.
  300     def check_for_whole_start_tag(self, i):
  301         rawdata = self.rawdata
  302         m = locatestarttagend.match(rawdata, i)
  303         if m:
  304             j = m.end()
  305             next = rawdata[j:j+1]
  306             if next == ">":
  307                 return j + 1
  308             if next == "/":
  309                 s = rawdata[j:j+2]
  310                 if s == "/>":
  311                     return j + 2
  312                 if s == "/":
  313                     # buffer boundary
  314                     return -1
  315                 # else bogus input
  316                 self.updatepos(i, j + 1)
  317                 self.error("malformed empty start tag")
  318             if next == "":
  319                 # end of input
  320                 return -1
  321             if next in ("abcdefghijklmnopqrstuvwxyz=/"
  322                         "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
  323                 # end of input in or before attribute value, or we have the
  324                 # '/' from a '/>' ending
  325                 return -1
  326             self.updatepos(i, j)
  327             self.error("malformed start tag")
  328         raise AssertionError("we should not get here!")
  329 
  330     # Internal -- parse endtag, return end or -1 if incomplete
  331     def parse_endtag(self, i):
  332         rawdata = self.rawdata
  333         assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
  334         match = endendtag.search(rawdata, i+1) # >
  335         if not match:
  336             return -1
  337         j = match.end()
  338         match = endtagfind.match(rawdata, i) # </ + tag + >
  339         if not match:
  340             self.error("bad end tag: %s" % repr(rawdata[i:j]))
  341         tag = match.group(1).lower()
  342         if (  self.cdata_endtag is not None
  343               and tag != self.cdata_endtag):
  344             # Should be a mismatched end tag, but we'll treat it
  345             # as text anyway, since most HTML authors aren't
  346             # interested in the finer points of syntax.
  347             self.handle_data(match.group(0))
  348         else:
  349             self.handle_endtag(tag)
  350             self.clear_cdata_mode()
  351         return j
  352 
  353     # Overridable -- finish processing of start+end tag: <tag.../>
  354     def handle_startendtag(self, tag, attrs):
  355         self.handle_starttag(tag, attrs)
  356         self.handle_endtag(tag)
  357 
  358     # Overridable -- handle start tag
  359     def handle_starttag(self, tag, attrs):
  360         pass
  361 
  362     # Overridable -- handle end tag
  363     def handle_endtag(self, tag):
  364         pass
  365 
  366     # Overridable -- handle character reference
  367     def handle_charref(self, name):
  368         pass
  369 
  370     # Overridable -- handle entity reference
  371     def handle_entityref(self, name):
  372         pass
  373 
  374     # Overridable -- handle data
  375     def handle_data(self, data):
  376         pass
  377 
  378     # Overridable -- handle comment
  379     def handle_comment(self, data):
  380         pass
  381 
  382     # Overridable -- handle declaration
  383     def handle_decl(self, decl):
  384         pass
  385 
  386     # Overridable -- handle processing instruction
  387     def handle_pi(self, data):
  388         pass
  389 
  390     def unknown_decl(self, data):
  391         self.error("unknown declaration: " + repr(data))
  392 
  393     # Internal -- helper to remove special character quoting
  394     def unescape(self, s):
  395         if '&' not in s:
  396             return s
  397         s = s.replace("&lt;", "<")
  398         s = s.replace("&gt;", ">")
  399         s = s.replace("&apos;", "'")
  400         s = s.replace("&quot;", '"')
  401         s = s.replace("&amp;", "&") # Must be last
  402         return s