"Fossies" - the Fresh Open Source Software Archive

Member "xhtml2pdf-0.2.5/xhtml2pdf/parser.py" (25 Sep 2020, 26117 Bytes) of package /linux/www/xhtml2pdf-0.2.5.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Python source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. For more information about "parser.py" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 0.2.4_vs_0.2.5.

    1 # -*- coding: utf-8 -*-
    2 
    3 # Copyright 2010 Dirk Holtwick, holtwick.it
    4 #
    5 # Licensed under the Apache License, Version 2.0 (the "License");
    6 # you may not use this file except in compliance with the License.
    7 # You may obtain a copy of the License at
    8 #
    9 #     http://www.apache.org/licenses/LICENSE-2.0
   10 #
   11 # Unless required by applicable law or agreed to in writing, software
   12 # distributed under the License is distributed on an "AS IS" BASIS,
   13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   14 # See the License for the specific language governing permissions and
   15 # limitations under the License.
   16 from __future__ import print_function, unicode_literals
   17 
   18 import copy
   19 import logging
   20 import re
   21 from xml.dom import Node
   22 import xml.dom.minidom
   23 
   24 from html5lib import treebuilders  # , inputstream
   25 import html5lib
   26 from reportlab.platypus.doctemplate import NextPageTemplate, FrameBreak
   27 from reportlab.platypus.flowables import PageBreak, KeepInFrame
   28 import six
   29 
   30 from xhtml2pdf.default import BOX, POS, MUST, FONT
   31 from xhtml2pdf.default import TAGS, STRING, INT, BOOL, SIZE, COLOR, FILE
   32 from xhtml2pdf.tables import *  # TODO: Kill wild import!
   33 from xhtml2pdf.tags import *  # TODO: Kill wild import!
   34 from xhtml2pdf.util import getBox, getPos, pisaTempFile, transform_attrs
   35 from xhtml2pdf.util import getSize, getBool, toList, getColor, getAlign
   36 import xhtml2pdf.w3c.cssDOMElementInterface as cssDOMElementInterface
   37 from xhtml2pdf.xhtml2pdf_reportlab import PmlRightPageBreak, PmlLeftPageBreak
   38 
   39 
   40 
   41 CSSAttrCache = {}
   42 
   43 log = logging.getLogger("xhtml2pdf")
   44 
   45 rxhttpstrip = re.compile("https?://[^/]+(.*)", re.M | re.I)
   46 
   47 
   48 class AttrContainer(dict):
   49 
   50     def __getattr__(self, name):
   51         try:
   52             return dict.__getattr__(self, name)
   53         except:
   54             return self[name]
   55 
   56 
   57 def pisaGetAttributes(c, tag, attributes):
   58     global TAGS
   59 
   60     attrs = {}
   61     if attributes:
   62         for k, v in attributes.items():
   63             try:
   64                 # XXX no Unicode! Reportlab fails with template names
   65                 attrs[str(k)] = str(v)
   66             except:
   67                 attrs[k] = v
   68 
   69     nattrs = {}
   70     if tag in TAGS:
   71         block, adef = TAGS[tag]
   72         adef["id"] = STRING
   73 
   74         for k, v in six.iteritems(adef):
   75             nattrs[k] = None
   76             # print k, v
   77             # defaults, wenn vorhanden
   78             if type(v) == tuple:
   79                 if v[1] == MUST:
   80                     if k not in attrs:
   81                         log.warning(
   82                             c.warning("Attribute '%s' must be set!", k))
   83                         nattrs[k] = None
   84                         continue
   85                 nv = attrs.get(k, v[1])
   86                 dfl = v[1]
   87                 v = v[0]
   88             else:
   89                 nv = attrs.get(k, None)
   90                 dfl = None
   91 
   92             if nv is not None:
   93                 if type(v) == list:
   94                     nv = nv.strip().lower()
   95                     if nv not in v:
   96                         #~ raise PML_EXCEPTION, "attribute '%s' of wrong value, allowed is one of: %s" % (k, repr(v))
   97                         log.warning(
   98                             c.warning("Attribute '%s' of wrong value, allowed is one of: %s", k, repr(v)))
   99                         nv = dfl
  100 
  101                 elif v == BOOL:
  102                     nv = nv.strip().lower()
  103                     nv = nv in ("1", "y", "yes", "true", str(k))
  104 
  105                 elif v == SIZE:
  106                     try:
  107                         nv = getSize(nv)
  108                     except:
  109                         log.warning(
  110                             c.warning("Attribute '%s' expects a size value", k))
  111 
  112                 elif v == BOX:
  113                     nv = getBox(nv, c.pageSize)
  114 
  115                 elif v == POS:
  116                     nv = getPos(nv, c.pageSize)
  117 
  118                 elif v == INT:
  119                     nv = int(nv)
  120 
  121                 elif v == COLOR:
  122                     nv = getColor(nv)
  123 
  124                 elif v == FILE:
  125                     nv = c.getFile(nv)
  126 
  127                 elif v == FONT:
  128                     nv = c.getFontName(nv)
  129 
  130                 nattrs[k] = nv
  131 
  132     return AttrContainer(nattrs)
  133 
  134 
  135 attrNames = '''
  136     color
  137     font-family
  138     font-size
  139     font-weight
  140     font-style
  141     text-decoration
  142     line-height
  143     letter-spacing
  144     background-color
  145     display
  146     margin-left
  147     margin-right
  148     margin-top
  149     margin-bottom
  150     padding-left
  151     padding-right
  152     padding-top
  153     padding-bottom
  154     border-top-color
  155     border-top-style
  156     border-top-width
  157     border-bottom-color
  158     border-bottom-style
  159     border-bottom-width
  160     border-left-color
  161     border-left-style
  162     border-left-width
  163     border-right-color
  164     border-right-style
  165     border-right-width
  166     text-align
  167     vertical-align
  168     width
  169     height
  170     zoom
  171     page-break-after
  172     page-break-before
  173     list-style-type
  174     list-style-image
  175     white-space
  176     text-indent
  177     -pdf-page-break
  178     -pdf-frame-break
  179     -pdf-next-page
  180     -pdf-keep-with-next
  181     -pdf-outline
  182     -pdf-outline-level
  183     -pdf-outline-open
  184     -pdf-line-spacing
  185     -pdf-keep-in-frame-mode
  186     -pdf-word-wrap
  187     '''.strip().split()
  188 
  189 
  190 def getCSSAttr(self, cssCascade, attrName, default=NotImplemented):
  191     if attrName in self.cssAttrs:
  192         return self.cssAttrs[attrName]
  193 
  194     try:
  195         result = cssCascade.findStyleFor(self.cssElement, attrName, default)
  196     except LookupError:
  197         result = None
  198 
  199     # XXX Workaround for inline styles
  200     try:
  201         style = self.cssStyle
  202     except:
  203         style = self.cssStyle = cssCascade.parser.parseInline(
  204             self.cssElement.getStyleAttr() or '')[0]
  205     if attrName in style:
  206         result = style[attrName]
  207 
  208     if result == 'inherit':
  209         if hasattr(self.parentNode, 'getCSSAttr'):
  210             result = self.parentNode.getCSSAttr(cssCascade, attrName, default)
  211         elif default is not NotImplemented:
  212             return default
  213         raise LookupError(
  214             "Could not find inherited CSS attribute value for '%s'" % (attrName,))
  215 
  216     if result is not None:
  217         self.cssAttrs[attrName] = result
  218     return result
  219 
  220 
  221 # TODO: Monkeypatching standard lib should go away.
  222 xml.dom.minidom.Element.getCSSAttr = getCSSAttr
  223 
  224 # Create an aliasing system.  Many sources use non-standard tags, because browsers allow
  225 # them to.  This allows us to map a nonstandard name to the standard one.
  226 nonStandardAttrNames = {
  227     'bgcolor': 'background-color',
  228 }
  229 
  230 
  231 def mapNonStandardAttrs(c, n, attrList):
  232     for attr in nonStandardAttrNames:
  233         if attr in attrList and nonStandardAttrNames[attr] not in c:
  234             c[nonStandardAttrNames[attr]] = attrList[attr]
  235     return c
  236 
  237 
  238 def getCSSAttrCacheKey(node):
  239     _cl = _id = _st = ''
  240     for k, v in node.attributes.items():
  241         if k == 'class':
  242             _cl = v
  243         elif k == 'id':
  244             _id = v
  245         elif k == 'style':
  246             _st = v
  247     return "%s#%s#%s#%s#%s" % (id(node.parentNode), node.tagName.lower(), _cl, _id, _st)
  248 
  249 
  250 def CSSCollect(node, c):
  251     #node.cssAttrs = {}
  252     # return node.cssAttrs
  253 
  254     if c.css:
  255 
  256         _key = getCSSAttrCacheKey(node)
  257 
  258         if hasattr(node.parentNode, "tagName"):
  259             if node.parentNode.tagName.lower() != "html":
  260                 CachedCSSAttr = CSSAttrCache.get(_key, None)
  261                 if CachedCSSAttr is not None:
  262                     node.cssAttrs = CachedCSSAttr
  263                     return CachedCSSAttr
  264 
  265         node.cssElement = cssDOMElementInterface.CSSDOMElementInterface(node)
  266         node.cssAttrs = {}
  267         # node.cssElement.onCSSParserVisit(c.cssCascade.parser)
  268         cssAttrMap = {}
  269         for cssAttrName in attrNames:
  270             try:
  271                 cssAttrMap[cssAttrName] = node.getCSSAttr(
  272                     c.cssCascade, cssAttrName)
  273             # except LookupError:
  274             #    pass
  275             except Exception:  # TODO: Kill this catch-all!
  276                 log.debug("CSS error '%s'", cssAttrName, exc_info=1)
  277 
  278         CSSAttrCache[_key] = node.cssAttrs
  279     return node.cssAttrs
  280 
  281 
  282 def lower(sequence):
  283     if isinstance(sequence, six.string_types):
  284         return sequence.lower()
  285     else:
  286         return sequence[0].lower()
  287 
  288 
  289 def CSS2Frag(c, kw, isBlock):
  290     # COLORS
  291     if "color" in c.cssAttr:
  292         c.frag.textColor = getColor(c.cssAttr["color"])
  293     if "background-color" in c.cssAttr:
  294         c.frag.backColor = getColor(c.cssAttr["background-color"])
  295         # FONT SIZE, STYLE, WEIGHT
  296     if "font-family" in c.cssAttr:
  297         c.frag.fontName = c.getFontName(c.cssAttr["font-family"])
  298     if "font-size" in c.cssAttr:
  299         # XXX inherit
  300         c.frag.fontSize = max(
  301             getSize("".join(c.cssAttr["font-size"]), c.frag.fontSize, c.baseFontSize), 1.0)
  302     if "line-height" in c.cssAttr:
  303         leading = "".join(c.cssAttr["line-height"])
  304         c.frag.leading = getSize(leading, c.frag.fontSize)
  305         c.frag.leadingSource = leading
  306     else:
  307         c.frag.leading = getSize(c.frag.leadingSource, c.frag.fontSize)
  308     if "letter-spacing" in c.cssAttr:
  309         c.frag.letterSpacing = c.cssAttr["letter-spacing"]
  310     if "-pdf-line-spacing" in c.cssAttr:
  311         c.frag.leadingSpace = getSize("".join(c.cssAttr["-pdf-line-spacing"]))
  312         # print "line-spacing", c.cssAttr["-pdf-line-spacing"], c.frag.leading
  313     if "font-weight" in c.cssAttr:
  314         value = lower(c.cssAttr["font-weight"])
  315         if value in ("bold", "bolder", "500", "600", "700", "800", "900"):
  316             c.frag.bold = 1
  317         else:
  318             c.frag.bold = 0
  319     for value in toList(c.cssAttr.get("text-decoration", "")):
  320         if "underline" in value:
  321             c.frag.underline = 1
  322         if "line-through" in value:
  323             c.frag.strike = 1
  324         if "none" in value:
  325             c.frag.underline = 0
  326             c.frag.strike = 0
  327     if "font-style" in c.cssAttr:
  328         value = lower(c.cssAttr["font-style"])
  329         if value in ("italic", "oblique"):
  330             c.frag.italic = 1
  331         else:
  332             c.frag.italic = 0
  333     if "white-space" in c.cssAttr:
  334         # normal | pre | nowrap
  335         c.frag.whiteSpace = str(c.cssAttr["white-space"]).lower()
  336         # ALIGN & VALIGN
  337     if "text-align" in c.cssAttr:
  338         c.frag.alignment = getAlign(c.cssAttr["text-align"])
  339     if "vertical-align" in c.cssAttr:
  340         c.frag.vAlign = c.cssAttr["vertical-align"]
  341         # HEIGHT & WIDTH
  342     if "height" in c.cssAttr:
  343         try:
  344             # XXX Relative is not correct!
  345             c.frag.height = "".join(toList(c.cssAttr["height"]))
  346         except TypeError:
  347             # sequence item 0: expected string, tuple found
  348             c.frag.height = "".join(toList(c.cssAttr["height"][0]))
  349         if c.frag.height in ("auto",):
  350             c.frag.height = None
  351     if "width" in c.cssAttr:
  352         try:
  353             # XXX Relative is not correct!
  354             c.frag.width = "".join(toList(c.cssAttr["width"]))
  355         except TypeError:
  356             c.frag.width = "".join(toList(c.cssAttr["width"][0]))
  357         if c.frag.width in ("auto",):
  358             c.frag.width = None
  359         # ZOOM
  360     if "zoom" in c.cssAttr:
  361         # XXX Relative is not correct!
  362         zoom = "".join(toList(c.cssAttr["zoom"]))
  363         if zoom.endswith("%"):
  364             zoom = float(zoom[: - 1]) / 100.0
  365         c.frag.zoom = float(zoom)
  366         # MARGINS & LIST INDENT, STYLE
  367     if isBlock:
  368         transform_attrs(c.frag,
  369                         (("spaceBefore", "margin-top"),
  370                          ("spaceAfter", "margin-bottom"),
  371                          ("firstLineIndent", "text-indent"),
  372                          ),
  373                         c.cssAttr,
  374                         getSize,
  375                         extras=c.frag.fontSize
  376                         )
  377 
  378         if "margin-left" in c.cssAttr:
  379             c.frag.bulletIndent = kw["margin-left"]  # For lists
  380             kw["margin-left"] += getSize(c.cssAttr["margin-left"],
  381                                          c.frag.fontSize)
  382             c.frag.leftIndent = kw["margin-left"]
  383         if "margin-right" in c.cssAttr:
  384             kw["margin-right"] += getSize(
  385                 c.cssAttr["margin-right"], c.frag.fontSize)
  386             c.frag.rightIndent = kw["margin-right"]
  387 
  388         if "list-style-type" in c.cssAttr:
  389             c.frag.listStyleType = str(c.cssAttr["list-style-type"]).lower()
  390         if "list-style-image" in c.cssAttr:
  391             c.frag.listStyleImage = c.getFile(c.cssAttr["list-style-image"])
  392         # PADDINGS
  393     if isBlock:
  394         transform_attrs(c.frag,
  395                         (("paddingTop", "padding-top"),
  396                          ("paddingBottom", "padding-bottom"),
  397                          ("paddingLeft", "padding-left"),
  398                          ("paddingRight", "padding-right"),
  399                          ),
  400                         c.cssAttr,
  401                         getSize,
  402                         extras=c.frag.fontSize
  403                         )
  404 
  405         # BORDERS
  406     if isBlock:
  407         transform_attrs(c.frag,
  408                         (("borderTopWidth", "border-top-width"),
  409                          ("borderBottomWidth", "border-bottom-width"),
  410                          ("borderLeftWidth", "border-left-width"),
  411                          ("borderRightWidth", "border-right-width"),
  412                          ),
  413                         c.cssAttr,
  414                         getSize,
  415                         extras=c.frag.fontSize
  416                         )
  417         transform_attrs(c.frag,
  418                         (
  419                             ("borderTopStyle", "border-top-style"),
  420                             ("borderBottomStyle", "border-bottom-style"),
  421                             ("borderLeftStyle", "border-left-style"),
  422                             ("borderRightStyle", "border-right-style")
  423                         ),
  424                         c.cssAttr,
  425                         lambda x: x
  426                         )
  427 
  428         transform_attrs(c.frag,
  429                         (
  430                             ("borderTopColor", "border-top-color"),
  431                             ("borderBottomColor", "border-bottom-color"),
  432                             ("borderLeftColor", "border-left-color"),
  433                             ("borderRightColor",  "border-right-color")
  434                         ),
  435                         c.cssAttr,
  436                         getColor
  437                         )
  438 
  439 
  440 def pisaPreLoop(node, context, collect=False):
  441     """
  442     Collect all CSS definitions
  443     """
  444 
  445     data = u""
  446     if node.nodeType == Node.TEXT_NODE and collect:
  447         data = node.data
  448 
  449     elif node.nodeType == Node.ELEMENT_NODE:
  450         name = node.tagName.lower()
  451 
  452         if name in ("style", "link"):
  453             attr = pisaGetAttributes(context, name, node.attributes)
  454             media = [x.strip()
  455                      for x in attr.media.lower().split(",") if x.strip()]
  456 
  457             if attr.get("type", "").lower() in ("", "text/css") and \
  458                     (not media or "all" in media or "print" in media or "pdf" in media):
  459 
  460                 if name == "style":
  461                     for node in node.childNodes:
  462                         data += pisaPreLoop(node, context, collect=True)
  463                     context.addCSS(data)
  464                     return u""
  465 
  466                 if name == "link" and attr.href and attr.rel.lower() == "stylesheet":
  467                     # print "CSS LINK", attr
  468                     context.addCSS('\n@import "%s" %s;' %
  469                                    (attr.href, ",".join(media)))
  470 
  471     for node in node.childNodes:
  472         result = pisaPreLoop(node, context, collect=collect)
  473         if collect:
  474             data += result
  475 
  476     return data
  477 
  478 
  479 def pisaLoop(node, context, path=None, **kw):
  480     if path is None:
  481         path = []
  482 
  483     # Initialize KW
  484     if not kw:
  485         kw = {
  486             "margin-top": 0,
  487             "margin-bottom": 0,
  488             "margin-left": 0,
  489             "margin-right": 0,
  490         }
  491     else:
  492         kw = copy.copy(kw)
  493 
  494     # indent = len(path) * "  " # only used for debug print statements
  495 
  496     # TEXT
  497     if node.nodeType == Node.TEXT_NODE:
  498         # print indent, "#", repr(node.data) #, context.frag
  499         context.addFrag(node.data)
  500         # context.text.append(node.value)
  501 
  502     # ELEMENT
  503     elif node.nodeType == Node.ELEMENT_NODE:
  504 
  505         node.tagName = node.tagName.replace(":", "").lower()
  506 
  507         if node.tagName in ("style", "script"):
  508             return
  509 
  510         path = copy.copy(path) + [node.tagName]
  511 
  512         # Prepare attributes
  513         attr = pisaGetAttributes(context, node.tagName, node.attributes)
  514         # log.debug(indent + "<%s %s>" % (node.tagName, attr) +
  515         # repr(node.attributes.items())) #, path
  516 
  517         # Calculate styles
  518         context.cssAttr = CSSCollect(node, context)
  519         context.cssAttr = mapNonStandardAttrs(context.cssAttr, node, attr)
  520         context.node = node
  521 
  522         # Block?
  523         PAGE_BREAK = 1
  524         PAGE_BREAK_RIGHT = 2
  525         PAGE_BREAK_LEFT = 3
  526 
  527         pageBreakAfter = False
  528         frameBreakAfter = False
  529         display = lower(context.cssAttr.get("display", "inline"))
  530         # print indent, node.tagName, display,
  531         # context.cssAttr.get("background-color", None), attr
  532         isBlock = (display == "block")
  533 
  534         if isBlock:
  535             context.addPara()
  536 
  537             # Page break by CSS
  538             if "-pdf-next-page" in context.cssAttr:
  539                 context.addStory(
  540                     NextPageTemplate(str(context.cssAttr["-pdf-next-page"])))
  541             if "-pdf-page-break" in context.cssAttr:
  542                 if str(context.cssAttr["-pdf-page-break"]).lower() == "before":
  543                     context.addStory(PageBreak())
  544             if "-pdf-frame-break" in context.cssAttr:
  545                 if str(context.cssAttr["-pdf-frame-break"]).lower() == "before":
  546                     context.addStory(FrameBreak())
  547                 if str(context.cssAttr["-pdf-frame-break"]).lower() == "after":
  548                     frameBreakAfter = True
  549             if "page-break-before" in context.cssAttr:
  550                 if str(context.cssAttr["page-break-before"]).lower() == "always":
  551                     context.addStory(PageBreak())
  552                 if str(context.cssAttr["page-break-before"]).lower() == "right":
  553                     context.addStory(PageBreak())
  554                     context.addStory(PmlRightPageBreak())
  555                 if str(context.cssAttr["page-break-before"]).lower() == "left":
  556                     context.addStory(PageBreak())
  557                     context.addStory(PmlLeftPageBreak())
  558             if "page-break-after" in context.cssAttr:
  559                 if str(context.cssAttr["page-break-after"]).lower() == "always":
  560                     pageBreakAfter = PAGE_BREAK
  561                 if str(context.cssAttr["page-break-after"]).lower() == "right":
  562                     pageBreakAfter = PAGE_BREAK_RIGHT
  563                 if str(context.cssAttr["page-break-after"]).lower() == "left":
  564                     pageBreakAfter = PAGE_BREAK_LEFT
  565 
  566         if display == "none":
  567             # print "none!"
  568             return
  569 
  570         # Translate CSS to frags
  571 
  572         # Save previous frag styles
  573         context.pushFrag()
  574 
  575         # Map styles to Reportlab fragment properties
  576         CSS2Frag(context, kw, isBlock)
  577 
  578         # EXTRAS
  579         transform_attrs(context.frag,
  580                         (
  581                             ("keepWithNext", "-pdf-keep-with-next"),
  582                             ("outline", "-pdf-outline"),
  583                             #("borderLeftColor", "-pdf-outline-open"),
  584                         ),
  585                         context.cssAttr,
  586                         getBool
  587                         )
  588 
  589         if "-pdf-outline-level" in context.cssAttr:
  590             context.frag.outlineLevel = int(
  591                 context.cssAttr["-pdf-outline-level"])
  592 
  593         if "-pdf-word-wrap" in context.cssAttr:
  594             context.frag.wordWrap = context.cssAttr["-pdf-word-wrap"]
  595 
  596         # handle keep-in-frame
  597         keepInFrameMode = None
  598         keepInFrameMaxWidth = 0
  599         keepInFrameMaxHeight = 0
  600         if "-pdf-keep-in-frame-mode" in context.cssAttr:
  601             value = str(
  602                 context.cssAttr["-pdf-keep-in-frame-mode"]).strip().lower()
  603             if value in ("shrink", "error", "overflow", "truncate"):
  604                 keepInFrameMode = value
  605             else:
  606                 keepInFrameMode = "shrink"
  607             # Added because we need a default value.
  608 
  609         if "-pdf-keep-in-frame-max-width" in context.cssAttr:
  610             keepInFrameMaxWidth = getSize(
  611                 "".join(context.cssAttr["-pdf-keep-in-frame-max-width"]))
  612         if "-pdf-keep-in-frame-max-height" in context.cssAttr:
  613             keepInFrameMaxHeight = getSize(
  614                 "".join(context.cssAttr["-pdf-keep-in-frame-max-height"]))
  615 
  616         # ignore nested keep-in-frames, tables have their own KIF handling
  617         keepInFrame = keepInFrameMode is not None and context.keepInFrameIndex is None
  618         if keepInFrame:
  619             # keep track of current story index, so we can wrap everythink
  620             # added after this point in a KeepInFrame
  621             context.keepInFrameIndex = len(context.story)
  622 
  623         # BEGIN tag
  624         klass = globals().get("pisaTag%s" %
  625                               node.tagName.replace(":", "").upper(), None)
  626         obj = None
  627 
  628         # Static block
  629         elementId = attr.get("id", None)
  630         staticFrame = context.frameStatic.get(elementId, None)
  631         if staticFrame:
  632             context.frag.insideStaticFrame += 1
  633             oldStory = context.swapStory()
  634 
  635         # Tag specific operations
  636         if klass is not None:
  637             obj = klass(node, attr)
  638             obj.start(context)
  639 
  640         # Visit child nodes
  641         context.fragBlock = fragBlock = copy.copy(context.frag)
  642         for nnode in node.childNodes:
  643             pisaLoop(nnode, context, path, **kw)
  644         context.fragBlock = fragBlock
  645 
  646         # END tag
  647         if obj:
  648             obj.end(context)
  649 
  650         # Block?
  651         if isBlock:
  652             context.addPara()
  653 
  654             # XXX Buggy!
  655 
  656             # Page break by CSS
  657             if pageBreakAfter:
  658                 context.addStory(PageBreak())
  659                 if pageBreakAfter == PAGE_BREAK_RIGHT:
  660                     context.addStory(PmlRightPageBreak())
  661                 if pageBreakAfter == PAGE_BREAK_LEFT:
  662                     context.addStory(PmlLeftPageBreak())
  663             if frameBreakAfter:
  664                 context.addStory(FrameBreak())
  665 
  666         if keepInFrame:
  667             # get all content added after start of -pdf-keep-in-frame and wrap
  668             # it in a KeepInFrame
  669             substory = context.story[context.keepInFrameIndex:]
  670             context.story = context.story[:context.keepInFrameIndex]
  671             context.story.append(
  672                 KeepInFrame(
  673                     content=substory,
  674                     maxWidth=keepInFrameMaxWidth,
  675                     maxHeight=keepInFrameMaxHeight,
  676                     mode=keepInFrameMode))
  677             # mode wasn't being used; it is necessary for tables or images at
  678             # end of page.
  679             context.keepInFrameIndex = None
  680 
  681         # Static block, END
  682         if staticFrame:
  683             context.addPara()
  684             for frame in staticFrame:
  685                 frame.pisaStaticStory = context.story
  686             context.swapStory(oldStory)
  687             context.frag.insideStaticFrame -= 1
  688 
  689         # context.debug(1, indent, "</%s>" % (node.tagName))
  690 
  691         # Reset frag style
  692         context.pullFrag()
  693 
  694     # Unknown or not handled
  695     else:
  696         # context.debug(1, indent, "???", node, node.nodeType, repr(node))
  697         # Loop over children
  698         for node in node.childNodes:
  699             pisaLoop(node, context, path, **kw)
  700 
  701 
  702 def pisaParser(src, context, default_css="", xhtml=False, encoding=None, xml_output=None):
  703     """
  704     - Parse HTML and get miniDOM
  705     - Extract CSS informations, add default CSS, parse CSS
  706     - Handle the document DOM itself and build reportlab story
  707     - Return Context object
  708     """
  709 
  710     global CSSAttrCache
  711     CSSAttrCache = {}
  712 
  713     if xhtml:
  714         # TODO: XHTMLParser doesn't see to exist...
  715         parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom"))
  716     else:
  717         parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
  718     parser_kwargs = {}
  719     if isinstance(src, six.text_type):
  720         # If an encoding was provided, do not change it.
  721         if not encoding:
  722             encoding = "utf-8"
  723         src = src.encode(encoding)
  724         src = pisaTempFile(src, capacity=context.capacity)
  725         # To pass the encoding used to convert the text_type src to binary_type
  726         # on to html5lib's parser to ensure proper decoding
  727         parser_kwargs['transport_encoding'] = encoding
  728 
  729     # # Test for the restrictions of html5lib
  730     # if encoding:
  731     #     # Workaround for html5lib<0.11.1
  732     #     if hasattr(inputstream, "isValidEncoding"):
  733     #         if encoding.strip().lower() == "utf8":
  734     #             encoding = "utf-8"
  735     #         if not inputstream.isValidEncoding(encoding):
  736     #             log.error("%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding)
  737     #     else:
  738     #         if inputstream.codecName(encoding) is None:
  739     #             log.error("%r is not a valid encoding", encoding)
  740     document = parser.parse(
  741         src, **parser_kwargs
  742     )  # encoding=encoding)
  743 
  744     if xml_output:
  745         if encoding:
  746             xml_output.write(document.toprettyxml(encoding=encoding))
  747         else:
  748             xml_output.write(document.toprettyxml(encoding="utf8"))
  749 
  750     if default_css:
  751         context.addDefaultCSS(default_css)
  752 
  753     pisaPreLoop(document, context)
  754     # try:
  755     context.parseCSS()
  756     # except:
  757     #    context.cssText = DEFAULT_CSS
  758     #    context.parseCSS()
  759     # context.debug(9, pprint.pformat(context.css))
  760 
  761     pisaLoop(document, context)
  762     return context
  763 
  764 
  765 # Shortcuts
  766 
  767 HTML2PDF = pisaParser
  768 
  769 
  770 def XHTML2PDF(*a, **kw):
  771     kw["xhtml"] = True
  772     return HTML2PDF(*a, **kw)
  773 
  774 
  775 XML2PDF = XHTML2PDF