"Fossies" - the Fresh Open Source Software Archive

Member "Tahchee-1.0.0/Sources/tahchee/plugins/_kiwi/core.py" (22 Oct 2009, 30652 Bytes) of package /linux/privat/old/tahchee-1.0.0.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Python source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. For more information about "core.py" see the Fossies "Dox" file reference documentation.

    1 #!/usr/bin/env python
    2 # Encoding: iso-8859-1
    3 # vim: tw=80 ts=4 sw=4 noet
    4 # -----------------------------------------------------------------------------
    5 # Project           :   Kiwi
    6 # -----------------------------------------------------------------------------
    7 # Author            :   Sebastien Pierre                 <sebastien@type-z.org>
    8 # License           :   Revised BSD License
    9 # -----------------------------------------------------------------------------
   10 # Creation date     :   07-Fev-2006
   11 # Last mod.         :   05-Aug-2008
   12 # -----------------------------------------------------------------------------
   13 
   14 import os, sys
   15 
   16 import re, string, operator, getopt, codecs
   17 
   18 # We use 4Suite domlette
   19 #import Ft.Xml.Domlette
   20 #dom = Ft.Xml.Domlette.implementation
   21 # We use minidom implementation
   22 import xml.dom.minidom
   23 dom = xml.dom.minidom.getDOMImplementation()
   24 
   25 from inlines import *
   26 from blocks  import *
   27 
   28 #------------------------------------------------------------------------------
   29 #
   30 #  Globals
   31 #
   32 #------------------------------------------------------------------------------
   33 
   34 # How many spaces a tab represent.
   35 TAB_SIZE = 4
   36 
   37 #------------------------------------------------------------------------------
   38 #
   39 #  Regular expressions
   40 #
   41 #------------------------------------------------------------------------------
   42 
   43 RE_BLOCK_SEPARATOR = re.compile(u"[ \t\r]*\n[ \t\r]*\n", re.MULTILINE | re.LOCALE)
   44 RE_SPACES = re.compile(u"[\s\n]+", re.LOCALE|re.MULTILINE)
   45 RE_TABS = re.compile("\t+")
   46 ATTRIBUTE = u"""(\w+)\s*=\s*('[^']*'|"[^"]*")"""
   47 RE_ATTRIBUTE = re.compile(ATTRIBUTE, re.LOCALE|re.MULTILINE)
   48 
   49 #------------------------------------------------------------------------------
   50 #
   51 #  Parsing context
   52 #
   53 #------------------------------------------------------------------------------
   54 
   55 class Context:
   56     """The context stores information on the currently processed document. It
   57     has the following attributes:
   58 
   59         - document: a reference to the current XML document.
   60         - rootNode: the current XML document root  node.
   61         - header: the XML node corresponding to the header.
   62         - content: the XML node corresponding to the content.
   63         - references: the XML node corresponding to the references.
   64         - appendices: the XML node corresponding to the appendices.
   65         - currentNode: the XML node to which attributes/nodes are added during
   66         parsing.
   67         - blockStartOffset: the offset in the text where the currently parsed block
   68         starts.
   69         - blockEndOffset: the offset in the text where the currently parsed block
   70         ends.
   71         - parser: a reference to the Kiwi parser instance using the context.
   72     """
   73 
   74     def __init__( self, documentText, markOffsets=False ):
   75         self.document = None
   76         self.rootNode = None
   77         self.header   = None
   78         self.content  = None
   79         self.references = None
   80         self.appendices = None
   81         self.currentNode = None
   82         self._offset = 0
   83         self.blockStartOffset = 0
   84         self.blockEndOffset = -1
   85         self.setDocumentText(documentText)
   86         self._currentFragment = None
   87         self.parser = None
   88         self.markOffsets = markOffsets
   89         self.sections = []
   90         # These are convenience attributes used to make it easy for
   91         # post-verification of the links (are they all resolved)
   92         self._links   = []
   93         self._targets = []
   94 
   95     def _getElementsByTagName(self, node, name):
   96         if node.nodeType == node.ELEMENT_NODE and \
   97            node.localName == name:
   98             result = [node]
   99         else:
  100             result = []
  101         for child in node.childNodes:
  102             result.extend(self._getElementsByTagName(child, name))
  103         return result
  104 
  105     def ensureElement( self, node, elementName, index=0 ):
  106         """Ensures that the given element exists in the given node at the given
  107         index."""
  108         result = self._getElementsByTagName(node, elementName)
  109         if len(result)<=index:
  110             newElement = self.document.createElementNS(None, elementName)
  111             node.appendChild(newElement)
  112             return newElement
  113         else:
  114             return result[index]
  115 
  116     def ensureParent( self, parentNames, predicate=lambda x:True ):
  117         """Ensures that the parent node name is one of the following name. This
  118         is useful for block parsers which want to ensure that their parent is a
  119         specific node"""
  120         if self.currentNode!=None:
  121             while ( self.currentNode.nodeName not in parentNames 
  122                 or not predicate(self.currentNode)
  123                 ) and self.currentNode.nodeName!="Document":
  124                 if self.currentNode.parentNode:
  125                     self.currentNode = self.currentNode.parentNode
  126                 else:
  127                     return
  128 
  129     def declareSection( self, node, contentNode, depth ):
  130         """Declares a section node with the given depth (which can be
  131         negative)."""
  132         self.sections.append((node, contentNode, depth))
  133 
  134     def getParentSection( self, depth, indent ):
  135         """Gets the section that would be the parent section for the
  136         given depth."""
  137         for i in range(len(self.sections)-1,-1,-1):
  138             section = self.sections[i]
  139             section_node = section[0]
  140             section_content = section[1]
  141             section_depth = section[2]
  142             section_indent = int(section_node.getAttributeNS(None, "_indent"))
  143             if indent > section_indent:
  144                 return section_content 
  145             elif section_indent <= indent and section_depth < depth:
  146                 return section_content
  147         return self.content
  148 
  149     def getDepthInSection( self, node ):
  150         """Returns the number of parent sections of the given node."""
  151         sections = 0
  152         while node.parentNode:
  153             if node.nodeName == "Section":
  154                 sections += 1
  155             node = node.parentNode
  156         return sections
  157 
  158     def setDocumentText( self, text ):
  159         """Sets the text of the current document. This should only be called
  160         at context initialisation."""
  161         if not type(text) == type(u""):
  162             text = unicode(text)
  163         self.documentText = text
  164         self.documentTextLength = len(text)
  165         self.blockEndOffset = self.documentTextLength
  166         self.setOffset(0)
  167 
  168     def setOffset( self, offset ):
  169         """Sets the current offset."""
  170         self._offset = offset
  171         self._currentFragment = None
  172 
  173     def getOffset(self):
  174         """Returns the current offset."""
  175         return self._offset
  176 
  177     def increaseOffset( self, increase ):
  178         """Increases the current offset"""
  179         # We get the current fragment, because it will be freed by changing the
  180         # offset
  181         fragment = self.currentFragment()
  182         self.setOffset(self.getOffset()+increase)
  183         # We optimise current fragment access, by restoring it with a proper
  184         # value when possible
  185         if self.getOffset()<self.blockEndOffset:
  186             self._currentFragment = fragment[increase:]
  187 
  188     def decreaseOffset( self, decrease ):
  189         """Decreases the offset."""
  190         self.increaseOffset(-decrease)
  191 
  192     def fragment( self, start, end ):
  193         """Returns the text fragment that starts and ends at the given
  194         offsets."""
  195         return self.documentText[start:end]
  196 
  197     def currentFragment( self ):
  198         """Returns the current text fragment, from the current offset to the
  199         block end offset."""
  200         assert self.getOffset()<self.blockEndOffset,\
  201         "Offset greater than block end: %s >= %s" % (self.getOffset(), self.blockEndOffset)
  202         if not self._currentFragment:
  203             self._currentFragment = \
  204             self.documentText[self.getOffset():self.blockEndOffset]
  205         return self._currentFragment
  206 
  207     def documentEndReached( self ):
  208         """Returns true if the current offset is greater than the document
  209         length"""
  210         return self.getOffset() >= self.documentTextLength
  211 
  212     def blockEndReached( self ):
  213         """Returns true when the current offset has exceeded the current block
  214         end offset"""
  215         return self.getOffset() >= self.blockEndOffset
  216     
  217     def offsetInBlock( self, offset ):
  218         """Tells if the givne offset is in the current block."""
  219         return self.blockStartOffset <= offset <= self.blockEndOffset
  220 
  221     def setCurrentBlock( self, startOffset, endOffset ):
  222         """Sets the start and end offset of the current block. The current
  223         offset is set to the current block start."""
  224         if endOffset <= 0: endOffset += self.documentTextLength
  225         assert startOffset>=0
  226         assert endOffset<=self.documentTextLength
  227         assert startOffset<=endOffset, "Start offset too big: %s > %s" % (startOffset, endOffset)
  228         self.setOffset(startOffset)
  229         self.blockStartOffset = startOffset
  230         self.blockEndOffset = endOffset
  231         self._currentFragment = None
  232 
  233     def setCurrentBlockEnd( self, endOffset ):
  234         assert endOffset >= self.blockStartOffset
  235         self.blockEndOffset = endOffset
  236         self._currentFragment = None
  237 
  238     def getBlockIndentation( self ):
  239         """Returns the indentation of the current block."""
  240         return self.parser.getIndentation(
  241             self.documentText[self.blockStartOffset:self.blockEndOffset])
  242 
  243     def saveOffsets( self ):
  244         """Returns a value that can be later used with the restoreOffsets
  245         method to restore the offsets as they were."""
  246         return (self.blockStartOffset, self.getOffset(), self.blockEndOffset)
  247 
  248     def restoreOffsets( self, offsets ):
  249         """Takes a value returned by saveOffsets and restores the offsets as
  250         they were."""
  251         self.blockStartOffset = offsets[0]
  252         self.setOffset( offsets[1] )
  253         self.blockEndOffset = offsets[2]
  254 
  255     def clone( self ):
  256         """Returns a clone of the current context, which can be changed safely
  257         without modifying the current context."""
  258         clone = Context(self.documentText)
  259         clone.document    = self.document
  260         clone.rootNode    = self.rootNode
  261         clone.header      = self.header
  262         clone.content     = self.content
  263         clone.references  = self.references
  264         clone.appendices  = self.appendices
  265         clone.currentNode = self.currentNode
  266         clone.parser      = self.parser
  267         clone.document    = self.document
  268         clone.setOffset(self.getOffset())
  269         clone.setCurrentBlock(self.blockStartOffset, self.blockEndOffset)
  270         return clone
  271 
  272     def findNextInline( self, inlineParsers ):
  273         """Finds the next inline in the given context, using the given list of
  274         inline parsers. This does not modifies the context.
  275 
  276         Returns either None or a triple (offset, information, parser), where
  277         the offset is relative to the context offset and indicates the start
  278         offset where the parser recognised its tag and information is the
  279         information returned by the parser."""
  280         # We look for the inline parser that parses an inline with the lowest
  281         # offset
  282         results = []
  283         for inlineParser in inlineParsers:
  284             match_offset, result = inlineParser.recognises(self)
  285             if match_offset!=None:
  286                 assert match_offset >= 0
  287                 results.append((match_offset, result, inlineParser))
  288         matchedResult = None
  289         minimumOffset = self.documentTextLength+1
  290         # We get the closest matching parser
  291         for result in results:
  292             if result[0]<minimumOffset:
  293                 minimumOffset = result[0]
  294                 matchedResult = result
  295         return matchedResult
  296         
  297     def parseAttributes( self, text ):
  298         """Parses attributes expressed in the given text. Attributes have the
  299         following form: ATTRIBUTE="VALUE" and are separated by spaces."""
  300         if not text: return {}
  301         text = text.strip()
  302         attributes = {}
  303         match  = True
  304         # We parse attributes
  305         while match and text:
  306             match = RE_ATTRIBUTE.match(text)
  307             if not match: break
  308             attributes[match.group(1)] = match.group(2)[1:-1]
  309             offset = match.end()
  310             text = text[match.end():].strip()
  311         return attributes
  312 
  313 #------------------------------------------------------------------------------
  314 #
  315 #  Kiwi parser
  316 #
  317 #------------------------------------------------------------------------------
  318 
  319 class Parser:
  320 
  321     def __init__( self, baseDirectory, inputEncoding="utf8", outputEncoding="utf8" ):
  322         self.blockParsers  = []
  323         self.inlineParsers = []
  324         self.customParsers = {}
  325         self.baseDirectory = baseDirectory
  326         self.inputEncoding = inputEncoding
  327         self.outputEncoding = outputEncoding
  328         self.createBlockParsers()
  329         self.createInlineParsers()
  330         self.createCustomParsers()
  331 
  332     def createBlockParsers( self ):
  333         self.blockParsers.extend((
  334             CommentBlockParser(),
  335             MarkupBlockParser(),
  336             PreBlockParser(),
  337             PreBlockParser2(),
  338             TableBlockParser(),
  339             ReferenceEntryBlockParser(),
  340             TitleBlockParser(),
  341             SectionBlockParser(),
  342             DefinitionBlockParser(),
  343             ListItemBlockParser(),
  344             ReferenceEntryBlockParser(),
  345             TaggedBlockParser(),
  346         ))
  347         self.defaultBlockParser = ParagraphBlockParser()
  348     
  349     def createCustomParsers( self ):
  350         #self.customParsers["Meta"] = MetaBlockParser()
  351         self.customParsers["pre"]  = PreBlockParser()
  352         #self.customParsers["table"]= TableBlockParser()
  353         pass
  354 
  355     def createInlineParsers( self ):
  356         # Escaped and markup inline parser are the most important parsers,
  357         # because they MUST be invoked before any other.
  358         self.escapedParser = EscapedInlineParser()
  359         self.commentParser = CommentInlineParser()
  360         self.markupParser  = MarkupInlineParser()
  361         def normal( x,y ): return self.normaliseText(x.group(1))
  362         def term  ( x,y ): return self.normaliseText(x.group()[1:-1])
  363         self.inlineParsers.extend((
  364             self.escapedParser,
  365             self.commentParser,
  366             self.markupParser,
  367             EscapedStringInlineParser(),
  368             InlineParser("email",       RE_EMAIL),
  369             InlineParser("url",         RE_URL),
  370             InlineParser("url",         RE_URL_2),
  371             EntityInlineParser(),
  372             LinkInlineParser(),
  373             PreInlineParser(),
  374             TargetInlineParser(),
  375             InlineParser("code",        RE_CODE_2),
  376             InlineParser("code",        RE_CODE),
  377             InlineParser("term",        RE_TERM,     normal),
  378             InlineParser("strong",      RE_STRONG,   normal),
  379             InlineParser("emphasis",    RE_EMPHASIS, normal),
  380             InlineParser("quote",       RE_QUOTED,   normal),
  381             InlineParser("code",        RE_CODE_3, requiresLeadingSpace=True),
  382             InlineParser("citation",    RE_CITATION, normal),
  383             # Special characters
  384             InlineParser("break",       RE_BREAK),
  385             InlineParser(None,          RE_SWALLOW_BREAK),
  386             InlineParser("newline",     RE_NEWLINE),
  387             InlineParser("dots",        RE_DOTS),
  388             ArrowInlineParser(),
  389             InlineParser("endash",      RE_LONGDASH),
  390             InlineParser("emdash",      RE_LONGLONGDASH),
  391         ))
  392 
  393     def _initialiseContextDocument(self, context):
  394         """Creates the XML document that will be populated by Kiwi
  395         parsing."""
  396         document  = dom.createDocument(None,None,None)
  397         root_node = document.createElementNS(None, "Document")
  398         document.appendChild(root_node)
  399         context.rootNode = root_node
  400         context.document = document
  401         context.header   = document.createElementNS(None, "Header")
  402         context.content  = document.createElementNS(None, "Content")
  403         context.references = document.createElementNS(None, "References")
  404         context.appendices = document.createElementNS(None, "Appendices")
  405         root_node.appendChild(context.header)
  406         root_node.appendChild(context.content)
  407         root_node.appendChild(context.references)
  408         root_node.appendChild(context.appendices)
  409         context.currentNode = context.content
  410 
  411     # EXCEPTIONS_______________________________________________________________
  412 
  413     def _print( self, message, context ):
  414         text   = context.documentText[:context.getOffset()]
  415         line   = len(text.split("\n"))
  416         offset = context.getOffset() - text.rfind("\n") - 1
  417         message = unicode(message % (line, offset) + "\n")
  418         sys.stderr.write(message.encode("iso-8859-1"))
  419 
  420     def warning( self, message, context ):
  421         self._print( "WARNING at line %4d, character %3d: "+message, context)
  422 
  423     def tip( self, message, context ):
  424         self._print( "%4d:%3d >> " +message, context)
  425 
  426     def error( self, message, context ):
  427         self._print( "ERROR at line %4d, character %3d: "+message, context)
  428 
  429     # PARSING__________________________________________________________________
  430 
  431     def parse( self, text, offsets=False ):
  432         """Parses the given text, and returns an XML document. If `offsets` is
  433         set to True, then all nodes of the document are annotated with their
  434         position in the original text as well with a number. The document will
  435         also have an `offsets` attribute that will contain a list of (start,
  436         end) offset tuples for each element."""
  437         # Text MUST be unicode
  438         assert type(text) == type(u"")
  439         context = Context(text, markOffsets=offsets)
  440         self._initialiseContextDocument(context)
  441         context.parser = self
  442         while not context.documentEndReached():
  443             self._parseNextBlock(context)
  444         # We remove unnecessary nodes
  445         for node in ( context.header, context.content, context.references,
  446         context.appendices ):
  447             if len(node.childNodes) == 0:
  448                 context.rootNode.removeChild(node)
  449         if offsets:
  450             context.offsets = self._updateElementOffsets(context, offsets=[])
  451         return context.document
  452 
  453     def parseContext( self, context ):
  454         while not context.documentEndReached():
  455             self._parseNextBlock(context)
  456 
  457     def _parseNextBlock( self, context, end=None ):
  458         """Parses the block identified in the given context, ending at the given
  459         'end' (if 'end' is not None)."""
  460         assert context!=None
  461         # This variable indicates if at least one block parser recognised the
  462         # current block
  463         recognised = None
  464         # We find block start and end
  465         block_start_offset = context.getOffset()
  466         block_end_offset, next_block_start_offset = \
  467             self._findNextBlockSeparator(context)
  468         # If we specify the end
  469         if end != None:
  470             block_end_offset = min(end, block_end_offset)
  471         # If the block is an empty block (a SEPARATOR), we try to find the
  472         # parent node
  473         if block_end_offset == block_start_offset:
  474             # We rewind until we find a "Content" block
  475             while context.currentNode.nodeName != "Content" and \
  476             context.currentNode.parentNode != None:
  477                 context.currentNode = context.currentNode.parentNode
  478         # Otherwise we set the current block and process it
  479         else:
  480             context.setCurrentBlock(block_start_offset, block_end_offset)
  481             assert block_start_offset < block_end_offset <= next_block_start_offset
  482             # We first look for a block parser that recognises the current
  483             # context
  484             assert len(self.blockParsers)>0
  485             for blockParser in self.blockParsers:
  486                 context.setOffset(block_start_offset)
  487                 recognised = blockParser.recognises(context)
  488                 context.setOffset(block_start_offset)
  489                 if recognised: break
  490             # If no block parser was recognised, we used the default block
  491             # parser
  492             if not recognised:
  493                 blockParser = self.defaultBlockParser
  494                 recognised = self.defaultBlockParser.recognises(context)
  495                 context.setOffset(block_start_offset)
  496                 assert recognised
  497             start_offset = str(context.getOffset())
  498             blockParser.process(context, recognised)
  499             # Just in case the parser modified the end offset, we update
  500             # the next block start offset
  501             next_block_start_offset  = context.blockEndOffset
  502             node = context.currentNode
  503         # Anyway, we set the offset to the next block start
  504         context.setOffset(next_block_start_offset)
  505 
  506     def parseBlock( self, context, node, textProcessor ):
  507         """Parses the current block, looking for the inlines it may contain."""
  508         #if context.markOffsets and not node.getAttributeNS(None,"_start"):
  509         #   node.setAttributeNS(None, "_start", str(context.getOffset()))
  510         while not context.blockEndReached():
  511             self._parseNextInline(context, node, textProcessor)
  512         #if context.markOffsets and not node.getAttributeNS(None,"_end"):
  513         #   node.setAttributeNS(None, "_end", str(context.getOffset()))
  514 
  515     def _parseNextInline( self, context, node, textProcessor ):
  516         """Parses the content of the current block, starting at the context
  517         offset, modifying the given node and updating the context offset.
  518         This returns the a triple (offset, information, parser) where
  519         information is the result of the parser `recognises' method."""
  520         assert context and node and textProcessor
  521         assert not context.blockEndReached()
  522         parse_offset = context.getOffset()
  523         matchedResult = context.findNextInline(self.inlineParsers)
  524         # If an inline parser recognised the block content then we can parse
  525         # it without problem
  526         if matchedResult:
  527             # We append the text between the search start offset and the matched
  528             # block start
  529             text = context.currentFragment()[:matchedResult[0]]
  530             if text:
  531                 text = textProcessor( context, text )
  532                 text_node = context.document.createTextNode(text)
  533                 node.appendChild(text_node)
  534             new_offset = matchedResult[2].parse(context, node, matchedResult[1])
  535             # We increase the offset so that the next parsing offset will be
  536             # the end of the parsed inline.
  537             context.increaseOffset(new_offset)
  538         # When we have not found any matched result, this means that we simply
  539         # have to append the whole block as a text node
  540         else:
  541             assert parse_offset < context.blockEndOffset
  542             text = textProcessor(context,
  543                 context.documentText[parse_offset:context.blockEndOffset]
  544             )
  545             text_node = context.document.createTextNode(text)
  546             node.appendChild(text_node)
  547             # We set the end to the block end
  548             context.setOffset(context.blockEndOffset)
  549         # We make sure the parsers have actually augmented the offset
  550         assert context.getOffset() >= parse_offset
  551         return matchedResult
  552 
  553     def _findNextBlockSeparator( self, context ):
  554         """Returns a match object that matches the next block separator, taking
  555         into account possible custom block objects."""
  556         #FIXME: Should check if the found block separator is contained in a
  557         #custom block or not.
  558         block_match = RE_BLOCK_SEPARATOR.search(context.documentText,
  559         context.getOffset())
  560         if block_match:
  561             local_offset = context.getOffset()
  562             # We look for a markup inline between the current offset and the
  563             # next block separator
  564             while local_offset<block_match.start():
  565                 markup_match = RE_MARKUP.search(context.documentText,
  566                     local_offset, block_match.start())
  567                 # If we have not found a markup, we break
  568                 if not markup_match: break
  569                 if markup_match:
  570                     # We have specified that markup inlines should not be searched
  571                     # after the block separator
  572                     local_offset, result = self._delimitXMLMarkupBlock(context, markup_match, block_match, local_offset)
  573                     if not result is None: return result
  574             # We have found a block with no nested markup
  575             return (block_match.start(), block_match.end())
  576         # There was no block separator, so we reached the document end
  577         else:
  578             return (context.documentTextLength, context.documentTextLength)
  579 
  580     def _delimitXMLMarkupBlock( self, context, markupMatch, blockMatch, localOffset ):
  581         markup_match = markupMatch
  582         block_match  = blockMatch
  583         local_offset = localOffset
  584         assert markup_match.start()<block_match.start()
  585         # Case 1: Markup is a start tag
  586         if Markup_isStartTag(markup_match):
  587             # We look for the markup end inline
  588             offsets = context.saveOffsets()
  589             context.setCurrentBlock(markup_match.end(),context.documentTextLength)
  590             # There may be no 2nd group, so we  have to check this. Old
  591             # Kiwi documents may have [start:something] instead of
  592             # [start something]
  593             markup_end = None
  594             if markup_match.group(1):
  595                 markup_end = self.markupParser.findEnd(
  596                     markup_match.group(1).strip(), context)
  597             context.restoreOffsets(offsets)
  598             # If we found an end markup
  599             if markup_end:
  600                 # The returned end is relative to the start markup end
  601                 # offset (markup_end is a couple indicating the range
  602                 # covered by the matched end markup)
  603                 markup_end = markup_match.end() + markup_end[1]
  604                 # If the end is greater than the block end, then we have
  605                 # to recurse to look for a new block separator
  606                 # after the block end
  607                 if markup_end > block_match.start():
  608                     offsets = context.saveOffsets()
  609                     context.setOffset(markup_end)
  610                     result =  self._findNextBlockSeparator(context)
  611                     context.restoreOffsets(offsets)
  612                     # NOTE: This is the case where we found
  613                     # the block
  614                     return local_offset, result
  615                 # Otherwise we simply increase the offset, and look for
  616                 # other possible markup inlines
  617                 else:
  618                     local_offset = markup_end
  619             # If there was not markup end, we skip the markup inline
  620             else:
  621                 local_offset = markup_match.end()
  622         # We have a single tag, so we simply increase the offset
  623         else:
  624             local_offset = markup_match.end()
  625         return local_offset, None
  626 
  627     def _nodeHasOffsets( self, node ):
  628         start, end = self._nodeGetOffsets(node)
  629         return start != None and end != None
  630 
  631     def _nodeGetOffsets( self, node ):
  632         start = node.getAttributeNS(None, "_start") 
  633         end   = node.getAttributeNS(None, "_end") 
  634         if start == '': start = None
  635         if end   == '': end   = None
  636         if start != None: start = int(start)
  637         else: start = None
  638         if end != None: end = int(end)
  639         else: end = None
  640         return (start,end)
  641 
  642     def _nodeEnsureOffsets( self, node, start=None, end=None ):
  643         nstart, nend = self._nodeGetOffsets(node)
  644         if nstart is None and start != None:
  645             node.setAttributeNS(None, "_start", str(start))
  646         if nend is None and end != None:
  647             node.setAttributeNS(None, "_end", str(end))
  648 
  649     def _updateElementOffsets( self, context, node=None, counter=0, offsets=None ):
  650         """This function ensures that every element has a _start and _end
  651         attribute indicating the bit of original data it comes from."""
  652         if node == None:
  653             node = context.document.childNodes[0]
  654             self._nodeEnsureOffsets(node, 0, context.documentTextLength)
  655         node.setAttributeNS(None, "_number", str(counter))
  656         # The given offsets parameter is an array with the node number and the
  657         # offsets. It can be used by embedders to easily access nods by offset
  658         if offsets != None:
  659             assert len(offsets) == counter, "%s != %s" % (len(offsets) , counter)
  660             this_offsets = [None,None]
  661             offsets.append(this_offsets)
  662         # The first step is to fill an array with the child nodes offsets
  663         # Each child node may or may not have an offset
  664         child_nodes = tuple([n for n in node.childNodes if n.nodeType == n.ELEMENT_NODE])
  665         nstart, nend = self._nodeGetOffsets(node)
  666         if child_nodes:
  667             self._nodeEnsureOffsets(child_nodes[0], start=nstart)
  668             self._nodeEnsureOffsets(child_nodes[-1], end=nend)
  669         child_offsets = []
  670         start = end = None
  671         for e in child_nodes:
  672             counter = self._updateElementOffsets(context, e, counter + 1)
  673             child_offsets.append(self._nodeGetOffsets(e))
  674         # Once this list is created, we retried the start offset of the earliest
  675         # child that has a start offset, same for the end offset of the latest
  676         # child
  677         child_start = None
  678         child_end   = None
  679         if child_offsets:
  680             i = 0
  681             j = len(child_offsets) - 1
  682             while i < len(child_offsets) and child_offsets[i][0] == None: i += 1
  683             while j >= 0 and child_offsets[j][1] == None: j -= 1
  684             if i < len(child_offsets): child_start = child_offsets[i][0]
  685             if j >= 0: child_end   = child_offsets[j][1]
  686         # We update the current node with the child offsets (this allows node
  687         # that have incomplete offsets to be completed)
  688         self._nodeEnsureOffsets(node, child_start, child_end)
  689         # And now we update the children offsets again (so that they actually
  690         # all have offsets), because we have all the information we need to
  691         # actually update the children offsets
  692         start, end = self._nodeGetOffsets(node)
  693         self._propagateElementOffsets(node,start,end)
  694         # As we now the current node offsets, we can set the real values in the
  695         # fiven offsets array, by simply updating the offsets value in the
  696         # `this_offsets` list.
  697         if offsets!=None:
  698             o = self._nodeGetOffsets(node)
  699             this_offsets[0] = o[0]
  700             this_offsets[1] = o[1]
  701         # And we return the number of this node
  702         return counter
  703 
  704     def _propagateElementOffsets( self, element, start=None, end=None ):
  705         """Used by the _updateElementOffsets to ensure start and end offsets in
  706         all children and descendants."""
  707         #if start is None or end is None: return
  708         child_nodes = list([n for n in element.childNodes if n.nodeType == n.ELEMENT_NODE]) 
  709         self._nodeEnsureOffsets(element, start, end)
  710         # At first, we set the bounds properly, so that the first child node
  711         # start is this node start, and the last node end is this node end
  712         if child_nodes:
  713             self._nodeEnsureOffsets(child_nodes[0],  start=start)
  714             self._nodeEnsureOffsets(child_nodes[-1], end=end)
  715         # Now 
  716         for child in child_nodes:
  717             self._propagateElementOffsets(child, start=start)
  718             if self._nodeGetOffsets(child)[1] != None:
  719                 _,nstart = self._nodeGetOffsets(child)
  720                 if nstart != None: start = nstart
  721         child_nodes.reverse()
  722         for child in child_nodes:
  723             self._propagateElementOffsets(child,end=end)
  724             if self._nodeGetOffsets(child)[0] != None:
  725                 nend,_ = self._nodeGetOffsets(child)
  726                 if nend != None: end = nend
  727         # TODO: Maybe update the element offsetsd
  728 
  729 
  730     # TEXT PROCESSING UTILITIES________________________________________________
  731 
  732     def normaliseText( self, text ):
  733         """Treats tabs eols and multiples spaces as single space, plus removes
  734         leading and trailing spaces."""
  735         # We do not strip the text because white spaces are important
  736         return RE_SPACES.sub(u" ",text)
  737 
  738     def expandTabs( self, text, cut=0 ):
  739         """Expands the tabs in the given text, cutting the n first characters
  740         of each line, where n is given by the 'cut' argument.  This tabs
  741         expansion algorithm works better than Python line expansion
  742         algorithms."""
  743         if not text: return ""
  744         new_text = ""
  745         for line in text.split("\n"):
  746             start = 0
  747             match = 1
  748             new_line = ""
  749             while match!=None:
  750                 match = RE_TABS.search(line, start)
  751                 if match:
  752                     rest  = TAB_SIZE-operator.mod(match.start(), TAB_SIZE)
  753                     new_line += line[start:match.start()]
  754                     # We grow the line with additional tabs
  755                     value =  rest+(len(match.group())-1)*TAB_SIZE
  756                     while value>0: new_line+=" " ; value-=1
  757                     #It is important to mutate the original line
  758                     line = new_line+line[match.end():]
  759                     start = len(new_line)
  760             new_line += line[start:]
  761             cut_offset = min(len(new_line), cut)
  762             new_text += new_line[cut_offset:] + "\n"
  763         # We make sure that we do not add an extra empty line
  764         if text[-1]!=new_text[-1]:
  765             return new_text[:-1]
  766         else:
  767             return new_text
  768 
  769     @classmethod
  770     def getIndentation( self, text ):
  771         """Returns the indentation of a string.
  772 
  773         Basically if a string has the first three lines at the same
  774         identation level, then it is indentation will be the number
  775         of spaces or tabs that lead the first three lines.
  776 
  777         Tabs have the value given by the *TAB_SIZE* variable."""
  778         lines = filter(lambda x:len(x)>0, string.split(text, '\n', 4))
  779         indent = map(self.countLeadingSpaces, lines)
  780         if len(lines) == 0:
  781             res = 0
  782         elif len(lines) == 1:
  783             res = indent[0]
  784         elif len(lines) == 2:
  785             res = indent[1]
  786         else: 
  787             res = max(max(indent[0], indent[1]), indent[2])
  788         return res
  789 
  790     @classmethod
  791     def countLeadingSpaces( self, text ):
  792         """Returns the number of leading spaces in the given line.
  793         A tab will have the value given by the TAB_SIZE global."""
  794         count = 0
  795         for char in text:
  796             if char==u"\t":
  797                 count += TAB_SIZE-operator.mod(count,TAB_SIZE)
  798             elif char==u" ":
  799                 count+=1
  800             else:
  801                 return count
  802         return count
  803 
  804     @classmethod
  805     def removeLeadingSpaces( self, text, maximum=None ):
  806         i = 0 ; count = 0
  807         for char in text:
  808             if not maximum is None and count >= maximum:
  809                 return text[i:]
  810             if char==u"\t":
  811                 count += TAB_SIZE-operator.mod(count,TAB_SIZE)
  812             elif char==u" ":
  813                 count+=1
  814             else:
  815                 return text[i:]
  816             i += 1
  817         return ''
  818 
  819     @classmethod
  820     def charactersToSpaces( self, text):
  821         """Returns a string where all characters are converted to spaces.
  822         Newlines and tabs are preserved"""
  823         new_text = u""
  824         for char in text:
  825             if char in ("\t", "\n", " "):
  826                 new_text += char
  827             else:
  828                 new_text += " "
  829         return new_text
  830 
  831 # EOF