"Fossies" - the Fresh Open Source Software Archive 
Member "Tahchee-1.0.0/Sources/tahchee/plugins/_kiwi/core.py" (22 Oct 2009, 30652 Bytes) of package /linux/privat/old/tahchee-1.0.0.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Python source code syntax highlighting (style:
standard) with prefixed line numbers.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "core.py" see the
Fossies "Dox" file reference documentation.
1 #!/usr/bin/env python
2 # Encoding: iso-8859-1
3 # vim: tw=80 ts=4 sw=4 noet
4 # -----------------------------------------------------------------------------
5 # Project : Kiwi
6 # -----------------------------------------------------------------------------
7 # Author : Sebastien Pierre <sebastien@type-z.org>
8 # License : Revised BSD License
9 # -----------------------------------------------------------------------------
10 # Creation date : 07-Fev-2006
11 # Last mod. : 05-Aug-2008
12 # -----------------------------------------------------------------------------
13
14 import os, sys
15
16 import re, string, operator, getopt, codecs
17
18 # We use 4Suite domlette
19 #import Ft.Xml.Domlette
20 #dom = Ft.Xml.Domlette.implementation
21 # We use minidom implementation
22 import xml.dom.minidom
23 dom = xml.dom.minidom.getDOMImplementation()
24
25 from inlines import *
26 from blocks import *
27
28 #------------------------------------------------------------------------------
29 #
30 # Globals
31 #
32 #------------------------------------------------------------------------------
33
34 # How many spaces a tab represent.
35 TAB_SIZE = 4
36
37 #------------------------------------------------------------------------------
38 #
39 # Regular expressions
40 #
41 #------------------------------------------------------------------------------
42
43 RE_BLOCK_SEPARATOR = re.compile(u"[ \t\r]*\n[ \t\r]*\n", re.MULTILINE | re.LOCALE)
44 RE_SPACES = re.compile(u"[\s\n]+", re.LOCALE|re.MULTILINE)
45 RE_TABS = re.compile("\t+")
46 ATTRIBUTE = u"""(\w+)\s*=\s*('[^']*'|"[^"]*")"""
47 RE_ATTRIBUTE = re.compile(ATTRIBUTE, re.LOCALE|re.MULTILINE)
48
49 #------------------------------------------------------------------------------
50 #
51 # Parsing context
52 #
53 #------------------------------------------------------------------------------
54
55 class Context:
56 """The context stores information on the currently processed document. It
57 has the following attributes:
58
59 - document: a reference to the current XML document.
60 - rootNode: the current XML document root node.
61 - header: the XML node corresponding to the header.
62 - content: the XML node corresponding to the content.
63 - references: the XML node corresponding to the references.
64 - appendices: the XML node corresponding to the appendices.
65 - currentNode: the XML node to which attributes/nodes are added during
66 parsing.
67 - blockStartOffset: the offset in the text where the currently parsed block
68 starts.
69 - blockEndOffset: the offset in the text where the currently parsed block
70 ends.
71 - parser: a reference to the Kiwi parser instance using the context.
72 """
73
74 def __init__( self, documentText, markOffsets=False ):
75 self.document = None
76 self.rootNode = None
77 self.header = None
78 self.content = None
79 self.references = None
80 self.appendices = None
81 self.currentNode = None
82 self._offset = 0
83 self.blockStartOffset = 0
84 self.blockEndOffset = -1
85 self.setDocumentText(documentText)
86 self._currentFragment = None
87 self.parser = None
88 self.markOffsets = markOffsets
89 self.sections = []
90 # These are convenience attributes used to make it easy for
91 # post-verification of the links (are they all resolved)
92 self._links = []
93 self._targets = []
94
95 def _getElementsByTagName(self, node, name):
96 if node.nodeType == node.ELEMENT_NODE and \
97 node.localName == name:
98 result = [node]
99 else:
100 result = []
101 for child in node.childNodes:
102 result.extend(self._getElementsByTagName(child, name))
103 return result
104
105 def ensureElement( self, node, elementName, index=0 ):
106 """Ensures that the given element exists in the given node at the given
107 index."""
108 result = self._getElementsByTagName(node, elementName)
109 if len(result)<=index:
110 newElement = self.document.createElementNS(None, elementName)
111 node.appendChild(newElement)
112 return newElement
113 else:
114 return result[index]
115
116 def ensureParent( self, parentNames, predicate=lambda x:True ):
117 """Ensures that the parent node name is one of the following name. This
118 is useful for block parsers which want to ensure that their parent is a
119 specific node"""
120 if self.currentNode!=None:
121 while ( self.currentNode.nodeName not in parentNames
122 or not predicate(self.currentNode)
123 ) and self.currentNode.nodeName!="Document":
124 if self.currentNode.parentNode:
125 self.currentNode = self.currentNode.parentNode
126 else:
127 return
128
129 def declareSection( self, node, contentNode, depth ):
130 """Declares a section node with the given depth (which can be
131 negative)."""
132 self.sections.append((node, contentNode, depth))
133
134 def getParentSection( self, depth, indent ):
135 """Gets the section that would be the parent section for the
136 given depth."""
137 for i in range(len(self.sections)-1,-1,-1):
138 section = self.sections[i]
139 section_node = section[0]
140 section_content = section[1]
141 section_depth = section[2]
142 section_indent = int(section_node.getAttributeNS(None, "_indent"))
143 if indent > section_indent:
144 return section_content
145 elif section_indent <= indent and section_depth < depth:
146 return section_content
147 return self.content
148
149 def getDepthInSection( self, node ):
150 """Returns the number of parent sections of the given node."""
151 sections = 0
152 while node.parentNode:
153 if node.nodeName == "Section":
154 sections += 1
155 node = node.parentNode
156 return sections
157
158 def setDocumentText( self, text ):
159 """Sets the text of the current document. This should only be called
160 at context initialisation."""
161 if not type(text) == type(u""):
162 text = unicode(text)
163 self.documentText = text
164 self.documentTextLength = len(text)
165 self.blockEndOffset = self.documentTextLength
166 self.setOffset(0)
167
168 def setOffset( self, offset ):
169 """Sets the current offset."""
170 self._offset = offset
171 self._currentFragment = None
172
173 def getOffset(self):
174 """Returns the current offset."""
175 return self._offset
176
177 def increaseOffset( self, increase ):
178 """Increases the current offset"""
179 # We get the current fragment, because it will be freed by changing the
180 # offset
181 fragment = self.currentFragment()
182 self.setOffset(self.getOffset()+increase)
183 # We optimise current fragment access, by restoring it with a proper
184 # value when possible
185 if self.getOffset()<self.blockEndOffset:
186 self._currentFragment = fragment[increase:]
187
188 def decreaseOffset( self, decrease ):
189 """Decreases the offset."""
190 self.increaseOffset(-decrease)
191
192 def fragment( self, start, end ):
193 """Returns the text fragment that starts and ends at the given
194 offsets."""
195 return self.documentText[start:end]
196
197 def currentFragment( self ):
198 """Returns the current text fragment, from the current offset to the
199 block end offset."""
200 assert self.getOffset()<self.blockEndOffset,\
201 "Offset greater than block end: %s >= %s" % (self.getOffset(), self.blockEndOffset)
202 if not self._currentFragment:
203 self._currentFragment = \
204 self.documentText[self.getOffset():self.blockEndOffset]
205 return self._currentFragment
206
207 def documentEndReached( self ):
208 """Returns true if the current offset is greater than the document
209 length"""
210 return self.getOffset() >= self.documentTextLength
211
212 def blockEndReached( self ):
213 """Returns true when the current offset has exceeded the current block
214 end offset"""
215 return self.getOffset() >= self.blockEndOffset
216
217 def offsetInBlock( self, offset ):
218 """Tells if the givne offset is in the current block."""
219 return self.blockStartOffset <= offset <= self.blockEndOffset
220
221 def setCurrentBlock( self, startOffset, endOffset ):
222 """Sets the start and end offset of the current block. The current
223 offset is set to the current block start."""
224 if endOffset <= 0: endOffset += self.documentTextLength
225 assert startOffset>=0
226 assert endOffset<=self.documentTextLength
227 assert startOffset<=endOffset, "Start offset too big: %s > %s" % (startOffset, endOffset)
228 self.setOffset(startOffset)
229 self.blockStartOffset = startOffset
230 self.blockEndOffset = endOffset
231 self._currentFragment = None
232
233 def setCurrentBlockEnd( self, endOffset ):
234 assert endOffset >= self.blockStartOffset
235 self.blockEndOffset = endOffset
236 self._currentFragment = None
237
238 def getBlockIndentation( self ):
239 """Returns the indentation of the current block."""
240 return self.parser.getIndentation(
241 self.documentText[self.blockStartOffset:self.blockEndOffset])
242
243 def saveOffsets( self ):
244 """Returns a value that can be later used with the restoreOffsets
245 method to restore the offsets as they were."""
246 return (self.blockStartOffset, self.getOffset(), self.blockEndOffset)
247
248 def restoreOffsets( self, offsets ):
249 """Takes a value returned by saveOffsets and restores the offsets as
250 they were."""
251 self.blockStartOffset = offsets[0]
252 self.setOffset( offsets[1] )
253 self.blockEndOffset = offsets[2]
254
255 def clone( self ):
256 """Returns a clone of the current context, which can be changed safely
257 without modifying the current context."""
258 clone = Context(self.documentText)
259 clone.document = self.document
260 clone.rootNode = self.rootNode
261 clone.header = self.header
262 clone.content = self.content
263 clone.references = self.references
264 clone.appendices = self.appendices
265 clone.currentNode = self.currentNode
266 clone.parser = self.parser
267 clone.document = self.document
268 clone.setOffset(self.getOffset())
269 clone.setCurrentBlock(self.blockStartOffset, self.blockEndOffset)
270 return clone
271
272 def findNextInline( self, inlineParsers ):
273 """Finds the next inline in the given context, using the given list of
274 inline parsers. This does not modifies the context.
275
276 Returns either None or a triple (offset, information, parser), where
277 the offset is relative to the context offset and indicates the start
278 offset where the parser recognised its tag and information is the
279 information returned by the parser."""
280 # We look for the inline parser that parses an inline with the lowest
281 # offset
282 results = []
283 for inlineParser in inlineParsers:
284 match_offset, result = inlineParser.recognises(self)
285 if match_offset!=None:
286 assert match_offset >= 0
287 results.append((match_offset, result, inlineParser))
288 matchedResult = None
289 minimumOffset = self.documentTextLength+1
290 # We get the closest matching parser
291 for result in results:
292 if result[0]<minimumOffset:
293 minimumOffset = result[0]
294 matchedResult = result
295 return matchedResult
296
297 def parseAttributes( self, text ):
298 """Parses attributes expressed in the given text. Attributes have the
299 following form: ATTRIBUTE="VALUE" and are separated by spaces."""
300 if not text: return {}
301 text = text.strip()
302 attributes = {}
303 match = True
304 # We parse attributes
305 while match and text:
306 match = RE_ATTRIBUTE.match(text)
307 if not match: break
308 attributes[match.group(1)] = match.group(2)[1:-1]
309 offset = match.end()
310 text = text[match.end():].strip()
311 return attributes
312
313 #------------------------------------------------------------------------------
314 #
315 # Kiwi parser
316 #
317 #------------------------------------------------------------------------------
318
319 class Parser:
320
321 def __init__( self, baseDirectory, inputEncoding="utf8", outputEncoding="utf8" ):
322 self.blockParsers = []
323 self.inlineParsers = []
324 self.customParsers = {}
325 self.baseDirectory = baseDirectory
326 self.inputEncoding = inputEncoding
327 self.outputEncoding = outputEncoding
328 self.createBlockParsers()
329 self.createInlineParsers()
330 self.createCustomParsers()
331
332 def createBlockParsers( self ):
333 self.blockParsers.extend((
334 CommentBlockParser(),
335 MarkupBlockParser(),
336 PreBlockParser(),
337 PreBlockParser2(),
338 TableBlockParser(),
339 ReferenceEntryBlockParser(),
340 TitleBlockParser(),
341 SectionBlockParser(),
342 DefinitionBlockParser(),
343 ListItemBlockParser(),
344 ReferenceEntryBlockParser(),
345 TaggedBlockParser(),
346 ))
347 self.defaultBlockParser = ParagraphBlockParser()
348
349 def createCustomParsers( self ):
350 #self.customParsers["Meta"] = MetaBlockParser()
351 self.customParsers["pre"] = PreBlockParser()
352 #self.customParsers["table"]= TableBlockParser()
353 pass
354
355 def createInlineParsers( self ):
356 # Escaped and markup inline parser are the most important parsers,
357 # because they MUST be invoked before any other.
358 self.escapedParser = EscapedInlineParser()
359 self.commentParser = CommentInlineParser()
360 self.markupParser = MarkupInlineParser()
361 def normal( x,y ): return self.normaliseText(x.group(1))
362 def term ( x,y ): return self.normaliseText(x.group()[1:-1])
363 self.inlineParsers.extend((
364 self.escapedParser,
365 self.commentParser,
366 self.markupParser,
367 EscapedStringInlineParser(),
368 InlineParser("email", RE_EMAIL),
369 InlineParser("url", RE_URL),
370 InlineParser("url", RE_URL_2),
371 EntityInlineParser(),
372 LinkInlineParser(),
373 PreInlineParser(),
374 TargetInlineParser(),
375 InlineParser("code", RE_CODE_2),
376 InlineParser("code", RE_CODE),
377 InlineParser("term", RE_TERM, normal),
378 InlineParser("strong", RE_STRONG, normal),
379 InlineParser("emphasis", RE_EMPHASIS, normal),
380 InlineParser("quote", RE_QUOTED, normal),
381 InlineParser("code", RE_CODE_3, requiresLeadingSpace=True),
382 InlineParser("citation", RE_CITATION, normal),
383 # Special characters
384 InlineParser("break", RE_BREAK),
385 InlineParser(None, RE_SWALLOW_BREAK),
386 InlineParser("newline", RE_NEWLINE),
387 InlineParser("dots", RE_DOTS),
388 ArrowInlineParser(),
389 InlineParser("endash", RE_LONGDASH),
390 InlineParser("emdash", RE_LONGLONGDASH),
391 ))
392
393 def _initialiseContextDocument(self, context):
394 """Creates the XML document that will be populated by Kiwi
395 parsing."""
396 document = dom.createDocument(None,None,None)
397 root_node = document.createElementNS(None, "Document")
398 document.appendChild(root_node)
399 context.rootNode = root_node
400 context.document = document
401 context.header = document.createElementNS(None, "Header")
402 context.content = document.createElementNS(None, "Content")
403 context.references = document.createElementNS(None, "References")
404 context.appendices = document.createElementNS(None, "Appendices")
405 root_node.appendChild(context.header)
406 root_node.appendChild(context.content)
407 root_node.appendChild(context.references)
408 root_node.appendChild(context.appendices)
409 context.currentNode = context.content
410
411 # EXCEPTIONS_______________________________________________________________
412
413 def _print( self, message, context ):
414 text = context.documentText[:context.getOffset()]
415 line = len(text.split("\n"))
416 offset = context.getOffset() - text.rfind("\n") - 1
417 message = unicode(message % (line, offset) + "\n")
418 sys.stderr.write(message.encode("iso-8859-1"))
419
420 def warning( self, message, context ):
421 self._print( "WARNING at line %4d, character %3d: "+message, context)
422
423 def tip( self, message, context ):
424 self._print( "%4d:%3d >> " +message, context)
425
426 def error( self, message, context ):
427 self._print( "ERROR at line %4d, character %3d: "+message, context)
428
429 # PARSING__________________________________________________________________
430
431 def parse( self, text, offsets=False ):
432 """Parses the given text, and returns an XML document. If `offsets` is
433 set to True, then all nodes of the document are annotated with their
434 position in the original text as well with a number. The document will
435 also have an `offsets` attribute that will contain a list of (start,
436 end) offset tuples for each element."""
437 # Text MUST be unicode
438 assert type(text) == type(u"")
439 context = Context(text, markOffsets=offsets)
440 self._initialiseContextDocument(context)
441 context.parser = self
442 while not context.documentEndReached():
443 self._parseNextBlock(context)
444 # We remove unnecessary nodes
445 for node in ( context.header, context.content, context.references,
446 context.appendices ):
447 if len(node.childNodes) == 0:
448 context.rootNode.removeChild(node)
449 if offsets:
450 context.offsets = self._updateElementOffsets(context, offsets=[])
451 return context.document
452
453 def parseContext( self, context ):
454 while not context.documentEndReached():
455 self._parseNextBlock(context)
456
457 def _parseNextBlock( self, context, end=None ):
458 """Parses the block identified in the given context, ending at the given
459 'end' (if 'end' is not None)."""
460 assert context!=None
461 # This variable indicates if at least one block parser recognised the
462 # current block
463 recognised = None
464 # We find block start and end
465 block_start_offset = context.getOffset()
466 block_end_offset, next_block_start_offset = \
467 self._findNextBlockSeparator(context)
468 # If we specify the end
469 if end != None:
470 block_end_offset = min(end, block_end_offset)
471 # If the block is an empty block (a SEPARATOR), we try to find the
472 # parent node
473 if block_end_offset == block_start_offset:
474 # We rewind until we find a "Content" block
475 while context.currentNode.nodeName != "Content" and \
476 context.currentNode.parentNode != None:
477 context.currentNode = context.currentNode.parentNode
478 # Otherwise we set the current block and process it
479 else:
480 context.setCurrentBlock(block_start_offset, block_end_offset)
481 assert block_start_offset < block_end_offset <= next_block_start_offset
482 # We first look for a block parser that recognises the current
483 # context
484 assert len(self.blockParsers)>0
485 for blockParser in self.blockParsers:
486 context.setOffset(block_start_offset)
487 recognised = blockParser.recognises(context)
488 context.setOffset(block_start_offset)
489 if recognised: break
490 # If no block parser was recognised, we used the default block
491 # parser
492 if not recognised:
493 blockParser = self.defaultBlockParser
494 recognised = self.defaultBlockParser.recognises(context)
495 context.setOffset(block_start_offset)
496 assert recognised
497 start_offset = str(context.getOffset())
498 blockParser.process(context, recognised)
499 # Just in case the parser modified the end offset, we update
500 # the next block start offset
501 next_block_start_offset = context.blockEndOffset
502 node = context.currentNode
503 # Anyway, we set the offset to the next block start
504 context.setOffset(next_block_start_offset)
505
506 def parseBlock( self, context, node, textProcessor ):
507 """Parses the current block, looking for the inlines it may contain."""
508 #if context.markOffsets and not node.getAttributeNS(None,"_start"):
509 # node.setAttributeNS(None, "_start", str(context.getOffset()))
510 while not context.blockEndReached():
511 self._parseNextInline(context, node, textProcessor)
512 #if context.markOffsets and not node.getAttributeNS(None,"_end"):
513 # node.setAttributeNS(None, "_end", str(context.getOffset()))
514
515 def _parseNextInline( self, context, node, textProcessor ):
516 """Parses the content of the current block, starting at the context
517 offset, modifying the given node and updating the context offset.
518 This returns the a triple (offset, information, parser) where
519 information is the result of the parser `recognises' method."""
520 assert context and node and textProcessor
521 assert not context.blockEndReached()
522 parse_offset = context.getOffset()
523 matchedResult = context.findNextInline(self.inlineParsers)
524 # If an inline parser recognised the block content then we can parse
525 # it without problem
526 if matchedResult:
527 # We append the text between the search start offset and the matched
528 # block start
529 text = context.currentFragment()[:matchedResult[0]]
530 if text:
531 text = textProcessor( context, text )
532 text_node = context.document.createTextNode(text)
533 node.appendChild(text_node)
534 new_offset = matchedResult[2].parse(context, node, matchedResult[1])
535 # We increase the offset so that the next parsing offset will be
536 # the end of the parsed inline.
537 context.increaseOffset(new_offset)
538 # When we have not found any matched result, this means that we simply
539 # have to append the whole block as a text node
540 else:
541 assert parse_offset < context.blockEndOffset
542 text = textProcessor(context,
543 context.documentText[parse_offset:context.blockEndOffset]
544 )
545 text_node = context.document.createTextNode(text)
546 node.appendChild(text_node)
547 # We set the end to the block end
548 context.setOffset(context.blockEndOffset)
549 # We make sure the parsers have actually augmented the offset
550 assert context.getOffset() >= parse_offset
551 return matchedResult
552
553 def _findNextBlockSeparator( self, context ):
554 """Returns a match object that matches the next block separator, taking
555 into account possible custom block objects."""
556 #FIXME: Should check if the found block separator is contained in a
557 #custom block or not.
558 block_match = RE_BLOCK_SEPARATOR.search(context.documentText,
559 context.getOffset())
560 if block_match:
561 local_offset = context.getOffset()
562 # We look for a markup inline between the current offset and the
563 # next block separator
564 while local_offset<block_match.start():
565 markup_match = RE_MARKUP.search(context.documentText,
566 local_offset, block_match.start())
567 # If we have not found a markup, we break
568 if not markup_match: break
569 if markup_match:
570 # We have specified that markup inlines should not be searched
571 # after the block separator
572 local_offset, result = self._delimitXMLMarkupBlock(context, markup_match, block_match, local_offset)
573 if not result is None: return result
574 # We have found a block with no nested markup
575 return (block_match.start(), block_match.end())
576 # There was no block separator, so we reached the document end
577 else:
578 return (context.documentTextLength, context.documentTextLength)
579
580 def _delimitXMLMarkupBlock( self, context, markupMatch, blockMatch, localOffset ):
581 markup_match = markupMatch
582 block_match = blockMatch
583 local_offset = localOffset
584 assert markup_match.start()<block_match.start()
585 # Case 1: Markup is a start tag
586 if Markup_isStartTag(markup_match):
587 # We look for the markup end inline
588 offsets = context.saveOffsets()
589 context.setCurrentBlock(markup_match.end(),context.documentTextLength)
590 # There may be no 2nd group, so we have to check this. Old
591 # Kiwi documents may have [start:something] instead of
592 # [start something]
593 markup_end = None
594 if markup_match.group(1):
595 markup_end = self.markupParser.findEnd(
596 markup_match.group(1).strip(), context)
597 context.restoreOffsets(offsets)
598 # If we found an end markup
599 if markup_end:
600 # The returned end is relative to the start markup end
601 # offset (markup_end is a couple indicating the range
602 # covered by the matched end markup)
603 markup_end = markup_match.end() + markup_end[1]
604 # If the end is greater than the block end, then we have
605 # to recurse to look for a new block separator
606 # after the block end
607 if markup_end > block_match.start():
608 offsets = context.saveOffsets()
609 context.setOffset(markup_end)
610 result = self._findNextBlockSeparator(context)
611 context.restoreOffsets(offsets)
612 # NOTE: This is the case where we found
613 # the block
614 return local_offset, result
615 # Otherwise we simply increase the offset, and look for
616 # other possible markup inlines
617 else:
618 local_offset = markup_end
619 # If there was not markup end, we skip the markup inline
620 else:
621 local_offset = markup_match.end()
622 # We have a single tag, so we simply increase the offset
623 else:
624 local_offset = markup_match.end()
625 return local_offset, None
626
627 def _nodeHasOffsets( self, node ):
628 start, end = self._nodeGetOffsets(node)
629 return start != None and end != None
630
631 def _nodeGetOffsets( self, node ):
632 start = node.getAttributeNS(None, "_start")
633 end = node.getAttributeNS(None, "_end")
634 if start == '': start = None
635 if end == '': end = None
636 if start != None: start = int(start)
637 else: start = None
638 if end != None: end = int(end)
639 else: end = None
640 return (start,end)
641
642 def _nodeEnsureOffsets( self, node, start=None, end=None ):
643 nstart, nend = self._nodeGetOffsets(node)
644 if nstart is None and start != None:
645 node.setAttributeNS(None, "_start", str(start))
646 if nend is None and end != None:
647 node.setAttributeNS(None, "_end", str(end))
648
649 def _updateElementOffsets( self, context, node=None, counter=0, offsets=None ):
650 """This function ensures that every element has a _start and _end
651 attribute indicating the bit of original data it comes from."""
652 if node == None:
653 node = context.document.childNodes[0]
654 self._nodeEnsureOffsets(node, 0, context.documentTextLength)
655 node.setAttributeNS(None, "_number", str(counter))
656 # The given offsets parameter is an array with the node number and the
657 # offsets. It can be used by embedders to easily access nods by offset
658 if offsets != None:
659 assert len(offsets) == counter, "%s != %s" % (len(offsets) , counter)
660 this_offsets = [None,None]
661 offsets.append(this_offsets)
662 # The first step is to fill an array with the child nodes offsets
663 # Each child node may or may not have an offset
664 child_nodes = tuple([n for n in node.childNodes if n.nodeType == n.ELEMENT_NODE])
665 nstart, nend = self._nodeGetOffsets(node)
666 if child_nodes:
667 self._nodeEnsureOffsets(child_nodes[0], start=nstart)
668 self._nodeEnsureOffsets(child_nodes[-1], end=nend)
669 child_offsets = []
670 start = end = None
671 for e in child_nodes:
672 counter = self._updateElementOffsets(context, e, counter + 1)
673 child_offsets.append(self._nodeGetOffsets(e))
674 # Once this list is created, we retried the start offset of the earliest
675 # child that has a start offset, same for the end offset of the latest
676 # child
677 child_start = None
678 child_end = None
679 if child_offsets:
680 i = 0
681 j = len(child_offsets) - 1
682 while i < len(child_offsets) and child_offsets[i][0] == None: i += 1
683 while j >= 0 and child_offsets[j][1] == None: j -= 1
684 if i < len(child_offsets): child_start = child_offsets[i][0]
685 if j >= 0: child_end = child_offsets[j][1]
686 # We update the current node with the child offsets (this allows node
687 # that have incomplete offsets to be completed)
688 self._nodeEnsureOffsets(node, child_start, child_end)
689 # And now we update the children offsets again (so that they actually
690 # all have offsets), because we have all the information we need to
691 # actually update the children offsets
692 start, end = self._nodeGetOffsets(node)
693 self._propagateElementOffsets(node,start,end)
694 # As we now the current node offsets, we can set the real values in the
695 # fiven offsets array, by simply updating the offsets value in the
696 # `this_offsets` list.
697 if offsets!=None:
698 o = self._nodeGetOffsets(node)
699 this_offsets[0] = o[0]
700 this_offsets[1] = o[1]
701 # And we return the number of this node
702 return counter
703
704 def _propagateElementOffsets( self, element, start=None, end=None ):
705 """Used by the _updateElementOffsets to ensure start and end offsets in
706 all children and descendants."""
707 #if start is None or end is None: return
708 child_nodes = list([n for n in element.childNodes if n.nodeType == n.ELEMENT_NODE])
709 self._nodeEnsureOffsets(element, start, end)
710 # At first, we set the bounds properly, so that the first child node
711 # start is this node start, and the last node end is this node end
712 if child_nodes:
713 self._nodeEnsureOffsets(child_nodes[0], start=start)
714 self._nodeEnsureOffsets(child_nodes[-1], end=end)
715 # Now
716 for child in child_nodes:
717 self._propagateElementOffsets(child, start=start)
718 if self._nodeGetOffsets(child)[1] != None:
719 _,nstart = self._nodeGetOffsets(child)
720 if nstart != None: start = nstart
721 child_nodes.reverse()
722 for child in child_nodes:
723 self._propagateElementOffsets(child,end=end)
724 if self._nodeGetOffsets(child)[0] != None:
725 nend,_ = self._nodeGetOffsets(child)
726 if nend != None: end = nend
727 # TODO: Maybe update the element offsetsd
728
729
730 # TEXT PROCESSING UTILITIES________________________________________________
731
732 def normaliseText( self, text ):
733 """Treats tabs eols and multiples spaces as single space, plus removes
734 leading and trailing spaces."""
735 # We do not strip the text because white spaces are important
736 return RE_SPACES.sub(u" ",text)
737
738 def expandTabs( self, text, cut=0 ):
739 """Expands the tabs in the given text, cutting the n first characters
740 of each line, where n is given by the 'cut' argument. This tabs
741 expansion algorithm works better than Python line expansion
742 algorithms."""
743 if not text: return ""
744 new_text = ""
745 for line in text.split("\n"):
746 start = 0
747 match = 1
748 new_line = ""
749 while match!=None:
750 match = RE_TABS.search(line, start)
751 if match:
752 rest = TAB_SIZE-operator.mod(match.start(), TAB_SIZE)
753 new_line += line[start:match.start()]
754 # We grow the line with additional tabs
755 value = rest+(len(match.group())-1)*TAB_SIZE
756 while value>0: new_line+=" " ; value-=1
757 #It is important to mutate the original line
758 line = new_line+line[match.end():]
759 start = len(new_line)
760 new_line += line[start:]
761 cut_offset = min(len(new_line), cut)
762 new_text += new_line[cut_offset:] + "\n"
763 # We make sure that we do not add an extra empty line
764 if text[-1]!=new_text[-1]:
765 return new_text[:-1]
766 else:
767 return new_text
768
769 @classmethod
770 def getIndentation( self, text ):
771 """Returns the indentation of a string.
772
773 Basically if a string has the first three lines at the same
774 identation level, then it is indentation will be the number
775 of spaces or tabs that lead the first three lines.
776
777 Tabs have the value given by the *TAB_SIZE* variable."""
778 lines = filter(lambda x:len(x)>0, string.split(text, '\n', 4))
779 indent = map(self.countLeadingSpaces, lines)
780 if len(lines) == 0:
781 res = 0
782 elif len(lines) == 1:
783 res = indent[0]
784 elif len(lines) == 2:
785 res = indent[1]
786 else:
787 res = max(max(indent[0], indent[1]), indent[2])
788 return res
789
790 @classmethod
791 def countLeadingSpaces( self, text ):
792 """Returns the number of leading spaces in the given line.
793 A tab will have the value given by the TAB_SIZE global."""
794 count = 0
795 for char in text:
796 if char==u"\t":
797 count += TAB_SIZE-operator.mod(count,TAB_SIZE)
798 elif char==u" ":
799 count+=1
800 else:
801 return count
802 return count
803
804 @classmethod
805 def removeLeadingSpaces( self, text, maximum=None ):
806 i = 0 ; count = 0
807 for char in text:
808 if not maximum is None and count >= maximum:
809 return text[i:]
810 if char==u"\t":
811 count += TAB_SIZE-operator.mod(count,TAB_SIZE)
812 elif char==u" ":
813 count+=1
814 else:
815 return text[i:]
816 i += 1
817 return ''
818
819 @classmethod
820 def charactersToSpaces( self, text):
821 """Returns a string where all characters are converted to spaces.
822 Newlines and tabs are preserved"""
823 new_text = u""
824 for char in text:
825 if char in ("\t", "\n", " "):
826 new_text += char
827 else:
828 new_text += " "
829 return new_text
830
831 # EOF