"Fossies" - the Fresh Open Source Software Archive

Member "PURELIB/trac/util/text.py" (27 Aug 2019, 29516 Bytes) of package /windows/misc/Trac-1.4.win32.exe:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Python source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. See also the last Fossies "Diffs" side-by-side code changes report for "text.py": 1.3.5_vs_1.3.6.

    1 # -*- coding: utf-8 -*-
    2 #
    3 # Copyright (C) 2003-2019 Edgewall Software
    4 # Copyright (C) 2003-2004 Jonas Borgström <jonas@edgewall.com>
    5 # Copyright (C) 2006 Matthew Good <trac@matt-good.net>
    6 # Copyright (C) 2005-2006 Christian Boos <cboos@edgewall.org>
    7 # All rights reserved.
    8 #
    9 # This software is licensed as described in the file COPYING, which
   10 # you should have received as part of this distribution. The terms
   11 # are also available at https://trac.edgewall.org/wiki/TracLicense.
   12 #
   13 # This software consists of voluntary contributions made by many
   14 # individuals. For the exact contribution history, see the revision
   15 # history and logs, available at https://trac.edgewall.org/log/.
   16 #
   17 # Author: Jonas Borgström <jonas@edgewall.com>
   18 #         Matthew Good <trac@matt-good.net>
   19 #         Christian Boos <cboos@edgewall.org>
   20 
   21 import __builtin__
   22 import locale
   23 import os
   24 import re
   25 import sys
   26 import textwrap
   27 from urllib import quote, quote_plus, unquote
   28 from unicodedata import east_asian_width
   29 
   30 import jinja2
   31 
   32 CRLF = '\r\n'
   33 
   34 class Empty(unicode):
   35     """A special tag object evaluating to the empty string"""
   36     __slots__ = []
   37 
   38 empty = Empty()
   39 
   40 del Empty # shouldn't be used outside of Trac core
   41 
   42 
   43 # -- Jinja2
   44 
   45 def jinja2env(**kwargs):
   46     """Creates a Jinja2 ``Environment`` configured with Trac conventions.
   47 
   48     All default parameters can optionally be overriden. The ``loader``
   49     parameter is not set by default, so unless it is set by the
   50     caller, only inline templates can be created from the environment.
   51 
   52     :rtype: `jinja.Environment`
   53 
   54     """
   55     exts = ('html', 'rss', 'xml')
   56     def filterout_none(v):
   57         return '' if v is None else v
   58     def autoescape_extensions(template):
   59         return template and template.rsplit('.', 1)[1] in exts
   60     defaults = dict(
   61         variable_start_string='${',
   62         variable_end_string='}',
   63         line_statement_prefix='#',
   64         line_comment_prefix='##',
   65         trim_blocks=True,
   66         lstrip_blocks=True,
   67         extensions=['jinja2.ext.do', 'jinja2.ext.i18n', 'jinja2.ext.with_'],
   68         finalize=filterout_none,
   69         autoescape=autoescape_extensions,
   70     )
   71     defaults.update(kwargs)
   72     jenv = jinja2.Environment(**defaults)
   73     jenv.globals.update(
   74         len=len,
   75     )
   76     return jenv
   77 
   78 def jinja2template(template, text=False):
   79     """Creates a Jinja2 ``Template`` from inlined source.
   80 
   81     :param template: the template content
   82     :param text: if set to `False`, the result of the variable
   83                  expansion will be XML/HTML escaped
   84 
   85     """
   86     return jinja2env(autoescape=not text).from_string(template)
   87 
   88 
   89 # -- Unicode
   90 
   91 def to_unicode(text, charset=None):
   92     """Convert input to an `unicode` object.
   93 
   94     For a `str` object, we'll first try to decode the bytes using the given
   95     `charset` encoding (or UTF-8 if none is specified), then we fall back to
   96     the latin1 encoding which might be correct or not, but at least preserves
   97     the original byte sequence by mapping each byte to the corresponding
   98     unicode code point in the range U+0000 to U+00FF.
   99 
  100     For anything else, a simple `unicode()` conversion is attempted,
  101     with special care taken with `Exception` objects.
  102     """
  103     if isinstance(text, str):
  104         try:
  105             return unicode(text, charset or 'utf-8')
  106         except UnicodeDecodeError:
  107             return unicode(text, 'latin1')
  108     elif isinstance(text, Exception):
  109         if os.name == 'nt' and isinstance(text, EnvironmentError):
  110             strerror = text.strerror
  111             filename = text.filename
  112             if isinstance(strerror, basestring) and \
  113                     isinstance(filename, basestring):
  114                 try:
  115                     if not isinstance(strerror, unicode):
  116                         strerror = unicode(strerror, 'mbcs')
  117                     if not isinstance(filename, unicode):
  118                         filename = unicode(filename, 'mbcs')
  119                 except UnicodeError:
  120                     pass
  121                 else:
  122                     if isinstance(text, WindowsError):
  123                         return u"[Error %s] %s: '%s'" % (text.winerror,
  124                                                          strerror, filename)
  125                     else:
  126                         return u"[Errno %s] %s: '%s'" % (text.errno, strerror,
  127                                                          filename)
  128             # the exception might have a localized error string encoded with
  129             # ANSI codepage if OSError and IOError on Windows
  130             try:
  131                 return unicode(str(text), 'mbcs')
  132             except UnicodeError:
  133                 pass
  134         # two possibilities for storing unicode strings in exception data:
  135         try:
  136             # custom __str__ method on the exception (e.g. PermissionError)
  137             return unicode(text)
  138         except UnicodeError:
  139             # unicode arguments given to the exception (e.g. parse_date)
  140             return ' '.join(to_unicode(arg) for arg in text.args)
  141     return unicode(text)
  142 
  143 
  144 def exception_to_unicode(e, traceback=False):
  145     """Convert an `Exception` to an `unicode` object.
  146 
  147     In addition to `to_unicode`, this representation of the exception
  148     also contains the class name and optionally the traceback.
  149     """
  150     message = '%s: %s' % (e.__class__.__name__, to_unicode(e))
  151     if traceback:
  152         from trac.util import get_last_traceback
  153         traceback_only = get_last_traceback().split('\n')[:-2]
  154         message = '\n%s\n%s' % (to_unicode('\n'.join(traceback_only)), message)
  155     return message
  156 
  157 
  158 def path_to_unicode(path):
  159     """Convert a filesystem path to unicode, using the filesystem encoding."""
  160     if isinstance(path, str):
  161         try:
  162             return unicode(path, sys.getfilesystemencoding())
  163         except UnicodeDecodeError:
  164             return unicode(path, 'latin1')
  165     return unicode(path)
  166 
  167 
  168 _ws_leading_re = re.compile(u'\\A[\\s\u200b]+', re.UNICODE)
  169 _ws_trailing_re = re.compile(u'[\\s\u200b]+\\Z', re.UNICODE)
  170 
  171 def stripws(text, leading=True, trailing=True):
  172     """Strips unicode white-spaces and ZWSPs from ``text``.
  173 
  174     :param leading: strips leading spaces from ``text`` unless ``leading`` is
  175                     `False`.
  176     :param trailing: strips trailing spaces from ``text`` unless ``trailing``
  177                      is `False`.
  178     """
  179     if leading:
  180         text = _ws_leading_re.sub('', text)
  181     if trailing:
  182         text = _ws_trailing_re.sub('', text)
  183     return text
  184 
  185 
  186 def strip_line_ws(text, leading=True, trailing=True):
  187     """Strips unicode white-spaces and ZWSPs from each line of ``text``.
  188 
  189     :param leading: strips leading spaces from ``text`` unless ``leading`` is
  190                     `False`.
  191     :param trailing: strips trailing spaces from ``text`` unless ``trailing``
  192                      is `False`.
  193     """
  194     lines = re.compile(r'(\n|\r\n|\r)').split(text)
  195     if leading:
  196         lines[::2] = (_ws_leading_re.sub('', line) for line in lines[::2])
  197     if trailing:
  198         lines[::2] = (_ws_trailing_re.sub('', line) for line in lines[::2])
  199     return ''.join(lines)
  200 
  201 
  202 _js_quote = {'\\': '\\\\', '"': '\\"', '\b': '\\b', '\f': '\\f',
  203              '\n': '\\n', '\r': '\\r', '\t': '\\t', "'": "\\'"}
  204 for i in list(xrange(0x20)) + [ord(c) for c in u'&<>\u2028\u2029']:
  205     _js_quote.setdefault(unichr(i), '\\u%04x' % i)
  206 _js_quote_re = re.compile(r'[\x00-\x1f\\"\b\f\n\r\t\'&<>' + u'\u2028\u2029]')
  207 _js_string_re = re.compile(r'[\x00-\x1f\\"\b\f\n\r\t&<>' + u'\u2028\u2029]')
  208 
  209 
  210 def javascript_quote(text):
  211     """Quote strings for inclusion in single or double quote delimited
  212     Javascript strings
  213     """
  214     if not text:
  215         return ''
  216     def replace(match):
  217         return _js_quote[match.group(0)]
  218     return _js_quote_re.sub(replace, text)
  219 
  220 
  221 def to_js_string(text):
  222     """Embed the given string in a double quote delimited Javascript string
  223     (conform to the JSON spec)
  224     """
  225     if not text:
  226         return '""'
  227     def replace(match):
  228         return _js_quote[match.group(0)]
  229     return '"%s"' % _js_string_re.sub(replace, text)
  230 
  231 
  232 def unicode_quote(value, safe='/'):
  233     """A unicode aware version of `urllib.quote`
  234 
  235     :param value: anything that converts to a `str`. If `unicode`
  236                   input is given, it will be UTF-8 encoded.
  237     :param safe: as in `quote`, the characters that would otherwise be
  238                  quoted but shouldn't here (defaults to '/')
  239     """
  240     return quote(value.encode('utf-8') if isinstance(value, unicode)
  241                  else str(value), safe)
  242 
  243 
  244 def unicode_quote_plus(value, safe=''):
  245     """A unicode aware version of `urllib.quote_plus`.
  246 
  247     :param value: anything that converts to a `str`. If `unicode`
  248                   input is given, it will be UTF-8 encoded.
  249     :param safe: as in `quote_plus`, the characters that would
  250                  otherwise be quoted but shouldn't here (defaults to
  251                  '/')
  252     """
  253     return quote_plus(value.encode('utf-8') if isinstance(value, unicode)
  254                       else str(value), safe)
  255 
  256 
  257 def unicode_unquote(value):
  258     """A unicode aware version of `urllib.unquote`.
  259 
  260     :param str: UTF-8 encoded `str` value (for example, as obtained by
  261                 `unicode_quote`).
  262     :rtype: `unicode`
  263     """
  264     return unquote(value).decode('utf-8')
  265 
  266 
  267 def unicode_urlencode(params, safe=''):
  268     """A unicode aware version of `urllib.urlencode`.
  269 
  270     Values set to `empty` are converted to the key alone, without the
  271     equal sign.
  272     """
  273     if isinstance(params, dict):
  274         params = params.iteritems()
  275     l = []
  276     for k, v in params:
  277         if v is empty:
  278             l.append(unicode_quote_plus(k, safe))
  279         else:
  280             l.append(unicode_quote_plus(k, safe) + '=' +
  281                      unicode_quote_plus(v, safe))
  282     return '&'.join(l)
  283 
  284 
  285 _qs_quote_safe = ''.join(chr(c) for c in xrange(0x21, 0x7f))
  286 
  287 def quote_query_string(text):
  288     """Quote strings for query string
  289     """
  290     return unicode_quote_plus(text, _qs_quote_safe)
  291 
  292 
  293 def to_utf8(text, charset='latin1'):
  294     """Convert input to a UTF-8 `str` object.
  295 
  296     If the input is not an `unicode` object, we assume the encoding is
  297     already UTF-8, ISO Latin-1, or as specified by the optional
  298     *charset* parameter.
  299     """
  300     if isinstance(text, str):
  301         try:
  302             u = unicode(text, 'utf-8')
  303         except UnicodeError:
  304             try:
  305                 # Use the user supplied charset if possible
  306                 u = unicode(text, charset)
  307             except UnicodeError:
  308                 # This should always work
  309                 u = unicode(text, 'latin1')
  310         else:
  311             # Do nothing if it's already utf-8
  312             return text
  313     else:
  314         u = to_unicode(text)
  315     return u.encode('utf-8')
  316 
  317 
  318 class unicode_passwd(unicode):
  319     """Conceal the actual content of the string when `repr` is called."""
  320     def __repr__(self):
  321         return '*******'
  322 
  323 
  324 def stream_encoding(stream):
  325     """Return the appropriate encoding for the given stream."""
  326     encoding = getattr(stream, 'encoding', None)
  327     # Windows returns 'cp0' to indicate no encoding
  328     return encoding if encoding not in (None, 'cp0') else 'utf-8'
  329 
  330 
  331 def console_print(out, *args, **kwargs):
  332     """Output the given arguments to the console, encoding the output
  333     as appropriate.
  334 
  335     :param kwargs: ``newline`` controls whether a newline will be appended
  336                    (defaults to `True`)
  337     """
  338     cons_charset = stream_encoding(out)
  339     out.write(' '.join(to_unicode(a).encode(cons_charset, 'replace')
  340                        for a in args))
  341     if kwargs.get('newline', True):
  342         out.write('\n')
  343 
  344 
  345 def printout(*args, **kwargs):
  346     """Do a `console_print` on `sys.stdout`."""
  347     console_print(sys.stdout, *args, **kwargs)
  348 
  349 
  350 def printerr(*args, **kwargs):
  351     """Do a `console_print` on `sys.stderr`."""
  352     console_print(sys.stderr, *args, **kwargs)
  353 
  354 
  355 def printfout(message, *args, **kwargs):
  356     """Format `message`, do a `console.print` on `sys.stdout` and flush
  357     the buffer.
  358     """
  359     if args:
  360         message %= args
  361     printout(message, **kwargs)
  362     sys.stdout.flush()
  363 
  364 
  365 def printferr(message, *args, **kwargs):
  366     """Format `message`, do a `console.print` on `sys.stderr` and flush
  367     the buffer.
  368     """
  369     if args:
  370         message %= args
  371     printerr(message, **kwargs)
  372     sys.stderr.flush()
  373 
  374 
  375 def raw_input(prompt):
  376     """Input one line from the console and converts it to unicode as
  377     appropriate.
  378     """
  379     printout(prompt, newline=False)
  380     return to_unicode(__builtin__.raw_input(), sys.stdin.encoding)
  381 
  382 
  383 _preferredencoding = locale.getpreferredencoding()
  384 
  385 def getpreferredencoding():
  386     """Return the encoding, which is retrieved on ahead, according to user
  387     preference.
  388 
  389     We should use this instead of `locale.getpreferredencoding()` which
  390     is not thread-safe."""
  391     return _preferredencoding
  392 
  393 
  394 # -- Plain text formatting
  395 
  396 def text_width(text, ambiwidth=1):
  397     """Determine the column width of `text` in Unicode characters.
  398 
  399     The characters in the East Asian Fullwidth (F) or East Asian Wide (W)
  400     have a column width of 2. The other characters in the East Asian
  401     Halfwidth (H) or East Asian Narrow (Na) have a column width of 1.
  402 
  403     That `ambiwidth` parameter is used for the column width of the East
  404     Asian Ambiguous (A). If `1`, the same width as characters in US-ASCII.
  405     This is expected by most users. If `2`, twice the width of US-ASCII
  406     characters. This is expected by CJK users.
  407 
  408     cf. http://www.unicode.org/reports/tr11/.
  409     """
  410     twice = 'FWA' if ambiwidth == 2 else 'FW'
  411     return sum([2 if east_asian_width(chr) in twice else 1
  412                 for chr in to_unicode(text)])
  413 
  414 
  415 def _get_default_ambiwidth():
  416     """Return width of East Asian Ambiguous based on locale environment
  417     variables or Windows codepage.
  418     """
  419 
  420     if os.name == 'nt':
  421         import ctypes
  422         codepage = ctypes.windll.kernel32.GetConsoleOutputCP()
  423         if codepage in (932,   # Japanese (Shift-JIS)
  424                         936,   # Chinese Simplified (GB2312)
  425                         949,   # Korean (Unified Hangul Code)
  426                         950):  # Chinese Traditional (Big5)
  427             return 2
  428     else:
  429         for name in ('LANGUAGE', 'LC_ALL', 'LC_MESSAGES', 'LANG'):
  430             value = os.environ.get(name) or ''
  431             if value:
  432                 if name == 'LANGUAGE' and ':' in value:
  433                     value = value.split(':')[0]
  434                 return 2 if value.lower().startswith(('zh', 'ja', 'ko')) else 1
  435 
  436     return 1
  437 
  438 
  439 _default_ambiwidth = _get_default_ambiwidth()
  440 
  441 
  442 def print_table(data, headers=None, sep='  ', out=None, ambiwidth=None):
  443     """Print data according to a tabular layout.
  444 
  445     :param data: a sequence of rows; assume all rows are of equal length.
  446     :param headers: an optional row containing column headers; must be of
  447                     the same length as each row in `data`.
  448     :param sep: column separator
  449     :param out: output file descriptor (`None` means use `sys.stdout`)
  450     :param ambiwidth: column width of the East Asian Ambiguous (A). If None,
  451                       detect ambiwidth with the locale settings. If others,
  452                       pass to the `ambiwidth` parameter of `text_width`.
  453     """
  454     if out is None:
  455         out = sys.stdout
  456     charset = getattr(out, 'encoding', None) or 'utf-8'
  457     if ambiwidth is None:
  458         ambiwidth = _default_ambiwidth
  459     data = list(data)
  460     if headers:
  461         data.insert(0, headers)
  462     elif not data:
  463         return
  464 
  465     # Convert to an unicode object with `to_unicode`. If None, convert to a
  466     # empty string.
  467     def to_text(val):
  468         if val is None:
  469             return u''
  470         return to_unicode(val)
  471 
  472     def tw(text):
  473         return text_width(text, ambiwidth=ambiwidth)
  474 
  475     def to_lines(data):
  476         lines = []
  477         for row in data:
  478             row = [to_text(cell) for cell in row]
  479             if any('\n' in cell for cell in row):
  480                 row = [cell.splitlines() for cell in row]
  481                 max_lines = max(len(cell) for cell in row)
  482                 for cell in row:
  483                     if len(cell) < max_lines:
  484                         cell += [''] * (max_lines - len(cell))
  485                 lines.extend([cell[idx] for cell in row]
  486                              for idx in xrange(max_lines))
  487             else:
  488                 lines.append(row)
  489         return lines
  490 
  491     data = to_lines(data)
  492 
  493     num_cols = len(data[0])
  494     col_width = [max(tw(row[idx]) for row in data)
  495                  for idx in xrange(num_cols)]
  496 
  497     out.write('\n')
  498     for ridx, row in enumerate(data):
  499         for cidx, cell in enumerate(row):
  500             if cidx + 1 == num_cols:
  501                 line = cell  # No separator after last column
  502             else:
  503                 if headers and ridx == 0:
  504                     sp = ' ' * tw(sep)  # No separator in header
  505                 else:
  506                     sp = sep
  507                 line = u'%-*s%s' % (col_width[cidx] - tw(cell) + len(cell),
  508                                     cell, sp)
  509             line = line.encode(charset, 'replace')
  510             out.write(line)
  511 
  512         out.write('\n')
  513         if ridx == 0 and headers:
  514             out.write('-' * (tw(sep) * cidx + sum(col_width)))
  515             out.write('\n')
  516     out.write('\n')
  517 
  518 
  519 def shorten_line(text, maxlen=75):
  520     """Truncates `text` to length less than or equal to `maxlen` characters.
  521 
  522     This tries to be (a bit) clever and attempts to find a proper word
  523     boundary for doing so.
  524     """
  525     if len(text or '') <= maxlen:
  526         return text
  527     suffix = ' ...'
  528     maxtextlen = maxlen - len(suffix)
  529     cut = max(text.rfind(' ', 0, maxtextlen), text.rfind('\n', 0, maxtextlen))
  530     if cut < 0:
  531         cut = maxtextlen
  532     return text[:cut] + suffix
  533 
  534 
  535 class UnicodeTextWrapper(textwrap.TextWrapper):
  536     breakable_char_ranges = [
  537         (0x1100, 0x11FF),   # Hangul Jamo
  538         (0x2E80, 0x2EFF),   # CJK Radicals Supplement
  539         (0x3000, 0x303F),   # CJK Symbols and Punctuation
  540         (0x3040, 0x309F),   # Hiragana
  541         (0x30A0, 0x30FF),   # Katakana
  542         (0x3130, 0x318F),   # Hangul Compatibility Jamo
  543         (0x3190, 0x319F),   # Kanbun
  544         (0x31C0, 0x31EF),   # CJK Strokes
  545         (0x3200, 0x32FF),   # Enclosed CJK Letters and Months
  546         (0x3300, 0x33FF),   # CJK Compatibility
  547         (0x3400, 0x4DBF),   # CJK Unified Ideographs Extension A
  548         (0x4E00, 0x9FFF),   # CJK Unified Ideographs
  549         (0xA960, 0xA97F),   # Hangul Jamo Extended-A
  550         (0xAC00, 0xD7AF),   # Hangul Syllables
  551         (0xD7B0, 0xD7FF),   # Hangul Jamo Extended-B
  552         (0xF900, 0xFAFF),   # CJK Compatibility Ideographs
  553         (0xFE30, 0xFE4F),   # CJK Compatibility Forms
  554         (0xFF00, 0xFFEF),   # Halfwidth and Fullwidth Forms
  555         (0x20000, 0x2FFFF, u'[\uD840-\uD87F][\uDC00-\uDFFF]'), # Plane 2
  556         (0x30000, 0x3FFFF, u'[\uD880-\uD8BF][\uDC00-\uDFFF]'), # Plane 3
  557     ]
  558 
  559     split_re = None
  560     breakable_re = None
  561 
  562     @classmethod
  563     def _init_patterns(cls):
  564         char_ranges = []
  565         surrogate_pairs = []
  566         for val in cls.breakable_char_ranges:
  567             try:
  568                 high = unichr(val[0])
  569                 low = unichr(val[1])
  570                 char_ranges.append(u'%s-%s' % (high, low))
  571             except ValueError:
  572                 # Narrow build, `re` cannot use characters >= 0x10000
  573                 surrogate_pairs.append(val[2])
  574         char_ranges = u''.join(char_ranges)
  575         if surrogate_pairs:
  576             pattern = u'(?:[%s]|%s)+' % (char_ranges,
  577                                          u'|'.join(surrogate_pairs))
  578         else:
  579             pattern = u'[%s]+' % char_ranges
  580 
  581         cls.split_re = re.compile(
  582             r'(\s+|' +                                  # any whitespace
  583             pattern + u'|' +                            # breakable text
  584             r'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|' +   # hyphenated words
  585             r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))',     # em-dash
  586             re.UNICODE)
  587         cls.breakable_re = re.compile(r'\A' + pattern, re.UNICODE)
  588 
  589     def __init__(self, cols, replace_whitespace=0, break_long_words=0,
  590                  initial_indent='', subsequent_indent='', ambiwidth=1):
  591         textwrap.TextWrapper.__init__(
  592                 self, cols, replace_whitespace=0, break_long_words=0,
  593                 initial_indent=initial_indent,
  594                 subsequent_indent=subsequent_indent)
  595         self.ambiwidth = ambiwidth
  596         if self.split_re is None:
  597             self._init_patterns()
  598 
  599     def _split(self, text):
  600         chunks = self.split_re.split(to_unicode(text))
  601         chunks = filter(None, chunks)
  602         return chunks
  603 
  604     def _text_width(self, text):
  605         return text_width(text, ambiwidth=self.ambiwidth)
  606 
  607     def _wrap_chunks(self, chunks):
  608         lines = []
  609         chunks.reverse()
  610         text_width = self._text_width
  611 
  612         while chunks:
  613             cur_line = []
  614             cur_width = 0
  615 
  616             if lines:
  617                 indent = self.subsequent_indent
  618             else:
  619                 indent = self.initial_indent
  620             width = self.width - text_width(indent)
  621 
  622             if chunks[-1].strip() == '' and lines:
  623                 del chunks[-1]
  624 
  625             while chunks:
  626                 chunk = chunks[-1]
  627                 w = text_width(chunk)
  628                 if cur_width + w <= width:
  629                     cur_line.append(chunks.pop())
  630                     cur_width += w
  631                 elif self.breakable_re.match(chunk):
  632                     left_space = width - cur_width
  633                     for i in xrange(len(chunk)):
  634                         w = text_width(chunk[i])
  635                         if left_space < w:
  636                             break
  637                         left_space -= w
  638                     if i > 0:
  639                         cur_line.append(chunk[:i])
  640                         chunk = chunk[i:]
  641                         chunks[-1] = chunk
  642                     w = text_width(chunk)
  643                     break
  644                 else:
  645                     break
  646 
  647             if chunks and w > width:
  648                 self._handle_long_word(chunks, cur_line, cur_width, width)
  649 
  650             if cur_line and cur_line[-1].strip() == '':
  651                 del cur_line[-1]
  652 
  653             if cur_line:
  654                 lines.append(indent + ''.join(cur_line))
  655 
  656         return lines
  657 
  658 
  659 def wrap(t, cols=75, initial_indent='', subsequent_indent='',
  660          linesep=os.linesep, ambiwidth=1):
  661     """Wraps the single paragraph in `t`, which contains unicode characters.
  662     The every line is at most `cols` characters long.
  663 
  664     That `ambiwidth` parameter is used for the column width of the East
  665     Asian Ambiguous (A). If `1`, the same width as characters in US-ASCII.
  666     This is expected by most users. If `2`, twice the width of US-ASCII
  667     characters. This is expected by CJK users.
  668     """
  669     t = t.strip().replace('\r\n', '\n').replace('\r', '\n')
  670     wrapper = UnicodeTextWrapper(cols, replace_whitespace=0,
  671                                  break_long_words=0,
  672                                  initial_indent=initial_indent,
  673                                  subsequent_indent=subsequent_indent,
  674                                  ambiwidth=ambiwidth)
  675     wrappedLines = []
  676     for line in t.split('\n'):
  677         wrappedLines += wrapper.wrap(line.rstrip()) or ['']
  678     return linesep.join(wrappedLines)
  679 
  680 
  681 _obfuscation_char = u'@\u2026'
  682 
  683 def obfuscate_email_address(address):
  684     """Replace anything looking like an e-mail address (``'@something'``)
  685     with a trailing ellipsis (``'@…'``)
  686     """
  687     if address:
  688         at = address.find('@')
  689         if at != -1:
  690             return address[:at] + _obfuscation_char + \
  691                    ('>' if address[-1] == '>' else '')
  692     return address
  693 
  694 
  695 def is_obfuscated(word):
  696     """Returns `True` if the `word` looks like an obfuscated e-mail
  697     address.
  698 
  699     :since: 1.2
  700     """
  701     return _obfuscation_char in word
  702 
  703 
  704 def breakable_path(path):
  705     """Make a path breakable after path separators, and conversely, avoid
  706     breaking at spaces.
  707     """
  708     if not path:
  709         return path
  710     prefix = ''
  711     if path.startswith('/'):    # Avoid breaking after a leading /
  712         prefix = '/'
  713         path = path[1:]
  714     return prefix + path.replace('/', u'/\u200b').replace('\\', u'\\\u200b') \
  715                         .replace(' ', u'\u00a0')
  716 
  717 
  718 def normalize_whitespace(text, to_space=u'\u00a0', remove=u'\u200b'):
  719     """Normalize whitespace in a string, by replacing special spaces by normal
  720     spaces and removing zero-width spaces."""
  721     if not text:
  722         return text
  723     for each in to_space:
  724         text = text.replace(each, ' ')
  725     for each in remove:
  726         text = text.replace(each, '')
  727     return text
  728 
  729 
  730 def unquote_label(txt):
  731     """Remove (one level of) enclosing single or double quotes.
  732 
  733     .. versionadded :: 1.0
  734     """
  735     return txt[1:-1] if txt and txt[0] in "'\"" and txt[0] == txt[-1] else txt
  736 
  737 
  738 def cleandoc(message):
  739     """Removes uniform indentation and leading/trailing whitespace."""
  740     from inspect import cleandoc
  741     return cleandoc(message).strip()
  742 
  743 
  744 # -- Conversion
  745 
  746 def pretty_size(size, format='%.1f'):
  747     """Pretty print content size information with appropriate unit.
  748 
  749     :param size: number of bytes
  750     :param format: can be used to adjust the precision shown
  751     """
  752     if size is None:
  753         return ''
  754 
  755     jump = 1024
  756     if size < jump:
  757         from trac.util.translation import ngettext
  758         return ngettext("%(num)d byte", "%(num)d bytes", num=size)
  759 
  760     units = ['KB', 'MB', 'GB', 'TB']
  761     i = 0
  762     while size >= jump and i < len(units):
  763         i += 1
  764         size /= 1024.
  765 
  766     return (format + ' %s') % (size, units[i - 1])
  767 
  768 
  769 def expandtabs(s, tabstop=8, ignoring=None):
  770     """Expand tab characters `'\\\\t'` into spaces.
  771 
  772     :param tabstop: number of space characters per tab
  773                     (defaults to the canonical 8)
  774 
  775     :param ignoring: if not `None`, the expansion will be "smart" and
  776                      go from one tabstop to the next. In addition,
  777                      this parameter lists characters which can be
  778                      ignored when computing the indent.
  779     """
  780     if '\t' not in s:
  781         return s
  782     if ignoring is None:
  783         return s.expandtabs(tabstop)
  784 
  785     outlines = []
  786     for line in s.split('\n'):
  787         if '\t' not in line:
  788             outlines.append(line)
  789             continue
  790         p = 0
  791         s = []
  792         for c in line:
  793             if c == '\t':
  794                 n = tabstop - p % tabstop
  795                 s.append(' ' * n)
  796                 p += n
  797             elif not ignoring or c not in ignoring:
  798                 p += 1
  799                 s.append(c)
  800             else:
  801                 s.append(c)
  802         outlines.append(''.join(s))
  803     return '\n'.join(outlines)
  804 
  805 
  806 def fix_eol(text, eol):
  807     """Fix end-of-lines in a text."""
  808     lines = text.splitlines()
  809     lines.append('')
  810     return eol.join(lines)
  811 
  812 def unicode_to_base64(text, strip_newlines=True):
  813     """Safe conversion of ``text`` to base64 representation using
  814     utf-8 bytes.
  815 
  816     Strips newlines from output unless ``strip_newlines`` is `False`.
  817     """
  818     text = to_unicode(text)
  819     if strip_newlines:
  820         return text.encode('utf-8').encode('base64').replace('\n', '')
  821     return text.encode('utf-8').encode('base64')
  822 
  823 def unicode_from_base64(text):
  824     """Safe conversion of ``text`` to unicode based on utf-8 bytes."""
  825     return text.decode('base64').decode('utf-8')
  826 
  827 
  828 def levenshtein_distance(lhs, rhs):
  829     """Return the Levenshtein distance between two strings."""
  830     if len(lhs) > len(rhs):
  831         rhs, lhs = lhs, rhs
  832     if not lhs:
  833         return len(rhs)
  834 
  835     prev = xrange(len(rhs) + 1)
  836     for lidx, lch in enumerate(lhs):
  837         curr = [lidx + 1]
  838         for ridx, rch in enumerate(rhs):
  839             cost = (lch != rch) * 2
  840             curr.append(min(prev[ridx + 1] + 1, # deletion
  841                             curr[ridx] + 1,     # insertion
  842                             prev[ridx] + cost)) # substitution
  843         prev = curr
  844     return prev[-1]
  845 
  846 
  847 sub_vars_re = re.compile("[$]([A-Z_][A-Z0-9_]*)")
  848 
  849 def sub_vars(text, args):
  850     """Substitute $XYZ-style variables in a string with provided values.
  851 
  852     :param text: string containing variables to substitute.
  853     :param args: dictionary with keys matching the variables to be substituted.
  854                  The keys should not be prefixed with the $ character."""
  855     def repl(match):
  856         key = match.group(1)
  857         return args[key] if key in args else '$' + key
  858     return sub_vars_re.sub(repl, text)