"Fossies" - the Fresh Open Source Software Archive

Member "pyzor-1.0.0/pyzor/digest.py" (10 Dec 2014, 5879 Bytes) of package /linux/privat/pyzor-1.0.0.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Python source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. For more information about "digest.py" see the Fossies "Dox" file reference documentation.

    1 """Handle digesting the messages."""
    2 
    3 from __future__ import print_function
    4 
    5 import re
    6 import hashlib
    7 
    8 try:
    9     import HTMLParser
   10 except ImportError:
   11     import html.parser as HTMLParser
   12 
   13 # Hard-coded for the moment.
   14 digest_spec = ([(20, 3), (60, 3)])
   15 
   16 HASH = hashlib.sha1
   17 HASH_SIZE = len(HASH(b"").hexdigest())
   18 
   19 
   20 class HTMLStripper(HTMLParser.HTMLParser):
   21     """Strip all tags from the HTML."""
   22     def __init__(self, collector):
   23         HTMLParser.HTMLParser.__init__(self)
   24         self.reset()
   25         self.collector = collector
   26         self.collect = True
   27 
   28     def handle_data(self, data):
   29         """Keep track of the data."""
   30         data = data.strip()
   31         if data and self.collect:
   32             self.collector.append(data)
   33 
   34     def handle_starttag(self, tag, attrs):
   35         HTMLParser.HTMLParser.handle_starttag(self, tag, attrs)
   36         if tag.lower() in ("script", "style"):
   37             self.collect = False
   38 
   39     def handle_endtag(self, tag):
   40         HTMLParser.HTMLParser.handle_endtag(self, tag)
   41         if tag.lower() in ("script", "style"):
   42             self.collect = True
   43 
   44 
   45 class DataDigester(object):
   46     """The major workhouse class."""
   47     __slots__ = ['value', 'digest']
   48 
   49     # Minimum line length for it to be included as part of the digest.
   50     min_line_length = 8
   51 
   52     # If a message is this many lines or less, then we digest the whole
   53     # message.
   54     atomic_num_lines = 4
   55 
   56     # We're not going to try to match email addresses as per the spec
   57     # because it's too difficult.  Plus, regular expressions don't work well
   58     # for them. (BNF is better at balanced parens and such).
   59     email_ptrn = re.compile(r'\S+@\S+')
   60 
   61     # Same goes for URLs.
   62     url_ptrn = re.compile(r'[a-z]+:\S+', re.IGNORECASE)
   63 
   64     # We also want to remove anything that is so long it looks like possibly
   65     # a unique identifier.
   66     longstr_ptrn = re.compile(r'\S{10,}')
   67 
   68     ws_ptrn = re.compile(r'\s')
   69 
   70     # String that the above patterns will be replaced with.
   71     # Note that an empty string will always be used to remove whitespace.
   72     unwanted_txt_repl = ''
   73 
   74     def __init__(self, msg, spec=None):
   75         if spec is None:
   76             spec = digest_spec
   77         self.value = None
   78         self.digest = HASH()
   79 
   80         # Need to know the total number of lines in the content.
   81         lines = []
   82         for payload in self.digest_payloads(msg):
   83             for line in payload.splitlines():
   84                 norm = self.normalize(line)
   85                 if self.should_handle_line(norm):
   86                     try:
   87                         lines.append(norm.encode("utf8", "ignore"))
   88                     except UnicodeError:
   89                         continue
   90 
   91         if len(lines) <= self.atomic_num_lines:
   92             self.handle_atomic(lines)
   93         else:
   94             self.handle_pieced(lines, spec)
   95 
   96         self.value = self.digest.hexdigest()
   97 
   98         assert len(self.value) == HASH_SIZE
   99 
  100     def handle_atomic(self, lines):
  101         """We digest everything."""
  102         for line in lines:
  103             self.handle_line(line)
  104 
  105     def handle_pieced(self, lines, spec):
  106         """Digest stuff according to the spec."""
  107         for offset, length in spec:
  108             for i in xrange(length):
  109                 try:
  110                     line = lines[int(offset * len(lines) // 100) + i]
  111                 except IndexError:
  112                     pass
  113                 else:
  114                     self.handle_line(line)
  115 
  116     def handle_line(self, line):
  117         self.digest.update(line.rstrip())
  118 
  119     @classmethod
  120     def normalize(cls, s):
  121         repl = cls.unwanted_txt_repl
  122         s = cls.longstr_ptrn.sub(repl, s)
  123         s = cls.email_ptrn.sub(repl, s)
  124         s = cls.url_ptrn.sub(repl, s)
  125         # Make sure we do the whitespace last because some of the previous
  126         # patterns rely on whitespace.
  127         return cls.ws_ptrn.sub('', s).strip()
  128 
  129     @staticmethod
  130     def normalize_html_part(s):
  131         data = []
  132         stripper = HTMLStripper(data)
  133         try:
  134             stripper.feed(s)
  135         except (UnicodeDecodeError, HTMLParser.HTMLParseError):
  136             # We can't parse the HTML, so just strip it.  This is still
  137             # better than including generic HTML/CSS text.
  138             pass
  139         return " ".join(data)
  140 
  141     @classmethod
  142     def should_handle_line(cls, s):
  143         return len(s) and cls.min_line_length <= len(s)
  144 
  145     @classmethod
  146     def digest_payloads(cls, msg):
  147         for part in msg.walk():
  148             if part.get_content_maintype() == "text":
  149                 payload = part.get_payload(decode=True)
  150 
  151                 charset = part.get_content_charset()
  152                 errors = "ignore"
  153                 if not charset:
  154                     charset = "ascii"
  155                 elif (charset.lower().replace("_", "-") in ("quopri-codec",
  156                       "quopri", "quoted-printable", "quotedprintable")):
  157                     errors = "strict"
  158 
  159                 try:
  160                     payload = payload.decode(charset, errors)
  161                 except (LookupError, UnicodeError, AssertionError):
  162                     try:
  163                         payload = payload.decode("ascii", "ignore")
  164                     except UnicodeError:
  165                         continue
  166                 if part.get_content_subtype() == "html":
  167                     yield cls.normalize_html_part(payload)
  168                 else:
  169                     yield payload
  170             elif part.is_multipart():
  171                 # Skip, because walk() will give us the payload next.
  172                 pass
  173             else:
  174                 # Non-text parts are passed through as-is.
  175                 yield part.get_payload()
  176 
  177 
  178 class PrintingDataDigester(DataDigester):
  179     """Extends DataDigester: prints out what we're digesting."""
  180     def handle_line(self, line):
  181         print(line.decode("utf8"))
  182         super(PrintingDataDigester, self).handle_line(line)