"Fossies" - the Fresh Open Source Software Archive 
Member "pyzor-1.0.0/pyzor/digest.py" (10 Dec 2014, 5879 Bytes) of package /linux/privat/pyzor-1.0.0.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Python source code syntax highlighting (style:
standard) with prefixed line numbers.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "digest.py" see the
Fossies "Dox" file reference documentation.
1 """Handle digesting the messages."""
2
3 from __future__ import print_function
4
5 import re
6 import hashlib
7
8 try:
9 import HTMLParser
10 except ImportError:
11 import html.parser as HTMLParser
12
13 # Hard-coded for the moment.
14 digest_spec = ([(20, 3), (60, 3)])
15
16 HASH = hashlib.sha1
17 HASH_SIZE = len(HASH(b"").hexdigest())
18
19
20 class HTMLStripper(HTMLParser.HTMLParser):
21 """Strip all tags from the HTML."""
22 def __init__(self, collector):
23 HTMLParser.HTMLParser.__init__(self)
24 self.reset()
25 self.collector = collector
26 self.collect = True
27
28 def handle_data(self, data):
29 """Keep track of the data."""
30 data = data.strip()
31 if data and self.collect:
32 self.collector.append(data)
33
34 def handle_starttag(self, tag, attrs):
35 HTMLParser.HTMLParser.handle_starttag(self, tag, attrs)
36 if tag.lower() in ("script", "style"):
37 self.collect = False
38
39 def handle_endtag(self, tag):
40 HTMLParser.HTMLParser.handle_endtag(self, tag)
41 if tag.lower() in ("script", "style"):
42 self.collect = True
43
44
45 class DataDigester(object):
46 """The major workhouse class."""
47 __slots__ = ['value', 'digest']
48
49 # Minimum line length for it to be included as part of the digest.
50 min_line_length = 8
51
52 # If a message is this many lines or less, then we digest the whole
53 # message.
54 atomic_num_lines = 4
55
56 # We're not going to try to match email addresses as per the spec
57 # because it's too difficult. Plus, regular expressions don't work well
58 # for them. (BNF is better at balanced parens and such).
59 email_ptrn = re.compile(r'\S+@\S+')
60
61 # Same goes for URLs.
62 url_ptrn = re.compile(r'[a-z]+:\S+', re.IGNORECASE)
63
64 # We also want to remove anything that is so long it looks like possibly
65 # a unique identifier.
66 longstr_ptrn = re.compile(r'\S{10,}')
67
68 ws_ptrn = re.compile(r'\s')
69
70 # String that the above patterns will be replaced with.
71 # Note that an empty string will always be used to remove whitespace.
72 unwanted_txt_repl = ''
73
74 def __init__(self, msg, spec=None):
75 if spec is None:
76 spec = digest_spec
77 self.value = None
78 self.digest = HASH()
79
80 # Need to know the total number of lines in the content.
81 lines = []
82 for payload in self.digest_payloads(msg):
83 for line in payload.splitlines():
84 norm = self.normalize(line)
85 if self.should_handle_line(norm):
86 try:
87 lines.append(norm.encode("utf8", "ignore"))
88 except UnicodeError:
89 continue
90
91 if len(lines) <= self.atomic_num_lines:
92 self.handle_atomic(lines)
93 else:
94 self.handle_pieced(lines, spec)
95
96 self.value = self.digest.hexdigest()
97
98 assert len(self.value) == HASH_SIZE
99
100 def handle_atomic(self, lines):
101 """We digest everything."""
102 for line in lines:
103 self.handle_line(line)
104
105 def handle_pieced(self, lines, spec):
106 """Digest stuff according to the spec."""
107 for offset, length in spec:
108 for i in xrange(length):
109 try:
110 line = lines[int(offset * len(lines) // 100) + i]
111 except IndexError:
112 pass
113 else:
114 self.handle_line(line)
115
116 def handle_line(self, line):
117 self.digest.update(line.rstrip())
118
119 @classmethod
120 def normalize(cls, s):
121 repl = cls.unwanted_txt_repl
122 s = cls.longstr_ptrn.sub(repl, s)
123 s = cls.email_ptrn.sub(repl, s)
124 s = cls.url_ptrn.sub(repl, s)
125 # Make sure we do the whitespace last because some of the previous
126 # patterns rely on whitespace.
127 return cls.ws_ptrn.sub('', s).strip()
128
129 @staticmethod
130 def normalize_html_part(s):
131 data = []
132 stripper = HTMLStripper(data)
133 try:
134 stripper.feed(s)
135 except (UnicodeDecodeError, HTMLParser.HTMLParseError):
136 # We can't parse the HTML, so just strip it. This is still
137 # better than including generic HTML/CSS text.
138 pass
139 return " ".join(data)
140
141 @classmethod
142 def should_handle_line(cls, s):
143 return len(s) and cls.min_line_length <= len(s)
144
145 @classmethod
146 def digest_payloads(cls, msg):
147 for part in msg.walk():
148 if part.get_content_maintype() == "text":
149 payload = part.get_payload(decode=True)
150
151 charset = part.get_content_charset()
152 errors = "ignore"
153 if not charset:
154 charset = "ascii"
155 elif (charset.lower().replace("_", "-") in ("quopri-codec",
156 "quopri", "quoted-printable", "quotedprintable")):
157 errors = "strict"
158
159 try:
160 payload = payload.decode(charset, errors)
161 except (LookupError, UnicodeError, AssertionError):
162 try:
163 payload = payload.decode("ascii", "ignore")
164 except UnicodeError:
165 continue
166 if part.get_content_subtype() == "html":
167 yield cls.normalize_html_part(payload)
168 else:
169 yield payload
170 elif part.is_multipart():
171 # Skip, because walk() will give us the payload next.
172 pass
173 else:
174 # Non-text parts are passed through as-is.
175 yield part.get_payload()
176
177
178 class PrintingDataDigester(DataDigester):
179 """Extends DataDigester: prints out what we're digesting."""
180 def handle_line(self, line):
181 print(line.decode("utf8"))
182 super(PrintingDataDigester, self).handle_line(line)