spambayes  1.1a6
About: SpamBayes - a Bayesian anti-spam filter/classifier (written in Python)
  Fossies Dox: spambayes-1.1a6.zip  ("inofficial" and yet experimental doxygen-generated source code documentation)  

 All Classes Namespaces Files Functions Variables Properties Macros
dnscache.py
Go to the documentation of this file.
1 # Copyright 2004, Matthew Dixon Cowles <matt@mondoinfo.com>.
2 # Distributable under the same terms as the Python programming language.
3 # Inspired by the KevinL's cache included with PyDNS.
4 # Provided with NO WARRANTY.
5 
6 # Version 0.1 2004 06 27
7 # Version 0.11 2004 07 06 Fixed zero division error in __del__
8 
9 # From http://sourceforge.net/projects/pydns/
10 import DNS
11 
12 import sys
13 import os
14 import operator
15 import time
16 import types
17 import socket
18 
19 from spambayes.Options import options
20 from spambayes.safepickle import pickle_read, pickle_write
21 
22 kCheckForPruneEvery = 20
23 kMaxTTL = 60 * 60 * 24 * 7 # One week
24 # Some servers always return a TTL of zero. We'll hold onto data a bit
25 # longer.
26 kMinTTL = 24 * 60 * 60 * 1 # one day
27 kPruneThreshold = 5000 # May go over slightly; numbers chosen at random
28 kPruneDownTo = 2500
29 
30 
31 class lookupResult(object):
32  #__slots__=("qType","answer","question","expiresAt","lastUsed")
33 
34  def __init__(self, qType, answer, question, expiresAt, now):
35  self.qType = qType
36  self.answer = answer
37  self.question = question
38  self.expiresAt = expiresAt
39  self.lastUsed = now
40  return None
41 
42 
43 # From ActiveState's Python cookbook
44 # Yakov Markovitch, Fast sort the list of objects by object's attribute
45 # http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52230
46 def sort_by_attr(seq, attr):
47  """Sort the sequence of objects by object's attribute
48 
49  Arguments:
50  seq - the list or any sequence (including immutable one) of objects to sort.
51  attr - the name of attribute to sort by
52 
53  Returns:
54  the sorted list of objects.
55  """
56  #import operator
57 
58  # Use the "Schwartzian transform"
59  # Create the auxiliary list of tuples where every i-th tuple has form
60  # (seq[i].attr, i, seq[i]) and sort it. The second item of tuple is needed not
61  # only to provide stable sorting, but mainly to eliminate comparison of objects
62  # (which can be expensive or prohibited) in case of equal attribute values.
63  intermed = map(None, map(getattr, seq, (attr,)*len(seq)), xrange(len(seq)), seq)
64  intermed.sort()
65  return map(operator.getitem, intermed, (-1,) * len(intermed))
66 
67 
68 class cache:
69  def __init__(self, dnsServer=None, cachefile=""):
70  # These attributes intended for user setting
71  self.printStatsAtEnd = False
72 
73  # As far as I can tell from the standards,
74  # it's legal to have more than one PTR record
75  # for an address. That is, it's legal to get
76  # more than one name back when you do a
77  # reverse lookup on an IP address. I don't
78  # know of a use for that and I've never seen
79  # it done. And I don't think that most
80  # people would expect it. So forward ("A")
81  # lookups always return a list. Reverse
82  # ("PTR") lookups return a single name unless
83  # this attribute is set to False.
84  self.returnSinglePTR = True
85 
86  # How long to cache an error as no data
87  self.cacheErrorSecs=5*60
88 
89  # How long to wait for the server
90  self.dnsTimeout=10
91 
92  # end of user-settable attributes
93 
94  self.cachefile = os.path.expanduser(cachefile)
95  self.caches = None
96 
97  if self.cachefile and os.path.exists(self.cachefile):
98  try:
99  self.caches = pickle_read(self.cachefile)
100  except:
101  os.unlink(self.cachefile)
102 
103  if self.caches is None:
104  self.caches = {"A": {}, "PTR": {}}
105 
106  if options["globals", "verbose"]:
107  if self.caches["A"] or self.caches["PTR"]:
108  print >> sys.stderr, "opened existing cache with",
109  print >> sys.stderr, len(self.caches["A"]), "A records",
110  print >> sys.stderr, "and", len(self.caches["PTR"]),
111  print >> sys.stderr, "PTR records"
112  else:
113  print >> sys.stderr, "opened new cache"
114 
115  self.hits=0 # These two for statistics
116  self.misses=0
117  self.pruneTicker=0
118 
119  if dnsServer == None:
120  DNS.DiscoverNameServers()
121  self.queryObj = DNS.DnsRequest()
122  else:
123  self.queryObj = DNS.DnsRequest(server=dnsServer)
124  return None
125 
126  def close(self):
127  if self.printStatsAtEnd:
128  self.printStats()
129  if self.cachefile:
130  pickle_write(self.cachefile, self.caches)
131 
132  def printStats(self):
133  for key,val in self.caches.items():
134  totAnswers=0
135  for item in val.values():
136  totAnswers+=len(item)
137  print >> sys.stderr, "cache", key, "has", len(self.caches[key]),
138  print >> sys.stderr, "question(s) and", totAnswers, "answer(s)"
139  if self.hits+self.misses == 0:
140  print >> sys.stderr, "No queries"
141  else:
142  print >> sys.stderr, self.hits, "hits,", self.misses, "misses",
143  print >> sys.stderr, "(%.1f%% hits)" % \
144  (self.hits/float(self.hits+self.misses)*100)
145 
146  def prune(self, now):
147  # I want this to be as fast as reasonably possible.
148  # If I didn't, I'd probably do various things differently
149  # Is there a faster way to do this?
150  allAnswers = []
151  for cache in self.caches.values():
152  for val in cache.values():
153  allAnswers += val
154 
155  allAnswers = sort_by_attr(allAnswers,"expiresAt")
156  allAnswers.reverse()
157 
158  while True:
159  if allAnswers[-1].expiresAt > now:
160  break
161  answer = allAnswers.pop()
162  c = self.caches[answer.qType]
163  c[answer.question].remove(answer)
164  if not c[answer.question]:
165  del c[answer.question]
166 
167  if options["globals", "verbose"]:
168  self.printStats()
169 
170  if len(allAnswers)<=kPruneDownTo:
171  return None
172 
173  # Expiring didn't get us down to the size we want, so delete
174  # some entries least-recently-used-wise. I'm not by any means
175  # sure that this is the best strategy, but as yet I don't have
176  # data to test different strategies.
177  allAnswers = sort_by_attr(allAnswers, "lastUsed")
178  allAnswers.reverse()
179  numToDelete = len(allAnswers)-kPruneDownTo
180  for _count in xrange(numToDelete):
181  answer = allAnswers.pop()
182  c = self.caches[answer.qType]
183  c[answer.question].remove(answer)
184  if not c[answer.question]:
185  del c[answer.question]
186 
187  return None
188 
189 
190  def formatForReturn(self, listOfObjs):
191  if len(listOfObjs) == 1 and listOfObjs[0].answer == None:
192  return []
193 
194  if listOfObjs[0].qType == "PTR" and self.returnSinglePTR:
195  return listOfObjs[0].answer
196 
197  return [ obj.answer for obj in listOfObjs ]
198 
199 
200  def lookup(self,question,qType="A"):
201  qType = qType.upper()
202  if qType not in ("A","PTR"):
203  raise ValueError,"Query type must be one of A, PTR"
204 
205  now = int(time.time())
206 
207  # Finding the len() of a dictionary isn't an expensive operation
208  # but doing it twice for every lookup isn't necessary.
209  self.pruneTicker += 1
210  if self.pruneTicker == kCheckForPruneEvery:
211  self.pruneTicker = 0
212  if len(self.caches["A"])+len(self.caches["PTR"])>kPruneThreshold:
213  self.prune(now)
214 
215  cacheToLookIn = self.caches[qType]
216 
217  try:
218  answers = cacheToLookIn[question]
219  except KeyError:
220  pass
221  else:
222  if answers:
223  ind = 0
224  # No guarantee that expire has already been done
225  while ind<len(answers):
226  thisAnswer = answers[ind]
227  if thisAnswer.expiresAt<now:
228  del answers[ind]
229  else:
230  thisAnswer.lastUsed = now
231  ind += 1
232  else:
233  print >> sys.stderr, "lookup failure:", question
234 
235  if not answers:
236  del cacheToLookIn[question]
237  else:
238  self.hits += 1
239  return self.formatForReturn(answers)
240 
241  # Not in cache or we just expired it
242  self.misses += 1
243 
244  if qType == "PTR":
245  qList = question.split(".")
246  qList.reverse()
247  queryQuestion = ".".join(qList)+".in-addr.arpa"
248  else:
249  queryQuestion = question
250 
251  # where do we get NXDOMAIN?
252  try:
253  reply = self.queryObj.req(queryQuestion, qtype=qType,
254  timeout=self.dnsTimeout)
255  except DNS.Base.DNSError,detail:
256  if detail.args[0] not in ("Timeout", "nothing to lookup"):
257  print >> sys.stderr, detail.args[0]
258  print >> sys.stderr, "Error, fixme", detail
259  print >> sys.stderr, "Question was", queryQuestion
260  print >> sys.stderr, "Original question was", question
261  print >> sys.stderr, "Type was", qType
262  objs = [lookupResult(qType, None, question,
263  self.cacheErrorSecs+now, now)]
264  cacheToLookIn[question] = objs # Add to format for return?
265  return self.formatForReturn(objs)
266  except socket.gaierror,detail:
267  print >> sys.stderr, "DNS connection failure:", self.queryObj.ns, detail
268  print >> sys.stderr, "Defaults:", DNS.defaults
269 
270  objs = []
271  for answer in reply.answers:
272  if answer["typename"] == qType:
273  # PyDNS returns TTLs as longs but RFC 1035 says that the TTL
274  # value is a signed 32-bit value and must be positive, so it
275  # should be safe to coerce it to a Python integer. And
276  # anyone who sets a time to live of more than 2^31-1 seconds
277  # (68 years and change) is drunk. Arguably, I ought to
278  # impose a maximum rather than continuing with longs
279  # (int(long) returns long in recent versions of Python).
280  ttl = max(min(int(answer["ttl"]), kMaxTTL), kMinTTL)
281  # RFC 2308 says that you should cache an NXDOMAIN for the
282  # minimum of the minimum field of the SOA record and the TTL
283  # of the SOA.
284  if ttl > 0:
285  item = lookupResult(qType, answer["data"], question,
286  ttl+now, now)
287  objs.append(item)
288 
289  if objs:
290  cacheToLookIn[question] = objs
291  return self.formatForReturn(objs)
292 
293  # Probably SERVFAIL or the like
294  if not reply.authority:
295  objs = [lookupResult(qType, None, question,
296  self.cacheErrorSecs+now, now)]
297  cacheToLookIn[question] = objs
298  return self.formatForReturn(objs)
299 
300 
301  # No such host
302  #
303  # I don't know in what circumstances you'd have more than one authority,
304  # so I'll just assume that the first is what we want.
305  #
306  # RFC 2308 specifies that this how to decide how long to cache an
307  # NXDOMAIN.
308  auth = reply.authority[0]
309  auTTL = int(auth["ttl"])
310  for item in auth["data"]:
311  if type(item) == types.TupleType and item[0] == "minimum":
312  auMin = int(item[1])
313  cacheNeg = min(auMin,auTTL)
314  break
315  else:
316  cacheNeg = auTTL
317  objs = [lookupResult(qType, None, question, cacheNeg+now, now)]
318 
319  cacheToLookIn[question] = objs
320  return self.formatForReturn(objs)
321 
322 
323 def main():
324  import transaction
325  c = cache(cachefile=os.path.expanduser("~/.dnscache"))
326  c.printStatsAtEnd = True
327  for host in ["www.python.org", "www.timsbloggers.com",
328  "www.seeputofor.com", "www.completegarbage.tv",
329  "www.tradelinkllc.com"]:
330  print >> sys.stderr, "checking", host
331  now = time.time()
332  ips = c.lookup(host)
333  print >> sys.stderr, ips, time.time()-now
334  now = time.time()
335  ips = c.lookup(host)
336  print >> sys.stderr, ips, time.time()-now
337 
338  if ips:
339  ip = ips[0]
340  now = time.time()
341  name = c.lookup(ip, qType="PTR")
342  print >> sys.stderr, name, time.time()-now
343  now = time.time()
344  name = c.lookup(ip, qType="PTR")
345  print >> sys.stderr, name, time.time()-now
346  else:
347  print >> sys.stderr, "unknown"
348 
349  c.close()
350 
351  return None
352 
353 if __name__ == "__main__":
354  main()