spambayes  1.1a6
About: SpamBayes - a Bayesian anti-spam filter/classifier (written in Python)
  Fossies Dox: spambayes-1.1a6.zip  ("inofficial" and yet experimental doxygen-generated source code documentation)  

 All Classes Namespaces Files Functions Variables Properties Macros
storage.py
Go to the documentation of this file.
1 #! /usr/bin/env python
2 
3 '''storage.py - Spambayes database management framework.
4 
5 Classes:
6  PickledClassifier - Classifier that uses a pickle db
7  DBDictClassifier - Classifier that uses a shelve db
8  PGClassifier - Classifier that uses postgres
9  mySQLClassifier - Classifier that uses mySQL
10  CBDClassifier - Classifier that uses CDB
11  ZODBClassifier - Classifier that uses ZODB
12  ZEOClassifier - Classifier that uses ZEO
13  Trainer - Classifier training observer
14  SpamTrainer - Trainer for spam
15  HamTrainer - Trainer for ham
16 
17 Abstract:
18  *Classifier are subclasses of Classifier (classifier.Classifier)
19  that add automatic state store/restore function to the Classifier class.
20  All SQL based classifiers are subclasses of SQLClassifier, which is a
21  subclass of Classifier.
22 
23  PickledClassifier is a Classifier class that uses a cPickle
24  datastore. This database is relatively small, but slower than other
25  databases.
26 
27  DBDictClassifier is a Classifier class that uses a database
28  store.
29 
30  Trainer is concrete class that observes a Corpus and trains a
31  Classifier object based upon movement of messages between corpora When
32  an add message notification is received, the trainer trains the
33  database with the message, as spam or ham as appropriate given the
34  type of trainer (spam or ham). When a remove message notification
35  is received, the trainer untrains the database as appropriate.
36 
37  SpamTrainer and HamTrainer are convenience subclasses of Trainer, that
38  initialize as the appropriate type of Trainer
39 
40 To Do:
41  o Suggestions?
42 
43  '''
44 
45 # This module is part of the spambayes project, which is Copyright 2002-2007
46 # The Python Software Foundation and is covered by the Python Software
47 # Foundation license.
48 
49 ### Note to authors - please direct all prints to sys.stderr. In some
50 ### situations prints to sys.stdout will garble the message (e.g., in
51 ### hammiefilter).
52 
53 __author__ = ("Neale Pickett <neale@woozle.org>,"
54  "Tim Stone <tim@fourstonesExpressions.com>")
55 __credits__ = "All the spambayes contributors."
56 
57 import os
58 import sys
59 import time
60 import types
61 import tempfile
62 from spambayes import classifier
63 from spambayes.Options import options, get_pathname_option
64 import errno
65 import shelve
66 from spambayes import cdb
67 from spambayes import dbmstorage
68 from spambayes.safepickle import pickle_write, pickle_read
69 
70 # Make shelve use binary pickles by default.
71 oldShelvePickler = shelve.Pickler
72 def binaryDefaultPickler(f, binary=1):
73  return oldShelvePickler(f, binary)
74 shelve.Pickler = binaryDefaultPickler
75 
76 PICKLE_TYPE = 1
77 NO_UPDATEPROBS = False # Probabilities will not be autoupdated with training
78 UPDATEPROBS = True # Probabilities will be autoupdated with training
79 
81  '''Classifier object persisted in a pickle'''
82 
83  def __init__(self, db_name):
84  classifier.Classifier.__init__(self)
85  self.db_name = db_name
86  self.load()
87 
88  def load(self):
89  '''Load this instance from the pickle.'''
90  # This is a bit strange, because the loading process
91  # creates a temporary instance of PickledClassifier, from which
92  # this object's state is copied. This is a nuance of the way
93  # that pickle does its job.
94  # Tim sez: that's because this is an unusual way to use pickle.
95  # Note that nothing non-trivial is actually copied, though:
96  # assignment merely copies a pointer. The actual wordinfo etc
97  # objects are shared between tempbayes and self, and the tiny
98  # tempbayes object is reclaimed when load() returns.
99 
100  if options["globals", "verbose"]:
101  print >> sys.stderr, 'Loading state from', self.db_name, 'pickle'
102 
103  try:
104  tempbayes = pickle_read(self.db_name)
105  except:
106  tempbayes = None
107 
108  if tempbayes:
109  # Copy state from tempbayes. The use of our base-class
110  # __setstate__ is forced, in case self is of a subclass of
111  # PickledClassifier that overrides __setstate__.
112  classifier.Classifier.__setstate__(self,
113  tempbayes.__getstate__())
114  if options["globals", "verbose"]:
115  print >> sys.stderr, ('%s is an existing pickle,'
116  ' with %d ham and %d spam') \
117  % (self.db_name, self.nham, self.nspam)
118  else:
119  # new pickle
120  if options["globals", "verbose"]:
121  print >> sys.stderr, self.db_name,'is a new pickle'
122  self.wordinfo = {}
123  self.nham = 0
124  self.nspam = 0
125 
126  def store(self):
127  '''Store self as a pickle'''
128 
129  if options["globals", "verbose"]:
130  print >> sys.stderr, 'Persisting', self.db_name, 'as a pickle'
131 
132  pickle_write(self.db_name, self, PICKLE_TYPE)
133 
134  def close(self):
135  # we keep no resources open - nothing to do
136  pass
137 
138 # Values for our changed words map
139 WORD_DELETED = "D"
140 WORD_CHANGED = "C"
141 
142 STATE_KEY = 'saved state'
143 
145  '''Classifier object persisted in a caching database'''
146 
147  def __init__(self, db_name, mode='c'):
148  '''Constructor(database name)'''
149 
150  classifier.Classifier.__init__(self)
151  self.statekey = STATE_KEY
152  self.mode = mode
153  self.db_name = db_name
154  self.load()
155 
156  def close(self):
157  # Close our underlying database. Better not assume all databases
158  # have close functions!
159  def noop():
160  pass
161  getattr(self.db, "close", noop)()
162  getattr(self.dbm, "close", noop)()
163  # should not be a need to drop the 'dbm' or 'db' attributes.
164  # but we do anyway, because it makes it more clear what has gone
165  # wrong if we try to keep using the database after we have closed
166  # it.
167  if hasattr(self, "db"):
168  del self.db
169  if hasattr(self, "dbm"):
170  del self.dbm
171  if options["globals", "verbose"]:
172  print >> sys.stderr, 'Closed', self.db_name, 'database'
173 
174  def load(self):
175  '''Load state from database'''
176 
177  if options["globals", "verbose"]:
178  print >> sys.stderr, 'Loading state from', self.db_name, 'database'
179 
180  self.dbm = dbmstorage.open(self.db_name, self.mode)
181  self.db = shelve.Shelf(self.dbm)
182 
183  if self.db.has_key(self.statekey):
184  t = self.db[self.statekey]
185  if t[0] != classifier.PICKLE_VERSION:
186  raise ValueError("Can't unpickle -- version %s unknown" % t[0])
187  (self.nspam, self.nham) = t[1:]
188 
189  if options["globals", "verbose"]:
190  print >> sys.stderr, ('%s is an existing database,'
191  ' with %d spam and %d ham') \
192  % (self.db_name, self.nspam, self.nham)
193  else:
194  # new database
195  if options["globals", "verbose"]:
196  print >> sys.stderr, self.db_name,'is a new database'
197  self.nspam = 0
198  self.nham = 0
199  self.wordinfo = {}
200  self.changed_words = {} # value may be one of the WORD_ constants
201 
202  def store(self):
203  '''Place state into persistent store'''
204 
205  if options["globals", "verbose"]:
206  print >> sys.stderr, 'Persisting', self.db_name,
207  print >> sys.stderr, 'state in database'
208 
209  # Iterate over our changed word list.
210  # This is *not* thread-safe - another thread changing our
211  # changed_words could mess us up a little. Possibly a little
212  # lock while we copy and reset self.changed_words would be appropriate.
213  # For now, just do it the naive way.
214  for key, flag in self.changed_words.iteritems():
215  if flag is WORD_CHANGED:
216  val = self.wordinfo[key]
217  self.db[key] = val.__getstate__()
218  elif flag is WORD_DELETED:
219  assert key not in self.wordinfo, \
220  "Should not have a wordinfo for words flagged for delete"
221  # Word may be deleted before it was ever written.
222  try:
223  del self.db[key]
224  except KeyError:
225  pass
226  else:
227  raise RuntimeError, "Unknown flag value"
228 
229  # Reset the changed word list.
230  self.changed_words = {}
231  # Update the global state, then do the actual save.
232  self._write_state_key()
233  self.db.sync()
234 
235  def _write_state_key(self):
236  self.db[self.statekey] = (classifier.PICKLE_VERSION,
237  self.nspam, self.nham)
238 
239  def _post_training(self):
240  """This is called after training on a wordstream. We ensure that the
241  database is in a consistent state at this point by writing the state
242  key."""
243  self._write_state_key()
244 
245  def _wordinfoget(self, word):
246  if isinstance(word, unicode):
247  word = word.encode("utf-8")
248  try:
249  return self.wordinfo[word]
250  except KeyError:
251  ret = None
252  if self.changed_words.get(word) is not WORD_DELETED:
253  r = self.db.get(word)
254  if r:
255  ret = self.WordInfoClass()
256  ret.__setstate__(r)
257  self.wordinfo[word] = ret
258  return ret
259 
260  def _wordinfoset(self, word, record):
261  # "Singleton" words (i.e. words that only have a single instance)
262  # take up more than 1/2 of the database, but are rarely used
263  # so we don't put them into the wordinfo cache, but write them
264  # directly to the database
265  # If the word occurs again, then it will be brought back in and
266  # never be a singleton again.
267  # This seems to reduce the memory footprint of the DBDictClassifier by
268  # as much as 60%!!! This also has the effect of reducing the time it
269  # takes to store the database
270  if isinstance(word, unicode):
271  word = word.encode("utf-8")
272  if record.spamcount + record.hamcount <= 1:
273  self.db[word] = record.__getstate__()
274  try:
275  del self.changed_words[word]
276  except KeyError:
277  # This can happen if, e.g., a new word is trained as ham
278  # twice, then untrained once, all before a store().
279  pass
280 
281  try:
282  del self.wordinfo[word]
283  except KeyError:
284  pass
285 
286  else:
287  self.wordinfo[word] = record
288  self.changed_words[word] = WORD_CHANGED
289 
290  def _wordinfodel(self, word):
291  if isinstance(word, unicode):
292  word = word.encode("utf-8")
293  del self.wordinfo[word]
294  self.changed_words[word] = WORD_DELETED
295 
296  def _wordinfokeys(self):
297  wordinfokeys = self.db.keys()
298  del wordinfokeys[wordinfokeys.index(self.statekey)]
299  return wordinfokeys
300 
301 
303  def __init__(self, db_name):
304  '''Constructor(database name)'''
305 
306  classifier.Classifier.__init__(self)
307  self.statekey = STATE_KEY
308  self.db_name = db_name
309  self.load()
310 
311  def close(self):
312  '''Release all database resources'''
313  # As we (presumably) aren't as constrained as we are by file locking,
314  # don't force sub-classes to override
315  pass
316 
317  def load(self):
318  '''Load state from the database'''
319  raise NotImplementedError, "must be implemented in subclass"
320 
321  def store(self):
322  '''Save state to the database'''
323  self._set_row(self.statekey, self.nspam, self.nham)
324 
325  def cursor(self):
326  '''Return a new db cursor'''
327  raise NotImplementedError, "must be implemented in subclass"
328 
329  def fetchall(self, c):
330  '''Return all rows as a dict'''
331  raise NotImplementedError, "must be implemented in subclass"
332 
333  def commit(self, c):
334  '''Commit the current transaction - may commit at db or cursor'''
335  raise NotImplementedError, "must be implemented in subclass"
336 
337  def create_bayes(self):
338  '''Create a new bayes table'''
339  c = self.cursor()
340  c.execute(self.table_definition)
341  self.commit(c)
342 
343  def _get_row(self, word):
344  '''Return row matching word'''
345  try:
346  c = self.cursor()
347  c.execute("select * from bayes"
348  " where word=%s",
349  (word,))
350  except Exception, e:
351  print >> sys.stderr, "error:", (e, word)
352  raise
353  rows = self.fetchall(c)
354 
355  if rows:
356  return rows[0]
357  else:
358  return {}
359 
360  def _set_row(self, word, nspam, nham):
361  c = self.cursor()
362  if self._has_key(word):
363  c.execute("update bayes"
364  " set nspam=%s,nham=%s"
365  " where word=%s",
366  (nspam, nham, word))
367  else:
368  c.execute("insert into bayes"
369  " (nspam, nham, word)"
370  " values (%s, %s, %s)",
371  (nspam, nham, word))
372  self.commit(c)
373 
374  def _delete_row(self, word):
375  c = self.cursor()
376  c.execute("delete from bayes"
377  " where word=%s",
378  (word,))
379  self.commit(c)
380 
381  def _has_key(self, key):
382  c = self.cursor()
383  c.execute("select word from bayes"
384  " where word=%s",
385  (key,))
386  return len(self.fetchall(c)) > 0
387 
388  def _wordinfoget(self, word):
389  if isinstance(word, unicode):
390  word = word.encode("utf-8")
391 
392  row = self._get_row(word)
393  if row:
394  item = self.WordInfoClass()
395  item.__setstate__((row["nspam"], row["nham"]))
396  return item
397  else:
398  return self.WordInfoClass()
399 
400  def _wordinfoset(self, word, record):
401  if isinstance(word, unicode):
402  word = word.encode("utf-8")
403  self._set_row(word, record.spamcount, record.hamcount)
404 
405  def _wordinfodel(self, word):
406  if isinstance(word, unicode):
407  word = word.encode("utf-8")
408  self._delete_row(word)
409 
410  def _wordinfokeys(self):
411  c = self.cursor()
412  c.execute("select word from bayes")
413  rows = self.fetchall(c)
414  return [r[0] for r in rows]
415 
416 
418  '''Classifier object persisted in a Postgres database'''
419  def __init__(self, db_name):
420  self.table_definition = ("create table bayes ("
421  " word bytea not null default '',"
422  " nspam integer not null default 0,"
423  " nham integer not null default 0,"
424  " primary key(word)"
425  ")")
426  SQLClassifier.__init__(self, db_name)
427 
428  def cursor(self):
429  return self.db.cursor()
430 
431  def fetchall(self, c):
432  return c.dictfetchall()
433 
434  def commit(self, _c):
435  self.db.commit()
436 
437  def load(self):
438  '''Load state from database'''
439 
440  import psycopg
441 
442  if options["globals", "verbose"]:
443  print >> sys.stderr, 'Loading state from', self.db_name, 'database'
444 
445  self.db = psycopg.connect('dbname=' + self.db_name)
446 
447  c = self.cursor()
448  try:
449  c.execute("select count(*) from bayes")
450  except psycopg.ProgrammingError:
451  self.db.rollback()
452  self.create_bayes()
453 
454  if self._has_key(self.statekey):
455  row = self._get_row(self.statekey)
456  self.nspam = row["nspam"]
457  self.nham = row["nham"]
458  if options["globals", "verbose"]:
459  print >> sys.stderr, ('%s is an existing database,'
460  ' with %d spam and %d ham') \
461  % (self.db_name, self.nspam, self.nham)
462  else:
463  # new database
464  if options["globals", "verbose"]:
465  print >> sys.stderr, self.db_name,'is a new database'
466  self.nspam = 0
467  self.nham = 0
468 
469 
471  '''Classifier object persisted in a mySQL database
472 
473  It is assumed that the database already exists, and that the mySQL
474  server is currently running.'''
475 
476  def __init__(self, data_source_name):
477  self.table_definition = ("create table bayes ("
478  " word varchar(255) not null default '',"
479  " nspam integer not null default 0,"
480  " nham integer not null default 0,"
481  " primary key(word)"
482  ");")
483  self.host = "localhost"
484  self.username = "root"
485  self.password = ""
486  db_name = "spambayes"
487  self.charset = None
488  source_info = data_source_name.split()
489  for info in source_info:
490  if info.startswith("host"):
491  self.host = info[5:]
492  elif info.startswith("user"):
493  self.username = info[5:]
494  elif info.startswith("pass"):
495  self.password = info[5:]
496  elif info.startswith("dbname"):
497  db_name = info[7:]
498  elif info.startswith("charset"):
499  self.charset = info[8:]
500  SQLClassifier.__init__(self, db_name)
501 
502  def cursor(self):
503  return self.db.cursor()
504 
505  def fetchall(self, c):
506  return c.fetchall()
507 
508  def commit(self, _c):
509  self.db.commit()
510 
511  def load(self):
512  '''Load state from database'''
513 
514  import MySQLdb
515 
516  if options["globals", "verbose"]:
517  print >> sys.stderr, 'Loading state from', self.db_name, 'database'
518 
519  params = {
520  'host': self.host, 'db': self.db_name,
521  'user': self.username, 'passwd': self.password,
522  'charset': self.charset
523  }
524  self.db = MySQLdb.connect(**params)
525 
526  c = self.cursor()
527  try:
528  c.execute("select count(*) from bayes")
529  except MySQLdb.ProgrammingError:
530  try:
531  self.db.rollback()
532  except MySQLdb.NotSupportedError:
533  # Server doesn't support rollback, so just assume that
534  # we can keep going and create the db. This should only
535  # happen once, anyway.
536  pass
537  self.create_bayes()
538 
539  if self._has_key(self.statekey):
540  row = self._get_row(self.statekey)
541  self.nspam = int(row[1])
542  self.nham = int(row[2])
543  if options["globals", "verbose"]:
544  print >> sys.stderr, ('%s is an existing database,'
545  ' with %d spam and %d ham') \
546  % (self.db_name, self.nspam, self.nham)
547  else:
548  # new database
549  if options["globals", "verbose"]:
550  print >> sys.stderr, self.db_name,'is a new database'
551  self.nspam = 0
552  self.nham = 0
553 
554  def _wordinfoget(self, word):
555  if isinstance(word, unicode):
556  word = word.encode("utf-8")
557 
558  row = self._get_row(word)
559  if row:
560  item = self.WordInfoClass()
561  item.__setstate__((row[1], row[2]))
562  return item
563  else:
564  return None
565 
566 
568  """A classifier that uses a CDB database.
569 
570  A CDB wordinfo database is quite small and fast but is slow to update.
571  It is appropriate if training is done rarely (e.g. monthly or weekly
572  using archived ham and spam).
573  """
574  def __init__(self, db_name):
575  classifier.Classifier.__init__(self)
576  self.db_name = db_name
577  self.statekey = STATE_KEY
578  self.load()
579 
580  def _WordInfoFactory(self, counts):
581  # For whatever reason, WordInfo's cannot be created with
582  # constructor ham/spam counts, so we do the work here.
583  # Since we're doing the work, we accept the ham/spam count
584  # in the form of a comma-delimited string, as that's what
585  # we get.
586  ham, spam = counts.split(',')
587  wi = classifier.WordInfo()
588  wi.hamcount = int(ham)
589  wi.spamcount = int(spam)
590  return wi
591 
592  # Stolen from sb_dbexpimp.py
593  # Heaven only knows what encoding non-ASCII stuff will be in
594  # Try a few common western encodings and punt if they all fail
595  def uunquote(self, s):
596  for encoding in ("utf-8", "cp1252", "iso-8859-1"):
597  try:
598  return unicode(s, encoding)
599  except UnicodeDecodeError:
600  pass
601  # punt
602  return s
603 
604  def load(self):
605  if os.path.exists(self.db_name):
606  db = open(self.db_name, "rb")
607  data = dict(cdb.Cdb(db))
608  db.close()
609  self.nham, self.nspam = [int(i) for i in \
610  data[self.statekey].split(',')]
611  self.wordinfo = dict([(self.uunquote(k),
612  self._WordInfoFactory(v)) \
613  for k, v in data.iteritems() \
614  if k != self.statekey])
615  if options["globals", "verbose"]:
616  print >> sys.stderr, ('%s is an existing CDB,'
617  ' with %d ham and %d spam') \
618  % (self.db_name, self.nham,
619  self.nspam)
620  else:
621  if options["globals", "verbose"]:
622  print >> sys.stderr, self.db_name, 'is a new CDB'
623  self.wordinfo = {}
624  self.nham = 0
625  self.nspam = 0
626 
627  def store(self):
628  items = [(self.statekey, "%d,%d" % (self.nham, self.nspam))]
629  for word, wi in self.wordinfo.iteritems():
630  if isinstance(word, types.UnicodeType):
631  word = word.encode("utf-8")
632  items.append((word, "%d,%d" % (wi.hamcount, wi.spamcount)))
633  db = open(self.db_name, "wb")
634  cdb.cdb_make(db, items)
635  db.close()
636 
637  def close(self):
638  # We keep no resources open - nothing to do.
639  pass
640 
641 
642 # If ZODB isn't available, then this class won't be useable, but we
643 # still need to be able to import this module. So we pretend that all
644 # is ok.
645 try:
646  from persistent import Persistent
647 except ImportError:
648  try:
649  from ZODB import Persistent
650  except ImportError:
651  Persistent = object
652 
654  def __init__(self):
655  import ZODB
656  from BTrees.OOBTree import OOBTree
657 
658  classifier.Classifier.__init__(self)
659  self.wordinfo = OOBTree()
660 
661 class ZODBClassifier(object):
662  # Allow subclasses to override classifier class.
663  ClassifierClass = _PersistentClassifier
664 
665  def __init__(self, db_name, mode='c'):
666  self.db_filename = db_name
667  self.db_name = os.path.basename(db_name)
668  self.closed = True
669  self.mode = mode
670  self.load()
671 
672  def __getattr__(self, att):
673  # We pretend that we are a classifier subclass.
674  if hasattr(self, "classifier") and hasattr(self.classifier, att):
675  return getattr(self.classifier, att)
676  raise AttributeError("ZODBClassifier object has no attribute '%s'"
677  % (att,))
678 
679  def __setattr__(self, att, value):
680  # For some attributes, we change the classifier instead.
681  if att in ("nham", "nspam") and hasattr(self, "classifier"):
682  setattr(self.classifier, att, value)
683  else:
684  object.__setattr__(self, att, value)
685 
686  def create_storage(self):
687  from ZODB.FileStorage import FileStorage
688  try:
689  self.storage = FileStorage(self.db_filename,
690  read_only=self.mode=='r')
691  except IOError:
692  print >> sys.stderr, ("Could not create FileStorage from",
693  self.db_filename)
694  raise
695 
696  def load(self):
697  '''Load state from database'''
698  import ZODB
699 
700  if options["globals", "verbose"]:
701  print >> sys.stderr, "Loading state from %s (%s) database" % \
702  (self.db_filename, self.db_name)
703 
704  # If we are not closed, then we need to close first before we
705  # reload.
706  if not self.closed:
707  self.close()
708 
709  self.create_storage()
710  self.DB = ZODB.DB(self.storage, cache_size=10000)
711  self.conn = self.DB.open()
712  root = self.conn.root()
713 
714  self.classifier = root.get(self.db_name)
715  if self.classifier is None:
716  # There is no classifier, so create one.
717  if options["globals", "verbose"]:
718  print >> sys.stderr, self.db_name, 'is a new ZODB'
719  self.classifier = root[self.db_name] = self.ClassifierClass()
720  else:
721  if options["globals", "verbose"]:
722  print >> sys.stderr, '%s is an existing ZODB, with %d ' \
723  'ham and %d spam' % (self.db_name, self.nham,
724  self.nspam)
725  self.closed = False
726 
727  def store(self):
728  '''Place state into persistent store'''
729  try:
730  import ZODB.Transaction
731  except ImportError:
732  import transaction
733  commit = transaction.commit
734  abort = transaction.abort
735  else:
736  commit = ZODB.Transaction.get_transaction().commit
737  abort = ZODB.Transaction.get_transaction().abort
738  from ZODB.POSException import ConflictError
739  try:
740  from ZODB.POSException import TransactionFailedError
741  except:
742  from ZODB.POSException import TransactionError as TransactionFailedError
743  from ZODB.POSException import ReadOnlyError
744 
745  assert not self.closed, "Can't store a closed database"
746 
747  if options["globals", "verbose"]:
748  print >> sys.stderr, 'Persisting', self.db_name, 'state in database'
749 
750  try:
751  commit()
752  except ConflictError:
753  # We'll save it next time, or on close. It'll be lost if we
754  # hard-crash, but that's unlikely, and not a particularly big
755  # deal.
756  if options["globals", "verbose"]:
757  print >> sys.stderr, "Conflict on commit", self.db_name
758  abort()
759  except TransactionFailedError:
760  # Saving isn't working. Try to abort, but chances are that
761  # restarting is needed.
762  print >> sys.stderr, "Storing failed. Need to restart.", \
763  self.db_name
764  abort()
765  except ReadOnlyError:
766  print >> sys.stderr, "Can't store transaction to read-only db."
767  abort()
768 
769  def close(self, pack=True, retain_backup=True):
770  # Ensure that the db is saved before closing. Alternatively, we
771  # could abort any waiting transaction. We need to do *something*
772  # with it, though, or it will be still around after the db is
773  # closed and cause problems. For now, saving seems to make sense
774  # (and we can always add abort methods if they are ever needed).
775  if self.mode != 'r':
776  self.store()
777 
778  # We don't make any use of the 'undo' capabilities of the
779  # FileStorage at the moment, so might as well pack the database
780  # each time it is closed, to save as much disk space as possible.
781  # Pack it up to where it was 'yesterday'.
782  if pack and self.mode != 'r':
783  self.pack(time.time()-60*60*24, retain_backup)
784 
785  # Do the closing.
786  self.DB.close()
787  self.storage.close()
788 
789  # Ensure that we cannot continue to use this classifier.
790  delattr(self, "classifier")
791 
792  self.closed = True
793  if options["globals", "verbose"]:
794  print >> sys.stderr, 'Closed', self.db_name, 'database'
795 
796  def pack(self, t, retain_backup=True):
797  """Like FileStorage pack(), but optionally remove the .old
798  backup file that is created. Often for our purposes we do
799  not care about being able to recover from this. Also
800  ignore the referencesf parameter, which appears to not do
801  anything."""
802  if hasattr(self.storage, "pack"):
803  self.storage.pack(t, None)
804  if not retain_backup:
805  old_name = self.db_filename + ".old"
806  if os.path.exists(old_name):
807  os.remove(old_name)
808 
809 
811  def __init__(self, data_source_name):
812  source_info = data_source_name.split()
813  self.host = "localhost"
814  self.port = None
815  db_name = "SpamBayes"
816  self.username = ''
817  self.password = ''
818  self.storage_name = '1'
819  self.wait = None
820  self.wait_timeout = None
821  for info in source_info:
822  if info.startswith("host"):
823  try:
824  # ZEO only accepts strings, not unicode.
825  self.host = str(info[5:])
826  except UnicodeDecodeError, e:
827  print >> sys.stderr, "Couldn't set host", \
828  info[5:], str(e)
829  elif info.startswith("port"):
830  self.port = int(info[5:])
831  elif info.startswith("dbname"):
832  db_name = info[7:]
833  elif info.startswith("user"):
834  self.username = info[5:]
835  elif info.startswith("pass"):
836  self.password = info[5:]
837  elif info.startswith("storage_name"):
838  self.storage_name = info[13:]
839  elif info.startswith("wait_timeout"):
840  self.wait_timeout = int(info[13:])
841  elif info.startswith("wait"):
842  self.wait = info[5:] == "True"
843  ZODBClassifier.__init__(self, db_name)
844 
845  def create_storage(self):
846  from ZEO.ClientStorage import ClientStorage
847  if self.port:
848  addr = self.host, self.port
849  else:
850  addr = self.host
851  if options["globals", "verbose"]:
852  print >> sys.stderr, "Connecting to ZEO server", addr, \
853  self.username, self.password
854  # Use persistent caches, with the cache in the temp directory.
855  # If the temp directory is cleared out, we lose the cache, but
856  # that doesn't really matter, and we should always be able to
857  # write to it.
858  try:
859  self.storage = ClientStorage(addr, name=self.db_name,
860  read_only=self.mode=='r',
861  username=self.username,
862  client=self.db_name,
863  wait=self.wait,
864  wait_timeout=self.wait_timeout,
865  storage=self.storage_name,
866  var=tempfile.gettempdir(),
867  password=self.password)
868  except ValueError:
869  # Probably bad cache; remove it and try without the cache.
870  try:
871  os.remove(os.path.join(tempfile.gettempdir(),
872  self.db_name + \
873  self.storage_name + ".zec"))
874  except OSError:
875  pass
876  self.storage = ClientStorage(addr, name=self.db_name,
877  read_only=self.mode=='r',
878  username=self.username,
879  wait=self.wait,
880  wait_timeout=self.wait_timeout,
881  storage=self.storage_name,
882  password=self.password)
883 
884  def is_connected(self):
885  return self.storage.is_connected()
886 
887 
888 # Flags that the Trainer will recognise. These should be or'able integer
889 # values (i.e. 1, 2, 4, 8, etc.).
890 NO_TRAINING_FLAG = 1
891 
892 class Trainer(object):
893  '''Associates a Classifier object and one or more Corpora, \
894  is an observer of the corpora'''
895 
896  def __init__(self, bayes, is_spam, updateprobs=NO_UPDATEPROBS):
897  '''Constructor(Classifier, is_spam(True|False),
898  updateprobs(True|False)'''
899 
900  self.bayes = bayes
901  self.is_spam = is_spam
902  self.updateprobs = updateprobs
903 
904  def onAddMessage(self, message, flags=0):
905  '''A message is being added to an observed corpus.'''
906  if not (flags & NO_TRAINING_FLAG):
907  self.train(message)
908 
909  def train(self, message):
910  '''Train the database with the message'''
911 
912  if options["globals", "verbose"]:
913  print >> sys.stderr, 'training with ', message.key()
914 
915  self.bayes.learn(message.tokenize(), self.is_spam)
916  message.setId(message.key())
917  message.RememberTrained(self.is_spam)
918 
919  def onRemoveMessage(self, message, flags=0):
920  '''A message is being removed from an observed corpus.'''
921  # If a message is being expired from the corpus, we do
922  # *NOT* want to untrain it, because that's not what's happening.
923  # If this is the case, then flags will include NO_TRAINING_FLAG.
924  # There are no other flags we currently use.
925  if not (flags & NO_TRAINING_FLAG):
926  self.untrain(message)
927 
928  def untrain(self, message):
929  '''Untrain the database with the message'''
930 
931  if options["globals", "verbose"]:
932  print >> sys.stderr, 'untraining with', message.key()
933 
934  self.bayes.unlearn(message.tokenize(), self.is_spam)
935 # self.updateprobs)
936  # can raise ValueError if database is fouled. If this is the case,
937  # then retraining is the only recovery option.
938  message.RememberTrained(None)
939 
940  def trainAll(self, corpus):
941  '''Train all the messages in the corpus'''
942  for msg in corpus:
943  self.train(msg)
944 
945  def untrainAll(self, corpus):
946  '''Untrain all the messages in the corpus'''
947  for msg in corpus:
948  self.untrain(msg)
949 
950 
952  '''Trainer for spam'''
953  def __init__(self, bayes, updateprobs=NO_UPDATEPROBS):
954  '''Constructor'''
955  Trainer.__init__(self, bayes, True, updateprobs)
956 
957 
959  '''Trainer for ham'''
960  def __init__(self, bayes, updateprobs=NO_UPDATEPROBS):
961  '''Constructor'''
962  Trainer.__init__(self, bayes, False, updateprobs)
963 
964 class NoSuchClassifierError(Exception):
965  def __init__(self, invalid_name):
966  Exception.__init__(self, invalid_name)
967  self.invalid_name = invalid_name
968  def __str__(self):
969  return repr(self.invalid_name)
970 
971 class MutuallyExclusiveError(Exception):
972  def __str__(self):
973  return "Only one type of database can be specified"
974 
975 # values are classifier class, True if it accepts a mode
976 # arg, and True if the argument is a pathname
977 _storage_types = {"dbm" : (DBDictClassifier, True, True),
978  "pickle" : (PickledClassifier, False, True),
979  "pgsql" : (PGClassifier, False, False),
980  "mysql" : (mySQLClassifier, False, False),
981  "cdb" : (CDBClassifier, False, True),
982  "zodb" : (ZODBClassifier, True, True),
983  "zeo" : (ZEOClassifier, False, False),
984  }
985 
986 def open_storage(data_source_name, db_type="dbm", mode=None):
987  """Return a storage object appropriate to the given parameters.
988 
989  By centralizing this code here, all the applications will behave
990  the same given the same options.
991  """
992  try:
993  klass, supports_mode, unused = _storage_types[db_type]
994  except KeyError:
995  raise NoSuchClassifierError(db_type)
996  try:
997  if supports_mode and mode is not None:
998  return klass(data_source_name, mode)
999  else:
1000  return klass(data_source_name)
1001  except dbmstorage.error, e:
1002  if str(e) == "No dbm modules available!":
1003  # We expect this to hit a fair few people, so warn them nicely,
1004  # rather than just printing the trackback.
1005  print >> sys.stderr, "\nYou do not have a dbm module available " \
1006  "to use. You need to either use a pickle (see the FAQ)" \
1007  ", use Python 2.3 (or above), or install a dbm module " \
1008  "such as bsddb (see http://sf.net/projects/pybsddb)."
1009  sys.exit()
1010  raise
1011 
1012 # The different database types that are available.
1013 # The key should be the command-line switch that is used to select this
1014 # type, and the value should be the name of the type (which
1015 # must be a valid key for the _storage_types dictionary).
1016 _storage_options = { "-p" : "pickle",
1017  "-d" : "dbm",
1018  }
1019 
1020 def database_type(opts, default_type=("Storage", "persistent_use_database"),
1021  default_name=("Storage", "persistent_storage_file")):
1022  """Return the name of the database and the type to use. The output of
1023  this function can be used as the db_type parameter for the open_storage
1024  function, for example:
1025 
1026  [standard getopts code]
1027  db_name, db_type = database_type(opts)
1028  storage = open_storage(db_name, db_type)
1029 
1030  The selection is made based on the options passed, or, if the
1031  appropriate options are not present, the options in the global
1032  options object.
1033 
1034  Currently supports:
1035  -p : pickle
1036  -d : dbm
1037  """
1038  nm, typ = None, None
1039  for opt, arg in opts:
1040  if _storage_options.has_key(opt):
1041  if nm is None and typ is None:
1042  nm, typ = arg, _storage_options[opt]
1043  else:
1044  raise MutuallyExclusiveError()
1045  if nm is None and typ is None:
1046  typ = options[default_type]
1047  try:
1048  unused, unused, is_path = _storage_types[typ]
1049  except KeyError:
1050  raise NoSuchClassifierError(typ)
1051  if is_path:
1052  nm = get_pathname_option(*default_name)
1053  else:
1054  nm = options[default_name]
1055  return nm, typ
1056 
1057 def convert(old_name=None, old_type=None, new_name=None, new_type=None):
1058  # The expected need is to convert the existing hammie.db dbm
1059  # database to a hammie.fs ZODB database.
1060  if old_name is None:
1061  old_name = "hammie.db"
1062  if old_type is None:
1063  old_type = "dbm"
1064  if new_name is None or new_type is None:
1065  auto_name, auto_type = database_type({})
1066  if new_name is None:
1067  new_name = auto_name
1068  if new_type is None:
1069  new_type = auto_type
1070 
1071  old_bayes = open_storage(old_name, old_type, 'r')
1072  new_bayes = open_storage(new_name, new_type)
1073  words = old_bayes._wordinfokeys()
1074 
1075  try:
1076  new_bayes.nham = old_bayes.nham
1077  except AttributeError:
1078  new_bayes.nham = 0
1079  try:
1080  new_bayes.nspam = old_bayes.nspam
1081  except AttributeError:
1082  new_bayes.nspam = 0
1083 
1084  print >> sys.stderr, "Converting %s (%s database) to " \
1085  "%s (%s database)." % (old_name, old_type, new_name, new_type)
1086  print >> sys.stderr, "Database has %s ham, %s spam, and %s words." % \
1087  (new_bayes.nham, new_bayes.nspam, len(words))
1088 
1089  for word in words:
1090  new_bayes._wordinfoset(word, old_bayes._wordinfoget(word))
1091  old_bayes.close()
1092 
1093  print >> sys.stderr, "Storing database, please be patient..."
1094  new_bayes.store()
1095  print >> sys.stderr, "Conversion complete."
1096  new_bayes.close()
1097 
1098 def ensureDir(dirname):
1099  """Ensure that the given directory exists - in other words, if it
1100  does not exist, attempt to create it."""
1101  try:
1102  os.mkdir(dirname)
1103  if options["globals", "verbose"]:
1104  print >> sys.stderr, "Creating directory", dirname
1105  except OSError, e:
1106  if e.errno != errno.EEXIST:
1107  raise
1108 
1109 if __name__ == '__main__':
1110  print >> sys.stderr, __doc__