3 '''storage.py - Spambayes database management framework.
6 PickledClassifier - Classifier that uses a pickle db
7 DBDictClassifier - Classifier that uses a shelve db
8 PGClassifier - Classifier that uses postgres
9 mySQLClassifier - Classifier that uses mySQL
10 CBDClassifier - Classifier that uses CDB
11 ZODBClassifier - Classifier that uses ZODB
12 ZEOClassifier - Classifier that uses ZEO
13 Trainer - Classifier training observer
14 SpamTrainer - Trainer for spam
15 HamTrainer - Trainer for ham
18 *Classifier are subclasses of Classifier (classifier.Classifier)
19 that add automatic state store/restore function to the Classifier class.
20 All SQL based classifiers are subclasses of SQLClassifier, which is a
21 subclass of Classifier.
23 PickledClassifier is a Classifier class that uses a cPickle
24 datastore. This database is relatively small, but slower than other
27 DBDictClassifier is a Classifier class that uses a database
30 Trainer is concrete class that observes a Corpus and trains a
31 Classifier object based upon movement of messages between corpora When
32 an add message notification is received, the trainer trains the
33 database with the message, as spam or ham as appropriate given the
34 type of trainer (spam or ham). When a remove message notification
35 is received, the trainer untrains the database as appropriate.
37 SpamTrainer and HamTrainer are convenience subclasses of Trainer, that
38 initialize as the appropriate type of Trainer
53 __author__ = (
"Neale Pickett <neale@woozle.org>,"
54 "Tim Stone <tim@fourstonesExpressions.com>")
55 __credits__ =
"All the spambayes contributors."
62 from spambayes
import classifier
66 from spambayes
import cdb
67 from spambayes
import dbmstorage
71 oldShelvePickler = shelve.Pickler
74 shelve.Pickler = binaryDefaultPickler
77 NO_UPDATEPROBS =
False
81 '''Classifier object persisted in a pickle'''
84 classifier.Classifier.__init__(self)
89 '''Load this instance from the pickle.'''
100 if options[
"globals",
"verbose"]:
101 print >> sys.stderr,
'Loading state from', self.
db_name,
'pickle'
112 classifier.Classifier.__setstate__(self,
113 tempbayes.__getstate__())
114 if options[
"globals",
"verbose"]:
115 print >> sys.stderr, (
'%s is an existing pickle,'
116 ' with %d ham and %d spam') \
120 if options[
"globals",
"verbose"]:
121 print >> sys.stderr, self.
db_name,
'is a new pickle'
127 '''Store self as a pickle'''
129 if options[
"globals",
"verbose"]:
130 print >> sys.stderr,
'Persisting', self.
db_name,
'as a pickle'
142 STATE_KEY =
'saved state'
145 '''Classifier object persisted in a caching database'''
148 '''Constructor(database name)'''
150 classifier.Classifier.__init__(self)
161 getattr(self.db,
"close", noop)()
162 getattr(self.dbm,
"close", noop)()
167 if hasattr(self,
"db"):
169 if hasattr(self,
"dbm"):
171 if options[
"globals",
"verbose"]:
172 print >> sys.stderr,
'Closed', self.db_name,
'database'
175 '''Load state from database'''
177 if options[
"globals",
"verbose"]:
178 print >> sys.stderr,
'Loading state from', self.
db_name,
'database'
185 if t[0] != classifier.PICKLE_VERSION:
186 raise ValueError(
"Can't unpickle -- version %s unknown" % t[0])
189 if options[
"globals",
"verbose"]:
190 print >> sys.stderr, (
'%s is an existing database,'
191 ' with %d spam and %d ham') \
195 if options[
"globals",
"verbose"]:
196 print >> sys.stderr, self.
db_name,
'is a new database'
203 '''Place state into persistent store'''
205 if options[
"globals",
"verbose"]:
206 print >> sys.stderr,
'Persisting', self.
db_name,
207 print >> sys.stderr,
'state in database'
214 for key, flag
in self.changed_words.iteritems():
215 if flag
is WORD_CHANGED:
217 self.
db[key] = val.__getstate__()
218 elif flag
is WORD_DELETED:
220 "Should not have a wordinfo for words flagged for delete"
227 raise RuntimeError,
"Unknown flag value"
235 def _write_state_key(self):
236 self.
db[self.
statekey] = (classifier.PICKLE_VERSION,
239 def _post_training(self):
240 """This is called after training on a wordstream. We ensure that the
241 database is in a consistent state at this point by writing the state
245 def _wordinfoget(self, word):
246 if isinstance(word, unicode):
247 word = word.encode(
"utf-8")
252 if self.changed_words.get(word)
is not WORD_DELETED:
253 r = self.db.get(word)
260 def _wordinfoset(self, word, record):
270 if isinstance(word, unicode):
271 word = word.encode(
"utf-8")
272 if record.spamcount + record.hamcount <= 1:
273 self.
db[word] = record.__getstate__()
290 def _wordinfodel(self, word):
291 if isinstance(word, unicode):
292 word = word.encode(
"utf-8")
296 def _wordinfokeys(self):
297 wordinfokeys = self.db.keys()
298 del wordinfokeys[wordinfokeys.index(self.
statekey)]
304 '''Constructor(database name)'''
306 classifier.Classifier.__init__(self)
312 '''Release all database resources'''
318 '''Load state from the database'''
319 raise NotImplementedError,
"must be implemented in subclass"
322 '''Save state to the database'''
326 '''Return a new db cursor'''
327 raise NotImplementedError,
"must be implemented in subclass"
330 '''Return all rows as a dict'''
331 raise NotImplementedError,
"must be implemented in subclass"
334 '''Commit the current transaction - may commit at db or cursor'''
335 raise NotImplementedError,
"must be implemented in subclass"
338 '''Create a new bayes table'''
340 c.execute(self.table_definition)
343 def _get_row(self, word):
344 '''Return row matching word'''
347 c.execute(
"select * from bayes"
351 print >> sys.stderr,
"error:", (e, word)
360 def _set_row(self, word, nspam, nham):
363 c.execute(
"update bayes"
364 " set nspam=%s,nham=%s"
368 c.execute(
"insert into bayes"
369 " (nspam, nham, word)"
370 " values (%s, %s, %s)",
374 def _delete_row(self, word):
376 c.execute(
"delete from bayes"
381 def _has_key(self, key):
383 c.execute(
"select word from bayes"
388 def _wordinfoget(self, word):
389 if isinstance(word, unicode):
390 word = word.encode(
"utf-8")
395 item.__setstate__((row[
"nspam"], row[
"nham"]))
400 def _wordinfoset(self, word, record):
401 if isinstance(word, unicode):
402 word = word.encode(
"utf-8")
403 self.
_set_row(word, record.spamcount, record.hamcount)
405 def _wordinfodel(self, word):
406 if isinstance(word, unicode):
407 word = word.encode(
"utf-8")
410 def _wordinfokeys(self):
412 c.execute(
"select word from bayes")
414 return [r[0]
for r
in rows]
418 '''Classifier object persisted in a Postgres database'''
421 " word bytea not null default '',"
422 " nspam integer not null default 0,"
423 " nham integer not null default 0,"
426 SQLClassifier.__init__(self, db_name)
429 return self.db.cursor()
432 return c.dictfetchall()
438 '''Load state from database'''
442 if options[
"globals",
"verbose"]:
443 print >> sys.stderr,
'Loading state from', self.
db_name,
'database'
449 c.execute(
"select count(*) from bayes")
450 except psycopg.ProgrammingError:
458 if options[
"globals",
"verbose"]:
459 print >> sys.stderr, (
'%s is an existing database,'
460 ' with %d spam and %d ham') \
464 if options[
"globals",
"verbose"]:
465 print >> sys.stderr, self.
db_name,
'is a new database'
471 '''Classifier object persisted in a mySQL database
473 It is assumed that the database already exists, and that the mySQL
474 server is currently running.'''
478 " word varchar(255) not null default '',"
479 " nspam integer not null default 0,"
480 " nham integer not null default 0,"
486 db_name =
"spambayes"
488 source_info = data_source_name.split()
489 for info
in source_info:
490 if info.startswith(
"host"):
492 elif info.startswith(
"user"):
494 elif info.startswith(
"pass"):
496 elif info.startswith(
"dbname"):
498 elif info.startswith(
"charset"):
500 SQLClassifier.__init__(self, db_name)
503 return self.db.cursor()
512 '''Load state from database'''
516 if options[
"globals",
"verbose"]:
517 print >> sys.stderr,
'Loading state from', self.
db_name,
'database'
524 self.
db = MySQLdb.connect(**params)
528 c.execute(
"select count(*) from bayes")
529 except MySQLdb.ProgrammingError:
532 except MySQLdb.NotSupportedError:
543 if options[
"globals",
"verbose"]:
544 print >> sys.stderr, (
'%s is an existing database,'
545 ' with %d spam and %d ham') \
549 if options[
"globals",
"verbose"]:
550 print >> sys.stderr, self.
db_name,
'is a new database'
554 def _wordinfoget(self, word):
555 if isinstance(word, unicode):
556 word = word.encode(
"utf-8")
561 item.__setstate__((row[1], row[2]))
568 """A classifier that uses a CDB database.
570 A CDB wordinfo database is quite small and fast but is slow to update.
571 It is appropriate if training is done rarely (e.g. monthly or weekly
572 using archived ham and spam).
575 classifier.Classifier.__init__(self)
580 def _WordInfoFactory(self, counts):
586 ham, spam = counts.split(
',')
588 wi.hamcount = int(ham)
589 wi.spamcount = int(spam)
596 for encoding
in (
"utf-8",
"cp1252",
"iso-8859-1"):
598 return unicode(s, encoding)
599 except UnicodeDecodeError:
605 if os.path.exists(self.
db_name):
613 for k, v
in data.iteritems() \
615 if options[
"globals",
"verbose"]:
616 print >> sys.stderr, (
'%s is an existing CDB,'
617 ' with %d ham and %d spam') \
621 if options[
"globals",
"verbose"]:
622 print >> sys.stderr, self.
db_name,
'is a new CDB'
629 for word, wi
in self.wordinfo.iteritems():
630 if isinstance(word, types.UnicodeType):
631 word = word.encode(
"utf-8")
632 items.append((word,
"%d,%d" % (wi.hamcount, wi.spamcount)))
634 cdb.cdb_make(db, items)
646 from persistent
import Persistent
649 from ZODB
import Persistent
656 from BTrees.OOBTree
import OOBTree
658 classifier.Classifier.__init__(self)
663 ClassifierClass = _PersistentClassifier
674 if hasattr(self,
"classifier")
and hasattr(self.
classifier, att):
676 raise AttributeError(
"ZODBClassifier object has no attribute '%s'"
681 if att
in (
"nham",
"nspam")
and hasattr(self,
"classifier"):
684 object.__setattr__(self, att, value)
687 from ZODB.FileStorage
import FileStorage
690 read_only=self.
mode==
'r')
692 print >> sys.stderr, (
"Could not create FileStorage from",
697 '''Load state from database'''
700 if options[
"globals",
"verbose"]:
701 print >> sys.stderr,
"Loading state from %s (%s) database" % \
712 root = self.conn.root()
717 if options[
"globals",
"verbose"]:
718 print >> sys.stderr, self.
db_name,
'is a new ZODB'
721 if options[
"globals",
"verbose"]:
722 print >> sys.stderr,
'%s is an existing ZODB, with %d ' \
723 'ham and %d spam' % (self.
db_name, self.nham,
728 '''Place state into persistent store'''
730 import ZODB.Transaction
733 commit = transaction.commit
734 abort = transaction.abort
736 commit = ZODB.Transaction.get_transaction().commit
737 abort = ZODB.Transaction.get_transaction().abort
738 from ZODB.POSException
import ConflictError
740 from ZODB.POSException
import TransactionFailedError
742 from ZODB.POSException
import TransactionError
as TransactionFailedError
743 from ZODB.POSException
import ReadOnlyError
745 assert not self.
closed,
"Can't store a closed database"
747 if options[
"globals",
"verbose"]:
748 print >> sys.stderr,
'Persisting', self.
db_name,
'state in database'
752 except ConflictError:
756 if options[
"globals",
"verbose"]:
757 print >> sys.stderr,
"Conflict on commit", self.
db_name
759 except TransactionFailedError:
762 print >> sys.stderr,
"Storing failed. Need to restart.", \
765 except ReadOnlyError:
766 print >> sys.stderr,
"Can't store transaction to read-only db."
769 def close(self, pack=True, retain_backup=True):
782 if pack
and self.
mode !=
'r':
783 self.pack(time.time()-60*60*24, retain_backup)
790 delattr(self,
"classifier")
793 if options[
"globals",
"verbose"]:
794 print >> sys.stderr,
'Closed', self.
db_name,
'database'
796 def pack(self, t, retain_backup=True):
797 """Like FileStorage pack(), but optionally remove the .old
798 backup file that is created. Often for our purposes we do
799 not care about being able to recover from this. Also
800 ignore the referencesf parameter, which appears to not do
802 if hasattr(self.
storage,
"pack"):
803 self.storage.pack(t,
None)
804 if not retain_backup:
806 if os.path.exists(old_name):
812 source_info = data_source_name.split()
815 db_name =
"SpamBayes"
821 for info
in source_info:
822 if info.startswith(
"host"):
825 self.
host = str(info[5:])
826 except UnicodeDecodeError, e:
827 print >> sys.stderr,
"Couldn't set host", \
829 elif info.startswith(
"port"):
830 self.
port = int(info[5:])
831 elif info.startswith(
"dbname"):
833 elif info.startswith(
"user"):
835 elif info.startswith(
"pass"):
837 elif info.startswith(
"storage_name"):
839 elif info.startswith(
"wait_timeout"):
841 elif info.startswith(
"wait"):
842 self.
wait = info[5:] ==
"True"
843 ZODBClassifier.__init__(self, db_name)
846 from ZEO.ClientStorage
import ClientStorage
851 if options[
"globals",
"verbose"]:
852 print >> sys.stderr,
"Connecting to ZEO server", addr, \
866 var=tempfile.gettempdir(),
871 os.remove(os.path.join(tempfile.gettempdir(),
877 read_only=self.
mode==
'r',
885 return self.storage.is_connected()
893 '''Associates a Classifier object and one or more Corpora, \
894 is an observer of the corpora'''
896 def __init__(self, bayes, is_spam, updateprobs=NO_UPDATEPROBS):
897 '''Constructor(Classifier, is_spam(True|False),
898 updateprobs(True|False)'''
905 '''A message is being added to an observed corpus.'''
906 if not (flags & NO_TRAINING_FLAG):
910 '''Train the database with the message'''
912 if options[
"globals",
"verbose"]:
913 print >> sys.stderr,
'training with ', message.key()
915 self.bayes.learn(message.tokenize(), self.
is_spam)
916 message.setId(message.key())
917 message.RememberTrained(self.
is_spam)
920 '''A message is being removed from an observed corpus.'''
925 if not (flags & NO_TRAINING_FLAG):
929 '''Untrain the database with the message'''
931 if options[
"globals",
"verbose"]:
932 print >> sys.stderr,
'untraining with', message.key()
934 self.bayes.unlearn(message.tokenize(), self.
is_spam)
938 message.RememberTrained(
None)
941 '''Train all the messages in the corpus'''
946 '''Untrain all the messages in the corpus'''
952 '''Trainer for spam'''
953 def __init__(self, bayes, updateprobs=NO_UPDATEPROBS):
955 Trainer.__init__(self, bayes,
True, updateprobs)
959 '''Trainer for ham'''
960 def __init__(self, bayes, updateprobs=NO_UPDATEPROBS):
962 Trainer.__init__(self, bayes,
False, updateprobs)
966 Exception.__init__(self, invalid_name)
973 return "Only one type of database can be specified"
977 _storage_types = {
"dbm" : (DBDictClassifier,
True,
True),
978 "pickle" : (PickledClassifier,
False,
True),
979 "pgsql" : (PGClassifier,
False,
False),
980 "mysql" : (mySQLClassifier,
False,
False),
981 "cdb" : (CDBClassifier,
False,
True),
982 "zodb" : (ZODBClassifier,
True,
True),
983 "zeo" : (ZEOClassifier,
False,
False),
987 """Return a storage object appropriate to the given parameters.
989 By centralizing this code here, all the applications will behave
990 the same given the same options.
993 klass, supports_mode, unused = _storage_types[db_type]
997 if supports_mode
and mode
is not None:
998 return klass(data_source_name, mode)
1000 return klass(data_source_name)
1002 if str(e) ==
"No dbm modules available!":
1005 print >> sys.stderr,
"\nYou do not have a dbm module available " \
1006 "to use. You need to either use a pickle (see the FAQ)" \
1007 ", use Python 2.3 (or above), or install a dbm module " \
1008 "such as bsddb (see http://sf.net/projects/pybsddb)."
1016 _storage_options = {
"-p" :
"pickle",
1020 def database_type(opts, default_type=(
"Storage",
"persistent_use_database"),
1021 default_name=(
"Storage",
"persistent_storage_file")):
1022 """Return the name of the database and the type to use. The output of
1023 this function can be used as the db_type parameter for the open_storage
1024 function, for example:
1026 [standard getopts code]
1027 db_name, db_type = database_type(opts)
1028 storage = open_storage(db_name, db_type)
1030 The selection is made based on the options passed, or, if the
1031 appropriate options are not present, the options in the global
1038 nm, typ =
None,
None
1039 for opt, arg
in opts:
1040 if _storage_options.has_key(opt):
1041 if nm
is None and typ
is None:
1042 nm, typ = arg, _storage_options[opt]
1045 if nm
is None and typ
is None:
1046 typ = options[default_type]
1048 unused, unused, is_path = _storage_types[typ]
1054 nm = options[default_name]
1057 def convert(old_name=None, old_type=None, new_name=None, new_type=None):
1060 if old_name
is None:
1061 old_name =
"hammie.db"
1062 if old_type
is None:
1064 if new_name
is None or new_type
is None:
1066 if new_name
is None:
1067 new_name = auto_name
1068 if new_type
is None:
1069 new_type = auto_type
1073 words = old_bayes._wordinfokeys()
1076 new_bayes.nham = old_bayes.nham
1077 except AttributeError:
1080 new_bayes.nspam = old_bayes.nspam
1081 except AttributeError:
1084 print >> sys.stderr,
"Converting %s (%s database) to " \
1085 "%s (%s database)." % (old_name, old_type, new_name, new_type)
1086 print >> sys.stderr,
"Database has %s ham, %s spam, and %s words." % \
1087 (new_bayes.nham, new_bayes.nspam, len(words))
1090 new_bayes._wordinfoset(word, old_bayes._wordinfoget(word))
1093 print >> sys.stderr,
"Storing database, please be patient..."
1095 print >> sys.stderr,
"Conversion complete."
1099 """Ensure that the given directory exists - in other words, if it
1100 does not exist, attempt to create it."""
1103 if options[
"globals",
"verbose"]:
1104 print >> sys.stderr,
"Creating directory", dirname
1106 if e.errno != errno.EEXIST:
1109 if __name__ ==
'__main__':
1110 print >> sys.stderr, __doc__