"Fossies" - the Fresh Open Source Software Archive

Member "spambayes-1.1a6/spambayes/classifier.py" (6 Dec 2008, 31973 Bytes) of archive /windows/mail/spambayes-1.1a6.zip:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Python source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. For more information about "classifier.py" see the Fossies "Dox" file reference documentation.

    1 #! /usr/bin/env python
    2 
    3 from __future__ import generators
    4 
    5 # An implementation of a Bayes-like spam classifier.
    6 #
    7 # Paul Graham's original description:
    8 #
    9 #     http://www.paulgraham.com/spam.html
   10 #
   11 # A highly fiddled version of that can be retrieved from our CVS repository,
   12 # via tag Last-Graham.  This made many demonstrated improvements in error
   13 # rates over Paul's original description.
   14 #
   15 # This code implements Gary Robinson's suggestions, the core of which are
   16 # well explained on his webpage:
   17 #
   18 #    http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
   19 #
   20 # This is theoretically cleaner, and in testing has performed at least as
   21 # well as our highly tuned Graham scheme did, often slightly better, and
   22 # sometimes much better.  It also has "a middle ground", which people like:
   23 # the scores under Paul's scheme were almost always very near 0 or very near
   24 # 1, whether or not the classification was correct.  The false positives
   25 # and false negatives under Gary's basic scheme (use_gary_combining) generally
   26 # score in a narrow range around the corpus's best spam_cutoff value.
   27 # However, it doesn't appear possible to guess the best spam_cutoff value in
   28 # advance, and it's touchy.
   29 #
   30 # The last version of the Gary-combining scheme can be retrieved from our
   31 # CVS repository via tag Last-Gary.
   32 #
   33 # The chi-combining scheme used by default here gets closer to the theoretical
   34 # basis of Gary's combining scheme, and does give extreme scores, but also
   35 # has a very useful middle ground (small # of msgs spread across a large range
   36 # of scores, and good cutoff values aren't touchy).
   37 #
   38 # This implementation is due to Tim Peters et alia.
   39 
   40 import math
   41 
   42 # XXX At time of writing, these are only necessary for the
   43 # XXX experimental url retrieving/slurping code.  If that
   44 # XXX gets ripped out, either rip these out, or run
   45 # XXX PyChecker over the code.
   46 import re
   47 import os
   48 import sys
   49 import socket
   50 import urllib2
   51 from email import message_from_string
   52 
   53 DOMAIN_AND_PORT_RE = re.compile(r"([^:/\\]+)(:([\d]+))?")
   54 HTTP_ERROR_RE = re.compile(r"HTTP Error ([\d]+)")
   55 URL_KEY_RE = re.compile(r"[\W]")
   56 # XXX ---- ends ----
   57 
   58 from spambayes.Options import options
   59 from spambayes.chi2 import chi2Q
   60 from spambayes.safepickle import pickle_read, pickle_write
   61 
   62 LN2 = math.log(2)       # used frequently by chi-combining
   63 
   64 slurp_wordstream = None
   65 
   66 PICKLE_VERSION = 5
   67 
   68 class WordInfo(object):
   69     # A WordInfo is created for each distinct word.  spamcount is the
   70     # number of trained spam msgs in which the word appears, and hamcount
   71     # the number of trained ham msgs.
   72     #
   73     # Invariant:  For use in a classifier database, at least one of
   74     # spamcount and hamcount must be non-zero.
   75     #
   76     # Important:  This is a tiny object.  Use of __slots__ is essential
   77     # to conserve memory.
   78     __slots__ = 'spamcount', 'hamcount'
   79 
   80     def __init__(self):
   81         self.__setstate__((0, 0))
   82 
   83     def __repr__(self):
   84         return "WordInfo" + repr((self.spamcount, self.hamcount))
   85 
   86     def __getstate__(self):
   87         return self.spamcount, self.hamcount
   88 
   89     def __setstate__(self, t):
   90         self.spamcount, self.hamcount = t
   91 
   92 
   93 class Classifier:
   94     # Defining __slots__ here made Jeremy's life needlessly difficult when
   95     # trying to hook this all up to ZODB as a persistent object.  There's
   96     # no space benefit worth getting from slots in this class; slots were
   97     # used solely to help catch errors earlier, when this code was changing
   98     # rapidly.
   99 
  100     #__slots__ = ('wordinfo',  # map word to WordInfo record
  101     #             'nspam',     # number of spam messages learn() has seen
  102     #             'nham',      # number of non-spam messages learn() has seen
  103     #            )
  104 
  105     # allow a subclass to use a different class for WordInfo
  106     WordInfoClass = WordInfo
  107 
  108     def __init__(self):
  109         self.wordinfo = {}
  110         self.probcache = {}
  111         self.nspam = self.nham = 0
  112 
  113     def __getstate__(self):
  114         return (PICKLE_VERSION, self.wordinfo, self.nspam, self.nham)
  115 
  116     def __setstate__(self, t):
  117         if t[0] != PICKLE_VERSION:
  118             raise ValueError("Can't unpickle -- version %s unknown" % t[0])
  119         (self.wordinfo, self.nspam, self.nham) = t[1:]
  120         self.probcache = {}
  121 
  122     # spamprob() implementations.  One of the following is aliased to
  123     # spamprob, depending on option settings.
  124     # Currently only chi-squared is available, but maybe there will be
  125     # an alternative again someday.
  126 
  127     # Across vectors of length n, containing random uniformly-distributed
  128     # probabilities, -2*sum(ln(p_i)) follows the chi-squared distribution
  129     # with 2*n degrees of freedom.  This has been proven (in some
  130     # appropriate sense) to be the most sensitive possible test for
  131     # rejecting the hypothesis that a vector of probabilities is uniformly
  132     # distributed.  Gary Robinson's original scheme was monotonic *with*
  133     # this test, but skipped the details.  Turns out that getting closer
  134     # to the theoretical roots gives a much sharper classification, with
  135     # a very small (in # of msgs), but also very broad (in range of scores),
  136     # "middle ground", where most of the mistakes live.  In particular,
  137     # this scheme seems immune to all forms of "cancellation disease":  if
  138     # there are many strong ham *and* spam clues, this reliably scores
  139     # close to 0.5.  Most other schemes are extremely certain then -- and
  140     # often wrong.
  141     def chi2_spamprob(self, wordstream, evidence=False):
  142         """Return best-guess probability that wordstream is spam.
  143 
  144         wordstream is an iterable object producing words.
  145         The return value is a float in [0.0, 1.0].
  146 
  147         If optional arg evidence is True, the return value is a pair
  148             probability, evidence
  149         where evidence is a list of (word, probability) pairs.
  150         """
  151 
  152         from math import frexp, log as ln
  153 
  154         # We compute two chi-squared statistics, one for ham and one for
  155         # spam.  The sum-of-the-logs business is more sensitive to probs
  156         # near 0 than to probs near 1, so the spam measure uses 1-p (so
  157         # that high-spamprob words have greatest effect), and the ham
  158         # measure uses p directly (so that lo-spamprob words have greatest
  159         # effect).
  160         #
  161         # For optimization, sum-of-logs == log-of-product, and f.p.
  162         # multiplication is a lot cheaper than calling ln().  It's easy
  163         # to underflow to 0.0, though, so we simulate unbounded dynamic
  164         # range via frexp.  The real product H = this H * 2**Hexp, and
  165         # likewise the real product S = this S * 2**Sexp.
  166         H = S = 1.0
  167         Hexp = Sexp = 0
  168 
  169         clues = self._getclues(wordstream)
  170         for prob, word, record in clues:
  171             S *= 1.0 - prob
  172             H *= prob
  173             if S < 1e-200:  # prevent underflow
  174                 S, e = frexp(S)
  175                 Sexp += e
  176             if H < 1e-200:  # prevent underflow
  177                 H, e = frexp(H)
  178                 Hexp += e
  179 
  180         # Compute the natural log of the product = sum of the logs:
  181         # ln(x * 2**i) = ln(x) + i * ln(2).
  182         S = ln(S) + Sexp * LN2
  183         H = ln(H) + Hexp * LN2
  184 
  185         n = len(clues)
  186         if n:
  187             S = 1.0 - chi2Q(-2.0 * S, 2*n)
  188             H = 1.0 - chi2Q(-2.0 * H, 2*n)
  189 
  190             # How to combine these into a single spam score?  We originally
  191             # used (S-H)/(S+H) scaled into [0., 1.], which equals S/(S+H).  A
  192             # systematic problem is that we could end up being near-certain
  193             # a thing was (for example) spam, even if S was small, provided
  194             # that H was much smaller.
  195             # Rob Hooft stared at these problems and invented the measure
  196             # we use now, the simpler S-H, scaled into [0., 1.].
  197             prob = (S-H + 1.0) / 2.0
  198         else:
  199             prob = 0.5
  200 
  201         if evidence:
  202             clues = [(w, p) for p, w, _r in clues]
  203             clues.sort(lambda a, b: cmp(a[1], b[1]))
  204             clues.insert(0, ('*S*', S))
  205             clues.insert(0, ('*H*', H))
  206             return prob, clues
  207         else:
  208             return prob
  209 
  210     def slurping_spamprob(self, wordstream, evidence=False):
  211         """Do the standard chi-squared spamprob, but if the evidence
  212         leaves the score in the unsure range, and we have fewer tokens
  213         than max_discriminators, also generate tokens from the text
  214         obtained by following http URLs in the message."""
  215         h_cut = options["Categorization", "ham_cutoff"]
  216         s_cut = options["Categorization", "spam_cutoff"]
  217 
  218         # Get the raw score.
  219         prob, clues = self.chi2_spamprob(wordstream, True)
  220 
  221         # If necessary, enhance it with the tokens from whatever is
  222         # at the URL's destination.
  223         if len(clues) < options["Classifier", "max_discriminators"] and \
  224            prob > h_cut and prob < s_cut and slurp_wordstream:
  225             slurp_tokens = list(self._generate_slurp())
  226             slurp_tokens.extend([w for (w, _p) in clues])
  227             sprob, sclues = self.chi2_spamprob(slurp_tokens, True)
  228             if sprob < h_cut or sprob > s_cut:
  229                 prob = sprob
  230                 clues = sclues
  231         if evidence:
  232             return prob, clues
  233         return prob
  234 
  235     if options["Classifier", "use_chi_squared_combining"]:
  236         if options["URLRetriever", "x-slurp_urls"]:
  237             spamprob = slurping_spamprob
  238         else:
  239             spamprob = chi2_spamprob
  240 
  241     def learn(self, wordstream, is_spam):
  242         """Teach the classifier by example.
  243 
  244         wordstream is a word stream representing a message.  If is_spam is
  245         True, you're telling the classifier this message is definitely spam,
  246         else that it's definitely not spam.
  247         """
  248         if options["Classifier", "use_bigrams"]:
  249             wordstream = self._enhance_wordstream(wordstream)
  250         if options["URLRetriever", "x-slurp_urls"]:
  251             wordstream = self._add_slurped(wordstream)
  252         self._add_msg(wordstream, is_spam)
  253 
  254     def unlearn(self, wordstream, is_spam):
  255         """In case of pilot error, call unlearn ASAP after screwing up.
  256 
  257         Pass the same arguments you passed to learn().
  258         """
  259         if options["Classifier", "use_bigrams"]:
  260             wordstream = self._enhance_wordstream(wordstream)
  261         if options["URLRetriever", "x-slurp_urls"]:
  262             wordstream = self._add_slurped(wordstream)
  263         self._remove_msg(wordstream, is_spam)
  264 
  265     def probability(self, record):
  266         """Compute, store, and return prob(msg is spam | msg contains word).
  267 
  268         This is the Graham calculation, but stripped of biases, and
  269         stripped of clamping into 0.01 thru 0.99.  The Bayesian
  270         adjustment following keeps them in a sane range, and one
  271         that naturally grows the more evidence there is to back up
  272         a probability.
  273         """
  274 
  275         spamcount = record.spamcount
  276         hamcount = record.hamcount
  277 
  278         # Try the cache first
  279         try:
  280             return self.probcache[spamcount][hamcount]
  281         except KeyError:
  282             pass
  283 
  284         nham = float(self.nham or 1)
  285         nspam = float(self.nspam or 1)
  286 
  287         assert hamcount <= nham, "Token seen in more ham than ham trained."
  288         hamratio = hamcount / nham
  289 
  290         assert spamcount <= nspam, "Token seen in more spam than spam trained."
  291         spamratio = spamcount / nspam
  292 
  293         prob = spamratio / (hamratio + spamratio)
  294 
  295         S = options["Classifier", "unknown_word_strength"]
  296         StimesX = S * options["Classifier", "unknown_word_prob"]
  297 
  298 
  299         # Now do Robinson's Bayesian adjustment.
  300         #
  301         #         s*x + n*p(w)
  302         # f(w) = --------------
  303         #           s + n
  304         #
  305         # I find this easier to reason about like so (equivalent when
  306         # s != 0):
  307         #
  308         #        x - p
  309         #  p +  -------
  310         #       1 + n/s
  311         #
  312         # IOW, it moves p a fraction of the distance from p to x, and
  313         # less so the larger n is, or the smaller s is.
  314 
  315         n = hamcount + spamcount
  316         prob = (StimesX + n * prob) / (S + n)
  317 
  318         # Update the cache
  319         try:
  320             self.probcache[spamcount][hamcount] = prob
  321         except KeyError:
  322             self.probcache[spamcount] = {hamcount: prob}
  323 
  324         return prob
  325 
  326     # NOTE:  Graham's scheme had a strange asymmetry:  when a word appeared
  327     # n>1 times in a single message, training added n to the word's hamcount
  328     # or spamcount, but predicting scored words only once.  Tests showed
  329     # that adding only 1 in training, or scoring more than once when
  330     # predicting, hurt under the Graham scheme.
  331     # This isn't so under Robinson's scheme, though:  results improve
  332     # if training also counts a word only once.  The mean ham score decreases
  333     # significantly and consistently, ham score variance decreases likewise,
  334     # mean spam score decreases (but less than mean ham score, so the spread
  335     # increases), and spam score variance increases.
  336     # I (Tim) speculate that adding n times under the Graham scheme helped
  337     # because it acted against the various ham biases, giving frequently
  338     # repeated spam words (like "Viagra") a quick ramp-up in spamprob; else,
  339     # adding only once in training, a word like that was simply ignored until
  340     # it appeared in 5 distinct training spams.  Without the ham-favoring
  341     # biases, though, and never ignoring words, counting n times introduces
  342     # a subtle and unhelpful bias.
  343     # There does appear to be some useful info in how many times a word
  344     # appears in a msg, but distorting spamprob doesn't appear a correct way
  345     # to exploit it.
  346     def _add_msg(self, wordstream, is_spam):
  347         self.probcache = {}    # nuke the prob cache
  348         if is_spam:
  349             self.nspam += 1
  350         else:
  351             self.nham += 1
  352 
  353         for word in set(wordstream):
  354             record = self._wordinfoget(word)
  355             if record is None:
  356                 record = self.WordInfoClass()
  357 
  358             if is_spam:
  359                 record.spamcount += 1
  360             else:
  361                 record.hamcount += 1
  362 
  363             self._wordinfoset(word, record)
  364 
  365         self._post_training()
  366 
  367     def _remove_msg(self, wordstream, is_spam):
  368         self.probcache = {}    # nuke the prob cache
  369         if is_spam:
  370             if self.nspam <= 0:
  371                 raise ValueError("spam count would go negative!")
  372             self.nspam -= 1
  373         else:
  374             if self.nham <= 0:
  375                 raise ValueError("non-spam count would go negative!")
  376             self.nham -= 1
  377 
  378         for word in set(wordstream):
  379             record = self._wordinfoget(word)
  380             if record is not None:
  381                 if is_spam:
  382                     if record.spamcount > 0:
  383                         record.spamcount -= 1
  384                 else:
  385                     if record.hamcount > 0:
  386                         record.hamcount -= 1
  387                 if record.hamcount == 0 == record.spamcount:
  388                     self._wordinfodel(word)
  389                 else:
  390                     self._wordinfoset(word, record)
  391 
  392         self._post_training()
  393 
  394     def _post_training(self):
  395         """This is called after training on a wordstream.  Subclasses might
  396         want to ensure that their databases are in a consistent state at
  397         this point.  Introduced to fix bug #797890."""
  398         pass
  399 
  400     # Return list of (prob, word, record) triples, sorted by increasing
  401     # prob.  "word" is a token from wordstream; "prob" is its spamprob (a
  402     # float in 0.0 through 1.0); and "record" is word's associated
  403     # WordInfo record if word is in the training database, or None if it's
  404     # not.  No more than max_discriminators items are returned, and have
  405     # the strongest (farthest from 0.5) spamprobs of all tokens in wordstream.
  406     # Tokens with spamprobs less than minimum_prob_strength away from 0.5
  407     # aren't returned.
  408     def _getclues(self, wordstream):
  409         mindist = options["Classifier", "minimum_prob_strength"]
  410 
  411         if options["Classifier", "use_bigrams"]:
  412             # This scheme mixes single tokens with pairs of adjacent tokens.
  413             # wordstream is "tiled" into non-overlapping unigrams and
  414             # bigrams.  Non-overlap is important to prevent a single original
  415             # token from contributing to more than one spamprob returned
  416             # (systematic correlation probably isn't a good thing).
  417 
  418             # First fill list raw with
  419             #     (distance, prob, word, record), indices
  420             # pairs, one for each unigram and bigram in wordstream.
  421             # indices is a tuple containing the indices (0-based relative to
  422             # the start of wordstream) of the tokens that went into word.
  423             # indices is a 1-tuple for an original token, and a 2-tuple for
  424             # a synthesized bigram token.  The indices are needed to detect
  425             # overlap later.
  426             raw = []
  427             push = raw.append
  428             pair = None
  429             # Keep track of which tokens we've already seen.
  430             # Don't use a set here!  This is an innermost loop, so speed is
  431             # important here (direct dict fiddling is much quicker than
  432             # invoking Python-level set methods; in Python 2.4 that will
  433             # change).
  434             seen = {pair: 1} # so the bigram token is skipped on 1st loop trip
  435             for i, token in enumerate(wordstream):
  436                 if i:   # not the 1st loop trip, so there is a preceding token
  437                     # This string interpolation must match the one in
  438                     # _enhance_wordstream().
  439                     pair = "bi:%s %s" % (last_token, token)
  440                 last_token = token
  441                 for clue, indices in (token, (i,)), (pair, (i-1, i)):
  442                     if clue not in seen:    # as always, skip duplicates
  443                         seen[clue] = 1
  444                         tup = self._worddistanceget(clue)
  445                         if tup[0] >= mindist:
  446                             push((tup, indices))
  447 
  448             # Sort raw, strongest to weakest spamprob.
  449             raw.sort()
  450             raw.reverse()
  451             # Fill clues with the strongest non-overlapping clues.
  452             clues = []
  453             push = clues.append
  454             # Keep track of which indices have already contributed to a
  455             # clue in clues.
  456             seen = {}
  457             for tup, indices in raw:
  458                 overlap = [i for i in indices if i in seen]
  459                 if not overlap: # no overlap with anything already in clues
  460                     for i in indices:
  461                         seen[i] = 1
  462                     push(tup)
  463             # Leave sorted from smallest to largest spamprob.
  464             clues.reverse()
  465 
  466         else:
  467             # The all-unigram scheme just scores the tokens as-is.  A set()
  468             # is used to weed out duplicates at high speed.
  469             clues = []
  470             push = clues.append
  471             for word in set(wordstream):
  472                 tup = self._worddistanceget(word)
  473                 if tup[0] >= mindist:
  474                     push(tup)
  475             clues.sort()
  476 
  477         if len(clues) > options["Classifier", "max_discriminators"]:
  478             del clues[0 : -options["Classifier", "max_discriminators"]]
  479         # Return (prob, word, record).
  480         return [t[1:] for t in clues]
  481 
  482     def _worddistanceget(self, word):
  483         record = self._wordinfoget(word)
  484         if record is None:
  485             prob = options["Classifier", "unknown_word_prob"]
  486         else:
  487             prob = self.probability(record)
  488         distance = abs(prob - 0.5)
  489         return distance, prob, word, record
  490 
  491     def _wordinfoget(self, word):
  492         return self.wordinfo.get(word)
  493 
  494     def _wordinfoset(self, word, record):
  495         self.wordinfo[word] = record
  496 
  497     def _wordinfodel(self, word):
  498         del self.wordinfo[word]
  499 
  500     def _enhance_wordstream(self, wordstream):
  501         """Add bigrams to the wordstream.
  502 
  503         For example, a b c -> a b "a b" c "b c"
  504 
  505         Note that these are *token* bigrams, and not *word* bigrams - i.e.
  506         'synthetic' tokens get bigram'ed, too.
  507 
  508         The bigram token is simply "bi:unigram1 unigram2" - a space should
  509         be sufficient as a separator, since spaces aren't in any other
  510         tokens, apart from 'synthetic' ones.  The "bi:" prefix is added
  511         to avoid conflict with tokens we generate (like "subject: word",
  512         which could be "word" in a subject, or a bigram of "subject:" and
  513         "word").
  514 
  515         If the "Classifier":"use_bigrams" option is removed, this function
  516         can be removed, too.
  517         """
  518 
  519         last = None
  520         for token in wordstream:
  521             yield token
  522             if last:
  523                 # This string interpolation must match the one in
  524                 # _getclues().
  525                 yield "bi:%s %s" % (last, token)
  526             last = token
  527 
  528     def _generate_slurp(self):
  529         # We don't want to do this recursively and check URLs
  530         # on webpages, so we have this little cheat.
  531         if not hasattr(self, "setup_done"):
  532             self.setup()
  533             self.setup_done = True
  534         if not hasattr(self, "do_slurp") or self.do_slurp:
  535             if slurp_wordstream:
  536                 self.do_slurp = False
  537 
  538                 tokens = self.slurp(*slurp_wordstream)
  539                 self.do_slurp = True
  540                 self._save_caches()
  541                 return tokens
  542         return []
  543 
  544     def setup(self):
  545         # Can't import this at the top because it's circular.
  546         # XXX Someone smarter than me, please figure out the right
  547         # XXX way to do this.
  548         from spambayes.FileCorpus import ExpiryFileCorpus, FileMessageFactory
  549 
  550         username = options["globals", "proxy_username"]
  551         password = options["globals", "proxy_password"]
  552         server = options["globals", "proxy_server"]
  553         if server.find(":") != -1:
  554             server, port = server.split(':', 1)
  555         else:
  556             port = 8080
  557         if server:
  558             # Build a new opener that uses a proxy requiring authorization
  559             proxy_support = urllib2.ProxyHandler({"http" : \
  560                                                   "http://%s:%s@%s:%d" % \
  561                                                   (username, password,
  562                                                    server, port)})
  563             opener = urllib2.build_opener(proxy_support,
  564                                           urllib2.HTTPHandler)
  565         else:
  566             # Build a new opener without any proxy information.
  567             opener = urllib2.build_opener(urllib2.HTTPHandler)
  568 
  569         # Install it
  570         urllib2.install_opener(opener)
  571 
  572         # Setup the cache for retrieved urls
  573         age = options["URLRetriever", "x-cache_expiry_days"]*24*60*60
  574         dir = options["URLRetriever", "x-cache_directory"]
  575         if not os.path.exists(dir):
  576             # Create the directory.
  577             if options["globals", "verbose"]:
  578                 print >> sys.stderr, "Creating URL cache directory"
  579             os.makedirs(dir)
  580 
  581         self.urlCorpus = ExpiryFileCorpus(age, FileMessageFactory(),
  582                                           dir, cacheSize=20)
  583         # Kill any old information in the cache
  584         self.urlCorpus.removeExpiredMessages()
  585 
  586         # Setup caches for unretrievable urls
  587         self.bad_url_cache_name = os.path.join(dir, "bad_urls.pck")
  588         self.http_error_cache_name = os.path.join(dir, "http_error_urls.pck")
  589         if os.path.exists(self.bad_url_cache_name):
  590             try:
  591                 self.bad_urls = pickle_read(self.bad_url_cache_name)
  592             except (IOError, ValueError):
  593                 # Something went wrong loading it (bad pickle,
  594                 # probably).  Start afresh.
  595                 if options["globals", "verbose"]:
  596                     print >> sys.stderr, "Bad URL pickle, using new."
  597                 self.bad_urls = {"url:non_resolving": (),
  598                                  "url:non_html": (),
  599                                  "url:unknown_error": ()}
  600         else:
  601             if options["globals", "verbose"]:
  602                 print "URL caches don't exist: creating"
  603             self.bad_urls = {"url:non_resolving": (),
  604                         "url:non_html": (),
  605                         "url:unknown_error": ()}
  606         if os.path.exists(self.http_error_cache_name):
  607             try:
  608                 self.http_error_urls = pickle_read(self.http_error_cache_name)
  609             except IOError, ValueError:
  610                 # Something went wrong loading it (bad pickle,
  611                 # probably).  Start afresh.
  612                 if options["globals", "verbose"]:
  613                     print >> sys.stderr, "Bad HHTP error pickle, using new."
  614                 self.http_error_urls = {}
  615         else:
  616             self.http_error_urls = {}
  617 
  618     def _save_caches(self):
  619         # XXX Note that these caches are never refreshed, which might not
  620         # XXX be a good thing long-term (if a previously invalid URL
  621         # XXX becomes valid, for example).
  622         for name, data in [(self.bad_url_cache_name, self.bad_urls),
  623                            (self.http_error_cache_name, self.http_error_urls),]:
  624             pickle_write(name, data)
  625 
  626     def slurp(self, proto, url):
  627         # We generate these tokens:
  628         #  url:non_resolving
  629         #  url:non_html
  630         #  url:http_XXX (for each type of http error encounted,
  631         #                for example 404, 403, ...)
  632         # And tokenise the received page (but we do not slurp this).
  633         # Actually, the special url: tokens barely showed up in my testing,
  634         # although I would have thought that they would more - this might
  635         # be due to an error, although they do turn up on occasion.  In
  636         # any case, we have to do the test, so generating an extra token
  637         # doesn't cost us anything apart from another entry in the db, and
  638         # it's only two entries, plus one for each type of http error
  639         # encountered, so it's pretty neglible.
  640         # If there is no content in the URL, then just return immediately.
  641         # "http://)" will trigger this.
  642         if not url:
  643             return ["url:non_resolving"]
  644         
  645         from spambayes.tokenizer import Tokenizer
  646 
  647         if options["URLRetriever", "x-only_slurp_base"]:
  648             url = self._base_url(url)
  649 
  650         # Check the unretrievable caches
  651         for err in self.bad_urls.keys():
  652             if url in self.bad_urls[err]:
  653                 return [err]
  654         if self.http_error_urls.has_key(url):
  655             return self.http_error_urls[url]
  656 
  657         # We check if the url will resolve first
  658         mo = DOMAIN_AND_PORT_RE.match(url)
  659         domain = mo.group(1)
  660         if mo.group(3) is None:
  661             port = 80
  662         else:
  663             port = mo.group(3)
  664         try:
  665             _unused = socket.getaddrinfo(domain, port)
  666         except socket.error:
  667             self.bad_urls["url:non_resolving"] += (url,)
  668             return ["url:non_resolving"]
  669 
  670         # If the message is in our cache, then we can just skip over
  671         # retrieving it from the network, and get it from there, instead.
  672         url_key = URL_KEY_RE.sub('_', url)
  673         cached_message = self.urlCorpus.get(url_key)
  674 
  675         if cached_message is None:
  676             # We're going to ignore everything that isn't text/html,
  677             # so we might as well not bother retrieving anything with
  678             # these extensions.
  679             parts = url.split('.')
  680             if parts[-1] in ('jpg', 'gif', 'png', 'css', 'js'):
  681                 self.bad_urls["url:non_html"] += (url,)
  682                 return ["url:non_html"]
  683 
  684             # Waiting for the default timeout period slows everything
  685             # down far too much, so try and reduce it for just this
  686             # call (this will only work with Python 2.3 and above).
  687             try:
  688                 timeout = socket.getdefaulttimeout()
  689                 socket.setdefaulttimeout(5)
  690             except AttributeError:
  691                 # Probably Python 2.2.
  692                 pass
  693             try:
  694                 if options["globals", "verbose"]:
  695                     print >> sys.stderr, "Slurping", url
  696                 f = urllib2.urlopen("%s://%s" % (proto, url))
  697             except (urllib2.URLError, socket.error), details:
  698                 mo = HTTP_ERROR_RE.match(str(details))
  699                 if mo:
  700                     self.http_error_urls[url] = "url:http_" + mo.group(1)
  701                     return ["url:http_" + mo.group(1)]
  702                 self.bad_urls["url:unknown_error"] += (url,)
  703                 return ["url:unknown_error"]
  704             # Restore the timeout
  705             try:
  706                 socket.setdefaulttimeout(timeout)
  707             except AttributeError:
  708                 # Probably Python 2.2.
  709                 pass
  710 
  711             try:
  712                 # Anything that isn't text/html is ignored
  713                 content_type = f.info().get('content-type')
  714                 if content_type is None or \
  715                    not content_type.startswith("text/html"):
  716                     self.bad_urls["url:non_html"] += (url,)
  717                     return ["url:non_html"]
  718 
  719                 page = f.read()
  720                 headers = str(f.info())
  721                 f.close()
  722             except socket.error:
  723                 # This is probably a temporary error, like a timeout.
  724                 # For now, just bail out.
  725                 return []
  726             
  727             fake_message_string = headers + "\r\n" + page
  728 
  729             # Retrieving the same messages over and over again will tire
  730             # us out, so we store them in our own wee cache.
  731             message = self.urlCorpus.makeMessage(url_key,
  732                                                  fake_message_string)
  733             self.urlCorpus.addMessage(message)
  734         else:
  735             fake_message_string = cached_message.as_string()
  736 
  737         msg = message_from_string(fake_message_string)
  738 
  739         # We don't want to do full header tokenising, as this is
  740         # optimised for messages, not webpages, so we just do the
  741         # basic stuff.
  742         bht = options["Tokenizer", "basic_header_tokenize"]
  743         bhto = options["Tokenizer", "basic_header_tokenize_only"]
  744         options["Tokenizer", "basic_header_tokenize"] = True
  745         options["Tokenizer", "basic_header_tokenize_only"] = True
  746 
  747         tokens = Tokenizer().tokenize(msg)
  748         pf = options["URLRetriever", "x-web_prefix"]
  749         tokens = ["%s%s" % (pf, tok) for tok in tokens]
  750 
  751         # Undo the changes
  752         options["Tokenizer", "basic_header_tokenize"] = bht
  753         options["Tokenizer", "basic_header_tokenize_only"] = bhto
  754         return tokens
  755 
  756     def _base_url(self, url):
  757         # To try and speed things up, and to avoid following
  758         # unique URLS, we convert the URL to as basic a form
  759         # as we can - so http://www.massey.ac.nz/~tameyer/index.html?you=me
  760         # would become http://massey.ac.nz and http://id.example.com
  761         # would become http://example.com
  762         url += '/'
  763         domain = url.split('/', 1)[0]
  764         parts = domain.split('.')
  765         if len(parts) > 2:
  766             base_domain = parts[-2] + '.' + parts[-1]
  767             if len(parts[-1]) < 3:
  768                 base_domain = parts[-3] + '.' + base_domain
  769         else:
  770             base_domain = domain
  771         return base_domain
  772 
  773     def _add_slurped(self, wordstream):
  774         """Add tokens generated by 'slurping' (i.e. tokenizing
  775         the text at the web pages pointed to by URLs in messages)
  776         to the wordstream."""
  777         for token in wordstream:
  778             yield token
  779         slurped_tokens = self._generate_slurp()
  780         for token in slurped_tokens:
  781             yield token
  782 
  783     def _wordinfokeys(self):
  784         return self.wordinfo.keys()
  785 
  786 
  787 Bayes = Classifier