"Fossies" - the Fresh Open Source Software Archive  

Source code changes of the file "lib/Mail/SpamAssassin/Plugin/Bayes.pm" between
Mail-SpamAssassin-3.4.3.tar.bz2 and Mail-SpamAssassin-3.4.4-rc1.tar.bz2

About: SpamAssassin is a mail filter that uses a wide range of heuristic tests on mail headers and body text to identify "spam" (also known as unsolicited commercial email) incl. Bayesian (statistical) spam filter and several internet-based realtime blacklists. Release candidate.

Bayes.pm  (Mail-SpamAssassin-3.4.3.tar.bz2):Bayes.pm  (Mail-SpamAssassin-3.4.4-rc1.tar.bz2)
skipping to change at line 1056 skipping to change at line 1056
return $msgdata; return $msgdata;
} }
########################################################################### ###########################################################################
# The calling functions expect a uniq'ed array of tokens ... # The calling functions expect a uniq'ed array of tokens ...
sub tokenize { sub tokenize {
my ($self, $msg, $msgdata) = @_; my ($self, $msg, $msgdata) = @_;
my $t_src = $self->{conf}->{bayes_token_sources}; my $t_src = $self->{conf}->{bayes_token_sources};
my @tokens;
# visible tokens from the body # visible tokens from the body
my @tokens_body;
if ($msgdata->{bayes_token_body}) { if ($msgdata->{bayes_token_body}) {
my(@t) = map($self->_tokenize_line ($_, '', 1), foreach (@{$msgdata->{bayes_token_body}}) {
@{$msgdata->{bayes_token_body}} ); push(@tokens_body, $self->_tokenize_line ($_, '', 1));
dbg("bayes: tokenized body: %d tokens", scalar @t); last if scalar @tokens_body >= 50000;
push(@tokens, @t); }
dbg("bayes: tokenized body: %d tokens", scalar @tokens_body);
} }
# the URI list # the URI list
my @tokens_uri;
if ($msgdata->{bayes_token_uris}) { if ($msgdata->{bayes_token_uris}) {
my(@t) = map($self->_tokenize_line ($_, '', 2), foreach (@{$msgdata->{bayes_token_uris}}) {
@{$msgdata->{bayes_token_uris}} ); push(@tokens_uri, $self->_tokenize_line ($_, '', 2));
dbg("bayes: tokenized uri: %d tokens", scalar @t); last if scalar @tokens_uri >= 10000;
push(@tokens, @t); }
dbg("bayes: tokenized uri: %d tokens", scalar @tokens_uri);
} }
# add invisible tokens # add invisible tokens
my @tokens_inviz;
if ($msgdata->{bayes_token_inviz}) { if ($msgdata->{bayes_token_inviz}) {
my $tokprefix; my $tokprefix;
if (ADD_INVIZ_TOKENS_I_PREFIX) { $tokprefix = 'I*:' } if (ADD_INVIZ_TOKENS_I_PREFIX) { $tokprefix = 'I*:' }
if (ADD_INVIZ_TOKENS_NO_PREFIX) { $tokprefix = '' } if (ADD_INVIZ_TOKENS_NO_PREFIX) { $tokprefix = '' }
if (defined $tokprefix) { if (defined $tokprefix) {
my(@t) = map($self->_tokenize_line ($_, $tokprefix, 1), foreach (@{$msgdata->{bayes_token_inviz}}) {
@{$msgdata->{bayes_token_inviz}} ); push(@tokens_inviz, $self->_tokenize_line ($_, $tokprefix, 1));
dbg("bayes: tokenized invisible: %d tokens", scalar @t); last if scalar @tokens_inviz >= 50000;
push(@tokens, @t); }
} }
dbg("bayes: tokenized invisible: %d tokens", scalar @tokens_inviz);
} }
# add digests and Content-Type of all MIME parts # add digests and Content-Type of all MIME parts
my @tokens_mimepart;
if ($msgdata->{bayes_mimepart_digests}) { if ($msgdata->{bayes_mimepart_digests}) {
my %shorthand = ( # some frequent MIME part contents for human readability my %shorthand = ( # some frequent MIME part contents for human readability
'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/plain'=> 'Empty-Plaintext', 'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/plain'=> 'Empty-Plaintext',
'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/html' => 'Empty-HTML', 'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/html' => 'Empty-HTML',
'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/xml' => 'Empty-XML', 'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/xml' => 'Empty-XML',
'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/plain'=> 'OneNL-Plaintext', 'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/plain'=> 'OneNL-Plaintext',
'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/html' => 'OneNL-HTML', 'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/html' => 'OneNL-HTML',
'71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/plain'=> 'TwoNL-Plaintext', '71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/plain'=> 'TwoNL-Plaintext',
'71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/html' => 'TwoNL-HTML', '71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/html' => 'TwoNL-HTML',
); );
my(@t) = map('MIME:' . ($shorthand{$_} || $_), @tokens_mimepart = map('MIME:' . ($shorthand{$_} || $_),
@{ $msgdata->{bayes_mimepart_digests} }); @{ $msgdata->{bayes_mimepart_digests} });
dbg("bayes: tokenized mime parts: %d tokens", scalar @t); dbg("bayes: tokenized mime parts: %d tokens", scalar @tokens_mimepart);
dbg("bayes: mime-part token %s", $_) for @t; dbg("bayes: mime-part token %s", $_) for @tokens_mimepart;
push(@tokens, @t);
} }
# Tokenize the headers # Tokenize the headers
my @tokens_header;
if ($t_src->{header}) { if ($t_src->{header}) {
my(@t);
my %hdrs = $self->_tokenize_headers ($msg); my %hdrs = $self->_tokenize_headers ($msg);
while( my($prefix, $value) = each %hdrs ) { while( my($prefix, $value) = each %hdrs ) {
push(@t, $self->_tokenize_line ($value, "H$prefix:", 0)); push(@tokens_header, $self->_tokenize_line ($value, "H$prefix:", 0));
last if scalar @tokens_header >= 10000;
} }
dbg("bayes: tokenized header: %d tokens", scalar @t); dbg("bayes: tokenized header: %d tokens", scalar @tokens_header);
push(@tokens, @t);
} }
# Go ahead and uniq the array, skip null tokens (can happen sometimes) # Go ahead and uniq the array, skip null tokens (can happen sometimes)
# generate an SHA1 hash and take the lower 40 bits as our token # generate an SHA1 hash and take the lower 40 bits as our token
my %tokens; my %tokens;
foreach my $token (@tokens) { foreach my $token
# dbg("bayes: token: %s", $token); (@tokens_body, @tokens_uri, @tokens_inviz, @tokens_mimepart, @tokens_header)
{
# dbg("bayes: token: %s", $token);
$tokens{substr(sha1($token), -5)} = $token if $token ne ''; $tokens{substr(sha1($token), -5)} = $token if $token ne '';
} }
# return the keys == tokens ... # return the keys == tokens ...
return \%tokens; return \%tokens;
} }
sub _tokenize_line { sub _tokenize_line {
my $self = $_[0]; my $self = $_[0];
my $tokprefix = $_[2]; my $tokprefix = $_[2];
 End of changes. 16 change blocks. 
23 lines changed or deleted 30 lines changed or added

Home  |  About  |  Features  |  All  |  Newest  |  Dox  |  Diffs  |  RSS Feeds  |  Screenshots  |  Comments  |  Imprint  |  Privacy  |  HTTP(S)