"Fossies" - the Fresh Open Source Software Archive  

Source code changes of the file "lib/Mail/SpamAssassin/Plugin/TextCat.pm" between
Mail-SpamAssassin-3.4.4.tar.bz2 and Mail-SpamAssassin-3.4.5.tar.bz2

About: SpamAssassin is a mail filter that uses a wide range of heuristic tests on mail headers and body text to identify "spam" (also known as unsolicited commercial email) incl. Bayesian (statistical) spam filter and several internet-based realtime blacklists.

TextCat.pm  (Mail-SpamAssassin-3.4.4.tar.bz2):TextCat.pm  (Mail-SpamAssassin-3.4.5.tar.bz2)
skipping to change at line 83 skipping to change at line 83
my $mailsaobject = shift; my $mailsaobject = shift;
$class = ref($class) || $class; $class = ref($class) || $class;
my $self = $class->SUPER::new($mailsaobject); my $self = $class->SUPER::new($mailsaobject);
bless ($self, $class); bless ($self, $class);
# load language models once # load language models once
if (! @nm) { if (! @nm) {
if (!defined $mailsaobject->{languages_filename}) { if (!defined $mailsaobject->{languages_filename}) {
warn "textcat: languages filename not defined\n"; warn "textcat: languages filename not defined\n";
$self->{textcat_disabled} = 1;
} }
else { else {
load_models($mailsaobject->{languages_filename}); load_models($mailsaobject->{languages_filename});
} }
} }
$self->register_eval_rule("check_language"); $self->register_eval_rule("check_language");
$self->register_eval_rule("check_body_8bits"); $self->register_eval_rule("check_body_8bits");
$self->set_config($mailsaobject->{conf}); $self->set_config($mailsaobject->{conf});
skipping to change at line 487 skipping to change at line 488
} }
sub create_lm { sub create_lm {
my ($inputptr, $conf) = @_; my ($inputptr, $conf) = @_;
my %ngram; my %ngram;
my @sorted; my @sorted;
# Note that $$inputptr may or may not be in perl characters (utf8 flag set) # Note that $$inputptr may or may not be in perl characters (utf8 flag set)
my $is_unicode = utf8::is_utf8($$inputptr); my $is_unicode = utf8::is_utf8($$inputptr);
# my $non_word_characters = qr/[0-9\s]/; # "Split the text into separate tokens consisting only of letters and
for my $word (split(/[0-9\s]+/, $$inputptr)) # apostrophes. Digits and punctuation are discarded."
while ($$inputptr =~ /([^0-9\s\-!"#\$\%\&()*+,.\/:;<=>?\@\[\\\]\^_`{|}~]+)/gs)
{ {
my $word = $1;
# Bug 6229: Current TextCat database only works well with lowercase input # Bug 6229: Current TextCat database only works well with lowercase input
if ($is_unicode) { if ($is_unicode) {
# Unicode rules are used for the case change # Unicode rules are used for the case change
$word = lc $word if $word =~ /\w{4}/; $word = lc $word if $word =~ /\w{4}/;
utf8::encode($word); # encode Unicode characters to UTF-8 octets utf8::encode($word); # encode Unicode characters to UTF-8 octets
} elsif ($word =~ /[A-Z]/ && } elsif ($word =~ /[A-Z]/ &&
$word =~ /[a-zA-Z\xc0-\xd6\xd8-\xde\xe0-\xf6\xf8-\xfe]{4}/) { $word =~ /[a-zA-Z\xc0-\xd6\xd8-\xde\xe0-\xf6\xf8-\xfe]{4}/) {
# assume ISO 8859-1 / Windows-1252 # assume ISO 8859-1 / Windows-1252
$word =~ tr/A-Z\xc0-\xd6\xd8-\xde/a-z\xe0-\xf6\xf8-\xfe/; $word =~ tr/A-Z\xc0-\xd6\xd8-\xde/a-z\xe0-\xf6\xf8-\xfe/;
} }
$word = "\000" . $word . "\000"; $word = "\000" . $word . "\000";
my $len = length($word); my $len = length($word);
my $flen = $len; my $flen = $len;
my $i; for (my $i = 0; $i < $flen; $i++) {
for ($i = 0; $i < $flen; $i++) {
$len--; $len--;
$ngram{substr($word, $i, 1)}++; $ngram{substr($word, $i, 1)}++;
($len < 1) ? next : $ngram{substr($word, $i, 2)}++; ($len < 1) ? next : $ngram{substr($word, $i, 2)}++;
($len < 2) ? next : $ngram{substr($word, $i, 3)}++; ($len < 2) ? next : $ngram{substr($word, $i, 3)}++;
($len < 3) ? next : $ngram{substr($word, $i, 4)}++; ($len < 3) ? next : $ngram{substr($word, $i, 4)}++;
if ($len > 3) { $ngram{substr($word, $i, 5)}++ }; if ($len > 3) { $ngram{substr($word, $i, 5)}++ };
} }
} }
if ($conf->{textcat_optimal_ngrams} > 0) { if ($conf->{textcat_optimal_ngrams} > 0) {
# as suggested by Karel P. de Vos <k.vos@elsevier.nl> we speed # as suggested by Karel P. de Vos <k.vos@elsevier.nl> we speed
# up sorting by removing singletons, however I have very bad # up sorting by removing singletons, however I have very bad
# results for short inputs, this way # results for short inputs, this way
@sorted = sort { $ngram{$b} <=> $ngram{$a} } @sorted = sort { $ngram{$b} <=> $ngram{$a} }
(grep { $ngram{$_} > $conf->{textcat_optimal_ngrams} } keys %n gram); (grep { $ngram{$_} > $conf->{textcat_optimal_ngrams} } sort keys %ngram);
} }
else { else {
@sorted = sort { $ngram{$b} <=> $ngram{$a} } keys %ngram; @sorted = sort { $ngram{$b} <=> $ngram{$a} } sort keys %ngram;
} }
splice(@sorted, $conf->{textcat_max_ngrams}) if (@sorted > $conf->{textcat_max _ngrams}); splice(@sorted, $conf->{textcat_max_ngrams}) if (@sorted > $conf->{textcat_max _ngrams});
return @sorted; return @sorted;
} }
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
sub extract_metadata { sub extract_metadata {
my ($self, $opts) = @_; my ($self, $opts) = @_;
return if $self->{textcat_disabled};
my $msg = $opts->{msg}; my $msg = $opts->{msg};
my $body = $msg->get_rendered_body_text_array(); my $body = $msg->get_rendered_body_text_array();
$body = join("\n", @{$body}); $body = join("\n", @{$body});
$body =~ s/^Subject://i; $body =~ s/^Subject://i;
# Strip anything that looks like url or email, enhances results
$body =~ s{https?://\S+}{ }gs;
$body =~ s{\S+?\@[a-zA-Z]\S+}{ }gs;
my $len = length($body); my $len = length($body);
# truncate after 10k; that should be plenty to classify it # truncate after 10k; that should be plenty to classify it
if ($len > 10000) { if ($len > 10000) {
substr($body, 10000) = ''; substr($body, 10000) = '';
$len = 10000; $len = 10000;
} }
# note input length since the check_languages() eval rule also uses it # note input length since the check_languages() eval rule also uses it
$msg->put_metadata("X-Languages-Length", $len); $msg->put_metadata("X-Languages-Length", $len);
# need about 256 bytes for reasonably accurate match (experimentally derived) # need about 256 bytes for reasonably accurate match (experimentally derived)
skipping to change at line 575 skipping to change at line 583
my $matches_str = join(' ', @matches); my $matches_str = join(' ', @matches);
$msg->put_metadata("X-Languages", $matches_str); $msg->put_metadata("X-Languages", $matches_str);
dbg("textcat: X-Languages: \"$matches_str\", X-Languages-Length: $len"); dbg("textcat: X-Languages: \"$matches_str\", X-Languages-Length: $len");
} }
# UNWANTED_LANGUAGE_BODY # UNWANTED_LANGUAGE_BODY
sub check_language { sub check_language {
my ($self, $scan) = @_; my ($self, $scan) = @_;
return 0 if $self->{textcat_disabled};
my $msg = $scan->{msg}; my $msg = $scan->{msg};
my @languages = split(/\s+/, $scan->{conf}->{ok_languages}); my @languages = split(/\s+/, $scan->{conf}->{ok_languages});
if (grep { $_ eq "all" } @languages) { if (grep { $_ eq "all" } @languages) {
return 0; return 0;
} }
my $len = $msg->get_metadata("X-Languages-Length"); my $len = $msg->get_metadata("X-Languages-Length");
my @matches = split(' ', $msg->get_metadata("X-Languages")); my @matches = split(' ', $msg->get_metadata("X-Languages"));
skipping to change at line 615 skipping to change at line 625
} }
} }
} }
return 1; return 1;
} }
sub check_body_8bits { sub check_body_8bits {
my ($self, $scan, $body) = @_; my ($self, $scan, $body) = @_;
return 0 if $self->{textcat_disabled};
my @languages = split(/\s+/, $scan->{conf}->{ok_languages}); my @languages = split(/\s+/, $scan->{conf}->{ok_languages});
for (@languages) { for (@languages) {
return 0 if $_ eq "all"; return 0 if $_ eq "all";
# this list is initially conservative, it includes any language with # this list is initially conservative, it includes any language with
# a common n-gram sequence of 2+ consecutive bytes matching [\x80-\xff] # a common n-gram sequence of 2+ consecutive bytes matching [\x80-\xff]
# here are the one more likely to be removed: cs=czech, et=estonian, # here are the one more likely to be removed: cs=czech, et=estonian,
# fi=finnish, hi=hindi, is=icelandic, pt=portuguese, tr=turkish, # fi=finnish, hi=hindi, is=icelandic, pt=portuguese, tr=turkish,
# uk=ukrainian, vi=vietnamese # uk=ukrainian, vi=vietnamese
return 0 if /^(?:am|ar|be|bg|cs|el|et|fa|fi|he|hi|hy|is|ja|ka|ko|mr|pt|ru|ta |th|tr|uk|vi|yi|zh)$/; return 0 if /^(?:am|ar|be|bg|cs|el|et|fa|fi|he|hi|hy|is|ja|ka|ko|mr|pt|ru|ta |th|tr|uk|vi|yi|zh)$/;
 End of changes. 10 change blocks. 
6 lines changed or deleted 18 lines changed or added

Home  |  About  |  Features  |  All  |  Newest  |  Dox  |  Diffs  |  RSS Feeds  |  Screenshots  |  Comments  |  Imprint  |  Privacy  |  HTTP(S)