"Fossies" - the Fresh Open Source Software Archive

Member "websec-1.9.0/webdiff" (14 Mar 2005, 12963 Bytes) of package /linux/www/old/websec-1.9.0.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Perl source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 #!/usr/bin/perl -w
    2 
    3 #################################################################################
    4 #
    5 # Webdiff
    6 #
    7 # Compares two HTML pages (current and archive) and outputs a new page based
    8 # on the current page but with the differences between the two pages highlighted.
    9 #
   10 # Copyright (C) 1998  Chew Wei Yih
   11 # Copyright (C) 2004,2005 Baruch Even <baruch@ev-en.org>
   12 #
   13 # This program is free software; you can redistribute it and/or
   14 # modify it under the terms of the GNU General Public License
   15 # as published by the Free Software Foundation; either version 2
   16 # of the License, or (at your option) any later version.
   17 #
   18 # This program is distributed in the hope that it will be useful,
   19 # but WITHOUT ANY WARRANTY; without even the implied warranty of
   20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   21 # GNU General Public License for more details.
   22 #
   23 # You should have received a copy of the GNU General Public License
   24 # along with this program; if not, write to the Free Software
   25 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
   26 #
   27 #################################################################################
   28 
   29 use Getopt::Long;
   30 use Pod::Usage;
   31 
   32 # Initialize parameters
   33 $oldpage    = "";
   34 $curpage    = "";
   35 $outpage    = "";
   36 $hicolor    = "blue";
   37 $asciimarker = 0;
   38 $ignore     = "none";
   39 $ignoreurl  = "none";
   40 $tmin       = 0;
   41 $tmax       = 99999;
   42 $debug      = 0;
   43 $ignoreFile = "ignore.list";
   44 $basedir    = $ENV{HOME} . "/.websec/";
   45 
   46 # Parse options
   47 $help = 0;
   48 $man  = 0;
   49 GetOptions(
   50     "help|?"       => \$help,
   51     "man"          => \$man,
   52     "basedir=s"    => \$basedir,
   53     "archive=s"    => \$oldpage,
   54     "current=s"    => \$curpage,
   55     "out=s"        => \$outpage,
   56     "hicolor=s"    => \$hicolor,
   57     "asciimarker"  => \$asciimarker,
   58     "ignore=s"     => \$ignore,
   59     "ignoreurl=s"  => \$ignoreurl,
   60     "tmin=i"       => \$tmin,
   61     "tmax=i"       => \$tmax,
   62     "debug"        => \$debug,
   63     "ignorefile=s" => \$ignoreFile
   64   )
   65   or pod2usage(0);
   66 
   67 pod2usage(1) if ($help);
   68 pod2usage( -exitstatus => 0, -verbose => 2 ) if $man;
   69 
   70 # Remove trailing slash from basedir, we will add it ourself everywhere needed
   71 $basedir =~ s/\/$//;
   72 
   73 # Make sure some essential option values are supplied
   74 if ( $oldpage eq "" ) {
   75     print
   76       "You did not supply the archive HTML file via the --archive option.\n";
   77     exit -1;
   78 }
   79 if ( $curpage eq "" ) {
   80     print
   81       "You did not supply the current HTML file via the --current option.\n";
   82     exit -1;
   83 }
   84 if ( $outpage eq "" ) {
   85     print "You did not supply the output HTML file via the --out option.\n";
   86     exit -1;
   87 }
   88 
   89 # Choose highlighting color
   90 %colorList = (
   91     yellow => "#ffff99",
   92     blue   => "#66ccff",
   93     pink   => "#ffcccc",
   94     grey   => "#4c4c4c"
   95 );
   96 if ( defined $colorList{$hicolor} ) { $hicolor = $colorList{$hicolor}; }
   97 if ( $hicolor eq "" ) { $hicolor = $colorList{"blue"}; }
   98 
   99 # Other global variables
  100 $changeStatus = 0;
  101 @tags         = (
  102     "CODE",     "B",   "I",   "U",     "TT",     "EM",
  103     "FONT*",    "SUP", "SUB", "SMALL", "STRIKE", "STRONG",
  104     "CAPTION*", "A*"
  105 );
  106 
  107 # Read ignore keywords
  108 if ( $ignore ne "none" ) {
  109     $ignore          = "," . $ignore . ",";
  110     $ignorelist      = "";
  111     $ignoreStartRead = 0;
  112     open( IGNORE, "< $basedir/$ignoreFile" )
  113       or die "Cannot open $basedir/$ignoreFile: $!\n";
  114     while (<IGNORE>) {
  115         chomp;
  116         s/^\s*//;
  117         s/\s*$//;
  118 
  119         # Ignore comments
  120         next if (m/^#/);
  121         # Stop with a finish marker
  122         last if (m/^__END__/);
  123 
  124         if ( $ignoreStartRead && $_ eq "" ) { $ignoreStartRead = 0; next; }
  125         if ($ignoreStartRead) { $ignorelist .= $_ . "\r"; next; }
  126         ( $section = $_ ) =~ s:\[\s*(.*?)\s*\]:$1:sig;
  127         if ( $ignore =~ m:,$section,:i ) { $ignoreStartRead = 1; }
  128     }
  129     close( IGNORE );
  130     @ignore = split /[\r\n]/, $ignorelist;
  131 }
  132 if ($debug) {
  133     foreach (@ignore) { print "Ignore: $_\n"; }
  134 }
  135 
  136 # Read ignore urls
  137 if ( $ignoreurl ne "none" ) {
  138     $ignoreurl       = "," . $ignoreurl . ",";
  139     $ignorelist      = "";
  140     $ignoreStartRead = 0;
  141     open( IGNORE, "< $basedir/$ignoreFile" )
  142       or die "Cannot open $basedir/$ignoreFile: $!\n";
  143     while (<IGNORE>) {
  144         chomp;
  145         s/^\s*//;
  146         s/\s*$//;
  147 
  148         # Ignore comments
  149         next if (m/^#/);
  150         # Stop with a finish marker
  151         last if (m/^__END__/);
  152 
  153         if ( $ignoreStartRead && $_ eq "" ) { $ignoreStartRead = 0; next; }
  154         if ($ignoreStartRead) { $ignorelist .= $_ . "\r"; next; }
  155         ( $section = $_ ) =~ s:\[\s*(.*?)\s*\]:$1:sig;
  156         if ( $ignoreurl =~ m:,$section,:i ) { $ignoreStartRead = 1; }
  157     }
  158     close( IGNORE );
  159     @ignoreurl = split /[\r\n]/, $ignorelist;
  160 }
  161 if ($debug) {
  162     foreach (@ignoreurl) { print "IgnoreURL: $_\n"; }
  163 }
  164 
  165 # Undefine line separator so that we can read entire file at one go from now on
  166 undef $/;
  167 
  168 # Open input pages for comparing
  169 open( OLDPAGE, "< $oldpage" ) or die "Cannot open $oldpage: $!\n";
  170 open( CURPAGE, "< $curpage" ) or die "Cannot open $curpage: $!\n";
  171 
  172 # Read input pages
  173 $oldpage = <OLDPAGE>;
  174 $newpage = <CURPAGE>;
  175 
  176 # Close input pages
  177 close(OLDPAGE);
  178 close(CURPAGE);
  179 
  180 # Mangle some HTML tags to a form suitable for analysis
  181 $oldpage = &MangleHTML($oldpage, @tags);
  182 $newpage = &MangleHTML($newpage, @tags);
  183 
  184 # Parse old and new page
  185 &TokenizePage($oldpage);
  186 @oldtokens = @tokens;
  187 $#tokens   = -1;
  188 if ($debug) {
  189     foreach (@oldtokens) { print ">>>> $_\n"; }
  190 }
  191 &TokenizePage($newpage);
  192 @newtokens = @tokens;
  193 $#tokens   = -1;
  194 
  195 # Parse new page
  196 &PerformDiff();
  197 
  198 # Restore tags which we have previously mangled
  199 foreach $token (@newtokens) {
  200     $token =~ s/\@\@\@\@&nbsp;~~~~/&nbsp;/sig;
  201     foreach $tag (@tags) { $token =~ s/~~~~(\/*.*?)\@\@\@\@/<$1>/sig; }
  202 }
  203 
  204 # Open output file for writing
  205 open( OUTPAGE, "> $outpage" ) or die "Cannot open $outpage: $!\n";
  206 foreach (@newtokens) { print OUTPAGE "$_\n"; }
  207 close(OUTPAGE);
  208 
  209 # End of program
  210 if ( !$changeStatus ) {
  211     if ($debug) { print "No changes were detected.\n"; }
  212 }
  213 exit $changeStatus;
  214 
  215 # Convert page to tokens
  216 sub TokenizePage() {
  217     my $page = shift (@_);
  218     @tokens = split /(<.*?>)/s, $page;
  219     foreach (@tokens) { s/^\s+//sig; }
  220     foreach (@tokens) { s/\s+$//sig; }
  221 }
  222 
  223 # Perform diff between two pages
  224 sub PerformDiff() {
  225     my $commentOn   = 0;
  226     my $scriptOn    = 0;
  227     my $styleOn     = 0;
  228     my $titleOn     = 0;
  229     my $ignoreUrlOn = 0;
  230 
  231     foreach $token (@newtokens) {
  232         if ( $token eq "" ) { next; }
  233         if ($debug) { print "<<<< $token\n"; }
  234 
  235         if ( $token =~ m|^.*?<!-.*?$| ) { $commentOn = 1; }
  236         if ( $token =~ m|^.*?->.*?| )   { $commentOn = 0; next; }
  237 
  238         if ( $token =~ m|^.*?<TITLE.*?>$|i )  { $titleOn = 1; }
  239         if ( $token =~ m|^.*?</TITLE.*?>$|i ) { $titleOn = 0; next; }
  240 
  241         if ( $token =~ m|^.*?<SCRIPT.*?>$|i )  { $scriptOn = 1; }
  242         if ( $token =~ m|^.*?</SCRIPT.*?>$|i ) { $scriptOn = 0; next; }
  243 
  244         if ( $token =~ m|^.*?<STYLE.*?>$|i )  { $styleOn = 1; }
  245         if ( $token =~ m|^.*?</STYLE.*?>$|i ) { $styleOn = 0; next; }
  246 
  247         if ( TokenContainsIgnoreURL($token) ) { $ignoreUrlOn = 1; }
  248         if ( $ignoreUrlOn && TokenContainsHlinkEnd($token) ) {
  249             $ignoreUrlOn = 0;
  250             next;
  251         }
  252 
  253         if ($commentOn) {
  254             if ($debug) { print "#### Token is within comment block.\n"; }
  255         }
  256         elsif ($titleOn) {
  257             if ($debug) { print "#### Token is within title block.\n"; }
  258         }
  259         elsif ($scriptOn) {
  260             if ($debug) { print "#### Token is within Javascript block.\n"; }
  261         }
  262         elsif ($styleOn) {
  263             if ($debug) { print "#### Token is within stylesheet block.\n"; }
  264         }
  265         elsif ($ignoreUrlOn) {
  266             if ($debug) {
  267                 print "#### Token contains ignore URL - $lastIgnoreURL\n";
  268             }
  269         }
  270         elsif ( $token =~ m/<.*?>/sig ) {
  271             if ($debug) { print "#### Token is a HTML tag.\n"; }
  272         }
  273         elsif ( TokenIsMangledHTMLTag($token) ) {
  274             if ($debug) { print "#### Token is a mangled HTML tag.\n"; }
  275         }
  276         elsif ( TokenContainsIgnoreKeyword($token) ) {
  277             if ($debug) {
  278                 print
  279                   "#### Token contains ignore keyword - $lastIgnoreKeyword\n";
  280             }
  281         }
  282         elsif ( TokenExists($token) ) {
  283             if ($debug) { print "#### Token exists in old page.\n"; }
  284         }
  285         else {
  286             if ($debug) { print "#### Token has been highlighted!\n"; }
  287             if ($asciimarker) {
  288                 $token = "###>>>". $token ."<<<###";
  289             }
  290             $token =
  291                     "<span style=\"background-color: $hicolor\">"
  292                     . $token . "</span>";
  293             $changeStatus = 1;
  294         }
  295     }
  296 }
  297 
  298 # Check if token is a mangled HTML tag
  299 sub TokenIsMangledHTMLTag() {
  300     my $token = shift (@_);
  301 
  302     while ( $token ne "" ) {
  303         if ( $token =~ m/^\s*(.*?)\s*~~~~.*?\@\@\@\@\s*(.*?)\s*$/i ) {
  304             $token = $2;
  305             if ( !$1 =~ m/^\s*$/ ) { return 0; }
  306         }
  307         else { return 0; }
  308     }
  309     return 1;
  310 }
  311 
  312 # Check if token contains any keyword in ignore list
  313 sub TokenContainsIgnoreKeyword() {
  314     my $token = shift (@_);
  315     $token =~ s/\s{2,}/ /sig;
  316 
  317     # If this token contains >= tmax no. of words, do not ignore
  318     $tokdup = &ReduceSpaces($token);
  319     @words = split /\s/, $tokdup;
  320     if ($debug) { print "#### C" . ( $#words + 1 ) . ": $tokdup\n"; }
  321     if ( $#words + 1 > $tmax ) { return 0; }
  322 
  323     foreach $keyword (@ignore) {
  324         if ( $token =~ m/^.*?(\b$keyword\b).*?$/i
  325             || $tokdup =~ m/^.*?(\b$keyword\b).*?$/i )
  326         {
  327             $lastIgnoreKeyword = $keyword;
  328             return 1;
  329         }
  330     }
  331     return 0;
  332 }
  333 
  334 # Check if token already exists
  335 sub TokenExists() {
  336     my $token = shift (@_);
  337     $token =~ s/\s{2,}/ /sig;
  338 
  339     # If this token contains <= tmin no. of words, don't check
  340     $tokdup = &ReduceSpaces($token);
  341     @words = split /\s/, $tokdup;
  342     if ( $#words + 1 <= $tmin ) { return 1; }
  343 
  344     foreach $oldtok (@oldtokens) {
  345         $oldtok =~ s/\s{2,}/ /sig;
  346         if ( $token eq $oldtok ) { return 1; }
  347     }
  348     return 0;
  349 }
  350 
  351 # Check if token contains ignore URL
  352 sub TokenContainsIgnoreURL() {
  353     my $token = shift (@_);
  354     $token =~ s/\s{2,}/ /sig;
  355 
  356     foreach $url (@ignoreurl) {
  357         if ( $token =~ m/~~~~A.*?HREF=.*?$url.*?\@\@\@\@/i ) {
  358             $lastIgnoreURL = $url;
  359             return 1;
  360         }
  361     }
  362     return 0;
  363 }
  364 
  365 # Check if token contains end of hyperlink
  366 sub TokenContainsHlinkEnd() {
  367     my $token = shift (@_);
  368     $token =~ s/\s{2,}/ /sig;
  369     return 1 if $token =~ m/~~~~\/A\@\@\@\@/i;
  370     return 0;
  371 }
  372 
  373 sub MangleHTML() {
  374     my $page = shift(@_);
  375     my @tags = shift(@_);
  376 
  377     $page =~ s/[\r\n]|\s\s/ /sig;    # Handle MSDOS-style line separators
  378     $page =~ s/&nbsp;/\@\@\@\@&nbsp;~~~~/sig;   # Handle non-breaking white space
  379     $page =~ s/<A(\s+[^>]*)<([^>]*)>([^>])*>/~~~~A$1~~~~$2\@\@\@\@$3\@\@\@\@/sig;    # Handle nested brackets
  380     foreach (@tags) {
  381         $tag = $_;
  382         $page =~ s/<(\/*$tag)>/~~~~$1\@\@\@\@/sig;
  383         if ( $tag =~ s/\*/ / ) { # XXX WTF is going here with the re?
  384             $page =~ s/<(\/*$tag.*?)>/~~~~$1\@\@\@\@/sig;
  385         }
  386     }
  387 
  388     return $page;
  389 }
  390 
  391 sub ReduceSpaces() {
  392     my $token = shift(@_);
  393     
  394     $token =~ s/\@\@\@\@&nbsp;~~~~/ /sig;
  395     $token =~ s/~~~~/</sig;
  396     $token =~ s/\@\@\@\@/>/sig;
  397     $token =~ s/<A(\s+[^>]*)<([^>]*)>([^>])*>//sig;
  398     $token =~ s/<[^>]*>//sig;
  399     $token =~ s/^\s*//sig;
  400     $token =~ s/\s*$//sig;
  401     $token =~ s/\s+/ /sig;
  402 
  403     return $token;
  404 }
  405 
  406 __END__
  407 
  408 =head1 NAME
  409 
  410 webdiff - Find and Highlight Differences Between Webpages
  411 
  412 =head1 SYNOPSIS
  413 
  414 webdiff [options]
  415 
  416 
  417 =head1 OPTIONS
  418 
  419 =over 8
  420 
  421 =item B<--help>
  422 
  423 Print a brief help message and exits.
  424 
  425 =item B<--man>
  426 
  427 Prints the manual page and exits.
  428 
  429 =item B<--archive>=<pathname>
  430 
  431 Archive HTML file
  432 
  433 =item B<--current>=<pathname>
  434 
  435 Current HTML file
  436 
  437 =item B<--out>=<pathname>
  438 
  439 Output HTML file (with highlighting)
  440 
  441 =item B<--basedir>=<pathname>
  442 
  443 Base directory for files
  444 
  445 =item B<--hicolor>=<color>
  446 
  447 Highlight color (Def: blue, yellow, pink, grey or #rrggbb)
  448 
  449 =item B<--ignore>=<filelist>
  450 
  451 Comma-delimited list of named sections containing ignore keywords
  452 
  453 =item B<--ignoreurl>=<filelist>
  454 
  455 Comma-delimited list of named sections containing ignore urls
  456 
  457 =item B<--tmin>=<number>
  458 
  459 Don't check if token contains <= given no. of words
  460 
  461 =item B<--tmax>=<number>
  462 
  463 Don't ignore if token contains >= given no. of words
  464 
  465 =item B<--debug>
  466 
  467 Debug messages
  468 
  469 =back
  470 
  471 =head1 DESCRIPTION
  472 
  473 B<webdiff> will compare two webpages and create an output file with the changesw
  474 highlighted.
  475 
  476 
  477 B<webdiff> is internal to B<websec> and isn't well documented.
  478 
  479 
  480 =head1 SEE ALSO
  481 
  482 L<websec(1)>
  483 
  484 
  485 =head1 AUTHOR
  486 
  487 Victor Chew is the original author of this software and
  488 Baruch Even is continuing the maintenance.
  489 
  490 =cut
  491 
  492 vim:set et ts=4: