"Fossies" - the Fresh Open Source Software Archive 
Member "websec-1.9.0/webdiff" (14 Mar 2005, 12963 Bytes) of package /linux/www/old/websec-1.9.0.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Perl source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
1 #!/usr/bin/perl -w
2
3 #################################################################################
4 #
5 # Webdiff
6 #
7 # Compares two HTML pages (current and archive) and outputs a new page based
8 # on the current page but with the differences between the two pages highlighted.
9 #
10 # Copyright (C) 1998 Chew Wei Yih
11 # Copyright (C) 2004,2005 Baruch Even <baruch@ev-en.org>
12 #
13 # This program is free software; you can redistribute it and/or
14 # modify it under the terms of the GNU General Public License
15 # as published by the Free Software Foundation; either version 2
16 # of the License, or (at your option) any later version.
17 #
18 # This program is distributed in the hope that it will be useful,
19 # but WITHOUT ANY WARRANTY; without even the implied warranty of
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 # GNU General Public License for more details.
22 #
23 # You should have received a copy of the GNU General Public License
24 # along with this program; if not, write to the Free Software
25 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
26 #
27 #################################################################################
28
29 use Getopt::Long;
30 use Pod::Usage;
31
32 # Initialize parameters
33 $oldpage = "";
34 $curpage = "";
35 $outpage = "";
36 $hicolor = "blue";
37 $asciimarker = 0;
38 $ignore = "none";
39 $ignoreurl = "none";
40 $tmin = 0;
41 $tmax = 99999;
42 $debug = 0;
43 $ignoreFile = "ignore.list";
44 $basedir = $ENV{HOME} . "/.websec/";
45
46 # Parse options
47 $help = 0;
48 $man = 0;
49 GetOptions(
50 "help|?" => \$help,
51 "man" => \$man,
52 "basedir=s" => \$basedir,
53 "archive=s" => \$oldpage,
54 "current=s" => \$curpage,
55 "out=s" => \$outpage,
56 "hicolor=s" => \$hicolor,
57 "asciimarker" => \$asciimarker,
58 "ignore=s" => \$ignore,
59 "ignoreurl=s" => \$ignoreurl,
60 "tmin=i" => \$tmin,
61 "tmax=i" => \$tmax,
62 "debug" => \$debug,
63 "ignorefile=s" => \$ignoreFile
64 )
65 or pod2usage(0);
66
67 pod2usage(1) if ($help);
68 pod2usage( -exitstatus => 0, -verbose => 2 ) if $man;
69
70 # Remove trailing slash from basedir, we will add it ourself everywhere needed
71 $basedir =~ s/\/$//;
72
73 # Make sure some essential option values are supplied
74 if ( $oldpage eq "" ) {
75 print
76 "You did not supply the archive HTML file via the --archive option.\n";
77 exit -1;
78 }
79 if ( $curpage eq "" ) {
80 print
81 "You did not supply the current HTML file via the --current option.\n";
82 exit -1;
83 }
84 if ( $outpage eq "" ) {
85 print "You did not supply the output HTML file via the --out option.\n";
86 exit -1;
87 }
88
89 # Choose highlighting color
90 %colorList = (
91 yellow => "#ffff99",
92 blue => "#66ccff",
93 pink => "#ffcccc",
94 grey => "#4c4c4c"
95 );
96 if ( defined $colorList{$hicolor} ) { $hicolor = $colorList{$hicolor}; }
97 if ( $hicolor eq "" ) { $hicolor = $colorList{"blue"}; }
98
99 # Other global variables
100 $changeStatus = 0;
101 @tags = (
102 "CODE", "B", "I", "U", "TT", "EM",
103 "FONT*", "SUP", "SUB", "SMALL", "STRIKE", "STRONG",
104 "CAPTION*", "A*"
105 );
106
107 # Read ignore keywords
108 if ( $ignore ne "none" ) {
109 $ignore = "," . $ignore . ",";
110 $ignorelist = "";
111 $ignoreStartRead = 0;
112 open( IGNORE, "< $basedir/$ignoreFile" )
113 or die "Cannot open $basedir/$ignoreFile: $!\n";
114 while (<IGNORE>) {
115 chomp;
116 s/^\s*//;
117 s/\s*$//;
118
119 # Ignore comments
120 next if (m/^#/);
121 # Stop with a finish marker
122 last if (m/^__END__/);
123
124 if ( $ignoreStartRead && $_ eq "" ) { $ignoreStartRead = 0; next; }
125 if ($ignoreStartRead) { $ignorelist .= $_ . "\r"; next; }
126 ( $section = $_ ) =~ s:\[\s*(.*?)\s*\]:$1:sig;
127 if ( $ignore =~ m:,$section,:i ) { $ignoreStartRead = 1; }
128 }
129 close( IGNORE );
130 @ignore = split /[\r\n]/, $ignorelist;
131 }
132 if ($debug) {
133 foreach (@ignore) { print "Ignore: $_\n"; }
134 }
135
136 # Read ignore urls
137 if ( $ignoreurl ne "none" ) {
138 $ignoreurl = "," . $ignoreurl . ",";
139 $ignorelist = "";
140 $ignoreStartRead = 0;
141 open( IGNORE, "< $basedir/$ignoreFile" )
142 or die "Cannot open $basedir/$ignoreFile: $!\n";
143 while (<IGNORE>) {
144 chomp;
145 s/^\s*//;
146 s/\s*$//;
147
148 # Ignore comments
149 next if (m/^#/);
150 # Stop with a finish marker
151 last if (m/^__END__/);
152
153 if ( $ignoreStartRead && $_ eq "" ) { $ignoreStartRead = 0; next; }
154 if ($ignoreStartRead) { $ignorelist .= $_ . "\r"; next; }
155 ( $section = $_ ) =~ s:\[\s*(.*?)\s*\]:$1:sig;
156 if ( $ignoreurl =~ m:,$section,:i ) { $ignoreStartRead = 1; }
157 }
158 close( IGNORE );
159 @ignoreurl = split /[\r\n]/, $ignorelist;
160 }
161 if ($debug) {
162 foreach (@ignoreurl) { print "IgnoreURL: $_\n"; }
163 }
164
165 # Undefine line separator so that we can read entire file at one go from now on
166 undef $/;
167
168 # Open input pages for comparing
169 open( OLDPAGE, "< $oldpage" ) or die "Cannot open $oldpage: $!\n";
170 open( CURPAGE, "< $curpage" ) or die "Cannot open $curpage: $!\n";
171
172 # Read input pages
173 $oldpage = <OLDPAGE>;
174 $newpage = <CURPAGE>;
175
176 # Close input pages
177 close(OLDPAGE);
178 close(CURPAGE);
179
180 # Mangle some HTML tags to a form suitable for analysis
181 $oldpage = &MangleHTML($oldpage, @tags);
182 $newpage = &MangleHTML($newpage, @tags);
183
184 # Parse old and new page
185 &TokenizePage($oldpage);
186 @oldtokens = @tokens;
187 $#tokens = -1;
188 if ($debug) {
189 foreach (@oldtokens) { print ">>>> $_\n"; }
190 }
191 &TokenizePage($newpage);
192 @newtokens = @tokens;
193 $#tokens = -1;
194
195 # Parse new page
196 &PerformDiff();
197
198 # Restore tags which we have previously mangled
199 foreach $token (@newtokens) {
200 $token =~ s/\@\@\@\@ ~~~~/ /sig;
201 foreach $tag (@tags) { $token =~ s/~~~~(\/*.*?)\@\@\@\@/<$1>/sig; }
202 }
203
204 # Open output file for writing
205 open( OUTPAGE, "> $outpage" ) or die "Cannot open $outpage: $!\n";
206 foreach (@newtokens) { print OUTPAGE "$_\n"; }
207 close(OUTPAGE);
208
209 # End of program
210 if ( !$changeStatus ) {
211 if ($debug) { print "No changes were detected.\n"; }
212 }
213 exit $changeStatus;
214
215 # Convert page to tokens
216 sub TokenizePage() {
217 my $page = shift (@_);
218 @tokens = split /(<.*?>)/s, $page;
219 foreach (@tokens) { s/^\s+//sig; }
220 foreach (@tokens) { s/\s+$//sig; }
221 }
222
223 # Perform diff between two pages
224 sub PerformDiff() {
225 my $commentOn = 0;
226 my $scriptOn = 0;
227 my $styleOn = 0;
228 my $titleOn = 0;
229 my $ignoreUrlOn = 0;
230
231 foreach $token (@newtokens) {
232 if ( $token eq "" ) { next; }
233 if ($debug) { print "<<<< $token\n"; }
234
235 if ( $token =~ m|^.*?<!-.*?$| ) { $commentOn = 1; }
236 if ( $token =~ m|^.*?->.*?| ) { $commentOn = 0; next; }
237
238 if ( $token =~ m|^.*?<TITLE.*?>$|i ) { $titleOn = 1; }
239 if ( $token =~ m|^.*?</TITLE.*?>$|i ) { $titleOn = 0; next; }
240
241 if ( $token =~ m|^.*?<SCRIPT.*?>$|i ) { $scriptOn = 1; }
242 if ( $token =~ m|^.*?</SCRIPT.*?>$|i ) { $scriptOn = 0; next; }
243
244 if ( $token =~ m|^.*?<STYLE.*?>$|i ) { $styleOn = 1; }
245 if ( $token =~ m|^.*?</STYLE.*?>$|i ) { $styleOn = 0; next; }
246
247 if ( TokenContainsIgnoreURL($token) ) { $ignoreUrlOn = 1; }
248 if ( $ignoreUrlOn && TokenContainsHlinkEnd($token) ) {
249 $ignoreUrlOn = 0;
250 next;
251 }
252
253 if ($commentOn) {
254 if ($debug) { print "#### Token is within comment block.\n"; }
255 }
256 elsif ($titleOn) {
257 if ($debug) { print "#### Token is within title block.\n"; }
258 }
259 elsif ($scriptOn) {
260 if ($debug) { print "#### Token is within Javascript block.\n"; }
261 }
262 elsif ($styleOn) {
263 if ($debug) { print "#### Token is within stylesheet block.\n"; }
264 }
265 elsif ($ignoreUrlOn) {
266 if ($debug) {
267 print "#### Token contains ignore URL - $lastIgnoreURL\n";
268 }
269 }
270 elsif ( $token =~ m/<.*?>/sig ) {
271 if ($debug) { print "#### Token is a HTML tag.\n"; }
272 }
273 elsif ( TokenIsMangledHTMLTag($token) ) {
274 if ($debug) { print "#### Token is a mangled HTML tag.\n"; }
275 }
276 elsif ( TokenContainsIgnoreKeyword($token) ) {
277 if ($debug) {
278 print
279 "#### Token contains ignore keyword - $lastIgnoreKeyword\n";
280 }
281 }
282 elsif ( TokenExists($token) ) {
283 if ($debug) { print "#### Token exists in old page.\n"; }
284 }
285 else {
286 if ($debug) { print "#### Token has been highlighted!\n"; }
287 if ($asciimarker) {
288 $token = "###>>>". $token ."<<<###";
289 }
290 $token =
291 "<span style=\"background-color: $hicolor\">"
292 . $token . "</span>";
293 $changeStatus = 1;
294 }
295 }
296 }
297
298 # Check if token is a mangled HTML tag
299 sub TokenIsMangledHTMLTag() {
300 my $token = shift (@_);
301
302 while ( $token ne "" ) {
303 if ( $token =~ m/^\s*(.*?)\s*~~~~.*?\@\@\@\@\s*(.*?)\s*$/i ) {
304 $token = $2;
305 if ( !$1 =~ m/^\s*$/ ) { return 0; }
306 }
307 else { return 0; }
308 }
309 return 1;
310 }
311
312 # Check if token contains any keyword in ignore list
313 sub TokenContainsIgnoreKeyword() {
314 my $token = shift (@_);
315 $token =~ s/\s{2,}/ /sig;
316
317 # If this token contains >= tmax no. of words, do not ignore
318 $tokdup = &ReduceSpaces($token);
319 @words = split /\s/, $tokdup;
320 if ($debug) { print "#### C" . ( $#words + 1 ) . ": $tokdup\n"; }
321 if ( $#words + 1 > $tmax ) { return 0; }
322
323 foreach $keyword (@ignore) {
324 if ( $token =~ m/^.*?(\b$keyword\b).*?$/i
325 || $tokdup =~ m/^.*?(\b$keyword\b).*?$/i )
326 {
327 $lastIgnoreKeyword = $keyword;
328 return 1;
329 }
330 }
331 return 0;
332 }
333
334 # Check if token already exists
335 sub TokenExists() {
336 my $token = shift (@_);
337 $token =~ s/\s{2,}/ /sig;
338
339 # If this token contains <= tmin no. of words, don't check
340 $tokdup = &ReduceSpaces($token);
341 @words = split /\s/, $tokdup;
342 if ( $#words + 1 <= $tmin ) { return 1; }
343
344 foreach $oldtok (@oldtokens) {
345 $oldtok =~ s/\s{2,}/ /sig;
346 if ( $token eq $oldtok ) { return 1; }
347 }
348 return 0;
349 }
350
351 # Check if token contains ignore URL
352 sub TokenContainsIgnoreURL() {
353 my $token = shift (@_);
354 $token =~ s/\s{2,}/ /sig;
355
356 foreach $url (@ignoreurl) {
357 if ( $token =~ m/~~~~A.*?HREF=.*?$url.*?\@\@\@\@/i ) {
358 $lastIgnoreURL = $url;
359 return 1;
360 }
361 }
362 return 0;
363 }
364
365 # Check if token contains end of hyperlink
366 sub TokenContainsHlinkEnd() {
367 my $token = shift (@_);
368 $token =~ s/\s{2,}/ /sig;
369 return 1 if $token =~ m/~~~~\/A\@\@\@\@/i;
370 return 0;
371 }
372
373 sub MangleHTML() {
374 my $page = shift(@_);
375 my @tags = shift(@_);
376
377 $page =~ s/[\r\n]|\s\s/ /sig; # Handle MSDOS-style line separators
378 $page =~ s/ /\@\@\@\@ ~~~~/sig; # Handle non-breaking white space
379 $page =~ s/<A(\s+[^>]*)<([^>]*)>([^>])*>/~~~~A$1~~~~$2\@\@\@\@$3\@\@\@\@/sig; # Handle nested brackets
380 foreach (@tags) {
381 $tag = $_;
382 $page =~ s/<(\/*$tag)>/~~~~$1\@\@\@\@/sig;
383 if ( $tag =~ s/\*/ / ) { # XXX WTF is going here with the re?
384 $page =~ s/<(\/*$tag.*?)>/~~~~$1\@\@\@\@/sig;
385 }
386 }
387
388 return $page;
389 }
390
391 sub ReduceSpaces() {
392 my $token = shift(@_);
393
394 $token =~ s/\@\@\@\@ ~~~~/ /sig;
395 $token =~ s/~~~~/</sig;
396 $token =~ s/\@\@\@\@/>/sig;
397 $token =~ s/<A(\s+[^>]*)<([^>]*)>([^>])*>//sig;
398 $token =~ s/<[^>]*>//sig;
399 $token =~ s/^\s*//sig;
400 $token =~ s/\s*$//sig;
401 $token =~ s/\s+/ /sig;
402
403 return $token;
404 }
405
406 __END__
407
408 =head1 NAME
409
410 webdiff - Find and Highlight Differences Between Webpages
411
412 =head1 SYNOPSIS
413
414 webdiff [options]
415
416
417 =head1 OPTIONS
418
419 =over 8
420
421 =item B<--help>
422
423 Print a brief help message and exits.
424
425 =item B<--man>
426
427 Prints the manual page and exits.
428
429 =item B<--archive>=<pathname>
430
431 Archive HTML file
432
433 =item B<--current>=<pathname>
434
435 Current HTML file
436
437 =item B<--out>=<pathname>
438
439 Output HTML file (with highlighting)
440
441 =item B<--basedir>=<pathname>
442
443 Base directory for files
444
445 =item B<--hicolor>=<color>
446
447 Highlight color (Def: blue, yellow, pink, grey or #rrggbb)
448
449 =item B<--ignore>=<filelist>
450
451 Comma-delimited list of named sections containing ignore keywords
452
453 =item B<--ignoreurl>=<filelist>
454
455 Comma-delimited list of named sections containing ignore urls
456
457 =item B<--tmin>=<number>
458
459 Don't check if token contains <= given no. of words
460
461 =item B<--tmax>=<number>
462
463 Don't ignore if token contains >= given no. of words
464
465 =item B<--debug>
466
467 Debug messages
468
469 =back
470
471 =head1 DESCRIPTION
472
473 B<webdiff> will compare two webpages and create an output file with the changesw
474 highlighted.
475
476
477 B<webdiff> is internal to B<websec> and isn't well documented.
478
479
480 =head1 SEE ALSO
481
482 L<websec(1)>
483
484
485 =head1 AUTHOR
486
487 Victor Chew is the original author of this software and
488 Baruch Even is continuing the maintenance.
489
490 =cut
491
492 vim:set et ts=4: