A hint: This file contains one or more very long lines, so maybe it is better readable using the pure text view mode that shows the contents as wrapped lines within the browser window.
1 #!/usr/bin/perl -w 2 # 3 # checkbot - A perl5 script to check validity of links in www document trees 4 # 5 # Hans de Graaff <hans@degraaff.org>, 1994-2005. 6 # Based on Dimitri Tischenko, Delft University of Technology, 1994 7 # Based on the testlinks script by Roy Fielding 8 # With contributions from Bruce Speyer <bruce.speyer@elecomm.com> 9 # 10 # This application is free software; you can redistribute it and/or 11 # modify it under the same terms as Perl itself. 12 # 13 # Info-URL: http://degraaff.org/checkbot/ 14 # 15 # $Id: checkbot 238 2008-10-15 12:55:00Z graaff $ 16 # (Log information can be found at the end of the script) 17 18 require 5.004; 19 use strict; 20 21 require LWP; 22 use File::Basename; 23 24 BEGIN { 25 eval "use Time::Duration qw(duration)"; 26 $main::useduration = ($@ ? 0 : 1); 27 } 28 29 # Version information 30 my 31 $VERSION = '1.80'; 32 33 34 =head1 NAME 35 36 Checkbot - WWW Link Verifier 37 38 =head1 SYNOPSIS 39 40 checkbot [B<--cookies>] [B<--debug>] [B<--file> file name] [B<--help>] 41 [B<--mailto> email addresses] [B<--noproxy> list of domains] 42 [B<--verbose>] 43 [B<--url> start URL] 44 [B<--match> match string] [B<--exclude> exclude string] 45 [B<--proxy> proxy URL] [B<--internal-only>] 46 [B<--ignore> ignore string] 47 [B<--filter> substitution regular expression] 48 [B<--style> style file URL] 49 [B<--note> note] [B<--sleep> seconds] [B<--timeout> timeout] 50 [B<--interval> seconds] [B<--dontwarn> HTTP responde codes] 51 [B<--enable-virtual>] 52 [B<--language> language code] 53 [B<--suppress> suppression file] 54 [start URLs] 55 56 =head1 DESCRIPTION 57 58 Checkbot verifies the links in a specific portion of the World Wide 59 Web. It creates HTML pages with diagnostics. 60 61 Checkbot uses LWP to find URLs on pages and to check them. It supports 62 the same schemes as LWP does, and finds the same links that 63 HTML::LinkExtor will find. 64 65 Checkbot considers links to be either 'internal' or 66 'external'. Internal links are links within the web space that needs 67 to be checked. If an internal link points to a web document this 68 document is retrieved, and its links are extracted and 69 processed. External links are only checked to be working. Checkbot 70 checks links as it finds them, so internal and external links are 71 checked at the same time, even though they are treated differently. 72 73 Options for Checkbot are: 74 75 =over 4 76 77 =item --cookies 78 79 Accept cookies from the server and offer them again at later 80 requests. This may be useful for servers that use cookies to handle 81 sessions. By default Checkbot does not accept any cookies. 82 83 =item --debug 84 85 Enable debugging mode. Not really supported anymore, but it will keep 86 some files around that otherwise would be deleted. 87 88 =item --file <file name> 89 90 Use the file I<file name> as the basis for the summary file names. The 91 summary page will get the I<file name> given, and the server pages are 92 based on the I<file name> without the .html extension. For example, 93 setting this option to C<index.html> will create a summary page called 94 index.html and server pages called index-server1.html and 95 index-server2.html. 96 97 The default value for this option is C<checkbot.html>. 98 99 =item --help 100 101 Shows brief help message on the standard output. 102 103 =item --mailto <email address>[,<email address>] 104 105 Send mail to the I<email address> when Checkbot is done checking. You 106 can give more than one address separated by commas. The notification 107 email includes a small summary of the results. As of Checkbot 1.76 108 email is only sent if problems have been found during the Checkbot 109 run. 110 111 =item --noproxy <list of domains> 112 113 Do not proxy requests to the given domains. The list of domains must 114 be a comma-separated list. For example, so avoid using the proxy for 115 the localhost and someserver.xyz, you can use C<--noproxy 116 localhost,someserver.xyz>. 117 118 =item --verbose 119 120 Show verbose output while running. Includes all links checked, results 121 from the checks, etc. 122 123 124 125 126 127 =item --url <start URL> 128 129 Set the start URL. Checkbot starts checking at this URL, and then 130 recursively checks all links found on this page. The start URL takes 131 precedence over additional URLs specified on the command line. 132 133 If no scheme is specified for the URL, the file protocol is assumed. 134 135 =item --match <match string> 136 137 This option selects which pages Checkbot considers local. If the 138 I<match string> is contained within the URL, then Checkbot considers 139 the page local, retrieves it, and will check all the links contained 140 on it. Otherwise the page is considered external and it is only 141 checked with a HEAD request. 142 143 If no explicit I<match string> is given, the start URLs (See option 144 C<--url>) will be used as a match string instead. In this case the 145 last page name, if any, will be trimmed. For example, a start URL like 146 C<http://some.site/index.html> will result in a default I<match 147 string> of C<http://some.site/>. 148 149 The I<match string> can be a perl regular expression. For example, to 150 check the main server page and all HTML pages directly underneath it, 151 but not the HTML pages in the subdirectories of the server, the 152 I<match string> would be C<www.someserver.xyz/($|[^/]+.html)>. 153 154 =item --exclude <exclude string> 155 156 URLs matching the I<exclude string> are considered to be external, 157 even if they happen to match the I<match string> (See option 158 C<--match>). URLs matching the --exclude string are still being 159 checked and will be reported if problems are found, but they will not 160 be checked for further links into the site. 161 162 The I<exclude string> can be a perl regular expression. For example, 163 to consider all URLs with a query string external, use C<[=\?]>. This 164 can be useful when a URL with a query string unlocks the path to a 165 huge database which will be checked. 166 167 =item --filter <filter string> 168 169 This option defines a I<filter string>, which is a perl regular 170 expression. This filter is run on each URL found, thus rewriting the 171 URL before it enters the queue to be checked. It can be used to remove 172 elements from a URL. This option can be useful when symbolic links 173 point to the same directory, or when a content management system adds 174 session IDs to URLs. 175 176 For example C</old/new/> would replace occurrences of 'old' with 'new' 177 in each URL. 178 179 =item --ignore <ignore string> 180 181 URLs matching the I<ignore string> are not checked at all, they are 182 completely ignored by Checkbot. This can be useful to ignore known 183 problem links, or to ignore links leading into databases. The I<ignore 184 string> is matched after the I<filter string> has been applied. 185 186 The I<ignore string> can be a perl regular expression. 187 188 For example C<www.server.com\/(one|two)> would match all URLs starting 189 with either www.server.com/one or www.server.com/two. 190 191 192 =item --proxy <proxy URL> 193 194 This attribute specifies the URL of a proxy server. Only the HTTP and 195 FTP requests will be sent to that proxy server. 196 197 =item --internal-only 198 199 Skip the checking of external links at the end of the Checkbot 200 run. Only matching links are checked. Note that some redirections may 201 still cause external links to be checked. 202 203 =item --note <note> 204 205 The I<note> is included verbatim in the mail message (See option 206 C<--mailto>). This can be useful to include the URL of the summary HTML page 207 for easy reference, for instance. 208 209 Only meaningful in combination with the C<--mailto> option. 210 211 =item --sleep <seconds> 212 213 Number of I<seconds> to sleep in between requests. Default is 0 214 seconds, i.e. do not sleep at all between requests. Setting this 215 option can be useful to keep the load on the web server down while 216 running Checkbot. This option can also be set to a fractional number, 217 i.e. a value of 0.1 will sleep one tenth of a second between requests. 218 219 =item --timeout <timeout> 220 221 Default timeout for the requests, specified in seconds. The default is 222 2 minutes. 223 224 =item --interval <seconds> 225 226 The maximum interval between updates of the results web pages in 227 seconds. Default is 3 hours (10800 seconds). Checkbot will start the 228 interval at one minute, and gradually extend it towards the maximum 229 interval. 230 231 =item --style <URL of style file> 232 233 When this option is used, Checkbot embeds this URL as a link to a 234 style file on each page it writes. This makes it easy to customize the 235 layout of pages generated by Checkbot. 236 237 =item --dontwarn <HTTP response codes regular expression> 238 239 Do not include warnings on the result pages for those HTTP response 240 codes which match the regular expression. For instance, --dontwarn 241 "(301|404)" would not include 301 and 404 response codes. 242 243 Checkbot uses the response codes generated by the server, even if this 244 response code is not defined in RFC 2616 (HTTP/1.1). In addition to 245 the normal HTTP response code, Checkbot defines a few response codes 246 for situations which are not technically a problem, but which causes 247 problems in many cases anyway. These codes are: 248 249 901 Host name expected but not found 250 In this case the URL supports a host name, but non was found 251 in the URL. This usually indicates a mistake in the URL. An 252 exception is that this check is not applied to news: URLs. 253 254 902 Unqualified host name found 255 In this case the host name does not contain the domain part. 256 This usually means that the pages work fine when viewed within 257 the original domain, but not when viewed from outside it. 258 259 903 Double slash in URL path 260 The URL has a double slash in it. This is legal, but some web 261 servers cannot handle it very well and may cause Checkbot to 262 run away. See also the comments below. 263 264 904 Unknown scheme in URL 265 The URL starts with a scheme that Checkbot does not know 266 about. This is often caused by mistyping the scheme of the URL, 267 but the scheme can also be a legal one. In that case please let 268 me know so that it can be added to Checkbot. 269 270 =item --enable-virtual 271 272 This option enables dealing with virtual servers. Checkbot then 273 assumes that all hostnames for internal servers are unique, even 274 though their IP addresses may be the same. Normally Checkbot uses the 275 IP address to distinguish servers. This has the advantage that if a 276 server has two names (e.g. www and bamboozle) its pages only get 277 checked once. When you want to check multiple virtual servers this 278 causes problems, which this feature works around by using the hostname 279 to distinguish the server. 280 281 =item --language 282 283 The argument for this option is a two-letter language code. Checkbot 284 will use language negotiation to request files in that language. The 285 default is to request English language (language code 'en'). 286 287 =item --suppress 288 289 The argument for this option is a file which contains combinations of 290 error codes and URLs for which to suppress warnings. This can be used 291 to avoid reporting of known and unfixable URL errors or warnings. 292 293 The format of the suppression file is a simple whitespace delimited 294 format, first listing the error code followed by the URL. Each error 295 code and URL combination is listed on a new line. Comments can be 296 added to the file by starting the line with a C<#> character. 297 298 # 301 Moved Permanently 299 301 http://www.w3.org/P3P 300 301 # 403 Forbidden 302 403 http://www.herring.com/ 303 304 For further flexibility a regular expression can be used instead of a 305 normal URL. The regular expression must be enclosed with forward 306 slashes. For example, to suppress all 403 errors on wikipedia: 307 308 403 /http:\/\/wikipedia.org\/.*/ 309 310 =back 311 312 Deprecated options which will disappear in a future release: 313 314 =over 315 316 =item --allow-simple-hosts (deprecated) 317 318 This option turns off warnings about URLs which contain unqualified 319 host names. This is useful for intranet sites which often use just a 320 simple host name or even C<localhost> in their links. 321 322 Use of this option is deprecated. Please use the --dontwarn mechanism 323 for error 902 instead. 324 325 =back 326 327 328 =head1 HINTS AND TIPS 329 330 =over 331 332 =item Problems with checking FTP links 333 334 Some users may experience consistent problems with checking FTP 335 links. In these cases it may be useful to instruct Net::FTP to use 336 passive FTP mode to check files. This can be done by setting the 337 environment variable FTP_PASSIVE to 1. For example, using the bash 338 shell: C<FTP_PASSIVE=1 checkbot ...>. See the Net::FTP documentation 339 for more details. 340 341 =item Run-away Checkbot 342 343 In some cases Checkbot literally takes forever to finish. There are two 344 common causes for this problem. 345 346 First, there might be a database application as part of the web site 347 which generates a new page based on links on another page. Since 348 Checkbot tries to travel through all links this will create an 349 infinite number of pages. This kind of run-away effect is usually predictable. It can be avoided by using the --exclude option. 350 351 Second, a server configuration problem can cause a loop in generating 352 URLs for pages that really do not exist. This will result in URLs of 353 the form http://some.server/images/images/images/logo.png, with ever 354 more 'images' included. Checkbot cannot check for this because the 355 server should have indicated that the requested pages do not 356 exist. There is no easy way to solve this other than fixing the 357 offending web server or the broken links. 358 359 =item Problems with https:// links 360 361 The error message 362 363 Can't locate object method "new" via package "LWP::Protocol::https::Socket" 364 365 usually means that the current installation of LWP does not support 366 checking of SSL links (i.e. links starting with https://). This 367 problem can be solved by installing the Crypt::SSLeay module. 368 369 =back 370 371 =head1 EXAMPLES 372 373 The most simple use of Checkbot is to check a set of pages on a 374 server. To check my checkbot pages I would use: 375 376 checkbot http://degraaff.org/checkbot/ 377 378 Checkbot runs can take some time so Checkbot can send a notification 379 mail when the run is done: 380 381 checkbot --mailto hans@degraaff.org http://degraaff.org/checkbot/ 382 383 It is possible to check a set of local file without using a web 384 server. This only works for static files but may be useful in some 385 cases. 386 387 checkbot file:///var/www/documents/ 388 389 =head1 PREREQUISITES 390 391 This script uses the C<LWP> modules. 392 393 =head1 COREQUISITES 394 395 This script can send mail when C<Mail::Send> is present. 396 397 =head1 AUTHOR 398 399 Hans de Graaff <hans@degraaff.org> 400 401 =pod OSNAMES 402 403 any 404 405 =cut 406 407 # Declare some global variables, avoids ugly use of main:: all around 408 my %checkbot_errors = ('901' => 'Host name expected but not found', 409 '902' => 'Unqualified host name in URL', 410 '903' => 'URL contains double slash in URL', 411 '904' => 'Unknown scheme in URL', 412 ); 413 414 my @starturls = (); 415 416 # Two hashes to store the response to a URL, and all the parents of the URL 417 my %url_error = (); 418 my %url_parent = (); 419 420 # Hash for storing the title of a URL for use in reports. TODO: remove 421 # this and store title as part of queue. 422 my %url_title = (); 423 424 # Hash for suppressions, which are defined as a combination of code and URL 425 my %suppression = (); 426 427 # Hash to store statistics on link checking 428 my %stats = ('todo' => 0, 429 'link' => 0, 430 'problem' => 0 ); 431 432 # Options hash (to be filled by GetOptions) 433 my %options = (); 434 435 # Keep track of start time so that we can use it in reports 436 my $start_time = time(); 437 438 # If on a Mac we should ask for the arguments through some MacPerl stuff 439 if ($^O eq 'MacOS') { 440 $main::mac_answer = eval "MacPerl::Ask('Enter Command-Line Options')"; 441 push(@ARGV, split(' ', $main::mac_answer)); 442 } 443 444 # Prepare 445 check_options(); 446 init_modules(); 447 init_globals(); 448 init_suppression(); 449 450 # Start actual application 451 check_links(); 452 453 # Finish up 454 create_page(1); 455 send_mail() if defined $main::opt_mailto and $stats{problem} > 0; 456 457 exit 0; 458 459 # output prints stuff on stderr if --verbose, and takes care of proper 460 # indentation 461 sub output { 462 my ($line, $level) = @_; 463 464 return unless $main::opt_verbose; 465 466 chomp $line; 467 468 my $indent = ''; 469 470 if (defined $level) { 471 while ($level-- > 0) { 472 $indent .= ' '; 473 } 474 } 475 476 print STDERR $indent, $line, "\n"; 477 } 478 479 ### Initialization and setup routines 480 481 sub check_options { 482 483 # Get command-line arguments 484 use Getopt::Long; 485 my $result = GetOptions(qw(cookies debug help noproxy=s verbose url=s match=s exclude|x=s file=s filter=s style=s ignore|z=s mailto|M=s note|N=s proxy=s internal-only sleep=f timeout=i interval=i dontwarn=s enable-virtual language=s allow-simple-hosts suppress=s)); 486 487 # Handle arguments, some are mandatory, some have defaults 488 &print_help if (($main::opt_help && $main::opt_help) 489 || (!$main::opt_url && $#ARGV == -1)); 490 $main::opt_timeout = 120 unless defined($main::opt_timeout) && length($main::opt_timeout); 491 $main::opt_verbose = 0 unless $main::opt_verbose; 492 $main::opt_sleep = 0 unless defined($main::opt_sleep) && length($main::opt_sleep); 493 $main::opt_interval = 10800 unless defined $main::opt_interval and length $main::opt_interval; 494 $main::opt_dontwarn = "xxx" unless defined $main::opt_dontwarn and length $main::opt_dontwarn; 495 $main::opt_enable_virtual = 0 unless defined $main::opt_enable_virtual; 496 # Set the default language and make sure it is a two letter, lowercase code 497 $main::opt_language = 'en' unless defined $main::opt_language; 498 $main::opt_language = lc(substr($main::opt_language, 0, 2)); 499 $main::opt_language =~ tr/a-z//cd; 500 if ($main::opt_language !~ /[a-z][a-z]/) { 501 warn "Argument --language $main::opt_language is not a valid language code\nUsing English as a default.\n"; 502 $main::opt_language = 'en'; 503 } 504 $main::opt_allow_simple_hosts = 0 505 unless $main::opt_allow_simple_hosts; 506 output "--allow-simple-hosts is deprecated, please use the --dontwarn mechanism", 0 if $main::opt_allow_simple_hosts; 507 508 # The default for opt_match will be set later, because we might want 509 # to muck with opt_url first. 510 511 # Display messages about the options 512 output "*** Starting Checkbot $VERSION in verbose mode"; 513 output 'Will skip checking of external links', 1 514 if $main::opt_internal_only; 515 output "Allowing unqualified host names", 1 516 if $main::opt_allow_simple_hosts; 517 output "Not using optional Time::Duration module: not found", 1 518 unless $main::useduration; 519 } 520 521 sub init_modules { 522 523 use URI; 524 # Prepare the user agent to be used: 525 use LWP::UserAgent; 526 use LWP::MediaTypes; 527 #use LWP::Debug qw(- +debug); 528 use HTML::LinkExtor; 529 $main::ua = new LWP::UserAgent; 530 $main::ua->agent("Checkbot/$VERSION LWP/" . LWP::Version); 531 $main::ua->timeout($main::opt_timeout); 532 # Add a proxy to the user agent, if defined 533 $main::ua->proxy(['http', 'ftp'], $main::opt_proxy) 534 if defined($main::opt_proxy); 535 $main::ua->no_proxy(split(',', $main::opt_noproxy)) 536 if defined $main::opt_noproxy; 537 # Add a cookie jar to the UA if requested by the user 538 $main::ua->cookie_jar( {} ) 539 if defined $main::opt_cookies or $main::opt_cookies; 540 541 require Mail::Send if defined $main::opt_mailto; 542 543 use HTTP::Status; 544 } 545 546 sub init_globals { 547 my $url; 548 549 # Directory and files for output 550 if ($main::opt_file) { 551 $main::file = $main::opt_file; 552 $main::file =~ /(.*)\./; 553 $main::server_prefix = $1; 554 } else { 555 $main::file = "checkbot.html"; 556 $main::server_prefix = "checkbot"; 557 } 558 $main::tmpdir = ($ENV{'TMPDIR'} or $ENV{'TMP'} or $ENV{'TEMP'} or "/tmp") . "/Checkbot.$$"; 559 560 $main::cur_queue = $main::tmpdir . "/queue"; 561 $main::new_queue = $main::tmpdir . "/queue-new"; 562 563 # Make sure we catch signals so that we can clean up temporary files 564 $SIG{'INT'} = $SIG{'TERM'} = $SIG{'HUP'} = $SIG{'QUIT'} = \&got_signal; 565 566 # Set up hashes to be used 567 %main::checked = (); 568 %main::servers = (); 569 %main::servers_get_only = (); 570 571 # Initialize the start URLs. --url takes precedence. Otherwise 572 # just process URLs in order as they appear on the command line. 573 unshift(@ARGV, $main::opt_url) if $main::opt_url; 574 foreach (@ARGV) { 575 $url = URI->new($_); 576 # If no scheme is defined we will assume file is used, so that 577 # it becomes easy to check a single file. 578 $url->scheme('file') unless defined $url->scheme; 579 $url->host('localhost') if $url->scheme eq 'file'; 580 if (!defined $url->host) { 581 warn "No host specified in URL $url, ignoring it.\n"; 582 next; 583 } 584 push(@starturls, $url); 585 } 586 die "There are no valid starting URLs to begin checking with!\n" 587 if scalar(@starturls) == -1; 588 589 # Set the automatic matching expression to a concatenation of the starturls 590 if (!defined $main::opt_match) { 591 my @matchurls; 592 foreach my $url (@starturls) { 593 # Remove trailing files from the match, e.g. remove index.html 594 # stuff so that we match on the host and/or directory instead, 595 # but only if there is a path component in the first place. 596 my $matchurl = $url->as_string; 597 $matchurl =~ s!/[^/]+$!/! unless $url->path eq ''; 598 push(@matchurls, quotemeta $matchurl); 599 } 600 $main::opt_match = '^(' . join('|', @matchurls) . ')'; 601 output "--match defaults to $main::opt_match"; 602 } 603 604 # Initialize statistics hash with number of start URLs 605 $stats{'todo'} = scalar(@starturls); 606 607 # We write out our status every now and then. 608 $main::cp_int = 1; 609 $main::cp_last = 0; 610 } 611 612 sub init_suppression { 613 return if not defined $main::opt_suppress; 614 615 die "Suppression file \"$main::opt_suppress\" is in fact a directory" 616 if -d $main::opt_suppress; 617 618 open(SUPPRESSIONS, $main::opt_suppress) 619 or die "Unable to open $main::opt_suppress for reading: $!\n"; 620 while (my $line = <SUPPRESSIONS>) { 621 chomp $line; 622 next if $line =~ /^#/ or $line =~ /^\s*$/; 623 624 if ($line !~ /^\s*(\d+)\s+(\S+)/) { 625 output "WARNING: Unable to parse line in suppression file $main::opt_suppress:\n $line\n"; 626 } else { 627 output "Suppressed: $1 $2\n" if $main::opt_verbose; 628 $suppression{$1}{$2} = $2; 629 } 630 } 631 close SUPPRESSIONS; 632 } 633 634 635 636 637 ### Main application code 638 639 sub check_links { 640 my $line; 641 642 mkdir $main::tmpdir, 0755 643 || die "$0: unable to create directory $main::tmpdir: $!\n"; 644 645 # Explicitly set the record separator. I had the problem that this 646 # was not defined under my perl 5.00502. This should fix that, and 647 # not cause problems for older versions of perl. 648 $/ = "\n"; 649 650 open(CURRENT, ">$main::cur_queue") 651 || die "$0: Unable to open CURRENT $main::cur_queue for writing: $!\n"; 652 open(QUEUE, ">$main::new_queue") 653 || die "$0: Unable to open QUEUE $main::new_queue for writing: $!\n"; 654 655 # Prepare CURRENT queue with starting URLs 656 foreach (@starturls) { 657 print CURRENT $_->as_string . "|\n"; 658 } 659 close CURRENT; 660 661 open(CURRENT, $main::cur_queue) 662 || die "$0: Unable to open CURRENT $main::cur_queue for reading: $!\n"; 663 664 do { 665 # Read a line from the queue, and process it 666 while (defined ($line = <CURRENT>) ) { 667 chomp($line); 668 &handle_url($line); 669 &check_point(); 670 } 671 672 # Move queues around, and try again, but only if there are still 673 # things to do 674 output "*** Moving queues around, " . $stats{'todo'} . " links to do."; 675 close CURRENT 676 or warn "Error while closing CURRENT filehandle: $!\n"; 677 close QUEUE; 678 679 # TODO: should check whether these succeed 680 unlink($main::cur_queue); 681 rename($main::new_queue, $main::cur_queue); 682 683 open(CURRENT, "$main::cur_queue") 684 || die "$0: Unable to open $main::cur_queue for reading: $!\n"; 685 open(QUEUE, ">$main::new_queue") 686 || die "$0: Unable to open $main::new_queue for writing: $!\n"; 687 688 } while (not -z $main::cur_queue); 689 690 close CURRENT; 691 close QUEUE; 692 693 unless (defined($main::opt_debug)) { 694 clean_up(); 695 } 696 } 697 698 sub clean_up { 699 unlink $main::cur_queue, $main::new_queue; 700 rmdir $main::tmpdir; 701 output "Removed temporary directory $main::tmpdir and its contents.\n", 1; 702 } 703 704 sub got_signal { 705 my ($signalname) = @_; 706 707 clean_up() unless defined $main::opt_debug; 708 709 print STDERR "Caught SIG$signalname.\n"; 710 exit 1; 711 } 712 713 # Whether URL is 'internal' or 'external' 714 sub is_internal ($) { 715 my ($url) = @_; 716 717 return ( $url =~ /$main::opt_match/o 718 and not (defined $main::opt_exclude and $url =~ /$main::opt_exclude/o)); 719 } 720 721 722 sub handle_url { 723 my ($line) = @_; 724 my ($urlstr, $urlparent) = split(/\|/, $line); 725 726 my $reqtype; 727 my $response; 728 my $type; 729 730 $stats{'todo'}--; 731 732 # Add this URL to the ones we've seen already, return if it is a 733 # duplicate. 734 return if add_checked($urlstr); 735 736 $stats{'link'}++; 737 738 # Is this an external URL and we only check internal stuff? 739 return if defined $main::opt_internal_only 740 and not is_internal($urlstr); 741 742 my $url = URI->new($urlstr); 743 744 # Perhaps this is a URL we are not interested in checking... 745 if (not defined($url->scheme) 746 or $url->scheme !~ /^(https?|file|ftp|gopher|nntp)$/o ) { 747 # Ignore URLs which we know we can ignore, create error for others 748 if ($url->scheme =~ /^(news|mailto|javascript|mms)$/o) { 749 output "Ignore $url", 1; 750 } else { 751 add_error($urlstr, $urlparent, 904, "Unknown scheme in URL: " 752 . $url->scheme); 753 } 754 return; 755 } 756 757 # Guess/determine the type of document we might retrieve from this 758 # URL. We do this because we only want to use a full GET for HTML 759 # document. No need to retrieve images, etc. 760 if ($url->path =~ /\/$/o || $url->path eq "") { 761 $type = 'text/html'; 762 } else { 763 $type = guess_media_type($url->path); 764 } 765 # application/octet-stream is the fallback of LWP's guess stuff, so 766 # if we get this then we ask the server what we got just to be sure. 767 if ($type eq 'application/octet-stream') { 768 $response = performRequest('HEAD', $url, $urlparent, $type, $main::opt_language); 769 $type = $response->content_type; 770 } 771 772 # Determine if this is a URL we should GET fully or partially (using HEAD) 773 if ($type =~ /html/o 774 && $url->scheme =~ /^(https?|file|ftp|gopher)$/o 775 and is_internal($url->as_string) 776 && (!defined $main::opt_exclude || $url !~ /$main::opt_exclude/o)) { 777 $reqtype = 'GET'; 778 } else { 779 $reqtype = 'HEAD'; 780 } 781 782 # Get the document, unless we already did while determining the type 783 $response = performRequest($reqtype, $url, $urlparent, $type, $main::opt_language) 784 unless defined($response) and $reqtype eq 'HEAD'; 785 786 # Ok, we got something back from checking, let's see what it is 787 if ($response->is_success) { 788 select(undef, undef, undef, $main::opt_sleep) 789 unless $main::opt_debug || $url->scheme eq 'file'; 790 791 # Internal HTML documents need to be given to handle_doc for processing 792 if ($reqtype eq 'GET' and is_internal($url->as_string)) { 793 handle_doc($response, $urlstr); 794 } 795 } else { 796 797 # Right, so it wasn't the smashing succes we hoped for, so bring 798 # the bad news and store the pertinent information for later 799 add_error($url, $urlparent, $response->code, $response->message); 800 801 if ($response->is_redirect and is_internal($url->as_string)) { 802 if ($response->code == 300) { # multiple choices, but no redirection available 803 output 'Multiple choices', 2; 804 } else { 805 my $baseURI = URI->new($url); 806 if (defined $response->header('Location')) { 807 my $redir_url = URI->new_abs($response->header('Location'), $baseURI); 808 output "Redirected to $redir_url", 2; 809 add_to_queue($redir_url, $urlparent); 810 $stats{'todo'}++; 811 } else { 812 output 'Location header missing from redirect response', 2; 813 } 814 } 815 } 816 } 817 # Done with this URL 818 } 819 820 sub performRequest { 821 my ($reqtype, $url, $urlparent, $type, $language) = @_; 822 823 my ($response); 824 825 # A better solution here would be to use GET exclusively. Here is how 826 # to do that. We would have to set this max_size thing in 827 # check_external, I guess... 828 # Set $ua->max_size(1) and then try a normal GET request. However, 829 # that doesn't always work as evidenced by an FTP server that just 830 # hangs in this case... Needs more testing to see if the timeout 831 # catches this. 832 833 # Normally, we would only need to do a HEAD, but given the way LWP 834 # handles gopher requests, we need to do a GET on those to get at 835 # least a 500 and 501 error. We would need to parse the document 836 # returned by LWP to find out if we had problems finding the 837 # file. -- Patch by Bruce Speyer <bspeyer@texas-one.org> 838 839 # We also need to do GET instead of HEAD if we know the remote 840 # server won't accept it. The standard way for an HTTP server to 841 # indicate this is by returning a 405 ("Method Not Allowed") or 501 842 # ("Not Implemented"). Other circumstances may also require sending 843 # GETs instead of HEADs to a server. Details are documented below. 844 # -- Larry Gilbert <larry@n2h2.com> 845 846 # Normally we try a HEAD request first, then a GET request if 847 # needed. There may be circumstances in which we skip doing a HEAD 848 # (e.g. when we should be getting the whole document). 849 foreach my $try ('HEAD', 'GET') { 850 851 # Skip trying HEAD when we know we need to do a GET or when we 852 # know only a GET will work anyway. 853 next if $try eq 'HEAD' and 854 ($reqtype eq 'GET' 855 or $url->scheme eq 'gopher' 856 or (defined $url->authority and $main::servers_get_only{$url->authority})); 857 858 # Output what we are going to do with this link 859 output(sprintf("%4s %s (%s)\n", $try, $url, $type), 1); 860 861 # Create the request with all appropriate headers 862 my %header_hash = ( 'Referer' => $urlparent ); 863 if (defined($language) && ($language ne '')) { 864 $header_hash{'Accept-Language'} = $language; 865 } 866 my $ref_header = new HTTP::Headers(%header_hash); 867 my $request = new HTTP::Request($try, $url, $ref_header); 868 $response = $main::ua->simple_request($request); 869 870 # If we are doing a HEAD request we need to make sure nothing 871 # fishy happened. we use some heuristics to see if we are ok, or 872 # if we should try again with a GET request. 873 if ($try eq 'HEAD') { 874 875 # 400, 405, 406 and 501 are standard indications that HEAD 876 # shouldn't be used 877 # We used to check for 403 here also, but according to the HTTP spec 878 # a 403 indicates that the server understood us fine but really does 879 # not want us to see the page, so we SHOULD NOT retry. 880 if ($response->code =~ /^(400|405|406|501)$/o) { 881 output "Server does not seem to like HEAD requests; retrying", 2; 882 $main::servers_get_only{$url->authority}++; 883 next; 884 }; 885 886 # There are many servers out there that have real trouble with 887 # HEAD, so if we get a 500 Internal Server error just retry with 888 # a GET request to get an authoritive answer. We used to do this 889 # only for special cases, but the list got big and some 890 # combinations (e.g. Zope server behind Apache proxy) can't 891 # easily be detected from the headers. 892 if ($response->code =~ /^500$/o) { 893 output "Internal server error on HEAD request; retrying with GET", 2; 894 $main::servers_get_only{$url->authority}++ if defined $url->authority; 895 next; 896 } 897 898 # If we know the server we can try some specific heuristics 899 if (defined $response->server) { 900 901 # Netscape Enterprise has been seen returning 500 and even 404 902 # (yes, 404!!) in response to HEAD requests 903 if ($response->server =~ /^Netscape-Enterprise/o 904 and $response->code =~ /^404$/o) { 905 output "Unreliable Netscape-Enterprise response to HEAD request; retrying", 2; 906 $main::servers_get_only{$url->authority}++; 907 next; 908 }; 909 } 910 911 # If a HEAD request resulted in nothing noteworthy, no need for 912 # any further attempts using GET, we are done. 913 last; 914 } 915 } 916 917 return $response; 918 } 919 920 921 # This routine creates a (temporary) WWW page based on the current 922 # findings This allows somebody to monitor the process, but is also 923 # convenient when this program crashes or waits because of diskspace 924 # or memory problems 925 926 sub create_page { 927 my($final_page) = @_; 928 929 my $path = ""; 930 my $prevpath = ""; 931 my $prevcode = 0; 932 my $prevmessage = ""; 933 934 output "*** Start writing results page"; 935 936 open(OUT, ">$main::file.new") 937 || die "$0: Unable to open $main::file.new for writing:\n"; 938 print OUT "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n"; 939 print OUT "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n"; 940 print OUT "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">\n"; 941 print OUT "<head>\n"; 942 if (!$final_page) { 943 printf OUT "<meta http-equiv=\"Refresh\" content=\"%d\" />\n", 944 int($main::cp_int * 60 / 2 - 5); 945 } 946 947 print OUT "<title>Checkbot report</title>\n"; 948 print OUT "<link rel=\"stylesheet\" type=\"text/css\" href=\"$main::opt_style\" />\n" if defined $main::opt_style; 949 print OUT "</head>\n"; 950 print OUT "<body>\n"; 951 print OUT "<h1><em>Checkbot</em>: main report</h1>\n"; 952 953 # Show the status of this checkbot session 954 print OUT "<table summary=\"Status of this Checkbot session\" class='status'><tr><th>Status:</th><td>"; 955 if ($final_page) { 956 print OUT "Done.<br />\n"; 957 print OUT 'Run started on ' . localtime($start_time) . ".<br />\n"; 958 print OUT 'Run duration ', duration(time() - $start_time), ".\n" 959 if $main::useduration; 960 } else { 961 print OUT "Running since " . localtime($start_time) . ".<br />\n"; 962 print OUT "Last update at ". localtime() . ".<br />\n"; 963 print OUT "Next update in <strong>", int($main::cp_int), "</strong> minutes.\n"; 964 } 965 print OUT "</td></tr></table>\n\n"; 966 967 # Summary (very brief overview of key statistics) 968 print OUT "<hr /><h2 class='summary'>Report summary</h2>\n"; 969 970 print OUT "<table summary=\"Report summary\" class='summary'>\n"; 971 print OUT "<tr id='checked'><th>Links checked</th><td class='value'>", $stats{'link'}, "</td></tr>\n"; 972 print OUT "<tr id='problems'><th>Problems so far</th><td class='value'>", $stats{'problem'}, "</td></tr>\n"; 973 print OUT "<tr id='todo'><th>Links to do</th><td class='value'>", $stats{'todo'}, "</td></tr>\n"; 974 print OUT "</table>\n"; 975 976 # Server information 977 printAllServers($final_page); 978 979 # Checkbot session parameters 980 print OUT "<hr /><h2 class='params'>Checkbot session parameters</h2>\n"; 981 print OUT "<table summary=\"Checkbot session parameters\" class='params'>\n"; 982 print OUT "<tr><th align=\"left\">--url &<br/> <command line urls></th><td class='text'>Start URL(s)</td><td class='value' id='url'>", 983 join(',', @starturls), "</td></tr>\n"; 984 print OUT "<tr><th align=\"left\">--match</th><td class='text'>Match regular expression</td><td class='value' id='match'>$main::opt_match</td></tr>\n"; 985 print OUT "<tr><th align=\"left\">--exclude</th><td class='text'>Exclude regular expression</td><td class='value' id='exclude'>$main::opt_exclude</td></tr>\n" if defined $main::opt_exclude; 986 print OUT "<tr><th align=\"left\">--filter</th><td class='text'>Filter regular expression</td><td class='value' id='filter'>$main::opt_filter</td></tr>\n" if defined $main::opt_filter; 987 print OUT "<tr><th align=\"left\">--noproxy</th><td class='text'>No Proxy for the following domains</td><td class='value' id='noproxy'>$main::opt_noproxy</td></tr>\n" if defined $main::opt_noproxy; 988 print OUT "<tr><th align=\"left\">--ignore</th><td class='text'>Ignore regular expression</td><td class='value' id='ignore'>$main::opt_ignore</td></tr>\n" if defined $main::opt_ignore; 989 print OUT "<tr><th align=\"left\">--suppress</th><td class='text'>Suppress error code and URL specified by</td><td class='value' id='suppress'>$main::opt_suppress</td></tr>\n" if defined $main::opt_suppress; 990 print OUT "<tr><th align=\"left\">--dontwarn</th><td class='text'>Don't warn for these codes</td><td class='value' id='dontwarn'>$main::opt_dontwarn</td></tr>\n" if $main::opt_dontwarn ne 'xxx'; 991 print OUT "<tr><th align=\"left\">--enable-virtual</th><td class='text'>Use virtual names only</td><td class='value' id='enable_virtual'>yes</td></tr>\n" if $main::opt_enable_virtual; 992 print OUT "<tr><th align=\"left\">--internal-only</th><td class='text'>Check only internal links</td><td class='value' id='internal_only'>yes</td></tr>\n" if defined $main::opt_internal_only; 993 print OUT "<tr><th align=\"left\">--cookies</th><td class='text'>Accept cookies</td><td class='value' id='cookies'>yes</td></tr>\n" if defined $main::opt_cookies; 994 print OUT "<tr><th align=\"left\">--sleep</th><td class='text'>Sleep seconds between requests</td><td class='value' id='sleep'>$main::opt_sleep</td></tr>\n" if ($main::opt_sleep != 0); 995 print OUT "<tr><th align=\"left\">--timeout</th><td class='text'>Request timeout seconds</td><td class='value' id='timeout'>$main::opt_timeout</td></tr>\n"; 996 print OUT "</table>\n"; 997 998 # Statistics for types of links 999 1000 print OUT signature(); 1001 1002 close(OUT); 1003 1004 rename($main::file, $main::file . ".bak"); 1005 rename($main::file . ".new", $main::file); 1006 1007 unlink $main::file . ".bak" unless $main::opt_debug; 1008 1009 output "*** Done writing result page"; 1010 } 1011 1012 # Create a list of all the servers, and create the corresponding table 1013 # and subpages. We use the servers overview for this. This can result 1014 # in strange effects when the same server (e.g. IP address) has 1015 # several names, because several entries will appear. However, when 1016 # using the IP address there are also a number of tricky situations, 1017 # e.g. with virtual hosting. Given that likely the servers have 1018 # different names for a reasons, I think it is better to have 1019 # duplicate entries in some cases, instead of working off of the IP 1020 # addresses. 1021 1022 sub printAllServers { 1023 my ($finalPage) = @_; 1024 1025 my $server; 1026 print OUT "<hr /><h2 class='overview'>Overview per server</h2>\n"; 1027 print OUT "<table summary=\"Overview per server\" class='overview'><tr><th>Server</th><th>Server<br />Type</th><th>Documents<br />scanned</th><th>Problem<br />links</th><th>Ratio</th></tr>\n"; 1028 1029 foreach $server (sort keys %main::servers) { 1030 print_server($server, $finalPage); 1031 } 1032 print OUT "</table>\n\n"; 1033 } 1034 1035 sub get_server_type { 1036 my($server) = @_; 1037 1038 my $result; 1039 1040 if ( ! defined($main::server_type{$server})) { 1041 if ($server eq 'localhost') { 1042 $result = 'Direct access through filesystem'; 1043 } else { 1044 my $request = new HTTP::Request('HEAD', "http://$server/"); 1045 my $response = $main::ua->simple_request($request); 1046 $result = $response->header('Server'); 1047 } 1048 $result = "Unknown server type" if ! defined $result or $result eq ""; 1049 output "=== Server $server is a $result"; 1050 $main::server_type{$server} = $result; 1051 } 1052 $main::server_type{$server}; 1053 } 1054 1055 sub add_checked { 1056 my($urlstr) = @_; 1057 my $item; 1058 my $result = 0; 1059 1060 if (is_internal($urlstr) and not $main::opt_enable_virtual) { 1061 # Substitute hostname with IP-address. This keeps us from checking 1062 # the same pages for each name of the server, wasting time & resources. 1063 # Only do this if we are not dealing with virtual servers. Also, we 1064 # only do this for internal servers, because it makes no sense for 1065 # external links. 1066 my $url = URI->new($urlstr); 1067 $url->host(ip_address($url->host)) if $url->can('host'); 1068 $urlstr = $url->as_string; 1069 } 1070 1071 if (defined $main::checked{$urlstr}) { 1072 $result = 1; 1073 $main::checked{$urlstr}++; 1074 } else { 1075 $main::checked{$urlstr} = 1; 1076 } 1077 1078 return $result; 1079 } 1080 1081 # Has this URL already been checked? 1082 sub is_checked { 1083 my ($urlstr) = @_; 1084 1085 if (is_internal($urlstr) and not $main::opt_enable_virtual) { 1086 # Substitute hostname with IP-address. This keeps us from checking 1087 # the same pages for each name of the server, wasting time & resources. 1088 # Only do this if we are not dealing with virtual servers. Also, we 1089 # only do this for internal servers, because it makes no sense for 1090 # external links. 1091 my $url = URI->new($urlstr); 1092 $url->host(ip_address($url->host)) if $url->can('host'); 1093 $urlstr = $url->as_string; 1094 } 1095 1096 return defined $main::checked{$urlstr}; 1097 } 1098 1099 sub add_error ($$$$) { 1100 my ($url, $urlparent, $code, $status) = @_; 1101 1102 # Check for the quick eliminations first 1103 return if $code =~ /$main::opt_dontwarn/o 1104 or defined $suppression{$code}{$url}; 1105 1106 # Check for matches on the regular expressions in the supression file 1107 if (defined $suppression{$code}) { 1108 foreach my $item ( %{$suppression{$code}} ) { 1109 if ($item =~ /^\/(.*)\/$/) { 1110 my $regexp = $1; 1111 if ($url =~ $regexp) { 1112 output "Supressing error $code for $url due to regular expression match on $regexp", 2; 1113 return; 1114 } 1115 } 1116 } 1117 } 1118 1119 $status = checkbot_status_message($code) if not defined $status; 1120 1121 output "$code $status", 2; 1122 1123 $url_error{$url}{'code'} = $code; 1124 $url_error{$url}{'status'} = $status; 1125 push @{$url_parent{$url}}, $urlparent; 1126 $stats{'problem'}++; 1127 } 1128 1129 # Parse document, and get the links 1130 sub handle_doc { 1131 my ($response, $urlstr) = @_; 1132 1133 my $num_links = 0; 1134 my $new_links = 0; 1135 1136 # TODO: we are making an assumption here that the $reponse->base is 1137 # valid, which might not always be true! This needs to be fixed, but 1138 # first let's try to find out why this stuff is sometimes not 1139 # valid... Aha. a simple <base href="news:"> will do the trick. It is 1140 # not clear what the right fix for this is. 1141 1142 # We use the URL we used to retrieve this document as the URL to 1143 # attach the problem reports to, even though this may not be the 1144 # proper base url. 1145 my $baseurl = URI->new($urlstr); 1146 1147 # When we received the document we can add a notch to its server 1148 $main::servers{$baseurl->authority}++; 1149 1150 # Retrieve useful information from this document. 1151 # TODO: using a regexp is NOT how this should be done, but it is 1152 # easy. The right way would be to write a HTML::Parser or to use 1153 # XPath on the document DOM provided that the document is easily 1154 # parsed as XML. Either method is a lot of overhead. 1155 if ($response->content =~ /title\>(.*?)\<\/title/si) { 1156 1157 # TODO: using a general hash that stores titles for all pages may 1158 # consume too much memory. It would be better to only store the 1159 # titles for requests that had problems. That requires passing them 1160 # down to the queue. Take the easy way out for now. 1161 $url_title{$baseurl} = $1; 1162 } 1163 1164 # Check if this document has a Robots META tag. If so, check if 1165 # Checkbot is allowed to FOLLOW the links on this page. Note that we 1166 # ignore the INDEX directive because Checkbot is not an indexing 1167 # robot. See http://www.robotstxt.org/wc/meta-user.html 1168 # TODO: one more reason (see title) to properly parse this document... 1169 if ($response->content =~ /\<meta[^\>]*?robots[^\>]*?nofollow[^\>]*?\>/si) { 1170 output "Obeying robots meta tag $&, skipping document", 2; 1171 return; 1172 } 1173 1174 1175 # Parse the document just downloaded, using the base url as defined 1176 # in the response, otherwise we won't get the same behavior as 1177 # browsers and miss things like a BASE url in pages. 1178 my $p = HTML::LinkExtor->new(undef, $response->base); 1179 1180 # If charset information is missing then decoded_content doesn't 1181 # work. Fall back to content in this case, even though that may lead 1182 # to charset warnings. See bug 1665075 for reference. 1183 my $content = $response->decoded_content || $response->content; 1184 $p->parse($content); 1185 $p->eof; 1186 1187 # Deal with the links we found in this document 1188 my @links = $p->links(); 1189 foreach (@links) { 1190 my ($tag, %l) = @{$_}; 1191 foreach (keys %l) { 1192 # Get the canonical URL, so we don't need to worry about base, case, etc. 1193 my $url = $l{$_}->canonical; 1194 1195 # Remove fragments, if any 1196 $url->fragment(undef); 1197 1198 # Determine in which tag this URL was found 1199 # Ignore <base> tags because they need not point to a valid URL 1200 # in order to work (e.g. when directory indexing is turned off). 1201 next if $tag eq 'base'; 1202 1203 # Skip some 'links' that are not required to link to an actual 1204 # live link but which LinkExtor returns as links anyway. 1205 next if $tag eq 'applet' and $_ eq 'code'; 1206 next if $tag eq 'object' and $_ eq 'classid'; 1207 1208 # Run filter on the URL if defined 1209 if (defined $main::opt_filter) { 1210 die "Filter supplied with --filter option contains errors!\n$@\n" 1211 unless defined eval '$url =~ s' . $main::opt_filter 1212 } 1213 1214 # Should we ignore this URL? 1215 if (defined $main::opt_ignore and $url =~ /$main::opt_ignore/o) { 1216 output "--ignore: $url", 1; 1217 next; 1218 } 1219 1220 # Check whether URL has fully-qualified hostname 1221 if ($url->can('host') and $url->scheme ne 'news') { 1222 if (! defined $url->host) { 1223 add_error($url, $baseurl->as_string, '901', 1224 $checkbot_errors{'901'}); 1225 } elsif (!$main::opt_allow_simple_hosts && $url->host !~ /\./) { 1226 add_error($url, $baseurl->as_string, '902', 1227 $checkbot_errors{'902'}); 1228 } 1229 } 1230 1231 # Some servers do not process // correctly in requests for relative 1232 # URLs. We should flag them here. Note that // in a URL path is 1233 # actually valid per RFC 2396, and that they should not be removed 1234 # when processing relative URLs as per RFC 1808. See 1235 # e.g. <http://deesse.univ-lemans.fr:8003/Connected/RFC/1808/18.html>. 1236 # Thanks to Randal Schwartz and Reinier Post for their explanations. 1237 if ($url =~ /^http:\/\/.*\/\//) { 1238 add_error($url, $baseurl->as_string, '903', 1239 $checkbot_errors{'903'}); 1240 } 1241 1242 # We add all URLs found to the queue, unless we already checked 1243 # it earlier 1244 if (is_checked($url)) { 1245 1246 # If an error has already been logged for this URL we add the 1247 # current parent to the list of parents on which this URL 1248 # appears. 1249 if (defined $url_error{$url}) { 1250 push @{$url_parent{$url}}, $baseurl->as_string; 1251 $stats{'problem'}++; 1252 } 1253 1254 $stats{'link'}++; 1255 } else { 1256 add_to_queue($url, $baseurl); 1257 $stats{'todo'}++; 1258 $new_links++; 1259 } 1260 $num_links++; 1261 } 1262 } 1263 output "Got $num_links links ($new_links new) from document", 2; 1264 } 1265 1266 1267 sub add_to_queue { 1268 my ($url, $parent) = @_; 1269 1270 print QUEUE $url . '|' . $parent . "\n"; 1271 } 1272 1273 sub checkbot_status_message ($) { 1274 my ($code) = @_; 1275 1276 my $result = status_message($code) || $checkbot_errors{$code} 1277 || '(Undefined status)'; 1278 } 1279 1280 sub print_server ($$) { 1281 my($server, $final_page) = @_; 1282 1283 my $host = $server; 1284 $host =~ s/(.*):\d+/$1/; 1285 1286 output "Writing server $server (really " . ip_address($host) . ")", 1; 1287 1288 my $server_problem = count_problems($server); 1289 my $filename = "$main::server_prefix-$server.html"; 1290 $filename =~ s/:/-/o; 1291 1292 print OUT "<tr><td class='server'>"; 1293 print OUT "<a href=\"@{[ (fileparse($filename))[0] ]}\">" if $server_problem > 0; 1294 print OUT "$server"; 1295 print OUT "</a>" if $server_problem > 0; 1296 print OUT "</td>"; 1297 print OUT "<td class='servertype'>" . get_server_type($server) . "</td>"; 1298 printf OUT "<td class='unique' align=\"right\">%d</td>", 1299 $main::servers{$server} + $server_problem; 1300 if ($server_problem) { 1301 printf OUT "<td class='problems' id='oops' align=\"right\">%d</td>", 1302 $server_problem; 1303 } else { 1304 printf OUT "<td class='problems' id='zero_defects' align=\"right\">%d</td>", 1305 $server_problem; 1306 } 1307 1308 my $ratio = $server_problem / ($main::servers{$server} + $server_problem) * 100; 1309 print OUT "<td class='ratio' align=\"right\">"; 1310 print OUT "<strong>" unless $ratio < 0.5; 1311 printf OUT "%4d%%", $ratio; 1312 print OUT "</strong>" unless $ratio < 0.5; 1313 print OUT "</td>"; 1314 print OUT "</tr>\n"; 1315 1316 # Create this server file 1317 open(SERVER, ">$filename") 1318 || die "Unable to open server file $filename for writing: $!"; 1319 print SERVER "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n"; 1320 print SERVER "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n"; 1321 print SERVER "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">\n"; 1322 print SERVER "<head>\n"; 1323 if (!$final_page) { 1324 printf SERVER "<meta http-equiv=\"Refresh\" content=\"%d\" />\n", 1325 int($main::cp_int * 60 / 2 - 5); 1326 } 1327 print SERVER "<link rel=\"stylesheet\" type=\"text/css\" href=\"$main::opt_style\" />\n" if defined $main::opt_style; 1328 print SERVER "<title>Checkbot: output for server $server</title></head>\n"; 1329 print SERVER "<body><h2><em>Checkbot</em>: report for server <tt>$server</tt></h2>\n"; 1330 print SERVER "<p>Go To: <a href=\"@{[ (fileparse($main::file))[0] ]}\">Main report page</a>"; 1331 1332 printServerProblems($server, $final_page); 1333 1334 print SERVER "\n"; 1335 print SERVER signature(); 1336 1337 close SERVER; 1338 } 1339 1340 # Return a string containing Checkbot's signature for HTML pages 1341 sub signature { 1342 return "<hr />\n<p class='signature'>Page created by <a href=\"http://degraaff.org/checkbot/\">Checkbot $VERSION</a> on <em>" . localtime() . "</em>.</p>\n". 1343 "<p><a href=\"http://validator.w3.org/check/?uri=referer\"><img src=\"http://www.w3.org/Icons/valid-xhtml11\" alt=\"Valid XHTML 1.1\" height=\"31\" width=\"88\" /></a></p>". 1344 "</body></html>"; 1345 } 1346 1347 # Loop through all possible problems, select relevant ones for this server 1348 # and display them in a meaningful way. 1349 sub printServerProblems ($$) { 1350 my ($server, $final_page) = @_; 1351 $server = quotemeta $server; 1352 1353 my $separator = "<hr />\n"; 1354 1355 my %thisServerList = (); 1356 1357 # First we find all the problems for this particular server 1358 foreach my $url (keys %url_parent) { 1359 foreach my $parent (@{$url_parent{$url}}) { 1360 next if $parent !~ $server; 1361 chomp $parent; 1362 $thisServerList{$url_error{$url}{'code'}}{$parent}{$url} 1363 = $url_error{$url}{'status'}; 1364 } 1365 } 1366 1367 # Do a run to find all error codes on this page, and include a table 1368 # of contents to the actual report 1369 foreach my $code (sort keys %thisServerList) { 1370 print SERVER ", <a href=\"#rc$code\">$code "; 1371 print SERVER checkbot_status_message($code); 1372 print SERVER "</a>"; 1373 } 1374 print SERVER ".</p>\n"; 1375 1376 1377 # Now run through this list and print the errors 1378 foreach my $code (sort keys %thisServerList) { 1379 my $codeOut = ''; 1380 1381 foreach my $parent (sort keys %{ $thisServerList{$code} }) { 1382 my $urlOut = ''; 1383 foreach my $url (sort keys %{ $thisServerList{$code}{$parent} }) { 1384 my $status = $thisServerList{$code}{$parent}{$url}; 1385 $urlOut .= "<li><a href=\"$url\">$url</a><br/>\n"; 1386 $urlOut .= "$status" 1387 if defined $status and $status ne checkbot_status_message($code); 1388 $urlOut .= "</li>\n"; 1389 } 1390 if ($urlOut ne '') { 1391 $codeOut .= "<dt><a href=\"$parent\">$parent</a>"; 1392 $codeOut .= "<br />$url_title{$parent}\n" if defined $url_title{$parent}; 1393 $codeOut .= "<dd><ul>\n$urlOut\n</ul>\n\n"; 1394 } 1395 } 1396 1397 if ($codeOut ne '') { 1398 print SERVER $separator if $separator; 1399 $separator = ''; 1400 print SERVER "<h4 id=\"rc$code\">$code "; 1401 print SERVER checkbot_status_message($code); 1402 print SERVER "</h4>\n<dl>\n$codeOut\n</dl>\n"; 1403 } 1404 } 1405 } 1406 1407 sub check_point { 1408 if ( ($main::cp_last + 60 * $main::cp_int < time()) 1409 || ($main::opt_debug && $main::opt_verbose)) { 1410 &create_page(0); 1411 $main::cp_last = time(); 1412 # Increase the intervall from one snapshot to the next by 25% 1413 # until we have reached the maximum. 1414 $main::cp_int *= 1.25 unless $main::opt_debug; 1415 $main::cp_int = $main::opt_interval if $main::cp_int > $main::opt_interval; 1416 } 1417 } 1418 1419 sub send_mail { 1420 my $msg = new Mail::Send; 1421 my $sub = 'Checkbot results for '; 1422 $sub .= join(', ', @starturls); 1423 $sub .= ': ' . $stats{'problem'} . ' errors'; 1424 1425 $msg->to($main::opt_mailto); 1426 $msg->subject($sub); 1427 1428 my $fh = $msg->open; 1429 1430 print $fh "Checkbot results for:\n " . join("\n ", @starturls) . "\n\n"; 1431 print $fh "User-supplied note: $main::opt_note\n\n" 1432 if defined $main::opt_note; 1433 1434 print $fh $stats{'link'}, " links were checked, and "; 1435 print $fh $stats{'problem'}, " problems were detected.\n"; 1436 1437 print $fh 'Run started on ' . localtime($start_time) . "\n"; 1438 print $fh 'Run duration ', duration(time() - $start_time), "\n" 1439 if $main::useduration; 1440 1441 1442 print $fh "\n-- \nCheckbot $VERSION\n"; 1443 print $fh "<URL:http://degraaff.org/checkbot/>\n"; 1444 1445 $fh->close; 1446 } 1447 1448 sub print_help { 1449 print <<"__EOT__"; 1450 Checkbot $VERSION command line options: 1451 1452 --cookies Accept cookies from the server 1453 --debug Debugging mode: No pauses, stop after 25 links. 1454 --file file Use file as basis for output file names. 1455 --help Provide this message. 1456 --mailto address Mail brief synopsis to address when done. 1457 --noproxy domains Do not proxy requests to given domains. 1458 --verbose Verbose mode: display many messages about progress. 1459 --url url Start URL 1460 --match match Check pages only if URL matches `match' 1461 If no match is given, the start URL is used as a match 1462 --exclude exclude Exclude pages if the URL matches 'exclude' 1463 --filter regexp Run regexp on each URL found 1464 --ignore ignore Ignore URLs matching 'ignore' 1465 --suppress file Use contents of 'file' to suppress errors in output 1466 --note note Include Note (e.g. URL to report) along with Mail message. 1467 --proxy URL URL of proxy server for HTTP and FTP requests. 1468 --internal-only Only check internal links, skip checking external links. 1469 --sleep seconds Sleep this many seconds between requests (default 0) 1470 --style url Reference the style sheet at this URL. 1471 --timeout seconds Timeout for http requests in seconds (default 120) 1472 --interval seconds Maximum time interval between updates (default 10800) 1473 --dontwarn codes Do not write warnings for these HTTP response codes 1474 --enable-virtual Use only virtual names, not IP numbers for servers 1475 --language Specify 2-letter language code for language negotiation 1476 1477 Options --match, --exclude, and --ignore can take a perl regular expression 1478 as their argument\n 1479 Use 'perldoc checkbot' for more verbose documentation. 1480 Checkbot WWW page : http://degraaff.org/checkbot/ 1481 Mail bugs and problems: checkbot\@degraaff.org 1482 __EOT__ 1483 1484 exit 0; 1485 } 1486 1487 sub ip_address { 1488 my($host) = @_; 1489 1490 return $main::ip_cache{$host} if defined $main::ip_cache{$host}; 1491 1492 my($name,$aliases,$adrtype,$length,@addrs) = gethostbyname($host); 1493 if (defined $addrs[0]) { 1494 my($n1,$n2,$n3,$n4) = unpack ('C4',$addrs[0]); 1495 $main::ip_cache{$host} = "$n1.$n2.$n3.$n4"; 1496 } else { 1497 # Whee! No IP-address found for this host. Just keep whatever we 1498 # got for the host. If this really is some kind of error it will 1499 # be found later on. 1500 $main::ip_cache{$host} = $host; 1501 } 1502 } 1503 1504 sub count_problems { 1505 my ($server) = @_; 1506 $server = quotemeta $server; 1507 my $count = 0; 1508 1509 foreach my $url (sort keys %url_parent) { 1510 foreach my $parent (@{ $url_parent{$url} }) { 1511 $count++ if $parent =~ m/$server/; 1512 } 1513 } 1514 return $count; 1515 } 1516