"Fossies" - the Fresh Open Source Software Archive

Member "automake-1.16.3/contrib/checklinkx" (19 Nov 2020, 111795 Bytes) of package /linux/misc/automake-1.16.3.tar.xz:


The requested HTML page contains a <FORM> tag that is unusable on "Fossies" in "automatic" (rendered) mode so that page is shown as HTML source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 #!/usr/local/bin/perl -wT
    2 #
    3 # W3C Link Checker
    4 # by Hugo Haas <hugo@w3.org>
    5 # (c) 1999-2011 World Wide Web Consortium
    6 # based on Renaud Bruyeron's checklink.pl
    7 #
    8 # This program is licensed under the W3C(r) Software License:
    9 #       http://www.w3.org/Consortium/Legal/copyright-software
   10 #
   11 # The documentation is at:
   12 #       http://validator.w3.org/docs/checklink.html
   13 #
   14 # See the Mercurial interface at:
   15 #       http://dvcs.w3.org/hg/link-checker/
   16 #
   17 # An online version is available at:
   18 #       http://validator.w3.org/checklink
   19 #
   20 # Comments and suggestions should be sent to the www-validator mailing list:
   21 #       www-validator@w3.org (with 'checklink' in the subject)
   22 #       http://lists.w3.org/Archives/Public/www-validator/ (archives)
   23 #
   24 # Small modifications in March 2020 by Karl Berry <karl@freefriends.org>
   25 # (contributed under the same license, or public domain if you prefer).
   26 # I started from https://metacpan.org/release/W3C-LinkChecker, version 4.81.
   27 # - (&simple_request)   ignore "Argument isn't numeric" warnings.
   28 # - (%Opts, &check_uri) new option --exclude-url-file; see --help message.
   29 # - (&parse_arguments)  allow multiple -X options.
   30 # - (&check_uri)        missing argument to hprintf.
   31 # - (&hprintf)          avoid useless warnings when undef is returned.
   32 # The ideas are (1) to avoid rechecking every url during development,
   33 # and (2) to make the exclude list easier to maintain,
   34 # and (3) to eliminate useless warnings from the code,
   35 #
   36 # For GNU Automake, this program is used by the checklinkx target
   37 # in doc/local.mk to check the (html output of) automake manual.
   38 
   39 use strict;
   40 use 5.008;
   41 
   42 # Get rid of potentially unsafe and unneeded environment variables.
   43 delete(@ENV{qw(IFS CDPATH ENV BASH_ENV)});
   44 $ENV{PATH} = undef;
   45 
   46 # ...but we want PERL5?LIB honored even in taint mode, see perlsec, perl5lib,
   47 # http://www.mail-archive.com/cpan-testers-discuss%40perl.org/msg01064.html
   48 use Config qw(%Config);
   49 use lib map { /(.*)/ }
   50     defined($ENV{PERL5LIB}) ? split(/$Config{path_sep}/, $ENV{PERL5LIB}) :
   51     defined($ENV{PERLLIB})  ? split(/$Config{path_sep}/, $ENV{PERLLIB}) :
   52                               ();
   53 
   54 # -----------------------------------------------------------------------------
   55 
   56 package W3C::UserAgent;
   57 
   58 use LWP::RobotUA 1.19 qw();
   59 use LWP::UserAgent qw();
   60 use Net::HTTP::Methods 5.833 qw();    # >= 5.833 for 4kB cookies (#6678)
   61 
   62 # if 0, ignore robots exclusion (useful for testing)
   63 use constant USE_ROBOT_UA => 1;
   64 
   65 if (USE_ROBOT_UA) {
   66     @W3C::UserAgent::ISA = qw(LWP::RobotUA);
   67 }
   68 else {
   69     @W3C::UserAgent::ISA = qw(LWP::UserAgent);
   70 }
   71 
   72 sub new
   73 {
   74     my $proto = shift;
   75     my $class = ref($proto) || $proto;
   76     my ($name, $from, $rules) = @_;
   77 
   78     # For security/privacy reasons, if $from was not given, do not send it.
   79     # Cheat by defining something for the constructor, and resetting it later.
   80     my $from_ok = $from;
   81     $from ||= 'www-validator@w3.org';
   82 
   83     my $self;
   84     if (USE_ROBOT_UA) {
   85         $self = $class->SUPER::new($name, $from, $rules);
   86     }
   87     else {
   88         my %cnf;
   89         @cnf{qw(agent from)} = ($name, $from);
   90         $self = LWP::UserAgent->new(%cnf);
   91         $self = bless $self, $class;
   92     }
   93 
   94     $self->from(undef) unless $from_ok;
   95 
   96     $self->env_proxy();
   97 
   98     $self->allow_private_ips(1);
   99 
  100     $self->protocols_forbidden([qw(mailto javascript)]);
  101 
  102     return $self;
  103 }
  104 
  105 sub allow_private_ips
  106 {
  107     my $self = shift;
  108     if (@_) {
  109         $self->{Checklink_allow_private_ips} = shift;
  110         if (!$self->{Checklink_allow_private_ips}) {
  111 
  112             # Pull in dependencies
  113             require Net::IP;
  114             require Socket;
  115             require Net::hostent;
  116         }
  117     }
  118     return $self->{Checklink_allow_private_ips};
  119 }
  120 
  121 sub redirect_progress_callback
  122 {
  123     my $self = shift;
  124     $self->{Checklink_redirect_callback} = shift if @_;
  125     return $self->{Checklink_redirect_callback};
  126 }
  127 
  128 sub simple_request
  129 {
  130     my $self = shift;
  131 
  132     my $response = $self->ip_disallowed($_[0]->uri());
  133 
  134     # RFC 2616, section 15.1.3
  135     $_[0]->remove_header("Referer")
  136         if ($_[0]->referer() &&
  137         (!$_[0]->uri()->secure() && URI->new($_[0]->referer())->secure()));
  138 
  139     $response ||= do {
  140         local $SIG{__WARN__} =
  141             sub {    # Suppress RobotRules warnings, rt.cpan.org #18902
  142                      # Suppress "Argument isn't numeric" warnings, see below.
  143             warn($_[0])
  144               if ($_[0]
  145                   && $_[0] !~ /^RobotRules/
  146                   && $_[0] !~ /^Argument .* isn't numeric.*Response\.pm/
  147                  );
  148             };
  149 
  150         # @@@ Why not just $self->SUPER::simple_request? [--unknown]
  151         # --- Indeed. Further, why use simple_request in the first place?
  152         # It is not part of the UserAgent UI. I believe this can result
  153         # in warnings like:
  154         #   Argument "0, 0, 0, 0" isn't numeric in numeric gt (>) at
  155         #   /usr/local/lib/perl5/site_perl/5.30.2/HTTP/Response.pm line 261.
  156         # when checking, e.g.,
  157         #   https://metacpan.org/pod/distribution/Test-Harness/bin/prove
  158         # For testing, here is a three-line html file to check that url:
  159         #   <html><head><title>X</title></head><body>
  160         #   <p><a href="https://metacpan.org/pod/release/MSCHWERN/Test-Simple-0.98_05/lib/Test/More.pm">prove</a></p>
  161         #   </body></html>
  162         # I have been unable to reproduce the warning with a test program
  163         # checking that url using $ua->request(), or other UserAgent
  164         # functions, even after carefully reproducing all the headers
  165         # that checklink sends in the request. --karl@freefriends.org.
  166 
  167         $self->W3C::UserAgent::SUPER::simple_request(@_);
  168     };
  169 
  170     if (!defined($self->{FirstResponse})) {
  171         $self->{FirstResponse} = $response->code();
  172         $self->{FirstMessage} = $response->message() || '(no message)';
  173     }
  174 
  175     return $response;
  176 }
  177 
  178 sub redirect_ok
  179 {
  180     my ($self, $request, $response) = @_;
  181 
  182     if (my $callback = $self->redirect_progress_callback()) {
  183 
  184         # @@@ TODO: when an LWP internal robots.txt request gets redirected,
  185         # this will a bit confusingly fire for it too.  Would need a robust
  186         # way to determine whether the request is such a LWP "internal
  187         # robots.txt" one.
  188         &$callback($request->method(), $request->uri());
  189     }
  190 
  191     return 0 unless $self->SUPER::redirect_ok($request, $response);
  192 
  193     if (my $res = $self->ip_disallowed($request->uri())) {
  194         $response->previous($response->clone());
  195         $response->request($request);
  196         $response->code($res->code());
  197         $response->message($res->message());
  198         return 0;
  199     }
  200 
  201     return 1;
  202 }
  203 
  204 #
  205 # Checks whether we're allowed to retrieve the document based on its IP
  206 # address.  Takes an URI object and returns a HTTP::Response containing the
  207 # appropriate status and error message if the IP was disallowed, 0
  208 # otherwise.  URIs without hostname or IP address are always allowed,
  209 # including schemes where those make no sense (eg. data:, often javascript:).
  210 #
  211 sub ip_disallowed
  212 {
  213     my ($self, $uri) = @_;
  214     return 0 if $self->allow_private_ips();    # Short-circuit
  215 
  216     my $hostname = undef;
  217     eval { $hostname = $uri->host() };    # Not all URIs implement host()...
  218     return 0 unless $hostname;
  219 
  220     my $addr = my $iptype = my $resp = undef;
  221     if (my $host = Net::hostent::gethostbyname($hostname)) {
  222         $addr = Socket::inet_ntoa($host->addr()) if $host->addr();
  223         if ($addr && (my $ip = Net::IP->new($addr))) {
  224             $iptype = $ip->iptype();
  225         }
  226     }
  227     if ($iptype && $iptype ne 'PUBLIC') {
  228         $resp = HTTP::Response->new(403,
  229             'Checking non-public IP address disallowed by link checker configuration'
  230         );
  231         $resp->header('Client-Warning', 'Internal response');
  232     }
  233     return $resp;
  234 }
  235 
  236 # -----------------------------------------------------------------------------
  237 
  238 package W3C::LinkChecker;
  239 
  240 use vars qw($AGENT $PACKAGE $PROGRAM $VERSION $REVISION
  241     $DocType $Head $Accept $ContentTypes %Cfg $CssUrl);
  242 
  243 use CSS::DOM 0.09 qw();    # >= 0.09 for many bugfixes
  244 use CSS::DOM::Constants qw(:rule);
  245 use CSS::DOM::Style qw();
  246 use CSS::DOM::Util qw();
  247 use Encode qw();
  248 use HTML::Entities qw();
  249 use HTML::Parser 3.40 qw();    # >= 3.40 for utf8_mode()
  250 use HTTP::Headers::Util qw();
  251 use HTTP::Message 5.827 qw();    # >= 5.827 for content_charset()
  252 use HTTP::Request 5.814 qw();    # >= 5.814 for accept_decodable()
  253 use HTTP::Response 1.50 qw();    # >= 1.50 for decoded_content()
  254 use Time::HiRes qw();
  255 use URI 1.53 qw();               # >= 1.53 for secure()
  256 use URI::Escape qw();
  257 use URI::Heuristic qw();
  258 
  259 # @@@ Needs also W3C::UserAgent but can't use() it here.
  260 
  261 use constant RC_ROBOTS_TXT          => -1;
  262 use constant RC_DNS_ERROR           => -2;
  263 use constant RC_IP_DISALLOWED       => -3;
  264 use constant RC_PROTOCOL_DISALLOWED => -4;
  265 
  266 use constant LINE_UNKNOWN => -1;
  267 
  268 use constant MP2 =>
  269     (exists($ENV{MOD_PERL_API_VERSION}) && $ENV{MOD_PERL_API_VERSION} >= 2);
  270 
  271 # Tag=>attribute mapping of things we treat as links.
  272 # Note: meta/@http-equiv gets special treatment, see start() for details.
  273 use constant LINK_ATTRS => {
  274     a => ['href'],
  275 
  276     # base/@href intentionally not checked
  277     # http://www.w3.org/mid/200802091439.27764.ville.skytta%40iki.fi
  278     area       => ['href'],
  279     audio      => ['src'],
  280     blockquote => ['cite'],
  281     body       => ['background'],
  282     command    => ['icon'],
  283 
  284     # button/@formaction not checked (side effects)
  285     del => ['cite'],
  286 
  287     # @pluginspage, @pluginurl, @href: pre-HTML5 proprietary
  288     embed => ['href', 'pluginspage', 'pluginurl', 'src'],
  289 
  290     # form/@action not checked (side effects)
  291     frame  => ['longdesc', 'src'],
  292     html   => ['manifest'],
  293     iframe => ['longdesc', 'src'],
  294     img    => ['longdesc', 'src'],
  295 
  296     # input/@action, input/@formaction not checked (side effects)
  297     input  => ['src'],
  298     ins    => ['cite'],
  299     link   => ['href'],
  300     object => ['data'],
  301     q      => ['cite'],
  302     script => ['src'],
  303     source => ['src'],
  304     track  => ['src'],
  305     video  => ['src', 'poster'],
  306 };
  307 
  308 # Tag=>[separator, attributes] mapping of things we treat as lists of links.
  309 use constant LINK_LIST_ATTRS => {
  310     a      => [qr/\s+/,    ['ping']],
  311     applet => [qr/[\s,]+/, ['archive']],
  312     area   => [qr/\s+/,    ['ping']],
  313     head   => [qr/\s+/,    ['profile']],
  314     object => [qr/\s+/,    ['archive']],
  315 };
  316 
  317 # TBD/TODO:
  318 # - applet/@code?
  319 # - bgsound/@src?
  320 # - object/@classid?
  321 # - isindex/@action?
  322 # - layer/@background,@src?
  323 # - ilayer/@background?
  324 # - table,tr,td,th/@background?
  325 # - xmp/@href?
  326 
  327 @W3C::LinkChecker::ISA = qw(HTML::Parser);
  328 
  329 BEGIN {
  330 
  331     # Version info
  332     $PACKAGE  = 'W3C Link Checker';
  333     $PROGRAM  = 'W3C-checklink';
  334     $VERSION  = '4.81';
  335     $REVISION = sprintf('version %s (c) 1999-2011 W3C', $VERSION);
  336     $AGENT    = sprintf(
  337         '%s/%s %s',
  338         $PROGRAM, $VERSION,
  339         (   W3C::UserAgent::USE_ROBOT_UA ? LWP::RobotUA->_agent() :
  340                 LWP::UserAgent->_agent()
  341         )
  342     );
  343 
  344     # Pull in mod_perl modules if applicable.
  345     eval {
  346         local $SIG{__DIE__} = undef;
  347         require Apache2::RequestUtil;
  348     } if MP2();
  349 
  350     my @content_types = qw(
  351         text/html
  352         application/xhtml+xml;q=0.9
  353         application/vnd.wap.xhtml+xml;q=0.6
  354     );
  355     $Accept = join(', ', @content_types, '*/*;q=0.5');
  356     push(@content_types, 'text/css', 'text/html-sandboxed');
  357     my $re = join('|', map { s/;.*//; quotemeta } @content_types);
  358     $ContentTypes = qr{\b(?:$re)\b}io;
  359 
  360     # Regexp for matching URL values in CSS.
  361     $CssUrl = qr/(?:\s|^)url\(\s*(['"]?)(.*?)\1\s*\)(?=\s|$)/;
  362 
  363     #
  364     # Read configuration.  If the W3C_CHECKLINK_CFG environment variable has
  365     # been set or the default contains a non-empty file, read it.  Otherwise,
  366     # skip silently.
  367     #
  368     my $defaultconfig = '/etc/w3c/checklink.conf';
  369     if ($ENV{W3C_CHECKLINK_CFG} || -s $defaultconfig) {
  370 
  371         require Config::General;
  372         Config::General->require_version(2.06);    # Need 2.06 for -SplitPolicy
  373 
  374         my $conffile = $ENV{W3C_CHECKLINK_CFG} || $defaultconfig;
  375         eval {
  376             my %config_opts = (
  377                 -ConfigFile        => $conffile,
  378                 -SplitPolicy       => 'equalsign',
  379                 -AllowMultiOptions => 'no',
  380             );
  381             %Cfg = Config::General->new(%config_opts)->getall();
  382         };
  383         if ($@) {
  384             die <<"EOF";
  385 Failed to read configuration from '$conffile':
  386 $@
  387 EOF
  388         }
  389     }
  390     $Cfg{Markup_Validator_URI} ||= 'http://validator.w3.org/check?uri=%s';
  391     $Cfg{CSS_Validator_URI} ||=
  392         'http://jigsaw.w3.org/css-validator/validator?uri=%s';
  393     $Cfg{Doc_URI} ||= 'http://validator.w3.org/docs/checklink.html';
  394 
  395     # Untaint config params that are used as the format argument to (s)printf(),
  396     # Perl 5.10 does not want to see that in taint mode.
  397     ($Cfg{Markup_Validator_URI}) = ($Cfg{Markup_Validator_URI} =~ /^(.*)$/);
  398     ($Cfg{CSS_Validator_URI})    = ($Cfg{CSS_Validator_URI}    =~ /^(.*)$/);
  399 
  400     $DocType =
  401         '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">';
  402     my $css_url = URI->new_abs('linkchecker.css', $Cfg{Doc_URI});
  403     my $js_url  = URI->new_abs('linkchecker.js',  $Cfg{Doc_URI});
  404     $Head =
  405         sprintf(<<'EOF', HTML::Entities::encode($AGENT), $css_url, $js_url);
  406 <meta http-equiv="Content-Script-Type" content="text/javascript" />
  407 <meta name="generator" content="%s" />
  408 <link rel="stylesheet" type="text/css" href="%s" />
  409 <script type="text/javascript" src="%s"></script>
  410 EOF
  411 
  412     # Trusted environment variables that need laundering in taint mode.
  413     for (qw(NNTPSERVER NEWSHOST)) {
  414         ($ENV{$_}) = ($ENV{$_} =~ /^(.*)$/) if $ENV{$_};
  415     }
  416 
  417     # Use passive FTP by default, see Net::FTP(3).
  418     $ENV{FTP_PASSIVE} = 1 unless exists($ENV{FTP_PASSIVE});
  419 }
  420 
  421 # Autoflush
  422 $| = 1;
  423 
  424 # Different options specified by the user
  425 my $cmdline = !($ENV{GATEWAY_INTERFACE} && $ENV{GATEWAY_INTERFACE} =~ /^CGI/);
  426 my %Opts = (
  427     Command_Line    => $cmdline,
  428     Quiet           => 0,
  429     Summary_Only    => 0,
  430     Verbose         => 0,
  431     Progress        => 0,
  432     HTML            => 0,
  433     Timeout         => 30,
  434     Redirects       => 1,
  435     Dir_Redirects   => 1,
  436     Accept_Language => $cmdline ? undef : $ENV{HTTP_ACCEPT_LANGUAGE},
  437     Cookies         => undef,
  438     No_Referer      => 0,
  439     Hide_Same_Realm => 0,
  440     Depth                    => 0,             # < 0 means unlimited recursion.
  441     Sleep_Time               => 1,
  442     Connection_Cache_Size    => 2,
  443     Max_Documents            => 150,           # For the online version.
  444     User                     => undef,
  445     Password                 => undef,
  446     Base_Locations           => [],
  447     Exclude                  => undef,
  448     Exclude_Docs             => undef,
  449     Exclude_Url_File         => undef,
  450     Suppress_Redirect        => [],
  451     Suppress_Redirect_Prefix => [],
  452     Suppress_Redirect_Regexp => [],
  453     Suppress_Temp_Redirects  => 1,
  454     Suppress_Broken          => [],
  455     Suppress_Fragment        => [],
  456     Masquerade               => 0,
  457     Masquerade_From          => '',
  458     Masquerade_To            => '',
  459     Trusted                  => $Cfg{Trusted},
  460     Allow_Private_IPs => defined($Cfg{Allow_Private_IPs}) ?
  461         $Cfg{Allow_Private_IPs} :
  462         $cmdline,
  463 );
  464 undef $cmdline;
  465 
  466 # Global variables
  467 # What URI's did we process? (used for recursive mode)
  468 my %processed;
  469 
  470 # Result of the HTTP query
  471 my %results;
  472 
  473 # List of redirects
  474 my %redirects;
  475 
  476 # Count of the number of documents checked
  477 my $doc_count = 0;
  478 
  479 # Time stamp
  480 my $timestamp = &get_timestamp();
  481 
  482 # Per-document header; undefined if already printed.  See print_doc_header().
  483 my $doc_header;
  484 
  485 &parse_arguments() if $Opts{Command_Line};
  486 
  487 my $ua = W3C::UserAgent->new($AGENT);    # @@@ TODO: admin address
  488 
  489 $ua->conn_cache({total_capacity => $Opts{Connection_Cache_Size}});
  490 if ($ua->can('delay')) {
  491     $ua->delay($Opts{Sleep_Time} / 60);
  492 }
  493 $ua->timeout($Opts{Timeout});
  494 
  495 # Set up cookie stash if requested
  496 if (defined($Opts{Cookies})) {
  497     require HTTP::Cookies;
  498     my $cookie_file = $Opts{Cookies};
  499     if ($cookie_file eq 'tmp') {
  500         $cookie_file = undef;
  501     }
  502     elsif ($cookie_file =~ /^(.*)$/) {
  503         $cookie_file = $1;    # untaint
  504     }
  505     $ua->cookie_jar(HTTP::Cookies->new(file => $cookie_file, autosave => 1));
  506 }
  507 eval { $ua->allow_private_ips($Opts{Allow_Private_IPs}); };
  508 if ($@) {
  509     die <<"EOF";
  510 Allow_Private_IPs is false; this feature requires the Net::IP, Socket, and
  511 Net::hostent modules:
  512 $@
  513 EOF
  514 }
  515 
  516 # Add configured forbidden protocols
  517 if ($Cfg{Forbidden_Protocols}) {
  518     my $forbidden = $ua->protocols_forbidden();
  519     push(@$forbidden, split(/[,\s]+/, lc($Cfg{Forbidden_Protocols})));
  520     $ua->protocols_forbidden($forbidden);
  521 }
  522 
  523 if ($Opts{Command_Line}) {
  524 
  525     require Text::Wrap;
  526     Text::Wrap->import('wrap');
  527 
  528     require URI::file;
  529 
  530     &usage(1) unless scalar(@ARGV);
  531 
  532     $Opts{_Self_URI} = 'http://validator.w3.org/checklink';   # For HTML output
  533 
  534     &ask_password() if ($Opts{User} && !$Opts{Password});
  535 
  536     if (!$Opts{Summary_Only}) {
  537         printf("%s %s\n", $PACKAGE, $REVISION) unless $Opts{HTML};
  538     }
  539     else {
  540         $Opts{Verbose}  = 0;
  541         $Opts{Progress} = 0;
  542     }
  543 
  544     # Populate data for print_form()
  545     my %params = (
  546         summary            => $Opts{Summary_Only},
  547         hide_redirects     => !$Opts{Redirects},
  548         hide_type          => $Opts{Dir_Redirects} ? 'dir' : 'all',
  549         no_accept_language => !(
  550             defined($Opts{Accept_Language}) && $Opts{Accept_Language} eq 'auto'
  551         ),
  552         no_referer => $Opts{No_Referer},
  553         recursive  => ($Opts{Depth} != 0),
  554         depth      => $Opts{Depth},
  555     );
  556 
  557     my $check_num = 1;
  558     my @bases     = @{$Opts{Base_Locations}};
  559     for my $uri (@ARGV) {
  560 
  561         # Reset base locations so that previous URI's given on the command line
  562         # won't affect the recursion scope for this URI (see check_uri())
  563         @{$Opts{Base_Locations}} = @bases;
  564 
  565         # Transform the parameter into a URI
  566         $uri = &urize($uri);
  567         $params{uri} = $uri;
  568         &check_uri(\%params, $uri, $check_num, $Opts{Depth}, undef, undef, 1);
  569         $check_num++;
  570     }
  571     undef $check_num;
  572 
  573     if ($Opts{HTML}) {
  574         &html_footer();
  575     }
  576     elsif ($doc_count > 0 && !$Opts{Summary_Only}) {
  577         printf("\n%s\n", &global_stats());
  578     }
  579 
  580 }
  581 else {
  582 
  583     require CGI;
  584     require CGI::Carp;
  585     CGI::Carp->import(qw(fatalsToBrowser));
  586     require CGI::Cookie;
  587 
  588     # file: URIs are not allowed in CGI mode
  589     my $forbidden = $ua->protocols_forbidden();
  590     push(@$forbidden, 'file');
  591     $ua->protocols_forbidden($forbidden);
  592 
  593     my $query = CGI->new();
  594 
  595     for my $param ($query->param()) {
  596         my @values = map { Encode::decode_utf8($_) } $query->param($param);
  597         $query->param($param, @values);
  598     }
  599 
  600     # Set a few parameters in CGI mode
  601     $Opts{Verbose}   = 0;
  602     $Opts{Progress}  = 0;
  603     $Opts{HTML}      = 1;
  604     $Opts{_Self_URI} = $query->url(-relative => 1);
  605 
  606     # Backwards compatibility
  607     my $uri = undef;
  608     if ($uri = $query->param('url')) {
  609         $query->param('uri', $uri) unless $query->param('uri');
  610         $query->delete('url');
  611     }
  612     $uri = $query->param('uri');
  613 
  614     if (!$uri) {
  615         &html_header('', undef);    # Set cookie only from results page.
  616         my %cookies = CGI::Cookie->fetch();
  617         &print_form(scalar($query->Vars()), $cookies{$PROGRAM}, 1);
  618         &html_footer();
  619         exit;
  620     }
  621 
  622     # Backwards compatibility
  623     if ($query->param('hide_dir_redirects')) {
  624         $query->param('hide_redirects', 'on');
  625         $query->param('hide_type',      'dir');
  626         $query->delete('hide_dir_redirects');
  627     }
  628 
  629     $Opts{Summary_Only} = 1 if $query->param('summary');
  630 
  631     if ($query->param('hide_redirects')) {
  632         $Opts{Dir_Redirects} = 0;
  633         if (my $type = $query->param('hide_type')) {
  634             $Opts{Redirects} = 0 if ($type ne 'dir');
  635         }
  636         else {
  637             $Opts{Redirects} = 0;
  638         }
  639     }
  640 
  641     $Opts{Accept_Language} = undef if $query->param('no_accept_language');
  642     $Opts{No_Referer} = $query->param('no_referer');
  643 
  644     $Opts{Depth} = -1 if ($query->param('recursive') && $Opts{Depth} == 0);
  645     if (my $depth = $query->param('depth')) {
  646 
  647         # @@@ Ignore invalid depth silently for now.
  648         $Opts{Depth} = $1 if ($depth =~ /(-?\d+)/);
  649     }
  650 
  651     # Save, clear or leave cookie as is.
  652     my $cookie = undef;
  653     if (my $action = $query->param('cookie')) {
  654         if ($action eq 'clear') {
  655 
  656             # Clear the cookie.
  657             $cookie = CGI::Cookie->new(-name => $PROGRAM);
  658             $cookie->value({clear => 1});
  659             $cookie->expires('-1M');
  660         }
  661         elsif ($action eq 'set') {
  662 
  663             # Set the options.
  664             $cookie = CGI::Cookie->new(-name => $PROGRAM);
  665             my %options = $query->Vars();
  666             delete($options{$_})
  667                 for qw(url uri check cookie);    # Non-persistent.
  668             $cookie->value(\%options);
  669         }
  670     }
  671     if (!$cookie) {
  672         my %cookies = CGI::Cookie->fetch();
  673         $cookie = $cookies{$PROGRAM};
  674     }
  675 
  676     # Always refresh cookie expiration time.
  677     $cookie->expires('+1M') if ($cookie && !$cookie->expires());
  678 
  679     # All Apache configurations don't set HTTP_AUTHORIZATION for CGI scripts.
  680     # If we're under mod_perl, there is a way around it...
  681     eval {
  682         local $SIG{__DIE__} = undef;
  683         my $auth =
  684             Apache2::RequestUtil->request()->headers_in()->{Authorization};
  685         $ENV{HTTP_AUTHORIZATION} = $auth if $auth;
  686     } if (MP2() && !$ENV{HTTP_AUTHORIZATION});
  687 
  688     $uri =~ s/^\s+//g;
  689     if ($uri =~ /:/) {
  690         $uri = URI->new($uri);
  691     }
  692     else {
  693         if ($uri =~ m|^//|) {
  694             $uri = URI->new("http:$uri");
  695         }
  696         else {
  697             local $ENV{URL_GUESS_PATTERN} = '';
  698             my $guess = URI::Heuristic::uf_uri($uri);
  699             if ($guess->scheme() && $ua->is_protocol_supported($guess)) {
  700                 $uri = $guess;
  701             }
  702             else {
  703                 $uri = URI->new("http://$uri");
  704             }
  705         }
  706     }
  707     $uri = $uri->canonical();
  708     $query->param("uri", $uri);
  709 
  710     &check_uri(scalar($query->Vars()), $uri, 1, $Opts{Depth}, $cookie);
  711     undef $query;    # Not needed any more.
  712     &html_footer();
  713 }
  714 
  715 ###############################################################################
  716 
  717 ################################
  718 # Command line and usage stuff #
  719 ################################
  720 
  721 sub parse_arguments ()
  722 {
  723     require Encode::Locale;
  724     Encode::Locale::decode_argv();
  725 
  726     require Getopt::Long;
  727     Getopt::Long->require_version(2.17);
  728     Getopt::Long->import('GetOptions');
  729     Getopt::Long::Configure('bundling', 'no_ignore_case');
  730     my $masq = '';
  731     my @locs = ();
  732 
  733     GetOptions(
  734         'help|h|?' => sub { usage(0) },
  735         'q|quiet'  => sub {
  736             $Opts{Quiet}        = 1;
  737             $Opts{Summary_Only} = 1;
  738         },
  739         's|summary' => \$Opts{Summary_Only},
  740         'b|broken'  => sub {
  741             $Opts{Redirects}     = 0;
  742             $Opts{Dir_Redirects} = 0;
  743         },
  744         'e|dir-redirects' => sub { $Opts{Dir_Redirects} = 0; },
  745         'v|verbose'       => \$Opts{Verbose},
  746         'i|indicator'     => \$Opts{Progress},
  747         'H|html'          => \$Opts{HTML},
  748         'r|recursive'     => sub {
  749             $Opts{Depth} = -1
  750                 if $Opts{Depth} == 0;
  751         },
  752         'l|location=s'                => \@locs,
  753         'X|exclude=s@'                => \@{$Opts{Exclude}},
  754         'exclude-docs=s@'             => \@{$Opts{Exclude_Docs}},
  755         'exclude-url-file=s'          => \$Opts{Exclude_Url_File},
  756         'suppress-redirect=s@'        => \@{$Opts{Suppress_Redirect}},
  757         'suppress-redirect-prefix=s@' => \@{$Opts{Suppress_Redirect_Prefix}},
  758         'suppress-temp-redirects'     => \$Opts{Suppress_Temp_Redirects},
  759         'suppress-broken=s@'          => \@{$Opts{Suppress_Broken}},
  760         'suppress-fragment=s@'        => \@{$Opts{Suppress_Fragment}},
  761         'u|user=s'                    => \$Opts{User},
  762         'p|password=s'                => \$Opts{Password},
  763         't|timeout=i'                 => \$Opts{Timeout},
  764         'C|connection-cache=i'        => \$Opts{Connection_Cache_Size},
  765         'S|sleep=i'                   => \$Opts{Sleep_Time},
  766         'L|languages=s'               => \$Opts{Accept_Language},
  767         'c|cookies=s'                 => \$Opts{Cookies},
  768         'R|no-referer'                => \$Opts{No_Referer},
  769         'D|depth=i'                   => sub {
  770             $Opts{Depth} = $_[1]
  771                 unless $_[1] == 0;
  772         },
  773         'd|domain=s'      => \$Opts{Trusted},
  774         'masquerade=s'    => \$masq,
  775         'hide-same-realm' => \$Opts{Hide_Same_Realm},
  776         'V|version'       => \&version,
  777         ) ||
  778         usage(1);
  779 
  780     if ($masq) {
  781         $Opts{Masquerade} = 1;
  782         my @masq = split(/\s+/, $masq);
  783         if (scalar(@masq) != 2 ||
  784             !defined($masq[0]) ||
  785             $masq[0] !~ /\S/ ||
  786             !defined($masq[1]) ||
  787             $masq[1] !~ /\S/)
  788         {
  789             usage(1,
  790                 "Error: --masquerade takes two whitespace separated URIs.");
  791         }
  792         else {
  793             require URI::file;
  794             $Opts{Masquerade_From} = $masq[0];
  795             my $u = URI->new($masq[1]);
  796             $Opts{Masquerade_To} =
  797                 $u->scheme() ? $u : URI::file->new_abs($masq[1]);
  798         }
  799     }
  800 
  801     if ($Opts{Accept_Language} && $Opts{Accept_Language} eq 'auto') {
  802         $Opts{Accept_Language} = &guess_language();
  803     }
  804 
  805     if (($Opts{Sleep_Time} || 0) < 1) {
  806         warn(
  807             "*** Warning: minimum allowed sleep time is 1 second, resetting.\n"
  808         );
  809         $Opts{Sleep_Time} = 1;
  810     }
  811 
  812     push(@{$Opts{Base_Locations}}, map { URI->new($_)->canonical() } @locs);
  813 
  814     $Opts{Depth} = -1 if ($Opts{Depth} == 0 && @locs);
  815 
  816     for my $i (0 .. $#{$Opts{Exclude_Docs}}) {
  817         eval { $Opts{Exclude_Docs}->[$i] = qr/$Opts{Exclude_Docs}->[$i]/; };
  818         &usage(1, "Error in exclude-docs regexp: $@") if $@;
  819     }
  820     if (defined($Opts{Trusted})) {
  821         eval { $Opts{Trusted} = qr/$Opts{Trusted}/io; };
  822         &usage(1, "Error in trusted domains regexp: $@") if $@;
  823     }
  824 
  825     # Sanity-check error-suppression arguments
  826     for my $i (0 .. $#{$Opts{Suppress_Redirect}}) {
  827         ${$Opts{Suppress_Redirect}}[$i] =~ s/ /->/;
  828         my $sr_arg = ${$Opts{Suppress_Redirect}}[$i];
  829         if ($sr_arg !~ /.->./) {
  830             &usage(1,
  831                 "Bad suppress-redirect argument, should contain \"->\": $sr_arg"
  832             );
  833         }
  834     }
  835     for my $i (0 .. $#{$Opts{Suppress_Redirect_Prefix}}) {
  836         my $srp_arg = ${$Opts{Suppress_Redirect_Prefix}}[$i];
  837         $srp_arg =~ s/ /->/;
  838         if ($srp_arg !~ /^(.*)->(.*)$/) {
  839             &usage(1,
  840                 "Bad suppress-redirect-prefix argument, should contain \"->\": $srp_arg"
  841             );
  842         }
  843 
  844         # Turn prefixes into a regexp.
  845         ${$Opts{Suppress_Redirect_Prefix}}[$i] = qr/^\Q$1\E(.*)->\Q$2\E\1$/ism;
  846     }
  847     for my $i (0 .. $#{$Opts{Suppress_Broken}}) {
  848         ${$Opts{Suppress_Broken}}[$i] =~ s/ /:/;
  849         my $sb_arg = ${$Opts{Suppress_Broken}}[$i];
  850         if ($sb_arg !~ /^(-1|[0-9]+):./) {
  851             &usage(1,
  852                 "Bad suppress-broken argument, should be prefixed by a numeric response code: $sb_arg"
  853             );
  854         }
  855     }
  856     for my $sf_arg (@{$Opts{Suppress_Fragment}}) {
  857         if ($sf_arg !~ /.#./) {
  858             &usage(1,
  859                 "Bad suppress-fragment argument, should contain \"#\": $sf_arg"
  860             );
  861         }
  862     }
  863 
  864     if ($#{$Opts{Exclude}} > 0) {
  865       # convert $Opts{Exclude} array into regexp by parenthesizing
  866       # each and inserting alternations between.
  867       my $exclude_rx = join("|", map { "($_)" } @{$Opts{Exclude}});
  868       #
  869       # For the sake of the rest of the program, pretend the option
  870       # was that string all along.
  871       $Opts{Exclude} = $exclude_rx;
  872     }
  873     
  874     if ($Opts{Exclude_Url_File}) {
  875         # The idea is that if the specified file exists, we read it and
  876         # treat it as a list of excludes. If the file doesn't exist, we
  877         # write it with all the urls that were successful. That way, we
  878         # can avoid re-checking them on every run, and it can be removed
  879         # externally (from cron) to get re-updated.
  880         # 
  881         # We distinguish the cases here, and either add to
  882         # $Opts{Exclude} if reading, or setting Exclude_File_Write in
  883         # %Opts if writing (even though it is not really an option,
  884         # but it's the most convenient place).
  885         if (-s $Opts{Exclude_Url_File}) {
  886             open (my $xf, "$Opts{Exclude_Url_File}")
  887             || &usage(1, "Could not open $Opts{Exclude_Url_File}"
  888                          . " for reading: $!");
  889             my @xf = ();
  890             while (<$xf>) {
  891                 chomp;
  892                 # the file is urls, not regexps, so quotemeta.
  893                 push (@xf, "(" . quotemeta($_) . ")");
  894             }
  895             my $xf_rx = join ("|", @xf);
  896             if ($Opts{Exclude}) {
  897                 $Opts{Exclude} .= "|$xf_rx";
  898             } else {
  899                 $Opts{Exclude} = $xf_rx;
  900             }
  901         } else {
  902             open ($Opts{Exclude_File_Write}, ">$Opts{Exclude_Url_File}")
  903             || &usage(1,
  904                      "Could not open $Opts{Exclude_Url_File} for writing: $!");
  905             # we write on a successful retrieve, and don't bother closing.
  906         }
  907     }
  908 
  909     # Precompile/error-check final list of regular expressions
  910     if (defined($Opts{Exclude})) {
  911         eval { $Opts{Exclude} = qr/$Opts{Exclude}/o; };
  912         &usage(1, "Error in exclude regexp $Opts{Exclude}: $@") if $@;
  913     }
  914 
  915     return;
  916 }
  917 
  918 sub version ()
  919 {
  920     print "$PACKAGE $REVISION\n";
  921     exit 0;
  922 }
  923 
  924 sub usage ()
  925 {
  926     my ($exitval, $msg) = @_;
  927     $exitval = 0 unless defined($exitval);
  928     $msg ||= '';
  929     $msg =~ s/[\r\n]*$/\n\n/ if $msg;
  930 
  931     die($msg) unless $Opts{Command_Line};
  932 
  933     my $trust = defined($Cfg{Trusted}) ? $Cfg{Trusted} : 'same host only';
  934 
  935     select(STDERR) if $exitval;
  936     print "$msg$PACKAGE $REVISION
  937 
  938 Usage: checklink <options> <uris>
  939 Options:
  940  -s, --summary              Result summary only.
  941  -b, --broken               Show only the broken links, not the redirects.
  942  -e, --directory            Hide directory redirects, for example
  943                             http://www.w3.org/TR -> http://www.w3.org/TR/
  944  -r, --recursive            Check the documents linked from the first one.
  945  -D, --depth N              Check the documents linked from the first one to
  946                             depth N (implies --recursive).
  947  -l, --location URI         Scope of the documents checked in recursive mode
  948                             (implies --recursive).  Can be specified multiple
  949                             times.  If not specified, the default eg. for
  950                             http://www.w3.org/TR/html4/Overview.html
  951                             would be http://www.w3.org/TR/html4/
  952  -X, --exclude REGEXP       Do not check links whose full, canonical URIs
  953                             match REGEXP; also limits recursion the same way
  954                             as --exclude-docs with the same regexp would.
  955                             This option may be specified multiple times.
  956  --exclude-docs REGEXP      In recursive mode, do not check links in documents
  957                             whose full, canonical URIs match REGEXP.  This
  958                             option may be specified multiple times.
  959  --exclude-url-file FILE    If FILE exists, treat each line as a string
  960                             specifying another exclude; quotemeta is called
  961                             to make them regexps. If FILE does not exist,
  962                             open it for writing and write each checked url
  963                             which gets a 200 response to it.
  964  --suppress-redirect URI->URI  Do not report a redirect from the first to the
  965                             second URI.  This option may be specified multiple
  966                             times.
  967  --suppress-redirect-prefix URI->URI  Do not report a redirect from a child of
  968                             the first URI to the same child of the second URI.
  969                             This option may be specified multiple times.
  970  --suppress-temp-redirects  Suppress warnings about temporary redirects.
  971  --suppress-broken CODE:URI  Do not report a broken link with the given CODE.
  972                             CODE is HTTP response, or -1 for robots exclusion.
  973                             This option may be specified multiple times.
  974  --suppress-fragment URI    Do not report the given broken fragment URI.
  975                             A fragment URI contains \"#\".  This option may be
  976                             specified multiple times.
  977  -L, --languages LANGS      Accept-Language header to send.  The special value
  978                             'auto' causes autodetection from the environment.
  979  -c, --cookies FILE         Use cookies, load/save them in FILE.  The special
  980                             value 'tmp' causes non-persistent use of cookies.
  981  -R, --no-referer           Do not send the Referer HTTP header.
  982  -q, --quiet                No output if no errors are found (implies -s).
  983  -v, --verbose              Verbose mode.
  984  -i, --indicator            Show percentage of lines processed while parsing.
  985  -u, --user USERNAME        Specify a username for authentication.
  986  -p, --password PASSWORD    Specify a password.
  987  --hide-same-realm          Hide 401's that are in the same realm as the
  988                             document checked.
  989  -S, --sleep SECS           Sleep SECS seconds between requests to each server
  990                             (default and minimum: 1 second).
  991  -t, --timeout SECS         Timeout for requests in seconds (default: 30).
  992  -d, --domain DOMAIN        Regular expression describing the domain to which
  993                             authentication information will be sent
  994                             (default: $trust).
  995  --masquerade \"BASE1 BASE2\" Masquerade base URI BASE1 as BASE2.  See the
  996                             manual page for more information.
  997  -H, --html                 HTML output.
  998  -?, -h, --help             Show this message and exit.
  999  -V, --version              Output version information and exit.
 1000 
 1001 See \"perldoc LWP\" for information about proxy server support,
 1002 \"perldoc Net::FTP\" for information about various environment variables
 1003 affecting FTP connections and \"perldoc Net::NNTP\" for setting a default
 1004 NNTP server for news: URIs.
 1005 
 1006 The W3C_CHECKLINK_CFG environment variable can be used to set the
 1007 configuration file to use.  See details in the full manual page, it can
 1008 be displayed with: perldoc checklink
 1009 
 1010 More documentation at: $Cfg{Doc_URI}
 1011 Please send bug reports and comments to the www-validator mailing list:
 1012   www-validator\@w3.org (with 'checklink' in the subject)
 1013   Archives are at: http://lists.w3.org/Archives/Public/www-validator/
 1014 ";
 1015     exit $exitval;
 1016 }
 1017 
 1018 sub ask_password ()
 1019 {
 1020     eval {
 1021         local $SIG{__DIE__} = undef;
 1022         require Term::ReadKey;
 1023         Term::ReadKey->require_version(2.00);
 1024         Term::ReadKey->import(qw(ReadMode));
 1025     };
 1026     if ($@) {
 1027         warn('Warning: Term::ReadKey 2.00 or newer not available, ' .
 1028                 "password input disabled.\n");
 1029         return;
 1030     }
 1031     printf(STDERR 'Enter the password for user %s: ', $Opts{User});
 1032     ReadMode('noecho', *STDIN);
 1033     chomp($Opts{Password} = <STDIN>);
 1034     ReadMode('restore', *STDIN);
 1035     print(STDERR "ok.\n");
 1036     return;
 1037 }
 1038 
 1039 ###############################################################################
 1040 
 1041 ###########################################################################
 1042 # Guess an Accept-Language header based on the $LANG environment variable #
 1043 ###########################################################################
 1044 
 1045 sub guess_language ()
 1046 {
 1047     my $lang = $ENV{LANG} or return;
 1048 
 1049     $lang =~ s/[\.@].*$//;    # en_US.UTF-8, fi_FI@euro...
 1050 
 1051     return 'en' if ($lang eq 'C' || $lang eq 'POSIX');
 1052 
 1053     my $res = undef;
 1054     eval {
 1055         require Locale::Language;
 1056         if (my $tmp = Locale::Language::language2code($lang)) {
 1057             $lang = $tmp;
 1058         }
 1059         if (my ($l, $c) = (lc($lang) =~ /^([a-z]+)(?:[-_]([a-z]+))?/)) {
 1060             if (Locale::Language::code2language($l)) {
 1061                 $res = $l;
 1062                 if ($c) {
 1063                     require Locale::Country;
 1064                     $res .= "-$c" if Locale::Country::code2country($c);
 1065                 }
 1066             }
 1067         }
 1068     };
 1069     return $res;
 1070 }
 1071 
 1072 ############################
 1073 # Transform foo into a URI #
 1074 ############################
 1075 
 1076 sub urize ($)
 1077 {
 1078     my $arg  = shift;
 1079     my $uarg = URI::Escape::uri_unescape($arg);
 1080     my $uri;
 1081     if (-d $uarg) {
 1082 
 1083         # look for an "index" file in dir, return it if found
 1084         require File::Spec;
 1085         for my $index (map { File::Spec->catfile($uarg, $_) }
 1086             qw(index.html index.xhtml index.htm index.xhtm))
 1087         {
 1088             if (-e $index) {
 1089                 $uri = URI::file->new_abs($index);
 1090                 last;
 1091             }
 1092         }
 1093 
 1094         # return dir itself if an index file was not found
 1095         $uri ||= URI::file->new_abs($uarg);
 1096     }
 1097     elsif ($uarg =~ /^[.\/\\]/ || -e $uarg) {
 1098         $uri = URI::file->new_abs($uarg);
 1099     }
 1100     else {
 1101         my $newuri = URI->new($arg);
 1102         if ($newuri->scheme()) {
 1103             $uri = $newuri;
 1104         }
 1105         else {
 1106             local $ENV{URL_GUESS_PATTERN} = '';
 1107             $uri = URI::Heuristic::uf_uri($arg);
 1108             $uri = URI::file->new_abs($uri) unless $uri->scheme();
 1109         }
 1110     }
 1111     return $uri->canonical();
 1112 }
 1113 
 1114 ########################################
 1115 # Check for broken links in a resource #
 1116 ########################################
 1117 
 1118 sub check_uri (\%\$$$$;\$$)
 1119 {
 1120     my ($params, $uri, $check_num, $depth, $cookie, $referer, $is_start) = @_;
 1121     $is_start ||= ($check_num == 1);
 1122 
 1123     my $start = $Opts{Summary_Only} ? 0 : &get_timestamp();
 1124 
 1125     # Get and parse the document
 1126     my $response = &get_document(
 1127         'GET',   $uri,    $doc_count, \%redirects, $referer,
 1128         $cookie, $params, $check_num, $is_start
 1129     );
 1130 
 1131     # Can we check the resource? If not, we exit here...
 1132     return if defined($response->{Stop});
 1133 
 1134     if ($Opts{HTML}) {
 1135         &html_header($uri, $cookie) if ($check_num == 1);
 1136         &print_form($params, $cookie, $check_num) if $is_start;
 1137     }
 1138 
 1139     if ($is_start) { # Starting point of a new check, eg. from the command line
 1140           # Use the first URI as the recursion base unless specified otherwise.
 1141         push(@{$Opts{Base_Locations}}, $response->{absolute_uri}->canonical())
 1142             unless @{$Opts{Base_Locations}};
 1143     }
 1144     else {
 1145 
 1146         # Before fetching the document, we don't know if we'll be within the
 1147         # recursion scope or not (think redirects).
 1148         if (!&in_recursion_scope($response->{absolute_uri})) {
 1149             hprintf("Not in recursion scope: %s\n", $response->{absolute_uri})
 1150                 if ($Opts{Verbose});
 1151             $response->content("");
 1152             return;
 1153         }
 1154     }
 1155 
 1156     # Define the document header, and perhaps print it.
 1157     # (It might still be defined if the previous document had no errors;
 1158     # just redefine it in that case.)
 1159 
 1160     if ($check_num != 1) {
 1161         if ($Opts{HTML}) {
 1162             $doc_header = "\n<hr />\n";
 1163         }
 1164         else {
 1165             $doc_header = "\n" . ('-' x 40) . "\n";
 1166         }
 1167     }
 1168 
 1169     if ($Opts{HTML}) {
 1170         $doc_header .=
 1171             ("<h2>\nProcessing\t" . &show_url($response->{absolute_uri}) .
 1172                 "\n</h2>\n\n");
 1173     }
 1174     else {
 1175         $doc_header .= "\nProcessing\t$response->{absolute_uri}\n\n";
 1176     }
 1177 
 1178     if (!$Opts{Quiet}) {
 1179         print_doc_header();
 1180     }
 1181 
 1182     # We are checking a new document
 1183     $doc_count++;
 1184 
 1185     my $result_anchor = 'results' . $doc_count;
 1186 
 1187     if ($check_num == 1 && !$Opts{HTML} && !$Opts{Summary_Only}) {
 1188         my $s = $Opts{Sleep_Time} == 1 ? '' : 's';
 1189         my $acclang = $Opts{Accept_Language} || '(not sent)';
 1190         my $send_referer = $Opts{No_Referer} ? 'not sent' : 'sending';
 1191         my $cookies = 'not used';
 1192         if (defined($Opts{Cookies})) {
 1193             $cookies = 'used, ';
 1194             if ($Opts{Cookies} eq 'tmp') {
 1195                 $cookies .= 'non-persistent';
 1196             }
 1197             else {
 1198                 $cookies .= "file $Opts{Cookies}";
 1199             }
 1200         }
 1201         printf(
 1202             <<'EOF', $Accept, $acclang, $send_referer, $cookies, $Opts{Sleep_Time}, $s);
 1203 
 1204 Settings used:
 1205 - Accept: %s
 1206 - Accept-Language: %s
 1207 - Referer: %s
 1208 - Cookies: %s
 1209 - Sleeping %d second%s between requests to each server
 1210 EOF
 1211         printf("- Excluding links matching %s\n", $Opts{Exclude})
 1212             if defined($Opts{Exclude});
 1213         printf("- Excluding links in documents whose URIs match %s\n",
 1214             join(', ', @{$Opts{Exclude_Docs}}))
 1215             if @{$Opts{Exclude_Docs}};
 1216     }
 1217 
 1218     if ($Opts{HTML}) {
 1219         if (!$Opts{Summary_Only}) {
 1220             my $accept       = &encode($Accept);
 1221             my $acclang      = &encode($Opts{Accept_Language} || '(not sent)');
 1222             my $send_referer = $Opts{No_Referer} ? 'not sent' : 'sending';
 1223             my $s            = $Opts{Sleep_Time} == 1 ? '' : 's';
 1224             printf(
 1225                 <<'EOF', $accept, $acclang, $send_referer, $Opts{Sleep_Time}, $s);
 1226 <div class="settings">
 1227 Settings used:
 1228  <ul>
 1229   <li><tt><a href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.1">Accept</a></tt>: %s</li>
 1230   <li><tt><a href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4">Accept-Language</a></tt>: %s</li>
 1231   <li><tt><a href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.36">Referer</a></tt>: %s</li>
 1232   <li>Sleeping %d second%s between requests to each server</li>
 1233  </ul>
 1234 </div>
 1235 EOF
 1236             printf("<p>Go to <a href=\"#%s\">the results</a>.</p>\n",
 1237                 $result_anchor);
 1238             my $esc_uri = URI::Escape::uri_escape($response->{absolute_uri},
 1239                 "^A-Za-z0-9.");
 1240             print "<p>For reliable link checking results, check ";
 1241 
 1242             if (!$response->{IsCss}) {
 1243                 printf("<a href=\"%s\">HTML validity</a> and ",
 1244                     &encode(sprintf($Cfg{Markup_Validator_URI}, $esc_uri)));
 1245             }
 1246             printf(
 1247                 "<a href=\"%s\">CSS validity</a> first.</p>
 1248 <p>Back to the <a accesskey=\"1\" href=\"%s\">link checker</a>.</p>\n",
 1249                 &encode(sprintf($Cfg{CSS_Validator_URI}, $esc_uri)),
 1250                 &encode($Opts{_Self_URI})
 1251             );
 1252 
 1253             printf(<<'EOF', $result_anchor);
 1254 <div class="progress" id="progress%s">
 1255 <h3>Status: <span></span></h3>
 1256 <div class="progressbar"><div></div></div>
 1257 <pre>
 1258 EOF
 1259         }
 1260     }
 1261 
 1262     if ($Opts{Summary_Only} && !$Opts{Quiet}) {
 1263         print '<p>' if $Opts{HTML};
 1264         print 'This may take some time';
 1265         print "... (<a href=\"$Cfg{Doc_URI}#wait\">why?</a>)</p>"
 1266             if $Opts{HTML};
 1267         print " if the document has many links to check.\n" unless $Opts{HTML};
 1268     }
 1269 
 1270     # Record that we have processed this resource
 1271     $processed{$response->{absolute_uri}} = 1;
 1272 
 1273     # Parse the document
 1274     my $p =
 1275         &parse_document($uri, $response->base(), $response, 1, ($depth != 0));
 1276     my $base = URI->new($p->{base});
 1277 
 1278     # Check anchors
 1279     ###############
 1280 
 1281     print "Checking anchors...\n" unless $Opts{Summary_Only};
 1282 
 1283     my %errors;
 1284     while (my ($anchor, $lines) = each(%{$p->{Anchors}})) {
 1285         if (!length($anchor)) {
 1286 
 1287             # Empty IDREF's are not allowed
 1288             $errors{$anchor} = 1;
 1289         }
 1290         else {
 1291             my $times = 0;
 1292             $times += $_ for values(%$lines);
 1293 
 1294             # They should appear only once
 1295             $errors{$anchor} = 1 if ($times > 1);
 1296         }
 1297     }
 1298     print " done.\n" unless $Opts{Summary_Only};
 1299 
 1300     # Check links
 1301     #############
 1302 
 1303     &hprintf("Recording all the links found: %d\n",
 1304         scalar(keys %{$p->{Links}}))
 1305         if ($Opts{Verbose});
 1306     my %links;
 1307     my %hostlinks;
 1308 
 1309     # Record all the links found
 1310     while (my ($link, $lines) = each(%{$p->{Links}})) {
 1311         my $link_uri = URI->new($link);
 1312         my $abs_link_uri = URI->new_abs($link_uri, $base);
 1313 
 1314         if ($Opts{Masquerade}) {
 1315             if ($abs_link_uri =~ m|^\Q$Opts{Masquerade_From}\E|) {
 1316                 print_doc_header();
 1317                 printf("processing %s in base %s\n",
 1318                     $abs_link_uri, $Opts{Masquerade_To});
 1319                 my $nlink = $abs_link_uri;
 1320                 $nlink =~ s|^\Q$Opts{Masquerade_From}\E|$Opts{Masquerade_To}|;
 1321                 $abs_link_uri = URI->new($nlink);
 1322             }
 1323         }
 1324 
 1325         my $canon_uri = URI->new($abs_link_uri->canonical());
 1326         my $fragment  = $canon_uri->fragment(undef);
 1327         if (!defined($Opts{Exclude}) || $canon_uri !~ $Opts{Exclude}) {
 1328             if (!exists($links{$canon_uri})) {
 1329                 my $hostport;
 1330                 $hostport = $canon_uri->host_port()
 1331                     if $canon_uri->can('host_port');
 1332                 $hostport = '' unless defined $hostport;
 1333                 push(@{$hostlinks{$hostport}}, $canon_uri);
 1334             }
 1335             for my $line_num (keys(%$lines)) {
 1336                 if (!defined($fragment) || !length($fragment)) {
 1337 
 1338                     # Document without fragment
 1339                     $links{$canon_uri}{location}{$line_num} = 1;
 1340                 }
 1341                 else {
 1342 
 1343                     # Resource with a fragment
 1344                     $links{$canon_uri}{fragments}{$fragment}{$line_num} = 1;
 1345                 }
 1346             }
 1347         } else {
 1348           hprintf("excluded via options: %s\n", $canon_uri)
 1349               if ($Opts{Verbose});
 1350         }
 1351     }
 1352 
 1353     my @order = &distribute_links(\%hostlinks);
 1354     undef %hostlinks;
 1355 
 1356     # Build the list of broken URI's
 1357 
 1358     my $nlinks = scalar(@order);
 1359 
 1360     &hprintf("Checking %d links to build list of broken URI's\n", $nlinks)
 1361         if ($Opts{Verbose});
 1362 
 1363     my %broken;
 1364     my $link_num = 0;
 1365     for my $u (@order) {
 1366         my $ulinks = $links{$u};
 1367 
 1368         if ($Opts{Summary_Only}) {
 1369 
 1370             # Hack: avoid browser/server timeouts in summary only CGI mode, bug 896
 1371             print ' ' if ($Opts{HTML} && !$Opts{Command_Line});
 1372         }
 1373         else {
 1374             &hprintf("\nChecking link %s\n", $u);
 1375             my $progress = ($link_num / $nlinks) * 100;
 1376             printf(
 1377                 '<script type="text/javascript">show_progress("%s", "Checking link %s", "%.1f%%");</script>',
 1378                 $result_anchor, &encode($u), $progress)
 1379                 if (!$Opts{Command_Line} &&
 1380                 $Opts{HTML} &&
 1381                 !$Opts{Summary_Only});
 1382         }
 1383         $link_num++;
 1384 
 1385         # Check that a link is valid
 1386         &check_validity($uri, $u, ($depth != 0 && &in_recursion_scope($u)),
 1387             \%links, \%redirects);
 1388         &hprintf("\tReturn code: %s\n", $results{$u}{location}{code})
 1389             if ($Opts{Verbose});
 1390         if ($Opts{Exclude_File_Write} && $results{$u}{location}{code} == 200) {
 1391             my $fh = $Opts{Exclude_File_Write};
 1392             print $fh ("$u\n");
 1393         }
 1394         if ($results{$u}{location}{success}) {
 1395 
 1396             # Even though it was not broken, we might want to display it
 1397             # on the results page (e.g. because it required authentication)
 1398             $broken{$u}{location} = 1
 1399                 if ($results{$u}{location}{display} >= 400);
 1400 
 1401             # List the broken fragments
 1402             while (my ($fragment, $lines) = each(%{$ulinks->{fragments}})) {
 1403 
 1404                 my $fragment_ok = $results{$u}{fragments}{$fragment};
 1405 
 1406                 if ($Opts{Verbose}) {
 1407                     my @line_nums = sort { $a <=> $b } keys(%$lines);
 1408                     &hprintf(
 1409                         "\t\t%s %s - Line%s: %s\n",
 1410                         $fragment,
 1411                         $fragment_ok             ? 'OK' : 'Not found',
 1412                         (scalar(@line_nums) > 1) ? 's'  : '',
 1413                         join(', ', @line_nums)
 1414                     );
 1415                 }
 1416 
 1417                 # A broken fragment?
 1418                 $broken{$u}{fragments}{$fragment} += 2 unless $fragment_ok;
 1419             }
 1420         }
 1421         elsif (!($Opts{Quiet} && &informational($results{$u}{location}{code})))
 1422         {
 1423 
 1424             # Couldn't find the document
 1425             $broken{$u}{location} = 1;
 1426 
 1427             # All the fragments associated are hence broken
 1428             for my $fragment (keys %{$ulinks->{fragments}}) {
 1429                 $broken{$u}{fragments}{$fragment}++;
 1430             }
 1431         }
 1432     }
 1433     &hprintf(
 1434         "\nProcessed in %s seconds.\n",
 1435         &time_diff($start, &get_timestamp())
 1436     ) unless $Opts{Summary_Only};
 1437     printf(
 1438         '<script type="text/javascript">show_progress("%s", "Done. Document processed in %s seconds.", "100%%");</script>',
 1439         $result_anchor, &time_diff($start, &get_timestamp()))
 1440         if ($Opts{HTML} && !$Opts{Summary_Only});
 1441 
 1442     # Display results
 1443     if ($Opts{HTML} && !$Opts{Summary_Only}) {
 1444         print("</pre>\n</div>\n");
 1445         printf("<h2><a name=\"%s\">Results</a></h2>\n", $result_anchor);
 1446     }
 1447     print "\n" unless $Opts{Quiet};
 1448 
 1449     &links_summary(\%links, \%results, \%broken, \%redirects);
 1450     &anchors_summary($p->{Anchors}, \%errors);
 1451 
 1452     # Do we want to process other documents?
 1453     if ($depth != 0) {
 1454 
 1455         for my $u (map { URI->new($_) } keys %links) {
 1456 
 1457             next unless $results{$u}{location}{success};    # Broken link?
 1458 
 1459             next unless &in_recursion_scope($u);
 1460 
 1461             # Do we understand its content type?
 1462             next unless ($results{$u}{location}{type} =~ $ContentTypes);
 1463 
 1464             # Have we already processed this URI?
 1465             next if &already_processed($u, $uri);
 1466 
 1467             # Do the job
 1468             print "\n" unless $Opts{Quiet};
 1469             if ($Opts{HTML}) {
 1470                 if (!$Opts{Command_Line}) {
 1471                     if ($doc_count == $Opts{Max_Documents}) {
 1472                         print(
 1473                             "<hr />\n<p><strong>Maximum number of documents ($Opts{Max_Documents}) reached!</strong></p>\n"
 1474                         );
 1475                     }
 1476                     if ($doc_count >= $Opts{Max_Documents}) {
 1477                         $doc_count++;
 1478                         print("<p>Not checking <strong>$u</strong></p>\n");
 1479                         $processed{$u} = 1;
 1480                         next;
 1481                     }
 1482                 }
 1483             }
 1484 
 1485             # This is an inherently recursive algorithm, so Perl's warning is not
 1486             # helpful.  You may wish to comment this out when debugging, though.
 1487             no warnings 'recursion';
 1488 
 1489             if ($depth < 0) {
 1490                 &check_uri($params, $u, 0, -1, $cookie, $uri);
 1491             }
 1492             else {
 1493                 &check_uri($params, $u, 0, $depth - 1, $cookie, $uri);
 1494             }
 1495         }
 1496     }
 1497     return;
 1498 }
 1499 
 1500 ###############################################################
 1501 # Distribute links based on host:port to avoid RobotUA delays #
 1502 ###############################################################
 1503 
 1504 sub distribute_links(\%)
 1505 {
 1506     my $hostlinks = shift;
 1507 
 1508     # Hosts ordered by weight (number of links), descending
 1509     my @order =
 1510         sort { scalar(@{$hostlinks->{$b}}) <=> scalar(@{$hostlinks->{$a}}) }
 1511         keys %$hostlinks;
 1512 
 1513     # All link list flattened into one, in host weight order
 1514     my @all;
 1515     push(@all, @{$hostlinks->{$_}}) for @order;
 1516 
 1517     return @all if (scalar(@order) < 2);
 1518 
 1519     # Indexes and chunk size for "zipping" the end result list
 1520     my $num = scalar(@{$hostlinks->{$order[0]}});
 1521     my @indexes = map { $_ * $num } (0 .. $num - 1);
 1522 
 1523     # Distribute them
 1524     my @result;
 1525     while (my @chunk = splice(@all, 0, $num)) {
 1526         @result[@indexes] = @chunk;
 1527         @indexes = map { $_ + 1 } @indexes;
 1528     }
 1529 
 1530     # Weed out undefs
 1531     @result = grep(defined, @result);
 1532 
 1533     return @result;
 1534 }
 1535 
 1536 ##########################################
 1537 # Decode Content-Encodings in a response #
 1538 ##########################################
 1539 
 1540 sub decode_content ($)
 1541 {
 1542     my $response = shift;
 1543     my $error    = undef;
 1544 
 1545     my $docref = $response->decoded_content(ref => 1);
 1546     if (defined($docref)) {
 1547         utf8::encode($$docref);
 1548         $response->content_ref($docref);
 1549 
 1550         # Remove Content-Encoding so it won't be decoded again later.
 1551         $response->remove_header('Content-Encoding');
 1552     }
 1553     else {
 1554         my $ce = $response->header('Content-Encoding');
 1555         $ce = defined($ce) ? "'$ce'" : 'undefined';
 1556         my $ct = $response->header('Content-Type');
 1557         $ct = defined($ct) ? "'$ct'" : 'undefined';
 1558         my $request_uri = $response->request->url;
 1559 
 1560         my $cs = $response->content_charset();
 1561         $cs = defined($cs) ? "'$cs'" : 'unknown';
 1562         $error =
 1563             "Error decoding document at <$request_uri>, Content-Type $ct, " .
 1564             "Content-Encoding $ce, content charset $cs: '$@'";
 1565     }
 1566     return $error;
 1567 }
 1568 
 1569 #######################################
 1570 # Get and parse a resource to process #
 1571 #######################################
 1572 
 1573 sub get_document ($\$$;\%\$$$$$)
 1574 {
 1575     my ($method, $uri,    $in_recursion, $redirects, $referer,
 1576         $cookie, $params, $check_num,    $is_start
 1577     ) = @_;
 1578 
 1579     # $method contains the HTTP method the use (GET or HEAD)
 1580     # $uri object contains the identifier of the resource
 1581     # $in_recursion is > 0 if we are in recursion mode (i.e. it is at least
 1582     #                        the second resource checked)
 1583     # $redirects is a pointer to the hash containing the map of the redirects
 1584     # $referer is the URI object of the referring document
 1585     # $cookie, $params, $check_num, and $is_start are for printing HTTP headers
 1586     #                  and the form if $in_recursion == 0 and not authenticating
 1587 
 1588     # Get the resource
 1589     my $response;
 1590     if (defined($results{$uri}{response}) &&
 1591         !($method eq 'GET' && $results{$uri}{method} eq 'HEAD'))
 1592     {
 1593         $response = $results{$uri}{response};
 1594     }
 1595     else {
 1596         $response = &get_uri($method, $uri, $referer);
 1597         &record_results($uri, $method, $response, $referer);
 1598         &record_redirects($redirects, $response);
 1599     }
 1600     if (!$response->is_success()) {
 1601         if (!$in_recursion) {
 1602 
 1603             # Is it too late to request authentication?
 1604             if ($response->code() == 401) {
 1605                 &authentication($response, $cookie, $params, $check_num,
 1606                     $is_start);
 1607             }
 1608             else {
 1609                 if ($Opts{HTML}) {
 1610                     &html_header($uri, $cookie) if ($check_num == 1);
 1611                     &print_form($params, $cookie, $check_num) if $is_start;
 1612                     print "<p>", &status_icon($response->code());
 1613                 }
 1614                 &hprintf("\nError: %d %s\n",
 1615                     $response->code(), $response->message() || '(no message)');
 1616                 print "</p>\n" if $Opts{HTML};
 1617             }
 1618         }
 1619         $response->{Stop} = 1;
 1620         $response->content("");
 1621         return ($response);
 1622     }
 1623 
 1624     # What is the URI of the resource that we are processing by the way?
 1625     my $base_uri    = $response->base();
 1626     my $request_uri = URI->new($response->request->url);
 1627     $response->{absolute_uri} = $request_uri->abs($base_uri);
 1628 
 1629     # Can we parse the document?
 1630     my $failed_reason;
 1631     my $ct = $response->header('Content-Type');
 1632     if (!$ct || $ct !~ $ContentTypes) {
 1633         $failed_reason = "Content-Type for <$request_uri> is " .
 1634             (defined($ct) ? "'$ct'" : 'undefined');
 1635     }
 1636     else {
 1637         $failed_reason = decode_content($response);
 1638     }
 1639     if ($failed_reason) {
 1640 
 1641         # No, there is a problem...
 1642         if (!$in_recursion) {
 1643             if ($Opts{HTML}) {
 1644                 &html_header($uri, $cookie) if ($check_num == 1);
 1645                 &print_form($params, $cookie, $check_num) if $is_start;
 1646                 print "<p>", &status_icon(406);
 1647 
 1648             }
 1649             &hprintf("Can't check links: %s.\n", $failed_reason);
 1650             print "</p>\n" if $Opts{HTML};
 1651         }
 1652         $response->{Stop} = 1;
 1653         $response->content("");
 1654     }
 1655 
 1656     # Ok, return the information
 1657     return ($response);
 1658 }
 1659 
 1660 #########################################################
 1661 # Check whether a URI is within the scope of recursion. #
 1662 #########################################################
 1663 
 1664 sub in_recursion_scope (\$)
 1665 {
 1666     my ($uri) = @_;
 1667     return 0 unless $uri;
 1668 
 1669     my $candidate = $uri->canonical();
 1670 
 1671     return 0 if (defined($Opts{Exclude}) && $candidate =~ $Opts{Exclude});
 1672 
 1673     for my $excluded_doc (@{$Opts{Exclude_Docs}}) {
 1674         return 0 if ($candidate =~ $excluded_doc);
 1675     }
 1676 
 1677     for my $base (@{$Opts{Base_Locations}}) {
 1678         my $rel = $candidate->rel($base);
 1679         next if ($candidate eq $rel);    # Relative path not possible?
 1680         next if ($rel =~ m|^(\.\.)?/|);  # Relative path upwards?
 1681         return 1;
 1682     }
 1683 
 1684     return 0;    # We always have at least one base location, but none matched.
 1685 }
 1686 
 1687 #################################
 1688 # Check for content type match. #
 1689 #################################
 1690 
 1691 sub is_content_type ($$)
 1692 {
 1693     my ($candidate, $type) = @_;
 1694     return 0 unless ($candidate && $type);
 1695     my @v = HTTP::Headers::Util::split_header_words($candidate);
 1696     return scalar(@v) ? $type eq lc($v[0]->[0]) : 0;
 1697 }
 1698 
 1699 ##################################################
 1700 # Check whether a URI has already been processed #
 1701 ##################################################
 1702 
 1703 sub already_processed (\$\$)
 1704 {
 1705     my ($uri, $referer) = @_;
 1706 
 1707     # Don't be verbose for that part...
 1708     my $summary_value = $Opts{Summary_Only};
 1709     $Opts{Summary_Only} = 1;
 1710 
 1711     # Do a GET: if it fails, we stop, if not, the results are cached
 1712     my $response = &get_document('GET', $uri, 1, undef, $referer);
 1713 
 1714     # ... but just for that part
 1715     $Opts{Summary_Only} = $summary_value;
 1716 
 1717     # Can we process the resource?
 1718     return -1 if defined($response->{Stop});
 1719 
 1720     # Have we already processed it?
 1721     return 1 if defined($processed{$response->{absolute_uri}->as_string()});
 1722 
 1723     # It's not processed yet and it is processable: return 0
 1724     return 0;
 1725 }
 1726 
 1727 ############################
 1728 # Get the content of a URI #
 1729 ############################
 1730 
 1731 sub get_uri ($\$;\$$\%$$$$)
 1732 {
 1733 
 1734     # Here we have a lot of extra parameters in order not to lose information
 1735     # if the function is called several times (401's)
 1736     my ($method, $uri,   $referer, $start, $redirects,
 1737         $code,   $realm, $message, $auth
 1738     ) = @_;
 1739 
 1740     # $method contains the method used
 1741     # $uri object contains the target of the request
 1742     # $referer is the URI object of the referring document
 1743     # $start is a timestamp (not defined the first time the function is called)
 1744     # $redirects is a map of redirects
 1745     # $code is the first HTTP return code
 1746     # $realm is the realm of the request
 1747     # $message is the HTTP message received
 1748     # $auth equals 1 if we want to send out authentication information
 1749 
 1750     # For timing purposes
 1751     $start = &get_timestamp() unless defined($start);
 1752 
 1753     # Prepare the query
 1754 
 1755     # Do we want printouts of progress?
 1756     my $verbose_progress =
 1757         !($Opts{Summary_Only} || (!$doc_count && $Opts{HTML}));
 1758 
 1759     &hprintf("%s %s ", $method, $uri) if $verbose_progress;
 1760 
 1761     my $request = HTTP::Request->new($method, $uri);
 1762 
 1763     $request->header('Accept-Language' => $Opts{Accept_Language})
 1764         if $Opts{Accept_Language};
 1765     $request->header('Accept', $Accept);
 1766     $request->accept_decodable();
 1767 
 1768     # Are we providing authentication info?
 1769     if ($auth && $request->url()->host() =~ $Opts{Trusted}) {
 1770         if (defined($ENV{HTTP_AUTHORIZATION})) {
 1771             $request->header(Authorization => $ENV{HTTP_AUTHORIZATION});
 1772         }
 1773         elsif (defined($Opts{User}) && defined($Opts{Password})) {
 1774             $request->authorization_basic($Opts{User}, $Opts{Password});
 1775         }
 1776     }
 1777 
 1778     # Tell the user agent if we want progress reports for redirects or not.
 1779     $ua->redirect_progress_callback(sub { &hprintf("\n-> %s %s ", @_); })
 1780         if $verbose_progress;
 1781 
 1782     # Set referer
 1783     $request->referer($referer) if (!$Opts{No_Referer} && $referer);
 1784 
 1785     # Telling caches in the middle we want a fresh copy (Bug 4998)
 1786     $request->header(Cache_Control => "max-age=0");
 1787 
 1788     # Do the query
 1789     my $response = $ua->request($request);
 1790 
 1791     # Get the results
 1792     # Record the very first response
 1793     if (!defined($code)) {
 1794         ($code, $message) = delete(@$ua{qw(FirstResponse FirstMessage)});
 1795     }
 1796 
 1797     # Authentication requested?
 1798     if ($response->code() == 401 &&
 1799         !defined($auth) &&
 1800         (defined($ENV{HTTP_AUTHORIZATION}) ||
 1801             (defined($Opts{User}) && defined($Opts{Password})))
 1802         )
 1803     {
 1804 
 1805         # Set host as trusted domain unless we already have one.
 1806         if (!$Opts{Trusted}) {
 1807             my $re = sprintf('^%s$', quotemeta($response->base()->host()));
 1808             $Opts{Trusted} = qr/$re/io;
 1809         }
 1810 
 1811         # Deal with authentication and avoid loops
 1812         if (!defined($realm) &&
 1813             $response->www_authenticate() =~ /Basic realm=\"([^\"]+)\"/)
 1814         {
 1815             $realm = $1;
 1816         }
 1817 
 1818         print "\n" if $verbose_progress;
 1819         return &get_uri($method, $response->request()->url(),
 1820             $referer, $start, $redirects, $code, $realm, $message, 1);
 1821     }
 1822 
 1823     # @@@ subtract robot delay from the "fetched in" time?
 1824     &hprintf(" fetched in %s seconds\n", &time_diff($start, &get_timestamp()))
 1825         if $verbose_progress;
 1826 
 1827     $response->{IsCss} =
 1828         is_content_type($response->content_type(), "text/css");
 1829     $response->{Realm} = $realm if defined($realm);
 1830 
 1831     return $response;
 1832 }
 1833 
 1834 #########################################
 1835 # Record the results of an HTTP request #
 1836 #########################################
 1837 
 1838 sub record_results (\$$$$)
 1839 {
 1840     my ($uri, $method, $response, $referer) = @_;
 1841     $results{$uri}{referer}        = $referer;
 1842     $results{$uri}{response}       = $response;
 1843     $results{$uri}{method}         = $method;
 1844     $results{$uri}{location}{code} = $response->code();
 1845     $results{$uri}{location}{code} = RC_ROBOTS_TXT()
 1846         if ($results{$uri}{location}{code} == 403 &&
 1847         $response->message() =~ /Forbidden by robots\.txt/);
 1848     $results{$uri}{location}{code} = RC_IP_DISALLOWED()
 1849         if ($results{$uri}{location}{code} == 403 &&
 1850         $response->message() =~ /non-public IP/);
 1851     $results{$uri}{location}{code} = RC_DNS_ERROR()
 1852         if ($results{$uri}{location}{code} == 500 &&
 1853         $response->message() =~ /Bad hostname '[^\']*'/);
 1854     $results{$uri}{location}{code} = RC_PROTOCOL_DISALLOWED()
 1855         if ($results{$uri}{location}{code} == 500 &&
 1856         $response->message() =~ /Access to '[^\']*' URIs has been disabled/);
 1857     $results{$uri}{location}{type}    = $response->header('Content-type');
 1858     $results{$uri}{location}{display} = $results{$uri}{location}{code};
 1859 
 1860     # Rewind, check for the original code and message.
 1861     for (my $tmp = $response->previous(); $tmp; $tmp = $tmp->previous()) {
 1862         $results{$uri}{location}{orig}         = $tmp->code();
 1863         $results{$uri}{location}{orig_message} = $tmp->message() ||
 1864             '(no message)';
 1865     }
 1866     $results{$uri}{location}{success} = $response->is_success();
 1867 
 1868     # If a suppressed broken link, fill the data structure like a typical success.
 1869     # print STDERR "success? " . $results{$uri}{location}{success} . ": $uri\n";
 1870     if (!$results{$uri}{location}{success}) {
 1871         my $code = $results{$uri}{location}{code};
 1872         my $match = grep { $_ eq "$code:$uri" } @{$Opts{Suppress_Broken}};
 1873         if ($match) {
 1874             $results{$uri}{location}{success} = 1;
 1875             $results{$uri}{location}{code}    = 100;
 1876             $results{$uri}{location}{display} = 100;
 1877         }
 1878     }
 1879 
 1880     # Stores the authentication information
 1881     if (defined($response->{Realm})) {
 1882         $results{$uri}{location}{realm} = $response->{Realm};
 1883         $results{$uri}{location}{display} = 401 unless $Opts{Hide_Same_Realm};
 1884     }
 1885 
 1886     # What type of broken link is it? (stored in {record} - the {display}
 1887     #              information is just for visual use only)
 1888     if ($results{$uri}{location}{display} == 401 &&
 1889         $results{$uri}{location}{code} == 404)
 1890     {
 1891         $results{$uri}{location}{record} = 404;
 1892     }
 1893     else {
 1894         $results{$uri}{location}{record} = $results{$uri}{location}{display};
 1895     }
 1896 
 1897     # Did it fail?
 1898     $results{$uri}{location}{message} = $response->message() || '(no message)';
 1899     if (!$results{$uri}{location}{success}) {
 1900         &hprintf(
 1901             "Error: %d %s\n",
 1902             $results{$uri}{location}{code},
 1903             $results{$uri}{location}{message}
 1904         ) if ($Opts{Verbose});
 1905     }
 1906     return;
 1907 }
 1908 
 1909 ####################
 1910 # Parse a document #
 1911 ####################
 1912 
 1913 sub parse_document (\$\$$$$)
 1914 {
 1915     my ($uri, $base_uri, $response, $links, $rec_needs_links) = @_;
 1916 
 1917     print("parse_document($uri, $base_uri, ..., $links, $rec_needs_links)\n")
 1918         if $Opts{Verbose};
 1919 
 1920     my $p;
 1921 
 1922     if (defined($results{$uri}{parsing})) {
 1923 
 1924         # We have already done the job. Woohoo!
 1925         $p->{base}    = $results{$uri}{parsing}{base};
 1926         $p->{Anchors} = $results{$uri}{parsing}{Anchors};
 1927         $p->{Links}   = $results{$uri}{parsing}{Links};
 1928         return $p;
 1929     }
 1930 
 1931     $p = W3C::LinkChecker->new();
 1932     $p->{base} = $base_uri;
 1933 
 1934     my $stype = $response->header("Content-Style-Type");
 1935     $p->{style_is_css} = !$stype || is_content_type($stype, "text/css");
 1936 
 1937     my $start;
 1938     if (!$Opts{Summary_Only}) {
 1939         $start = &get_timestamp();
 1940         print("Parsing...\n");
 1941     }
 1942 
 1943     # Content-Encoding etc already decoded in get_document().
 1944     my $docref = $response->content_ref();
 1945 
 1946     # Count lines beforehand if needed (for progress indicator, or CSS while
 1947     # we don't get any line context out of the parser).  In case of HTML, the
 1948     # actual final number of lines processed shown is populated by our
 1949     # end_document handler.
 1950     $p->{Total} = ($$docref =~ tr/\n//)
 1951         if ($response->{IsCss} || $Opts{Progress});
 1952 
 1953     # We only look for anchors if we are not interested in the links
 1954     # obviously, or if we are running a recursive checking because we
 1955     # might need this information later
 1956     $p->{only_anchors} = !($links || $rec_needs_links);
 1957 
 1958     if ($response->{IsCss}) {
 1959 
 1960         # Parse as CSS
 1961 
 1962         $p->parse_css($$docref, LINE_UNKNOWN());
 1963     }
 1964     else {
 1965 
 1966         # Parse as HTML
 1967 
 1968         # Transform <?xml:stylesheet ...?> into <xml:stylesheet ...> for parsing
 1969         # Processing instructions are not parsed by process, but in this case
 1970         # it should be. It's expensive, it's horrible, but it's the easiest way
 1971         # for right now.
 1972         $$docref =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/
 1973             unless $p->{only_anchors};
 1974 
 1975         $p->xml_mode(1) if ($response->content_type() =~ /\+xml$/);
 1976 
 1977         $p->parse($$docref)->eof();
 1978     }
 1979 
 1980     $response->content("");
 1981 
 1982     if (!$Opts{Summary_Only}) {
 1983         my $stop = &get_timestamp();
 1984         print "\r" if $Opts{Progress};
 1985         &hprintf(" done (%d lines in %s seconds).\n",
 1986             $p->{Total}, &time_diff($start, $stop));
 1987     }
 1988 
 1989     # Save the results before exiting
 1990     $results{$uri}{parsing}{base}    = $p->{base};
 1991     $results{$uri}{parsing}{Anchors} = $p->{Anchors};
 1992     $results{$uri}{parsing}{Links}   = $p->{Links};
 1993 
 1994     return $p;
 1995 }
 1996 
 1997 ####################################
 1998 # Constructor for W3C::LinkChecker #
 1999 ####################################
 2000 
 2001 sub new
 2002 {
 2003     my $p = HTML::Parser::new(@_, api_version => 3);
 2004     $p->utf8_mode(1);
 2005 
 2006     # Set up handlers
 2007 
 2008     $p->handler(start => 'start', 'self, tagname, attr, line');
 2009     $p->handler(end   => 'end',   'self, tagname, line');
 2010     $p->handler(text  => 'text',  'self, dtext, line');
 2011     $p->handler(
 2012         declaration => sub {
 2013             my $self = shift;
 2014             $self->declaration(substr($_[0], 2, -1));
 2015         },
 2016         'self, text, line'
 2017     );
 2018     $p->handler(end_document => 'end_document', 'self, line');
 2019     if ($Opts{Progress}) {
 2020         $p->handler(default => 'parse_progress', 'self, line');
 2021         $p->{last_percentage} = 0;
 2022     }
 2023 
 2024     # Check <a [..] name="...">?
 2025     $p->{check_name} = 1;
 2026 
 2027     # Check <[..] id="..">?
 2028     $p->{check_id} = 1;
 2029 
 2030     # Don't interpret comment loosely
 2031     $p->strict_comment(1);
 2032 
 2033     return $p;
 2034 }
 2035 
 2036 #################################################
 2037 # Record or return  the doctype of the document #
 2038 #################################################
 2039 
 2040 sub doctype
 2041 {
 2042     my ($self, $dc) = @_;
 2043     return $self->{doctype} unless $dc;
 2044     $_ = $self->{doctype} = $dc;
 2045 
 2046     # What to look for depending on the doctype
 2047 
 2048     # Check for <a name="...">?
 2049     $self->{check_name} = 0
 2050         if m%^-//(W3C|WAPFORUM)//DTD XHTML (Basic|Mobile) %;
 2051 
 2052     # Check for <* id="...">?
 2053     $self->{check_id} = 0
 2054         if (m%^-//IETF//DTD HTML [23]\.0//% || m%^-//W3C//DTD HTML 3\.2//%);
 2055 
 2056     # Enable XML mode (XHTML, XHTML Mobile, XHTML-Print, XHTML+RDFa, ...)
 2057     $self->xml_mode(1) if (m%^-//(W3C|WAPFORUM)//DTD XHTML[ \-\+]%);
 2058 
 2059     return;
 2060 }
 2061 
 2062 ###################################
 2063 # Print parse progress indication #
 2064 ###################################
 2065 
 2066 sub parse_progress
 2067 {
 2068     my ($self, $line) = @_;
 2069     return unless defined($line) && $line > 0 && $self->{Total} > 0;
 2070 
 2071     my $percentage = int($line / $self->{Total} * 100);
 2072     if ($percentage != $self->{last_percentage}) {
 2073         printf("\r%4d%%", $percentage);
 2074         $self->{last_percentage} = $percentage;
 2075     }
 2076 
 2077     return;
 2078 }
 2079 
 2080 #############################
 2081 # Extraction of the anchors #
 2082 #############################
 2083 
 2084 sub get_anchor
 2085 {
 2086     my ($self, $tag, $attr) = @_;
 2087 
 2088     my $anchor = $self->{check_id} ? $attr->{id} : undef;
 2089     if ($self->{check_name} && ($tag eq 'a')) {
 2090 
 2091         # @@@@ In XHTML, <a name="foo" id="foo"> is mandatory
 2092         # Force an error if it's not the case (or if id's and name's values
 2093         #                                      are different)
 2094         # If id is defined, name if defined must have the same value
 2095         $anchor ||= $attr->{name};
 2096     }
 2097 
 2098     return $anchor;
 2099 }
 2100 
 2101 #############################
 2102 # W3C::LinkChecker handlers #
 2103 #############################
 2104 
 2105 sub add_link
 2106 {
 2107     my ($self, $uri, $base, $line) = @_;
 2108     if (defined($uri)) {
 2109 
 2110         # Remove repeated slashes after the . or .. in relative links, to avoid
 2111         # duplicated checking or infinite recursion.
 2112         $uri =~ s|^(\.\.?/)/+|$1|o;
 2113         $uri = Encode::decode_utf8($uri);
 2114         $uri = URI->new_abs($uri, $base) if defined($base);
 2115         $self->{Links}{$uri}{defined($line) ? $line : LINE_UNKNOWN()}++;
 2116     }
 2117     return;
 2118 }
 2119 
 2120 sub start
 2121 {
 2122     my ($self, $tag, $attr, $line) = @_;
 2123     $line = LINE_UNKNOWN() unless defined($line);
 2124 
 2125     # Anchors
 2126     my $anchor = $self->get_anchor($tag, $attr);
 2127     $self->{Anchors}{$anchor}{$line}++ if defined($anchor);
 2128 
 2129     # Links
 2130     if (!$self->{only_anchors}) {
 2131 
 2132         my $tag_local_base = undef;
 2133 
 2134         # Special case: base/@href
 2135         # @@@TODO: The reason for handling <base href> ourselves is that LWP's
 2136         # head parsing magic fails at least for responses that have
 2137         # Content-Encodings: https://rt.cpan.org/Ticket/Display.html?id=54361
 2138         if ($tag eq 'base') {
 2139 
 2140             # Ignore <base> with missing/empty href.
 2141             $self->{base} = $attr->{href}
 2142                 if (defined($attr->{href}) && length($attr->{href}));
 2143         }
 2144 
 2145         # Special case: meta[@http-equiv=Refresh]/@content
 2146         elsif ($tag eq 'meta') {
 2147             if ($attr->{'http-equiv'} &&
 2148                 lc($attr->{'http-equiv'}) eq 'refresh')
 2149             {
 2150                 my $content = $attr->{content};
 2151                 if ($content && $content =~ /.*?;\s*(?:url=)?(.+)/i) {
 2152                     $self->add_link($1, undef, $line);
 2153                 }
 2154             }
 2155         }
 2156 
 2157         # Special case: tags that have "local base"
 2158         elsif ($tag eq 'applet' || $tag eq 'object') {
 2159             if (my $codebase = $attr->{codebase}) {
 2160 
 2161                 # Applet codebases are directories, append trailing slash
 2162                 # if it's not there so that new_abs does the right thing.
 2163                 $codebase .= "/" if ($tag eq 'applet' && $codebase !~ m|/$|);
 2164 
 2165                 # TODO: HTML 4 spec says applet/@codebase may only point to
 2166                 # subdirs of the directory containing the current document.
 2167                 # Should we do something about that?
 2168                 $tag_local_base = URI->new_abs($codebase, $self->{base});
 2169             }
 2170         }
 2171 
 2172         # Link attributes:
 2173         if (my $link_attrs = LINK_ATTRS()->{$tag}) {
 2174             for my $la (@$link_attrs) {
 2175                 $self->add_link($attr->{$la}, $tag_local_base, $line);
 2176             }
 2177         }
 2178 
 2179         # List of links attributes:
 2180         if (my $link_attrs = LINK_LIST_ATTRS()->{$tag}) {
 2181             my ($sep, $attrs) = @$link_attrs;
 2182             for my $la (@$attrs) {
 2183                 if (defined(my $value = $attr->{$la})) {
 2184                     for my $link (split($sep, $value)) {
 2185                         $self->add_link($link, $tag_local_base, $line);
 2186                     }
 2187                 }
 2188             }
 2189         }
 2190 
 2191         # Inline CSS:
 2192         delete $self->{csstext};
 2193         if ($tag eq 'style') {
 2194             $self->{csstext} = ''
 2195                 if ((!$attr->{type} && $self->{style_is_css}) ||
 2196                 is_content_type($attr->{type}, "text/css"));
 2197         }
 2198         elsif ($self->{style_is_css} && (my $style = $attr->{style})) {
 2199             $style = CSS::DOM::Style::parse($style);
 2200             $self->parse_style($style, $line);
 2201         }
 2202     }
 2203 
 2204     $self->parse_progress($line) if $Opts{Progress};
 2205     return;
 2206 }
 2207 
 2208 sub end
 2209 {
 2210     my ($self, $tagname, $line) = @_;
 2211 
 2212     $self->parse_css($self->{csstext}, $line) if ($tagname eq 'style');
 2213     delete $self->{csstext};
 2214 
 2215     $self->parse_progress($line) if $Opts{Progress};
 2216     return;
 2217 }
 2218 
 2219 sub parse_css
 2220 {
 2221     my ($self, $css, $line) = @_;
 2222     return unless $css;
 2223 
 2224     my $sheet = CSS::DOM::parse($css);
 2225     for my $rule (@{$sheet->cssRules()}) {
 2226         if ($rule->type() == IMPORT_RULE()) {
 2227             $self->add_link($rule->href(), $self->{base}, $line);
 2228         }
 2229         elsif ($rule->type == STYLE_RULE()) {
 2230             $self->parse_style($rule->style(), $line);
 2231         }
 2232     }
 2233     return;
 2234 }
 2235 
 2236 sub parse_style
 2237 {
 2238     my ($self, $style, $line) = @_;
 2239     return unless $style;
 2240 
 2241     for (my $i = 0, my $len = $style->length(); $i < $len; $i++) {
 2242         my $prop = $style->item($i);
 2243         my $val  = $style->getPropertyValue($prop);
 2244 
 2245         while ($val =~ /$CssUrl/go) {
 2246             my $url = CSS::DOM::Util::unescape($2);
 2247             $self->add_link($url, $self->{base}, $line);
 2248         }
 2249     }
 2250 
 2251     return;
 2252 }
 2253 
 2254 sub declaration
 2255 {
 2256     my ($self, $text, $line) = @_;
 2257 
 2258     # Extract the doctype
 2259     my @declaration = split(/\s+/, $text, 4);
 2260     if ($#declaration >= 3 &&
 2261         $declaration[0] eq 'DOCTYPE' &&
 2262         lc($declaration[1]) eq 'html')
 2263     {
 2264 
 2265         # Parse the doctype declaration
 2266         if ($text =~
 2267             m/^DOCTYPE\s+html\s+(?:PUBLIC\s+"([^"]+)"|SYSTEM)(\s+"([^"]+)")?\s*$/i
 2268             )
 2269         {
 2270 
 2271             # Store the doctype
 2272             $self->doctype($1) if $1;
 2273 
 2274             # If there is a link to the DTD, record it
 2275             $self->add_link($3, undef, $line)
 2276                 if (!$self->{only_anchors} && $3);
 2277         }
 2278     }
 2279 
 2280     $self->text($text) unless $self->{only_anchors};
 2281 
 2282     return;
 2283 }
 2284 
 2285 sub text
 2286 {
 2287     my ($self, $text, $line) = @_;
 2288     $self->{csstext} .= $text if defined($self->{csstext});
 2289     $self->parse_progress($line) if $Opts{Progress};
 2290     return;
 2291 }
 2292 
 2293 sub end_document
 2294 {
 2295     my ($self, $line) = @_;
 2296     $self->{Total} = $line;
 2297     delete $self->{csstext};
 2298     return;
 2299 }
 2300 
 2301 ################################
 2302 # Check the validity of a link #
 2303 ################################
 2304 
 2305 sub check_validity (\$\$$\%\%)
 2306 {
 2307     my ($referer, $uri, $want_links, $links, $redirects) = @_;
 2308 
 2309     # $referer is the URI object of the document checked
 2310     # $uri is the URI object of the target that we are verifying
 2311     # $want_links is true if we're interested in links in the target doc
 2312     # $links is a hash of the links in the documents checked
 2313     # $redirects is a map of the redirects encountered
 2314 
 2315     # Get the document with the appropriate method: GET if there are
 2316     # fragments to check or links are wanted, HEAD is enough otherwise.
 2317     my $fragments = $links->{$uri}{fragments} || {};
 2318     my $method = ($want_links || %$fragments) ? 'GET' : 'HEAD';
 2319 
 2320     my $response;
 2321     my $being_processed = 0;
 2322     if (!defined($results{$uri}) ||
 2323         ($method eq 'GET' && $results{$uri}{method} eq 'HEAD'))
 2324     {
 2325         $being_processed = 1;
 2326         $response = &get_uri($method, $uri, $referer);
 2327 
 2328         # Get the information back from get_uri()
 2329         &record_results($uri, $method, $response, $referer);
 2330 
 2331         # Record the redirects
 2332         &record_redirects($redirects, $response);
 2333     }
 2334     elsif (!($Opts{Summary_Only} || (!$doc_count && $Opts{HTML}))) {
 2335         my $ref = $results{$uri}{referer};
 2336         &hprintf("Already checked%s\n", $ref ? ", referrer $ref" : ".");
 2337     }
 2338 
 2339     # We got the response of the HTTP request. Stop here if it was a HEAD.
 2340     return if ($method eq 'HEAD');
 2341 
 2342     # There are fragments. Parse the document.
 2343     my $p;
 2344     if ($being_processed) {
 2345 
 2346         # Can we really parse the document?
 2347         if (!defined($results{$uri}{location}{type}) ||
 2348             $results{$uri}{location}{type} !~ $ContentTypes)
 2349         {
 2350             &hprintf("Can't check content: Content-Type for '%s' is '%s'.\n",
 2351                 $uri, $results{$uri}{location}{type})
 2352                 if ($Opts{Verbose});
 2353             $response->content("");
 2354             return;
 2355         }
 2356 
 2357         # Do it then
 2358         if (my $error = decode_content($response)) {
 2359             &hprintf("%s\n.", $error);
 2360         }
 2361 
 2362         # @@@TODO: this isn't the best thing to do if a decode error occurred
 2363         $p =
 2364             &parse_document($uri, $response->base(), $response, 0,
 2365             $want_links);
 2366     }
 2367     else {
 2368 
 2369         # We already had the information
 2370         $p->{Anchors} = $results{$uri}{parsing}{Anchors};
 2371     }
 2372 
 2373     # Check that the fragments exist
 2374     for my $fragment (keys %$fragments) {
 2375         if (defined($p->{Anchors}{$fragment}) ||
 2376             &escape_match($fragment, $p->{Anchors}) ||
 2377             grep { $_ eq "$uri#$fragment" } @{$Opts{Suppress_Fragment}})
 2378         {
 2379             $results{$uri}{fragments}{$fragment} = 1;
 2380         }
 2381         else {
 2382             $results{$uri}{fragments}{$fragment} = 0;
 2383         }
 2384     }
 2385     return;
 2386 }
 2387 
 2388 sub escape_match ($\%)
 2389 {
 2390     my ($a, $hash) = (URI::Escape::uri_unescape($_[0]), $_[1]);
 2391     for my $b (keys %$hash) {
 2392         return 1 if ($a eq URI::Escape::uri_unescape($b));
 2393     }
 2394     return 0;
 2395 }
 2396 
 2397 ##########################
 2398 # Ask for authentication #
 2399 ##########################
 2400 
 2401 sub authentication ($;$$$$)
 2402 {
 2403     my ($response, $cookie, $params, $check_num, $is_start) = @_;
 2404 
 2405     my $realm = '';
 2406     if ($response->www_authenticate() =~ /Basic realm=\"([^\"]+)\"/) {
 2407         $realm = $1;
 2408     }
 2409 
 2410     if ($Opts{Command_Line}) {
 2411         printf STDERR <<'EOF', $response->request()->url(), $realm;
 2412 
 2413 Authentication is required for %s.
 2414 The realm is "%s".
 2415 Use the -u and -p options to specify a username and password and the -d option
 2416 to specify trusted domains.
 2417 EOF
 2418     }
 2419     else {
 2420 
 2421         printf(
 2422             "Status: 401 Authorization Required\nWWW-Authenticate: %s\n%sConnection: close\nContent-Language: en\nContent-Type: text/html; charset=utf-8\n\n",
 2423             $response->www_authenticate(),
 2424             $cookie ? "Set-Cookie: $cookie\n" : "",
 2425         );
 2426 
 2427         printf(
 2428             "%s
 2429 <html lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">
 2430 <head>
 2431 <title>W3C Link Checker: 401 Authorization Required</title>
 2432 %s</head>
 2433 <body>", $DocType, $Head
 2434         );
 2435         &banner(': 401 Authorization Required');
 2436         &print_form($params, $cookie, $check_num) if $is_start;
 2437         printf(
 2438             '<p>
 2439   %s
 2440   You need "%s" access to <a href="%s">%s</a> to perform link checking.<br />
 2441 ',
 2442             &status_icon(401),
 2443             &encode($realm), (&encode($response->request()->url())) x 2
 2444         );
 2445 
 2446         my $host = $response->request()->url()->host();
 2447         if ($Opts{Trusted} && $host !~ $Opts{Trusted}) {
 2448             printf <<'EOF', &encode($Opts{Trusted}), &encode($host);
 2449   This service has been configured to send authentication only to hostnames
 2450   matching the regular expression <code>%s</code>, but the hostname
 2451   <code>%s</code> does not match it.
 2452 EOF
 2453         }
 2454 
 2455         print "</p>\n";
 2456     }
 2457     return;
 2458 }
 2459 
 2460 ##################
 2461 # Get statistics #
 2462 ##################
 2463 
 2464 sub get_timestamp ()
 2465 {
 2466     return pack('LL', Time::HiRes::gettimeofday());
 2467 }
 2468 
 2469 sub time_diff ($$)
 2470 {
 2471     my @start = unpack('LL', $_[0]);
 2472     my @stop  = unpack('LL', $_[1]);
 2473     for ($start[1], $stop[1]) {
 2474         $_ /= 1_000_000;
 2475     }
 2476     return (sprintf("%.2f", ($stop[0] + $stop[1]) - ($start[0] + $start[1])));
 2477 }
 2478 
 2479 ########################
 2480 # Handle the redirects #
 2481 ########################
 2482 
 2483 # Record the redirects in a hash
 2484 sub record_redirects (\%$)
 2485 {
 2486     my ($redirects, $response) = @_;
 2487     for (my $prev = $response->previous(); $prev; $prev = $prev->previous()) {
 2488 
 2489         # Check for redirect match.
 2490         my $from = $prev->request()->url();
 2491         my $to   = $response->request()->url();  # same on every loop iteration
 2492         my $from_to = $from . '->' . $to;
 2493         my $match = grep { $_ eq $from_to } @{$Opts{Suppress_Redirect}};
 2494 
 2495         # print STDERR "Result $match of redirect checking $from_to\n";
 2496         if ($match) { next; }
 2497 
 2498         $match = grep { $from_to =~ /$_/ } @{$Opts{Suppress_Redirect_Prefix}};
 2499 
 2500         # print STDERR "Result $match of regexp checking $from_to\n";
 2501         if ($match) { next; }
 2502 
 2503         my $c = $prev->code();
 2504         if ($Opts{Suppress_Temp_Redirects} && ($c == 307 || $c == 302)) {
 2505             next;
 2506         }
 2507 
 2508         $redirects->{$prev->request()->url()} = $response->request()->url();
 2509     }
 2510     return;
 2511 }
 2512 
 2513 # Determine if a request is redirected
 2514 sub is_redirected ($%)
 2515 {
 2516     my ($uri, %redirects) = @_;
 2517     return (defined($redirects{$uri}));
 2518 }
 2519 
 2520 # Get a list of redirects for a URI
 2521 sub get_redirects ($%)
 2522 {
 2523     my ($uri, %redirects) = @_;
 2524     my @history = ($uri);
 2525     my %seen    = ($uri => 1);    # for tracking redirect loops
 2526     my $loop    = 0;
 2527     while ($redirects{$uri}) {
 2528         $uri = $redirects{$uri};
 2529         push(@history, $uri);
 2530         if ($seen{$uri}) {
 2531             $loop = 1;
 2532             last;
 2533         }
 2534         else {
 2535             $seen{$uri}++;
 2536         }
 2537     }
 2538     return ($loop, @history);
 2539 }
 2540 
 2541 ####################################################
 2542 # Tool for sorting the unique elements of an array #
 2543 ####################################################
 2544 
 2545 sub sort_unique (@)
 2546 {
 2547     my %saw;
 2548     @saw{@_} = ();
 2549     return (sort { $a <=> $b } keys %saw);
 2550 }
 2551 
 2552 #####################
 2553 # Print the results #
 2554 #####################
 2555 
 2556 sub line_number ($)
 2557 {
 2558     my $line = shift;
 2559     return $line if ($line >= 0);
 2560     return "(N/A)";
 2561 }
 2562 
 2563 sub http_rc ($)
 2564 {
 2565     my $rc = shift;
 2566     return $rc if ($rc >= 0);
 2567     return "(N/A)";
 2568 }
 2569 
 2570 # returns true if the given code is informational
 2571 sub informational ($)
 2572 {
 2573     my $rc = shift;
 2574     return $rc == RC_ROBOTS_TXT() ||
 2575         $rc == RC_IP_DISALLOWED() ||
 2576         $rc == RC_PROTOCOL_DISALLOWED();
 2577 }
 2578 
 2579 sub anchors_summary (\%\%)
 2580 {
 2581     my ($anchors, $errors) = @_;
 2582 
 2583     # Number of anchors found.
 2584     my $n = scalar(keys(%$anchors));
 2585     if (!$Opts{Quiet}) {
 2586         if ($Opts{HTML}) {
 2587             print("<h3>Anchors</h3>\n<p>");
 2588         }
 2589         else {
 2590             print("Anchors\n\n");
 2591         }
 2592         &hprintf("Found %d anchor%s.\n", $n, ($n == 1) ? '' : 's');
 2593         print("</p>\n") if $Opts{HTML};
 2594     }
 2595 
 2596     # List of the duplicates, if any.
 2597     my @errors = keys %{$errors};
 2598     if (!scalar(@errors)) {
 2599         print("<p>Valid anchors!</p>\n")
 2600             if (!$Opts{Quiet} && $Opts{HTML} && $n);
 2601         return;
 2602     }
 2603     undef $n;
 2604 
 2605     print_doc_header();
 2606     print('<p>') if $Opts{HTML};
 2607     print('List of duplicate and empty anchors');
 2608     print <<'EOF' if $Opts{HTML};
 2609 </p>
 2610 <table class="report" border="1" summary="List of duplicate and empty anchors.">
 2611 <thead>
 2612 <tr>
 2613 <th>Anchor</th>
 2614 <th>Lines</th>
 2615 </tr>
 2616 </thead>
 2617 <tbody>
 2618 EOF
 2619     print("\n");
 2620 
 2621     for my $anchor (@errors) {
 2622         my $format;
 2623         my @unique = &sort_unique(
 2624             map { line_number($_) }
 2625                 keys %{$anchors->{$anchor}}
 2626         );
 2627         if ($Opts{HTML}) {
 2628             $format = "<tr><td class=\"broken\">%s</td><td>%s</td></tr>\n";
 2629         }
 2630         else {
 2631             my $s = (scalar(@unique) > 1) ? 's' : '';
 2632             $format = "\t%s\tLine$s: %s\n";
 2633         }
 2634         printf($format,
 2635             &encode(length($anchor) ? $anchor : 'Empty anchor'),
 2636             join(', ', @unique));
 2637     }
 2638 
 2639     print("</tbody>\n</table>\n") if $Opts{HTML};
 2640 
 2641     return;
 2642 }
 2643 
 2644 sub show_link_report (\%\%\%\%\@;$\%)
 2645 {
 2646     my ($links, $results, $broken, $redirects, $urls, $codes, $todo) = @_;
 2647 
 2648     print("\n<dl class=\"report\">") if $Opts{HTML};
 2649     print("\n") if (!$Opts{Quiet});
 2650 
 2651     # Process each URL
 2652     my ($c, $previous_c);
 2653     for my $u (@$urls) {
 2654         my @fragments = keys %{$broken->{$u}{fragments}};
 2655 
 2656         # Did we get a redirect?
 2657         my $redirected = &is_redirected($u, %$redirects);
 2658 
 2659         # List of lines
 2660         my @total_lines;
 2661         push(@total_lines, keys(%{$links->{$u}{location}}));
 2662         for my $f (@fragments) {
 2663             push(@total_lines, keys(%{$links->{$u}{fragments}{$f}}))
 2664                 unless ($f eq $u && defined($links->{$u}{$u}{LINE_UNKNOWN()}));
 2665         }
 2666 
 2667         my ($redirect_loop, @redirects_urls) = get_redirects($u, %$redirects);
 2668         my $currloc = $results->{$u}{location};
 2669 
 2670         # Error type
 2671         $c = &code_shown($u, $results);
 2672 
 2673         # What to do
 2674         my $whattodo;
 2675         my $redirect_too;
 2676         if ($todo) {
 2677             if ($u =~ m/^javascript:/) {
 2678                 if ($Opts{HTML}) {
 2679                     $whattodo =
 2680                         'You must change this link: people using a browser without JavaScript support
 2681 will <em>not</em> be able to follow this link. See the
 2682 <a href="http://www.w3.org/TR/WAI-WEBCONTENT/#tech-scripts">Web Content
 2683 Accessibility Guidelines on the use of scripting on the Web</a> and the
 2684 <a href="http://www.w3.org/TR/WCAG10-HTML-TECHS/#directly-accessible-scripts">techniques
 2685 on how to solve this</a>.';
 2686                 }
 2687                 else {
 2688                     $whattodo =
 2689                         'Change this link: people using a browser without JavaScript support will not be able to follow this link.';
 2690                 }
 2691             }
 2692             elsif ($c == RC_ROBOTS_TXT()) {
 2693                 $whattodo =
 2694                     'The link was not checked due to robots exclusion ' .
 2695                     'rules. Check the link manually.';
 2696             }
 2697             elsif ($redirect_loop) {
 2698                 $whattodo =
 2699                     'Retrieving the URI results in a redirect loop, that should be '
 2700                     . 'fixed.  Examine the redirect sequence to see where the loop '
 2701                     . 'occurs.';
 2702             }
 2703             else {
 2704                 $whattodo = $todo->{$c};
 2705             }
 2706         }
 2707         elsif (defined($redirects{$u})) {
 2708 
 2709             # Redirects
 2710             if (($u . '/') eq $redirects{$u}) {
 2711                 $whattodo =
 2712                     'The link is missing a trailing slash, and caused a redirect. Adding the trailing slash would speed up browsing.';
 2713             }
 2714             elsif ($c == 307 || $c == 302) {
 2715                 $whattodo =
 2716                     'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.';
 2717             }
 2718             elsif ($c == 301) {
 2719                 $whattodo =
 2720                     'This is a permanent redirect. The link should be updated.';
 2721             }
 2722         }
 2723 
 2724         my @unique = &sort_unique(map { line_number($_) } @total_lines);
 2725         my $lines_list = join(', ', @unique);
 2726         my $s = (scalar(@unique) > 1) ? 's' : '';
 2727         undef @unique;
 2728 
 2729         my @http_codes = ($currloc->{code});
 2730         unshift(@http_codes, $currloc->{orig}) if $currloc->{orig};
 2731         @http_codes = map { http_rc($_) } @http_codes;
 2732 
 2733         if ($Opts{HTML}) {
 2734 
 2735             # Style stuff
 2736             my $idref = '';
 2737             if ($codes && (!defined($previous_c) || ($c != $previous_c))) {
 2738                 $idref      = ' id="d' . $doc_count . 'code_' . $c . '"';
 2739                 $previous_c = $c;
 2740             }
 2741 
 2742             # Main info
 2743             for (@redirects_urls) {
 2744                 $_ = &show_url($_);
 2745             }
 2746 
 2747             # HTTP message
 2748             my $http_message;
 2749             if ($currloc->{message}) {
 2750                 $http_message = &encode($currloc->{message});
 2751                 if ($c == 404 || $c == 500) {
 2752                     $http_message =
 2753                         '<span class="broken">' . $http_message . '</span>';
 2754                 }
 2755             }
 2756             my $redirmsg =
 2757                 $redirect_loop ? ' <em>redirect loop detected</em>' : '';
 2758             printf("
 2759 <dt%s>%s <span class='msg_loc'>Line%s: %s</span> %s</dt>
 2760 <dd class='responsecode'><strong>Status</strong>: %s %s %s</dd>
 2761 <dd class='message_explanation'><p>%s %s</p></dd>\n",
 2762 
 2763                 # Anchor for return codes
 2764                 $idref,
 2765 
 2766                 # Color
 2767                 &status_icon($c),
 2768                 $s,
 2769 
 2770                 # List of lines
 2771                 $lines_list,
 2772 
 2773                 # List of redirects
 2774                 $redirected ?
 2775                     join(' redirected to ', @redirects_urls) . $redirmsg :
 2776                     &show_url($u),
 2777 
 2778                 # Realm
 2779                 defined($currloc->{realm}) ?
 2780                     sprintf('Realm: %s<br />', &encode($currloc->{realm})) :
 2781                     '',
 2782 
 2783                 # HTTP original message
 2784                 # defined($currloc->{orig_message})
 2785                 # ? &encode($currloc->{orig_message}).
 2786                 # ' <span title="redirected to">-&gt;</span> '
 2787                 # : '',
 2788 
 2789                 # Response code chain
 2790                 join(
 2791                     ' <span class="redirected_to" title="redirected to">-&gt;</span> ',
 2792                     map { &encode($_) } @http_codes),
 2793 
 2794                 # HTTP final message
 2795                 $http_message,
 2796 
 2797                 # What to do
 2798                 $whattodo,
 2799 
 2800                 # Redirect too?
 2801                 $redirect_too ?
 2802                     sprintf(' <span %s>%s</span>',
 2803                     &bgcolor(301), $redirect_too) :
 2804                     '',
 2805             );
 2806             if ($#fragments >= 0) {
 2807                 printf("<dd>Broken fragments: <ul>\n");
 2808             }
 2809         }
 2810         else {
 2811             my $redirmsg = $redirect_loop ? ' redirect loop detected' : '';
 2812             printf(
 2813                 "\n%s\t%s\n  Code: %s %s\n%s\n",
 2814 
 2815                 # List of redirects
 2816                 $redirected ? join("\n-> ", @redirects_urls) . $redirmsg : $u,
 2817 
 2818                 # List of lines
 2819                 $lines_list ? sprintf("\n%6s: %s", "Line$s", $lines_list) : '',
 2820 
 2821                 # Response code chain
 2822                 join(' -> ', @http_codes),
 2823 
 2824                 # HTTP message
 2825                 $currloc->{message} || '',
 2826 
 2827                 # What to do
 2828                 wrap(' To do: ', '        ', $whattodo)
 2829             );
 2830             if ($#fragments >= 0) {
 2831                 if ($currloc->{code} == 200) {
 2832                     print("The following fragments need to be fixed:\n");
 2833                 }
 2834                 else {
 2835                     print("Fragments:\n");
 2836                 }
 2837             }
 2838         }
 2839 
 2840         # Fragments
 2841         for my $f (@fragments) {
 2842             my @unique_lines =
 2843                 &sort_unique(keys %{$links->{$u}{fragments}{$f}});
 2844             my $plural = (scalar(@unique_lines) > 1) ? 's' : '';
 2845             my $unique_lines = join(', ', @unique_lines);
 2846             if ($Opts{HTML}) {
 2847                 printf("<li>%s<em>#%s</em> (line%s %s)</li>\n",
 2848                     &encode($u), &encode($f), $plural, $unique_lines);
 2849             }
 2850             else {
 2851                 printf("\t%-30s\tLine%s: %s\n", $f, $plural, $unique_lines);
 2852             }
 2853         }
 2854 
 2855         print("</ul></dd>\n") if ($Opts{HTML} && scalar(@fragments));
 2856     }
 2857 
 2858     # End of the table
 2859     print("</dl>\n") if $Opts{HTML};
 2860 
 2861     return;
 2862 }
 2863 
 2864 sub code_shown ($$)
 2865 {
 2866     my ($u, $results) = @_;
 2867 
 2868     if ($results->{$u}{location}{record} == 200) {
 2869         return $results->{$u}{location}{orig} ||
 2870             $results->{$u}{location}{record};
 2871     }
 2872     else {
 2873         return $results->{$u}{location}{record};
 2874     }
 2875 }
 2876 
 2877 sub links_summary (\%\%\%\%)
 2878 {
 2879 
 2880     # Advices to fix the problems
 2881 
 2882     my %todo = (
 2883         200 =>
 2884             'Some of the links to this resource point to broken URI fragments (such as index.html#fragment).',
 2885         300 =>
 2886             'This often happens when a typo in the link gets corrected automatically by the server. For the sake of performance, the link should be fixed.',
 2887         301 =>
 2888             'This is a permanent redirect. The link should be updated to point to the more recent URI.',
 2889         302 =>
 2890             'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.',
 2891         303 =>
 2892             'This rare status code points to a "See Other" resource. There is generally nothing to be done.',
 2893         307 =>
 2894             'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.',
 2895         400 =>
 2896             'This is usually the sign of a malformed URL that cannot be parsed by the server. Check the syntax of the link.',
 2897         401 =>
 2898             "The link is not public and the actual resource is only available behind authentication. If not already done, you could specify it.",
 2899         403 =>
 2900             'The link is forbidden! This needs fixing. Usual suspects: a missing index.html or Overview.html, or a missing ACL.',
 2901         404 =>
 2902             'The link is broken. Double-check that you have not made any typo, or mistake in copy-pasting. If the link points to a resource that no longer exists, you may want to remove or fix the link.',
 2903         405 =>
 2904             'The server does not allow HTTP HEAD requests, which prevents the Link Checker to check the link automatically.  Check the link manually.',
 2905         406 =>
 2906             "The server isn't capable of responding according to the Accept* headers sent. This is likely to be a server-side issue with negotiation.",
 2907         407 => 'The link is a proxy, but requires Authentication.',
 2908         408 => 'The request timed out.',
 2909         410 => 'The resource is gone. You should remove this link.',
 2910         415 => 'The media type is not supported.',
 2911         500 => 'This is a server side problem. Check the URI.',
 2912         501 =>
 2913             'Could not check this link: method not implemented or scheme not supported.',
 2914         503 =>
 2915             'The server cannot service the request, for some unknown reason.',
 2916 
 2917         # Non-HTTP codes:
 2918         RC_ROBOTS_TXT() => sprintf(
 2919             'The link was not checked due to %srobots exclusion rules%s. Check the link manually, and see also the link checker %sdocumentation on robots exclusion%s.',
 2920             $Opts{HTML} ? (
 2921                 '<a href="http://www.robotstxt.org/robotstxt.html">', '</a>',
 2922                 "<a href=\"$Cfg{Doc_URI}#bot\">",                     '</a>'
 2923                 ) : ('') x 4
 2924         ),
 2925         RC_DNS_ERROR() =>
 2926             'The hostname could not be resolved. Check the link for typos.',
 2927         RC_IP_DISALLOWED() =>
 2928             sprintf(
 2929             'The link resolved to a %snon-public IP address%s, and this link checker instance has been configured to not access such addresses. This may be a real error or just a quirk of the name resolver configuration on the server where the link checker runs. Check the link manually, in particular its hostname/IP address.',
 2930             $Opts{HTML} ?
 2931                 ('<a href="http://www.ietf.org/rfc/rfc1918.txt">', '</a>') :
 2932                 ('') x 2),
 2933         RC_PROTOCOL_DISALLOWED() =>
 2934             'Accessing links with this URI scheme has been disabled in link checker.',
 2935     );
 2936     my %priority = (
 2937         410 => 1,
 2938         404 => 2,
 2939         403 => 5,
 2940         200 => 10,
 2941         300 => 15,
 2942         401 => 20
 2943     );
 2944 
 2945     my ($links, $results, $broken, $redirects) = @_;
 2946 
 2947     # List of the broken links
 2948     my @urls              = keys %{$broken};
 2949     my @dir_redirect_urls = ();
 2950     if ($Opts{Redirects}) {
 2951 
 2952         # Add the redirected URI's to the report
 2953         for my $l (keys %$redirects) {
 2954             next
 2955                 unless (defined($results->{$l}) &&
 2956                 defined($links->{$l}) &&
 2957                 !defined($broken->{$l}));
 2958 
 2959             # Check whether we have a "directory redirect"
 2960             # e.g. http://www.w3.org/TR -> http://www.w3.org/TR/
 2961             my ($redirect_loop, @redirects) = get_redirects($l, %$redirects);
 2962             if ($#redirects == 1) {
 2963                 push(@dir_redirect_urls, $l);
 2964                 next;
 2965             }
 2966             push(@urls, $l);
 2967         }
 2968     }
 2969 
 2970     # Broken links and redirects
 2971     if ($#urls < 0) {
 2972         if (!$Opts{Quiet}) {
 2973             print_doc_header();
 2974             if ($Opts{HTML}) {
 2975                 print "<h3>Links</h3>\n<p>Valid links!</p>\n";
 2976             }
 2977             else {
 2978                 print "\nValid links.\n";
 2979             }
 2980         }
 2981     }
 2982     else {
 2983         print_doc_header();
 2984         print('<h3>') if $Opts{HTML};
 2985         print("\nList of broken links and other issues");
 2986 
 2987         #print(' and redirects') if $Opts{Redirects};
 2988 
 2989         # Sort the URI's by HTTP Code
 2990         my %code_summary;
 2991         my @idx;
 2992         for my $u (@urls) {
 2993             if (defined($results->{$u}{location}{record})) {
 2994                 my $c = &code_shown($u, $results);
 2995                 $code_summary{$c}++;
 2996                 push(@idx, $c);
 2997             }
 2998         }
 2999         my @sorted = @urls[
 3000             sort {
 3001                 defined($priority{$idx[$a]}) ?
 3002                     defined($priority{$idx[$b]}) ?
 3003                     $priority{$idx[$a]} <=> $priority{$idx[$b]} :
 3004                         -1 :
 3005                     defined($priority{$idx[$b]}) ? 1 :
 3006                     $idx[$a] <=> $idx[$b]
 3007             } 0 .. $#idx
 3008         ];
 3009         @urls = @sorted;
 3010         undef(@sorted);
 3011         undef(@idx);
 3012 
 3013         if ($Opts{HTML}) {
 3014 
 3015             # Print a summary
 3016             print <<'EOF';
 3017 </h3>
 3018 <p><em>There are issues with the URLs listed below. The table summarizes the
 3019 issues and suggested actions by HTTP response status code.</em></p>
 3020 <table class="report" border="1" summary="List of issues and suggested actions.">
 3021 <thead>
 3022 <tr>
 3023 <th>Code</th>
 3024 <th>Occurrences</th>
 3025 <th>What to do</th>
 3026 </tr>
 3027 </thead>
 3028 <tbody>
 3029 EOF
 3030             for my $code (sort(keys(%code_summary))) {
 3031                 printf('<tr%s>', &bgcolor($code));
 3032                 printf('<td><a href="#d%scode_%s">%s</a></td>',
 3033                     $doc_count, $code, http_rc($code));
 3034                 printf('<td>%s</td>', $code_summary{$code});
 3035                 printf('<td>%s</td>', $todo{$code});
 3036                 print "</tr>\n";
 3037             }
 3038             print "</tbody>\n</table>\n";
 3039         }
 3040         else {
 3041             print(':');
 3042         }
 3043         &show_link_report($links, $results, $broken, $redirects, \@urls, 1,
 3044             \%todo);
 3045     }
 3046 
 3047     # Show directory redirects
 3048     if ($Opts{Dir_Redirects} && ($#dir_redirect_urls > -1)) {
 3049         print_doc_header();
 3050         print('<h3>') if $Opts{HTML};
 3051         print("\nList of redirects");
 3052         print(
 3053             "</h3>\n<p>The links below are not broken, but the document does not use the exact URL, and the links were redirected. It may be a good idea to link to the final location, for the sake of speed.</p>"
 3054         ) if $Opts{HTML};
 3055         &show_link_report($links, $results, $broken, $redirects,
 3056             \@dir_redirect_urls);
 3057     }
 3058 
 3059     return;
 3060 }
 3061 
 3062 ###############################################################################
 3063 
 3064 ################
 3065 # Global stats #
 3066 ################
 3067 
 3068 sub global_stats ()
 3069 {
 3070     my $stop = &get_timestamp();
 3071     my $n_docs =
 3072         ($doc_count <= $Opts{Max_Documents}) ? $doc_count :
 3073                                                $Opts{Max_Documents};
 3074     return sprintf(
 3075         'Checked %d document%s in %s seconds.',
 3076         $n_docs,
 3077         ($n_docs == 1) ? '' : 's',
 3078         &time_diff($timestamp, $stop)
 3079     );
 3080 }
 3081 
 3082 ##################
 3083 # HTML interface #
 3084 ##################
 3085 
 3086 sub html_header ($$)
 3087 {
 3088     my ($uri, $cookie) = @_;
 3089 
 3090     my $title = defined($uri) ? $uri : '';
 3091     $title = ': ' . $title if ($title =~ /\S/);
 3092 
 3093     my $headers = '';
 3094     if (!$Opts{Command_Line}) {
 3095         $headers .= "Cache-Control: no-cache\nPragma: no-cache\n" if $uri;
 3096         $headers .= "Content-Type: text/html; charset=utf-8\n";
 3097         $headers .= "Set-Cookie: $cookie\n"                       if $cookie;
 3098 
 3099         # mod_perl 1.99_05 doesn't seem to like it if the "\n\n" isn't in the same
 3100         # print() statement as the last header
 3101         $headers .= "Content-Language: en\n\n";
 3102     }
 3103 
 3104     my $onload = $uri ? '' :
 3105           ' onload="if(document.getElementById){document.getElementById(\'uri_1\').focus()}"';
 3106 
 3107     print $headers, $DocType, "
 3108 <html lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">
 3109 <head>
 3110 <title>W3C Link Checker", &encode($title), "</title>
 3111 ",      $Head,   "</head>
 3112 <body", $onload, '>';
 3113     &banner($title);
 3114     return;
 3115 }
 3116 
 3117 sub banner ($)
 3118 {
 3119     my $tagline = "Check links and anchors in Web pages or full Web sites";
 3120 
 3121     printf(
 3122         <<'EOF', URI->new_abs("../images/no_w3c.png", $Cfg{Doc_URI}), $tagline);
 3123 <div id="banner"><h1 id="title"><a href="http://www.w3.org/" title="W3C"><img alt="W3C" id="logo" src="%s" width="110" height="61" /></a>
 3124 <a href="checklink"><span>Link Checker</span></a></h1>
 3125 <p id="tagline">%s</p></div>
 3126 <div id="main">
 3127 EOF
 3128     return;
 3129 }
 3130 
 3131 sub status_icon($)
 3132 {
 3133     my ($code) = @_;
 3134     my $icon_type;
 3135     my $r = HTTP::Response->new($code);
 3136     if ($r->is_success()) {
 3137         $icon_type = 'error'
 3138             ; # if is success but reported, it's because of broken frags => error
 3139     }
 3140     elsif (&informational($code)) {
 3141         $icon_type = 'info';
 3142     }
 3143     elsif ($code == 300) {
 3144         $icon_type = 'info';
 3145     }
 3146     elsif ($code == 401) {
 3147         $icon_type = 'error';
 3148     }
 3149     elsif ($r->is_redirect()) {
 3150         $icon_type = 'warning';
 3151     }
 3152     elsif ($r->is_error()) {
 3153         $icon_type = 'error';
 3154     }
 3155     else {
 3156         $icon_type = 'error';
 3157     }
 3158     return sprintf('<span class="err_type"><img src="%s" alt="%s" /></span>',
 3159         URI->new_abs("../images/info_icons/$icon_type.png", $Cfg{Doc_URI}),
 3160         $icon_type);
 3161 }
 3162 
 3163 sub bgcolor ($)
 3164 {
 3165     my ($code) = @_;
 3166     my $class;
 3167     my $r = HTTP::Response->new($code);
 3168     if ($r->is_success()) {
 3169         return '';
 3170     }
 3171     elsif ($code == RC_ROBOTS_TXT() || $code == RC_IP_DISALLOWED()) {
 3172         $class = 'dubious';
 3173     }
 3174     elsif ($code == 300) {
 3175         $class = 'multiple';
 3176     }
 3177     elsif ($code == 401) {
 3178         $class = 'unauthorized';
 3179     }
 3180     elsif ($r->is_redirect()) {
 3181         $class = 'redirect';
 3182     }
 3183     elsif ($r->is_error()) {
 3184         $class = 'broken';
 3185     }
 3186     else {
 3187         $class = 'broken';
 3188     }
 3189     return (' class="' . $class . '"');
 3190 }
 3191 
 3192 sub show_url ($)
 3193 {
 3194     my ($url) = @_;
 3195     return sprintf('<a href="%s">%s</a>', (&encode($url)) x 2);
 3196 }
 3197 
 3198 sub html_footer ()
 3199 {
 3200     printf("<p>%s</p>\n", &global_stats())
 3201         if ($doc_count > 0 && !$Opts{Quiet});
 3202     if (!$doc_count) {
 3203         print <<'EOF';
 3204 <div class="intro">
 3205   <p>
 3206     This Link Checker looks for issues in links, anchors and referenced objects
 3207     in a Web page, CSS style sheet, or recursively on a whole Web site. For
 3208     best results, it is recommended to first ensure that the documents checked
 3209     use Valid <a href="http://validator.w3.org/">(X)HTML Markup</a> and
 3210     <a href="http://jigsaw.w3.org/css-validator/">CSS</a>. The Link Checker is
 3211     part of the W3C's <a href="http://www.w3.org/QA/Tools/">validators and
 3212     Quality Web tools</a>.
 3213   </p>
 3214 </div>
 3215 EOF
 3216     }
 3217     printf(<<'EOF', $Cfg{Doc_URI}, $Cfg{Doc_URI}, $PACKAGE, $REVISION);
 3218 </div><!-- main -->
 3219 <ul class="navbar" id="menu">
 3220   <li><a href="%s" accesskey="3" title="Documentation for this Link Checker Service">Docs</a></li>
 3221   <li><a href="http://search.cpan.org/dist/W3C-LinkChecker/" accesskey="2" title="Download the source / Install this service">Download</a></li>
 3222   <li><a href="%s#csb" title="feedback: comments, suggestions and bugs" accesskey="4">Feedback</a></li>
 3223   <li><a href="http://validator.w3.org/" title="Validate your markup with the W3C Markup Validation Service">Validator</a></li>
 3224 </ul>
 3225 <div>
 3226 <address>
 3227 %s<br /> %s
 3228 </address>
 3229 </div>
 3230 </body>
 3231 </html>
 3232 EOF
 3233     return;
 3234 }
 3235 
 3236 sub print_form (\%$$)
 3237 {
 3238     my ($params, $cookie, $check_num) = @_;
 3239 
 3240     # Split params on \0, see CGI's docs on Vars()
 3241     while (my ($key, $value) = each(%$params)) {
 3242         if ($value) {
 3243             my @vals = split(/\0/, $value, 2);
 3244             $params->{$key} = $vals[0];
 3245         }
 3246     }
 3247 
 3248     # Override undefined values from the cookie, if we got one.
 3249     my $valid_cookie = 0;
 3250     if ($cookie) {
 3251         my %cookie_values = $cookie->value();
 3252         if (!$cookie_values{clear})
 3253         {    # XXX no easy way to check if cookie expired?
 3254             $valid_cookie = 1;
 3255             while (my ($key, $value) = each(%cookie_values)) {
 3256                 $params->{$key} = $value unless defined($params->{$key});
 3257             }
 3258         }
 3259     }
 3260 
 3261     my $chk = ' checked="checked"';
 3262     $params->{hide_type} = 'all' unless $params->{hide_type};
 3263 
 3264     my $requested_uri = &encode($params->{uri} || '');
 3265     my $sum = $params->{summary}        ? $chk : '';
 3266     my $red = $params->{hide_redirects} ? $chk : '';
 3267     my $all = ($params->{hide_type} ne 'dir') ? $chk : '';
 3268     my $dir = $all                            ? ''   : $chk;
 3269     my $acc = $params->{no_accept_language}   ? $chk : '';
 3270     my $ref = $params->{no_referer}           ? $chk : '';
 3271     my $rec = $params->{recursive}            ? $chk : '';
 3272     my $dep = &encode($params->{depth} || '');
 3273 
 3274     my $cookie_options = '';
 3275     if ($valid_cookie) {
 3276         $cookie_options = "
 3277     <label for=\"cookie1_$check_num\"><input type=\"radio\" id=\"cookie1_$check_num\" name=\"cookie\" value=\"nochanges\" checked=\"checked\" /> Don't modify saved options</label>
 3278     <label for=\"cookie2_$check_num\"><input type=\"radio\" id=\"cookie2_$check_num\" name=\"cookie\" value=\"set\" /> Save these options</label>
 3279     <label for=\"cookie3_$check_num\"><input type=\"radio\" id=\"cookie3_$check_num\" name=\"cookie\" value=\"clear\" /> Clear saved options</label>";
 3280     }
 3281     else {
 3282         $cookie_options = "
 3283     <label for=\"cookie_$check_num\"><input type=\"checkbox\" id=\"cookie_$check_num\" name=\"cookie\" value=\"set\" /> Save options in a <a href=\"http://www.w3.org/Protocols/rfc2109/rfc2109\">cookie</a></label>";
 3284     }
 3285 
 3286     print "<form action=\"", $Opts{_Self_URI},
 3287         "\" method=\"get\" onsubmit=\"return uriOk($check_num)\" accept-charset=\"UTF-8\">
 3288 <p><label for=\"uri_$check_num\">Enter the address (<a href=\"http://www.w3.org/Addressing/\">URL</a>)
 3289 of a document that you would like to check:</label></p>
 3290 <p><input type=\"text\" size=\"50\" id=\"uri_$check_num\" name=\"uri\" value=\"",
 3291         $requested_uri, "\" /></p>
 3292 <fieldset id=\"extra_opt_uri_$check_num\" class=\"moreoptions\">
 3293     <legend class=\"toggletext\">More Options</legend>
 3294     <div class=\"options\">
 3295   <p>
 3296     <label for=\"summary_$check_num\"><input type=\"checkbox\" id=\"summary_$check_num\" name=\"summary\" value=\"on\"",
 3297         $sum, " /> Summary only</label>
 3298     <br />
 3299     <label for=\"hide_redirects_$check_num\"><input type=\"checkbox\" id=\"hide_redirects_$check_num\" name=\"hide_redirects\" value=\"on\"",
 3300         $red,
 3301         " /> Hide <a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html#sec10.3\">redirects</a>:</label>
 3302     <label for=\"hide_type_all_$check_num\"><input type=\"radio\" id=\"hide_type_all_$check_num\" name=\"hide_type\" value=\"all\"",
 3303         $all, " /> all</label>
 3304     <label for=\"hide_type_dir_$check_num\"><input type=\"radio\" id=\"hide_type_dir_$check_num\" name=\"hide_type\" value=\"dir\"",
 3305         $dir, " /> for directories only</label>
 3306     <br />
 3307     <label for=\"no_accept_language_$check_num\"><input type=\"checkbox\" id=\"no_accept_language_$check_num\" name=\"no_accept_language\" value=\"on\"",
 3308         $acc,
 3309         " /> Don't send the <tt><a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4\">Accept-Language</a></tt> header</label>
 3310     <br />
 3311     <label for=\"no_referer_$check_num\"><input type=\"checkbox\" id=\"no_referer_$check_num\" name=\"no_referer\" value=\"on\"",
 3312         $ref,
 3313         " /> Don't send the <tt><a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.36\">Referer</a></tt> header</label>
 3314     <br />
 3315     <label title=\"Check linked documents recursively (maximum: ",
 3316         $Opts{Max_Documents},
 3317         " documents)\" for=\"recursive_$check_num\"><input type=\"checkbox\" id=\"recursive_$check_num\" name=\"recursive\" value=\"on\"",
 3318         $rec, " /> Check linked documents recursively</label>,
 3319     <label title=\"Depth of the recursion (-1 is the default and means unlimited)\" for=\"depth_$check_num\">recursion depth: <input type=\"text\" size=\"3\" maxlength=\"3\" id=\"depth_$check_num\" name=\"depth\" value=\"",
 3320         $dep, "\" /></label>
 3321     <br /><br />", $cookie_options, "
 3322   </p>
 3323   </div>
 3324 </fieldset>
 3325 <p class=\"submit_button\"><input type=\"submit\" name=\"check\" value=\"Check\" /></p>
 3326 </form>
 3327 <div class=\"intro\" id=\"don_program\"></div>
 3328 <script type=\"text/javascript\" src=\"http://www.w3.org/QA/Tools/don_prog.js\"></script>
 3329 ";
 3330     return;
 3331 }
 3332 
 3333 sub encode (@)
 3334 {
 3335     return $Opts{HTML} ? HTML::Entities::encode(@_) : @_;
 3336 }
 3337 
 3338 sub hprintf (@)
 3339 {
 3340     print_doc_header();
 3341     if (!$Opts{HTML}) {
 3342         # can have undef values here; avoid useless warning. E.g.,
 3343         #   Error: -1 Forbidden by robots.txt
 3344         #   Use of uninitialized value $_[2] in printf at /usr/local/bin/checklink line 3245.
 3345         # and
 3346         #   Error: 404 File `/u/karl/gnu/src/akarl/doc/dejagnu.html' does not exist
 3347         #   Use of uninitialized value $_[2] in printf at /usr/local/bin/checklink line 3245.
 3348         my @args = ();
 3349         for my $a (@_) {
 3350           push (@args, defined $a ? $a : ""),
 3351         }
 3352         printf(@args);
 3353     }
 3354     else {
 3355         print HTML::Entities::encode(sprintf($_[0], @_[1 .. @_ - 1]));
 3356     }
 3357     return;
 3358 }
 3359 
 3360 # Print the document header, if it hasn't been printed already.
 3361 # This is invoked before most other output operations, in order
 3362 # to enable quiet processing that doesn't clutter the output with
 3363 # "Processing..." messages when nothing else will be reported.
 3364 sub print_doc_header ()
 3365 {
 3366     if (defined($doc_header)) {
 3367         print $doc_header;
 3368         undef($doc_header);
 3369     }
 3370 }
 3371 
 3372 # Local Variables:
 3373 # mode: perl
 3374 # indent-tabs-mode: nil
 3375 # cperl-indent-level: 4
 3376 # cperl-continued-statement-offset: 4
 3377 # cperl-brace-offset: -4
 3378 # perl-indent-level: 4
 3379 # End:
 3380 # ex: ts=4 sw=4 et