"Fossies" - the Fresh Open Source Software Archive

Member "install-tl-20200916/tlpkg/tlperl/site/lib/WWW/RobotRules.pm" (5 Apr 2016, 11155 Bytes) of package /windows/misc/install-tl.zip:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Perl source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 package WWW::RobotRules;
    2 
    3 $VERSION = "6.02";
    4 sub Version { $VERSION; }
    5 
    6 use strict;
    7 use URI ();
    8 
    9 
   10 
   11 sub new {
   12     my($class, $ua) = @_;
   13 
   14     # This ugly hack is needed to ensure backwards compatibility.
   15     # The "WWW::RobotRules" class is now really abstract.
   16     $class = "WWW::RobotRules::InCore" if $class eq "WWW::RobotRules";
   17 
   18     my $self = bless { }, $class;
   19     $self->agent($ua);
   20     $self;
   21 }
   22 
   23 
   24 sub parse {
   25     my($self, $robot_txt_uri, $txt, $fresh_until) = @_;
   26     $robot_txt_uri = URI->new("$robot_txt_uri");
   27     my $netloc = $robot_txt_uri->host . ":" . $robot_txt_uri->port;
   28 
   29     $self->clear_rules($netloc);
   30     $self->fresh_until($netloc, $fresh_until || (time + 365*24*3600));
   31 
   32     my $ua;
   33     my $is_me = 0;      # 1 iff this record is for me
   34     my $is_anon = 0;        # 1 iff this record is for *
   35     my $seen_disallow = 0;      # watch for missing record separators
   36     my @me_disallowed = (); # rules disallowed for me
   37     my @anon_disallowed = ();   # rules disallowed for *
   38 
   39     # blank lines are significant, so turn CRLF into LF to avoid generating
   40     # false ones
   41     $txt =~ s/\015\012/\012/g;
   42 
   43     # split at \012 (LF) or \015 (CR) (Mac text files have just CR for EOL)
   44     for(split(/[\012\015]/, $txt)) {
   45 
   46     # Lines containing only a comment are discarded completely, and
   47         # therefore do not indicate a record boundary.
   48     next if /^\s*\#/;
   49 
   50     s/\s*\#.*//;        # remove comments at end-of-line
   51 
   52     if (/^\s*$/) {      # blank line
   53         last if $is_me; # That was our record. No need to read the rest.
   54         $is_anon = 0;
   55         $seen_disallow = 0;
   56     }
   57         elsif (/^\s*User-Agent\s*:\s*(.*)/i) {
   58         $ua = $1;
   59         $ua =~ s/\s+$//;
   60 
   61         if ($seen_disallow) {
   62         # treat as start of a new record
   63         $seen_disallow = 0;
   64         last if $is_me; # That was our record. No need to read the rest.
   65         $is_anon = 0;
   66         }
   67 
   68         if ($is_me) {
   69         # This record already had a User-agent that
   70         # we matched, so just continue.
   71         }
   72         elsif ($ua eq '*') {
   73         $is_anon = 1;
   74         }
   75         elsif($self->is_me($ua)) {
   76         $is_me = 1;
   77         }
   78     }
   79     elsif (/^\s*Disallow\s*:\s*(.*)/i) {
   80         unless (defined $ua) {
   81         warn "RobotRules <$robot_txt_uri>: Disallow without preceding User-agent\n" if $^W;
   82         $is_anon = 1;  # assume that User-agent: * was intended
   83         }
   84         my $disallow = $1;
   85         $disallow =~ s/\s+$//;
   86         $seen_disallow = 1;
   87         if (length $disallow) {
   88         my $ignore;
   89         eval {
   90             my $u = URI->new_abs($disallow, $robot_txt_uri);
   91             $ignore++ if $u->scheme ne $robot_txt_uri->scheme;
   92             $ignore++ if lc($u->host) ne lc($robot_txt_uri->host);
   93             $ignore++ if $u->port ne $robot_txt_uri->port;
   94             $disallow = $u->path_query;
   95             $disallow = "/" unless length $disallow;
   96         };
   97         next if $@;
   98         next if $ignore;
   99         }
  100 
  101         if ($is_me) {
  102         push(@me_disallowed, $disallow);
  103         }
  104         elsif ($is_anon) {
  105         push(@anon_disallowed, $disallow);
  106         }
  107     }
  108         elsif (/\S\s*:/) {
  109              # ignore
  110         }
  111     else {
  112         warn "RobotRules <$robot_txt_uri>: Malformed record: <$_>\n" if $^W;
  113     }
  114     }
  115 
  116     if ($is_me) {
  117     $self->push_rules($netloc, @me_disallowed);
  118     }
  119     else {
  120     $self->push_rules($netloc, @anon_disallowed);
  121     }
  122 }
  123 
  124 
  125 #
  126 # Returns TRUE if the given name matches the
  127 # name of this robot
  128 #
  129 sub is_me {
  130     my($self, $ua_line) = @_;
  131     my $me = $self->agent;
  132 
  133     # See whether my short-name is a substring of the
  134     #  "User-Agent: ..." line that we were passed:
  135 
  136     if(index(lc($me), lc($ua_line)) >= 0) {
  137       return 1;
  138     }
  139     else {
  140       return '';
  141     }
  142 }
  143 
  144 
  145 sub allowed {
  146     my($self, $uri) = @_;
  147     $uri = URI->new("$uri");
  148 
  149     return 1 unless $uri->scheme eq 'http' or $uri->scheme eq 'https';
  150      # Robots.txt applies to only those schemes.
  151 
  152     my $netloc = $uri->host . ":" . $uri->port;
  153 
  154     my $fresh_until = $self->fresh_until($netloc);
  155     return -1 if !defined($fresh_until) || $fresh_until < time;
  156 
  157     my $str = $uri->path_query;
  158     my $rule;
  159     for $rule ($self->rules($netloc)) {
  160     return 1 unless length $rule;
  161     return 0 if index($str, $rule) == 0;
  162     }
  163     return 1;
  164 }
  165 
  166 
  167 # The following methods must be provided by the subclass.
  168 sub agent;
  169 sub visit;
  170 sub no_visits;
  171 sub last_visits;
  172 sub fresh_until;
  173 sub push_rules;
  174 sub clear_rules;
  175 sub rules;
  176 sub dump;
  177 
  178 
  179 
  180 package WWW::RobotRules::InCore;
  181 
  182 use vars qw(@ISA);
  183 @ISA = qw(WWW::RobotRules);
  184 
  185 
  186 
  187 sub agent {
  188     my ($self, $name) = @_;
  189     my $old = $self->{'ua'};
  190     if ($name) {
  191         # Strip it so that it's just the short name.
  192         # I.e., "FooBot"                                      => "FooBot"
  193         #       "FooBot/1.2"                                  => "FooBot"
  194         #       "FooBot/1.2 [http://foobot.int; foo@bot.int]" => "FooBot"
  195 
  196     $name = $1 if $name =~ m/(\S+)/; # get first word
  197     $name =~ s!/.*!!;  # get rid of version
  198     unless ($old && $old eq $name) {
  199         delete $self->{'loc'}; # all old info is now stale
  200         $self->{'ua'} = $name;
  201     }
  202     }
  203     $old;
  204 }
  205 
  206 
  207 sub visit {
  208     my($self, $netloc, $time) = @_;
  209     return unless $netloc;
  210     $time ||= time;
  211     $self->{'loc'}{$netloc}{'last'} = $time;
  212     my $count = \$self->{'loc'}{$netloc}{'count'};
  213     if (!defined $$count) {
  214     $$count = 1;
  215     }
  216     else {
  217     $$count++;
  218     }
  219 }
  220 
  221 
  222 sub no_visits {
  223     my ($self, $netloc) = @_;
  224     $self->{'loc'}{$netloc}{'count'};
  225 }
  226 
  227 
  228 sub last_visit {
  229     my ($self, $netloc) = @_;
  230     $self->{'loc'}{$netloc}{'last'};
  231 }
  232 
  233 
  234 sub fresh_until {
  235     my ($self, $netloc, $fresh_until) = @_;
  236     my $old = $self->{'loc'}{$netloc}{'fresh'};
  237     if (defined $fresh_until) {
  238     $self->{'loc'}{$netloc}{'fresh'} = $fresh_until;
  239     }
  240     $old;
  241 }
  242 
  243 
  244 sub push_rules {
  245     my($self, $netloc, @rules) = @_;
  246     push (@{$self->{'loc'}{$netloc}{'rules'}}, @rules);
  247 }
  248 
  249 
  250 sub clear_rules {
  251     my($self, $netloc) = @_;
  252     delete $self->{'loc'}{$netloc}{'rules'};
  253 }
  254 
  255 
  256 sub rules {
  257     my($self, $netloc) = @_;
  258     if (defined $self->{'loc'}{$netloc}{'rules'}) {
  259     return @{$self->{'loc'}{$netloc}{'rules'}};
  260     }
  261     else {
  262     return ();
  263     }
  264 }
  265 
  266 
  267 sub dump
  268 {
  269     my $self = shift;
  270     for (keys %$self) {
  271     next if $_ eq 'loc';
  272     print "$_ = $self->{$_}\n";
  273     }
  274     for (keys %{$self->{'loc'}}) {
  275     my @rules = $self->rules($_);
  276     print "$_: ", join("; ", @rules), "\n";
  277     }
  278 }
  279 
  280 
  281 1;
  282 
  283 __END__
  284 
  285 
  286 # Bender: "Well, I don't have anything else
  287 #          planned for today.  Let's get drunk!"
  288 
  289 =head1 NAME
  290 
  291 WWW::RobotRules - database of robots.txt-derived permissions
  292 
  293 =head1 SYNOPSIS
  294 
  295  use WWW::RobotRules;
  296  my $rules = WWW::RobotRules->new('MOMspider/1.0');
  297 
  298  use LWP::Simple qw(get);
  299 
  300  {
  301    my $url = "http://some.place/robots.txt";
  302    my $robots_txt = get $url;
  303    $rules->parse($url, $robots_txt) if defined $robots_txt;
  304  }
  305 
  306  {
  307    my $url = "http://some.other.place/robots.txt";
  308    my $robots_txt = get $url;
  309    $rules->parse($url, $robots_txt) if defined $robots_txt;
  310  }
  311 
  312  # Now we can check if a URL is valid for those servers
  313  # whose "robots.txt" files we've gotten and parsed:
  314  if($rules->allowed($url)) {
  315      $c = get $url;
  316      ...
  317  }
  318 
  319 =head1 DESCRIPTION
  320 
  321 This module parses F</robots.txt> files as specified in
  322 "A Standard for Robot Exclusion", at
  323 <http://www.robotstxt.org/wc/norobots.html>
  324 Webmasters can use the F</robots.txt> file to forbid conforming
  325 robots from accessing parts of their web site.
  326 
  327 The parsed files are kept in a WWW::RobotRules object, and this object
  328 provides methods to check if access to a given URL is prohibited.  The
  329 same WWW::RobotRules object can be used for one or more parsed
  330 F</robots.txt> files on any number of hosts.
  331 
  332 The following methods are provided:
  333 
  334 =over 4
  335 
  336 =item $rules = WWW::RobotRules->new($robot_name)
  337 
  338 This is the constructor for WWW::RobotRules objects.  The first
  339 argument given to new() is the name of the robot.
  340 
  341 =item $rules->parse($robot_txt_url, $content, $fresh_until)
  342 
  343 The parse() method takes as arguments the URL that was used to
  344 retrieve the F</robots.txt> file, and the contents of the file.
  345 
  346 =item $rules->allowed($uri)
  347 
  348 Returns TRUE if this robot is allowed to retrieve this URL.
  349 
  350 =item $rules->agent([$name])
  351 
  352 Get/set the agent name. NOTE: Changing the agent name will clear the robots.txt
  353 rules and expire times out of the cache.
  354 
  355 =back
  356 
  357 =head1 ROBOTS.TXT
  358 
  359 The format and semantics of the "/robots.txt" file are as follows
  360 (this is an edited abstract of
  361 <http://www.robotstxt.org/wc/norobots.html>):
  362 
  363 The file consists of one or more records separated by one or more
  364 blank lines. Each record contains lines of the form
  365 
  366   <field-name>: <value>
  367 
  368 The field name is case insensitive.  Text after the '#' character on a
  369 line is ignored during parsing.  This is used for comments.  The
  370 following <field-names> can be used:
  371 
  372 =over 3
  373 
  374 =item User-Agent
  375 
  376 The value of this field is the name of the robot the record is
  377 describing access policy for.  If more than one I<User-Agent> field is
  378 present the record describes an identical access policy for more than
  379 one robot. At least one field needs to be present per record.  If the
  380 value is '*', the record describes the default access policy for any
  381 robot that has not not matched any of the other records.
  382 
  383 The I<User-Agent> fields must occur before the I<Disallow> fields.  If a
  384 record contains a I<User-Agent> field after a I<Disallow> field, that
  385 constitutes a malformed record.  This parser will assume that a blank
  386 line should have been placed before that I<User-Agent> field, and will
  387 break the record into two.  All the fields before the I<User-Agent> field
  388 will constitute a record, and the I<User-Agent> field will be the first
  389 field in a new record.
  390 
  391 =item Disallow
  392 
  393 The value of this field specifies a partial URL that is not to be
  394 visited. This can be a full path, or a partial path; any URL that
  395 starts with this value will not be retrieved
  396 
  397 =back
  398 
  399 Unrecognized records are ignored.
  400 
  401 =head1 ROBOTS.TXT EXAMPLES
  402 
  403 The following example "/robots.txt" file specifies that no robots
  404 should visit any URL starting with "/cyberworld/map/" or "/tmp/":
  405 
  406   User-agent: *
  407   Disallow: /cyberworld/map/ # This is an infinite virtual URL space
  408   Disallow: /tmp/ # these will soon disappear
  409 
  410 This example "/robots.txt" file specifies that no robots should visit
  411 any URL starting with "/cyberworld/map/", except the robot called
  412 "cybermapper":
  413 
  414   User-agent: *
  415   Disallow: /cyberworld/map/ # This is an infinite virtual URL space
  416 
  417   # Cybermapper knows where to go.
  418   User-agent: cybermapper
  419   Disallow:
  420 
  421 This example indicates that no robots should visit this site further:
  422 
  423   # go away
  424   User-agent: *
  425   Disallow: /
  426 
  427 This is an example of a malformed robots.txt file.
  428 
  429   # robots.txt for ancientcastle.example.com
  430   # I've locked myself away.
  431   User-agent: *
  432   Disallow: /
  433   # The castle is your home now, so you can go anywhere you like.
  434   User-agent: Belle
  435   Disallow: /west-wing/ # except the west wing!
  436   # It's good to be the Prince...
  437   User-agent: Beast
  438   Disallow:
  439 
  440 This file is missing the required blank lines between records.
  441 However, the intention is clear.
  442 
  443 =head1 SEE ALSO
  444 
  445 L<LWP::RobotUA>, L<WWW::RobotRules::AnyDBM_File>
  446 
  447 =head1 COPYRIGHT
  448 
  449   Copyright 1995-2009, Gisle Aas
  450   Copyright 1995, Martijn Koster
  451 
  452 This library is free software; you can redistribute it and/or
  453 modify it under the same terms as Perl itself.