"Fossies" - the Fresh Open Source Software Archive

Member "libs/PHPCrawler/PHPCrawlerRobotsTxtParser.class.php" (8 Jan 2013, 7881 Bytes) of package /linux/www/SitemapCreator.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) PHP source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 <?php
    2 /**
    3  * Class for parsing robots.txt-files.
    4  *
    5  * @package phpcrawl
    6  * @internal
    7  */  
    8 class PHPCrawlerRobotsTxtParser
    9 {
   10   /**
   11    * A PHPCrawlerHTTPRequest-object for requesting robots.txt-files.
   12    *
   13    * @var PHPCrawlerHTTPRequest
   14    */
   15   protected $PageRequest;
   16   
   17   public function __construct()
   18   {
   19     // Init PageRequest-class
   20     if (!class_exists("PHPCrawlerHTTPRequest")) include_once($classpath."/PHPCrawlerHTTPRequest.class.php");
   21     $this->PageRequest = new PHPCrawlerHTTPRequest();
   22   }
   23   
   24   /**
   25    * Parses the robots.txt-file related to the given URL and returns regular-expression-rules
   26    * corresponding to the containing "disallow"-rules that are adressed to the given user-agent.
   27    *
   28    * @param PHPCrawlerURLDescriptor $Url The URL
   29    * @param string $user_agent_string User-agent.
   30    *
   31    * @return array Numeric array containing regular-expressions for each "disallow"-rule defined in the robots.txt-file
   32    *               that's adressed to the given user-agent.
   33    */
   34   public function parseRobotsTxt(PHPCrawlerURLDescriptor $Url, $user_agent_string)
   35   {
   36     PHPCrawlerBenchmark::start("processing_robotstxt");
   37     
   38     // URL of robots-txt
   39     $RobotsTxtUrl = self::getRobotsTxtURL($Url);
   40     
   41     // Get robots.txt-content related to the given URL
   42     $robots_txt_content = $this->getRobotsTxtContent($RobotsTxtUrl);
   43     
   44     $non_follow_reg_exps = array();
   45     
   46     // If content was found
   47     if ($robots_txt_content != null)
   48     {
   49       // Get all lines in the robots.txt-content that are adressed to our user-agent.
   50       $applying_lines = $this->getApplyingLines($robots_txt_content, $user_agent_string);
   51       
   52       // Get valid reg-expressions for the given disallow-pathes.
   53       $non_follow_reg_exps = $this->buildRegExpressions($applying_lines, PHPCrawlerUtils::getRootUrl($Url->url_rebuild));
   54     }
   55     
   56     PHPCrawlerBenchmark::stop("processing_robots.txt");
   57     
   58     return $non_follow_reg_exps;
   59   }
   60   
   61   /**
   62    * Function returns all RAW lines in the given robots.txt-content that apply to
   63    * the given useragent-string.
   64    *
   65    * @return array Numeric array with found lines
   66    */
   67   protected function getApplyingLines(&$robots_txt_content, $user_agent_string)
   68   {
   69     // Split the content into its lines
   70     $robotstxt_lines = explode("\n", $robots_txt_content);
   71     
   72     // Flag that will get TRUE if the loop over the lines gets
   73     // into a section that applies to our user_agent_string 
   74     $matching_section = false;
   75     
   76     // Flag that indicats if the loop is in a "agent-define-section"
   77     // (the parts/blocks that contain the "User-agent"-lines.)
   78     $agent_define_section = false;
   79     
   80     // Flag that indicates if we have found a section that fits to our
   81     // User-agent
   82     $matching_section_found = false;
   83     
   84     // Array to collect all the lines that applie to our user_agent
   85     $applying_lines = array();
   86     
   87     // Loop over the lines
   88     $cnt = count($robotstxt_lines);
   89     for ($x=0; $x<$cnt; $x++)
   90     {
   91       $robotstxt_lines[$x] = trim($robotstxt_lines[$x]);
   92       
   93       // Check if a line begins with "User-agent"
   94       if (preg_match("#^User-agent:# i", $robotstxt_lines[$x]))
   95       {
   96         // If a new "user-agent" section begins -> reset matching_section-flag
   97         if ($agent_define_section == false)
   98         {
   99           $matching_section = false;
  100         }
  101         
  102         $agent_define_section = true; // Now we are in an agent-define-section
  103         
  104         // The user-agent specified in the "User-agent"-line
  105         preg_match("#^User-agent:[ ]*(.*)$# i", $robotstxt_lines[$x], $match);
  106         $user_agent_section = trim($match[1]);
  107         
  108         // if the specified user-agent in the line fits to our user-agent-String (* fits always)
  109         // -> switch the flag "matching_section" to true
  110         if ($user_agent_section == "*" || preg_match("#^".preg_quote($user_agent_section)."# i", $user_agent_string))
  111         {
  112           $matching_section = true;
  113           $matching_section_found = true;
  114         }
  115         
  116         continue; // Don't do anything else with the "User-agent"-lines, just go on
  117       }
  118       else
  119       {
  120         // We are not in an agent-define-section (anymore)
  121         $agent_define_section = false;
  122       }
  123       
  124       // If we are in a section that applies to our user_agent
  125       // -> store the line.
  126       if ($matching_section == true)
  127       {
  128         $applying_lines[] = $robotstxt_lines[$x];
  129       }
  130       
  131       // If we are NOT in a matching section (anymore) AND we've already found
  132       // and parsed a matching section -> stop looking further (thats what RFC says)
  133       if ($matching_section == false && $matching_section_found == true)
  134       {
  135         // break;
  136       }
  137     }
  138     
  139     return $applying_lines;
  140   }
  141   
  142   /**
  143    * Returns an array containig regular-expressions corresponding
  144    * to the given robots.txt-style "Disallow"-lines
  145    *
  146    * @param array &$applying_lines Numeric array containing "disallow"-lines.
  147    * @param string $base_url       Base-URL the robots.txt-file was found in.
  148    *
  149    * @return array  Numeric array containing regular-expresseions created for each "disallow"-line.
  150    */
  151   protected function buildRegExpressions(&$applying_lines, $base_url)
  152   { 
  153     // First, get all "Disallow:"-pathes
  154     $disallow_pathes = array();
  155     for ($x=0; $x<count($applying_lines); $x++)
  156     {
  157       if (preg_match("#^Disallow:# i", $applying_lines[$x]))
  158       {
  159         preg_match("#^Disallow:[ ]*(.*)#", $applying_lines[$x], $match);
  160         $disallow_pathes[] = trim($match[1]);
  161       }
  162     }
  163     
  164     // Works like this:
  165     // The base-url is http://www.foo.com.
  166     // The driective says: "Disallow: /bla/"
  167     // This means: The nonFollowMatch is "#^http://www\.foo\.com/bla/#"
  168     
  169     $normalized_base_url = PHPCrawlerUtils::normalizeURL($base_url);
  170     
  171     $non_follow_expressions = array();
  172     
  173     for ($x=0; $x<count($disallow_pathes); $x++)
  174     {
  175       // If the disallow-path is empty -> simply ignore it
  176       if ($disallow_pathes[$x] == "") continue;
  177       
  178       $non_follow_path_complpete = $normalized_base_url.substr($disallow_pathes[$x], 1); // "http://www.foo.com/bla/"
  179       $non_follow_exp = preg_quote($non_follow_path_complpete, "#"); // "http://www\.foo\.com/bla/"
  180       $non_follow_exp = "#^".$non_follow_exp."#"; // "#^http://www\.foo\.com/bla/#"
  181         
  182       $non_follow_expressions[] = $non_follow_exp;
  183     }
  184     
  185     return $non_follow_expressions;
  186   }
  187   
  188   /**
  189    * Retreives the content of a robots.txt-file
  190    *
  191    * @param PHPCrawlerURLDescriptor $Url The URL of the robots.txt-file
  192    * @return string The content of the robots.txt or NULL if no robots.txt was found.
  193    */
  194   protected function getRobotsTxtContent(PHPCrawlerURLDescriptor $Url)
  195   {
  196     // Request robots-txt
  197     $this->PageRequest->setUrl($Url);
  198     $PageInfo = $this->PageRequest->sendRequest();
  199 
  200     // Return content of the robots.txt-file if it was found, otherwie
  201     // reutrn NULL
  202     if ($PageInfo->http_status_code == 200)
  203     {
  204       return $PageInfo->content;
  205     }
  206     else
  207     {
  208       return null;
  209     }
  210   }
  211   
  212   /** 
  213    * Returns the Robots.txt-URL related to the given URL
  214    *
  215    * @param PHPCrawlerURLDescriptor $Url  The URL as PHPCrawlerURLDescriptor-object
  216    * @return PHPCrawlerURLDescriptor Url of the related to the passed URL.
  217    */
  218   public static function getRobotsTxtURL(PHPCrawlerURLDescriptor $Url)
  219   {
  220     $url_parts = PHPCrawlerUtils::splitURL($Url->url_rebuild); 
  221     $robots_txt_url = $url_parts["protocol"].$url_parts["host"].":".$url_parts["port"] . "/robots.txt";
  222     
  223     return new PHPCrawlerURLDescriptor($robots_txt_url);
  224   }
  225 }
  226   
  227 ?>