"Fossies" - the Fresh Open Source Software Archive

Member "libs/PHPCrawler/PHPCrawlerURLFilter.class.php" (8 Jan 2013, 7116 Bytes) of package /linux/www/SitemapCreator.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) PHP source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 <?php
    2 /**
    3  * Class for filtering URLs by given filter-rules.
    4  *
    5  * @package phpcrawl
    6  * @internal
    7  */
    8 class PHPCrawlerURLFilter
    9 {
   10   /**
   11    * The full qualified and normalized URL the crawling-prpocess was started with.
   12    *
   13    * @var string
   14    */
   15   protected $starting_url = "";
   16   
   17   /**
   18    * The URL-parts of the starting-url.
   19    *
   20    * @var array The URL-parts as returned by PHPCrawlerUtils::splitURL()
   21    */
   22   protected $starting_url_parts = array();
   23   
   24   /**
   25    * Array containing regex-rules for URLs that should be followed.
   26    *
   27    * @var array
   28    */
   29   protected $url_follow_rules = array();
   30   
   31   /**
   32    * Array containing regex-rules for URLs that should NOT be followed.
   33    *
   34    * @var array
   35    */
   36   protected $url_filter_rules = array();
   37   
   38   /**
   39    * Defines whether nofollow-tags should get obeyed.
   40    *
   41    * @var bool
   42    */
   43   public $obey_nofollow_tags = false;
   44   
   45   /**
   46    * The general follow-mode of the crawler
   47    *
   48    * @var int The follow-mode
   49    *
   50    *          0 -> follow every links
   51    *          1 -> stay in domain
   52    *          2 -> stay in host
   53    *          3 -> stay in path
   54    */
   55   public $general_follow_mode = 2;
   56  
   57   /**
   58    * Current PHPCrawlerDocumentInfo-object of the current document
   59    *
   60    * @var PHPCrawlerDocumentInfo
   61    */
   62   protected $CurrentDocumentInfo = null;
   63   
   64   /**
   65    * Sets the base-URL of the crawling process some rules relate to
   66    *
   67    * @param string $starting_url The URL the crawling-process was started with.
   68    */
   69   public function setBaseURL($starting_url)
   70   {
   71     $this->starting_url = $starting_url;
   72     
   73     // Parts of the starting-URL
   74     $this->starting_url_parts = PHPCrawlerUtils::splitURL($starting_url);
   75   }
   76   
   77   /**
   78    * Filters the given URLs (contained in the given PHPCrawlerDocumentInfo-object) by the given rules.
   79    *
   80    * @param PHPCrawlerDocumentInfo $DocumentInfo PHPCrawlerDocumentInfo-object containing all found links of the current document.
   81    */
   82   public function filterUrls(PHPCrawlerDocumentInfo $DocumentInfo)
   83   {
   84     PHPCrawlerBenchmark::start("filtering_urls");
   85     
   86     $this->CurrentDocumentInfo = $DocumentInfo;
   87     
   88     $filtered_urls = array();
   89     
   90     $cnt = count($DocumentInfo->links_found_url_descriptors);
   91     for ($x=0; $x<$cnt; $x++)
   92     {
   93       if (!$this->urlMatchesRules($DocumentInfo->links_found_url_descriptors[$x]))
   94       {
   95         $DocumentInfo->links_found_url_descriptors[$x] = null;
   96       }
   97     }
   98     
   99     PHPCrawlerBenchmark::stop("filtering_urls");
  100   }
  101   
  102   /**
  103    * Filters out all non-redirect-URLs from the URLs given in the PHPCrawlerDocumentInfo-object
  104    *
  105    * @param PHPCrawlerDocumentInfo $DocumentInfo PHPCrawlerDocumentInfo-object containing all found links of the current document.
  106    */
  107   public static function keepRedirectUrls(PHPCrawlerDocumentInfo $DocumentInfo)
  108   {
  109     $cnt = count($DocumentInfo->links_found_url_descriptors);
  110     for ($x=0; $x<$cnt; $x++)
  111     {
  112       if ($DocumentInfo->links_found_url_descriptors[$x]->is_redirect_url == false)
  113       {
  114         $DocumentInfo->links_found_url_descriptors[$x] = null;
  115       }
  116     }
  117   }
  118   
  119   /**
  120    * Checks whether a given URL matches the rules.
  121    *
  122    * @param string $url  The URL as a PHPCrawlerURLDescriptor-object
  123    * @return bool TRUE if the URL matches the defined rules.
  124    */
  125   protected function urlMatchesRules(PHPCrawlerURLDescriptor $url)
  126   { 
  127     // URL-parts of the URL to check against the filter-rules
  128     $url_parts = PHPCrawlerUtils::splitURL($url->url_rebuild);
  129     
  130     // Kick out all links that r NOT of protocol "http" or "https"
  131     if ($url_parts["protocol"] != "http://" && $url_parts["protocol"] != "https://")
  132     {
  133       return false;
  134     }
  135     
  136     // If meta-tag "robots"->"nofollow" is present and obey_nofollow_tags is TRUE -> always kick out URL
  137     if ($this->obey_nofollow_tags == true &&
  138         isset($this->CurrentDocumentInfo->meta_attributes["robots"]) &&
  139         preg_match("#nofollow# i", $this->CurrentDocumentInfo->meta_attributes["robots"]))
  140     {
  141       return false;
  142     }
  143     
  144     // If linkcode contains "rel='nofollow'" and obey_nofollow_tags is TRUE -> always kick out URL
  145     if ($this->obey_nofollow_tags == true)
  146     {
  147       if (preg_match("#^<[^>]*rel\s*=\s*(?|\"\s*nofollow\s*\"|'\s*nofollow\s*'|\s*nofollow\s*)[^>]*>#", $url->linkcode))
  148       {
  149         return false;
  150       }
  151     }
  152     
  153     // Filter URLs to other domains if wanted
  154     if ($this->general_follow_mode >= 1)
  155     {
  156       if ($url_parts["domain"] != $this->starting_url_parts["domain"]) return false;
  157     }
  158     
  159     // Filter URLs to other hosts if wanted
  160     if ($this->general_follow_mode >= 2)
  161     {
  162       // Ignore "www." at the beginning of the host, because "www.foo.com" is the same host as "foo.com"
  163       if (preg_replace("#^www\.#", "", $url_parts["host"]) != preg_replace("#^www\.#", "", $this->starting_url_parts["host"]))
  164         return false;
  165     }
  166     
  167     // Filter URLs leading path-up if wanted
  168     if ($this->general_follow_mode == 3)
  169     {
  170       if ($url_parts["protocol"] != $this->starting_url_parts["protocol"] ||
  171           preg_replace("#^www\.#", "", $url_parts["host"]) != preg_replace("#^www\.#", "", $this->starting_url_parts["host"]) ||
  172           substr($url_parts["path"], 0, strlen($this->starting_url_parts["path"])) != $this->starting_url_parts["path"])
  173       {
  174         return false;
  175       }
  176     }
  177     
  178     // Filter URLs by url_filter_rules
  179     for ($x=0; $x<count($this->url_filter_rules); $x++)
  180     {
  181       if (preg_match($this->url_filter_rules[$x], $url->url_rebuild)) return false;
  182     }
  183     
  184     // Filter URLs by url_follow_rules
  185     if (count($this->url_follow_rules) > 0)
  186     {
  187       $match_found = false;
  188       for ($x=0; $x<count($this->url_follow_rules); $x++)
  189       {
  190         if (preg_match($this->url_follow_rules[$x], $url->url_rebuild))
  191         {
  192           $match_found = true;
  193           break;
  194         }
  195       }
  196       
  197       if ($match_found == false) return false;
  198     }
  199     
  200     return true;
  201   }
  202   
  203   public function addURLFollowRule($regex)
  204   {
  205     $check = PHPCrawlerUtils::checkRegexPattern($regex); // Check pattern
  206     
  207     if ($check == true)
  208     {
  209       $this->url_follow_rules[] = trim($regex);
  210     }
  211     return $check;
  212   }
  213   
  214   /**
  215    * Adds a rule to the list of rules that decide which URLs found on a page should be ignored by the crawler. 
  216    */
  217   public function addURLFilterRule($regex)
  218   {
  219     $check = PHPCrawlerUtils::checkRegexPattern($regex); // Check pattern
  220     
  221     if ($check == true)
  222     {
  223       $this->url_filter_rules[] = trim($regex);
  224     }
  225     return $check;
  226   }
  227   
  228   /**
  229    * Adds a bunch of rules to the list of rules that decide which URLs found on a page should be ignored by the crawler. 
  230    */
  231   public function addURLFilterRules($regex_array)
  232   {
  233     $cnt = count($regex_array);
  234     for ($x=0; $x<$cnt; $x++)
  235     {
  236       $this->addURLFilterRule($regex_array[$x]);
  237     }
  238   }
  239 }
  240 ?>