"Fossies" - the Fresh Open Source Software Archive

Member "libs/PHPCrawler/UrlCache/PHPCrawlerURLCacheBase.class.php" (11 Jan 2013, 3775 Bytes) of package /linux/www/SitemapCreator.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) PHP source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 <?php
    2 /**
    3  * Abstract baseclass for implemented URL-caching classes.
    4  *
    5  * @package phpcrawl
    6  * @internal
    7  */
    8 abstract class PHPCrawlerURLCacheBase
    9 {
   10   protected $url_priorities = array();
   11   
   12   /**
   13    * Defines which property of an URL is used to ensure that each URL is only cached once.
   14    *
   15    * @var int One of the URLHASH_.. constants
   16    */
   17   public $url_distinct_property = self::URLHASH_URL;
   18     
   19   const URLHASH_URL = 1;
   20   const URLHASH_RAWLINK= 2; 
   21   const URLHASH_NONE = 3;
   22   
   23   /**
   24    * Returns the next URL from the cache that should be crawled.
   25    *
   26    * @return PhpCrawlerURLDescriptor
   27    */
   28   abstract public function getNextUrl();
   29   
   30   /**
   31    * Returns all URLs currently cached in the URL-cache.
   32    *
   33    * @return array Numeric array containing all URLs as PHPCrawlerURLDescriptor-objects
   34    */
   35   abstract public function getAllURLs();
   36   
   37   /**
   38    * Removes all URLs and all priority-rules from the URL-cache.
   39    */
   40   abstract public function clear();
   41   
   42   /**
   43    * Adds an URL to the url-cache
   44    *
   45    * @param PHPCrawlerURLDescriptor $UrlDescriptor      
   46    */
   47   abstract public function addURL(PHPCrawlerURLDescriptor $UrlDescriptor);
   48   
   49   /**
   50    * Adds an bunch of URLs to the url-cache
   51    *
   52    * @param array $urls  A numeric array containing the URLs as PHPCrawlerURLDescriptor-objects
   53    */
   54   abstract public function addURLs($urls);
   55   
   56   /**
   57    * Checks whether there are URLs left in the cache or not.
   58    *
   59    * @return bool
   60    */
   61   abstract public function containsURLs();
   62   
   63   /**
   64    * Marks the given URL in the cache as "followed"
   65    *
   66    * @param PHPCrawlerURLDescriptor $UrlDescriptor
   67    */
   68   abstract public function markUrlAsFollowed(PHPCrawlerURLDescriptor $UrlDescriptor);
   69   
   70   /**
   71    * Do cleanups after the cache is not needed anymore
   72    */
   73   abstract public function cleanup();
   74   
   75   /**
   76    * Cleans/purges the URL-cache from inconsistent entries.
   77    */
   78   abstract public function purgeCache();
   79   
   80   /**
   81    * Returns the distinct-hash for the given URL that ensures that no URLs a cached more than one time.
   82    *
   83    * @return string The hash or NULL if no distinct-hash should be used.
   84    */
   85   protected function getDistinctURLHash(PHPCrawlerURLDescriptor $UrlDescriptor)
   86   {
   87     if ($this->url_distinct_property == self::URLHASH_URL)
   88       return md5($UrlDescriptor->url_rebuild);
   89     elseif ($this->url_distinct_property == self::URLHASH_RAWLINK)
   90       return md5($UrlDescriptor->link_raw);
   91     else
   92       return null;
   93   }
   94   
   95   /**
   96    * Gets the priority-level of the given URL
   97    */
   98   protected function getUrlPriority($url)
   99   {
  100     $cnt = count($this->url_priorities);
  101     for ($x=0; $x<$cnt; $x++)
  102     {
  103       if (preg_match($this->url_priorities[$x]["match"], $url))
  104       {
  105         return $this->url_priorities[$x]["level"];
  106       }
  107     }
  108     
  109     return 0;
  110   }
  111   
  112   /**
  113    * Adds a Link-Priority-Level
  114    *
  115    * @param string $regex
  116    * @param int    $level
  117    */
  118   public function addLinkPriority($regex, $level)
  119   {
  120     $c = count($this->url_priorities);
  121     $this->url_priorities[$c]["match"] = trim($regex);
  122     $this->url_priorities[$c]["level"] = trim($level);
  123     
  124     // Sort url-priortie-array so that high priority-levels come firts.
  125     PHPCrawlerUtils::sort2dArray($this->url_priorities, "level", SORT_DESC);
  126   }
  127   
  128   /**
  129    * Adds a bunch of link-priorities
  130    *
  131    * @param array $priority_array Numeric array containing the subkeys "match" and "level"
  132    */
  133   public function addLinkPriorities($priority_array)
  134   {
  135     for ($x=0; $x<count($priority_array); $x++)
  136     {
  137       $this->addLinkPriority($priority_array[$x]["match"], $priority_array[$x]["level"]);
  138     }
  139   }
  140 }
  141 ?>