"Fossies" - the Fresh Open Source Software Archive

Member "libs/PHPCrawler/PHPCrawlerLinkFinder.class.php" (8 Jan 2013, 8876 Bytes) of package /linux/www/SitemapCreator.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) PHP source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 <?php
    2 /**
    3  * Class for finding links in HTML-documents.
    4  *
    5  * @package phpcrawl
    6  * @internal
    7  */
    8 class PHPCrawlerLinkFinder
    9 {
   10   /**
   11    * Numeric array containing all tags to extract links from
   12    *
   13    * @var array
   14    */
   15   public $extract_tags = array("href", "src", "url", "location", "codebase", "background", "data", "profile", "action", "open");
   16   
   17   /**
   18    * Specifies whether links will also be searched outside of HTML-tags
   19    *
   20    * @var bool
   21    */
   22   public $aggressive_search = true;
   23   
   24   /**
   25    * Specifies whether redirect-links set in http-headers should get found.
   26    *
   27    * @var bool
   28    */
   29   public $find_redirect_urls = true;
   30   
   31   /**
   32    * The URL of the html-source to find links from
   33    *
   34    * @var PHPCrawlerURLDescriptor
   35    */
   36   protected $SourceUrl;
   37   
   38   /**
   39    * Cache for storing found links/urls
   40    *
   41    * @var PHPCrawlerURLCache
   42    */
   43   protected $LinkCache;
   44   
   45   /**
   46    * Flag indicating whether the top lines of the HTML-source were processed.
   47    */
   48   protected $top_lines_processed = false;
   49   
   50   /**
   51    * Parts of the base-url as PHPCrawlerUrlPartsDescriptor-object
   52    *
   53    * @var PHPCrawlerUrlPartsDescriptor
   54    */
   55   protected $baseUrlParts;
   56   
   57   protected $found_links_map = array();
   58   
   59   /**
   60    * Meta-attributes found in the html-source.
   61    *
   62    * @var array
   63    */
   64   protected $meta_attributes = array();
   65   
   66   public function __construct()
   67   {
   68     if (!class_exists("PHPCrawlerMemoryURLCache")) include_once(dirname(__FILE__)."/UrlCache/PHPCrawlerMemoryURLCache.class.php");
   69     $this->LinkCache = new PHPCrawlerMemoryURLCache();
   70     $this->LinkCache->url_distinct_property = PHPCrawlerURLCacheBase::URLHASH_URL;
   71   }
   72   
   73   /**
   74    * Sets the source-URL of the document to find links in
   75    *
   76    * @param PHPCrawlerURLDescriptor $SourceUrl
   77    */
   78   public function setSourceUrl(PHPCrawlerURLDescriptor $SourceUrl)
   79   {
   80     $this->SourceUrl = $SourceUrl;
   81     $this->baseUrlParts = PHPCrawlerUrlPartsDescriptor::fromURL($SourceUrl->url_rebuild);
   82   }
   83   
   84   /**
   85    * Processes the response-header of the document.
   86    *
   87    * @param &string $header The response-header of the document.
   88    */
   89   public function processHTTPHeader(&$header)
   90   {
   91     if ($this->find_redirect_urls == true)
   92     {
   93       $this->findRedirectLinkInHeader($header);
   94     }
   95   }
   96   
   97   /**
   98    * Resets/clears the internal link-cache.
   99    */
  100   public function resetLinkCache()
  101   {
  102     $this->LinkCache->clear();
  103     $this->top_lines_processed = false;
  104   }
  105   
  106   /**
  107    * Checks for a redirect-URL in the given http-header and adds it to the internal link-cache.
  108    */
  109   protected function findRedirectLinkInHeader(&$http_header)
  110   {
  111     PHPCrawlerBenchmark::start("checking_for_redirect_link");
  112     
  113     // Get redirect-URL or link from header
  114     $redirect_link = PHPCrawlerUtils::getRedirectURLFromHeader($http_header);
  115     
  116     // Add redirect-URL to linkcache
  117     if ($redirect_link != null)
  118     {
  119       // Rebuild URL
  120       $url_rebuild = PHPCrawlerUtils::buildURLFromLink($redirect_link, $this->baseUrlParts);
  121       
  122       // Add URL to cache
  123       $UrlDescriptor = new PHPCrawlerURLDescriptor($url_rebuild, $redirect_link, "", "", $this->SourceUrl->url_rebuild);
  124       $UrlDescriptor->is_redirect_url = true;
  125       $this->LinkCache->addURL($UrlDescriptor);
  126     }
  127     
  128      PHPCrawlerBenchmark::stop("checking_for_redirect_link");
  129   }
  130   
  131   /**
  132    * Searches for links in the given HTML-chunk and adds found links the the internal link-cache.
  133    */
  134   public function findLinksInHTMLChunk(&$html_source)
  135   {
  136     PHPCrawlerBenchmark::start("searching_for_links_in_page");
  137     
  138     // Check for meta-base-URL and meta-tags in top of HTML-source
  139     if ($this->top_lines_processed == false)
  140     {
  141       $meta_base_url = PHPCrawlerUtils::getBaseUrlFromMetaTag($html_source);
  142       if ($meta_base_url != null)
  143       {
  144         $base_url = PHPCrawlerUtils::buildURLFromLink($meta_base_url, $this->baseUrlParts);
  145         $this->baseUrlParts = PHPCrawlerUrlPartsDescriptor::fromURL($base_url);
  146       }
  147       
  148       // Get all meta-tags
  149       $this->meta_attributes = PHPCrawlerUtils::getMetaTagAttributes($html_source);
  150       
  151       // Set flag that top-lines of source were processed
  152       $this->top_lines_processed == true;
  153     }
  154     
  155     // Build the RegEx-part for html-tags to search links in
  156     $tag_regex_part = "";
  157     $cnt = count($this->extract_tags);
  158     for ($x=0; $x<$cnt; $x++)
  159     {
  160       $tag_regex_part .= "|".$this->extract_tags[$x];
  161     }
  162     $tag_regex_part = substr($tag_regex_part, 1);
  163     
  164     // 1. <a href="...">LINKTEXT</a> (well formed link with </a> at the end and quotes around the link)
  165     // Get the link AND the linktext from these tags
  166     // This has to be done FIRST !!              
  167     preg_match_all("#<\s*a\s[^<>]*(?<=\s)(?:".$tag_regex_part.")\s*=\s*".
  168                    "(?|\"([^\"]+)\"|'([^']+)'|([^\s><'\"]+))[^<>]*>".
  169                    "((?:(?!<\s*\/a\s*>).){0,500})".
  170                    "<\s*\/a\s*># is", $html_source, $matches);
  171                           
  172     $cnt = count($matches[0]);
  173     for ($x=0; $x<$cnt; $x++)
  174     {  
  175       $link_raw = trim($matches[1][$x]);
  176       $linktext = $matches[2][$x];
  177       $linkcode = trim($matches[0][$x]);
  178 
  179       if (!empty($link_raw)) $this->addLinkToCache($link_raw, $linkcode, $linktext);
  180     }
  181                    
  182     // Second regex (everything that could be a link inside of <>-tags)
  183     preg_match_all("#<[^<>]*\s(?:".$tag_regex_part.")\s*=\s*".
  184                    "(?|\"([^\"]+)\"|'([^']+)'|([^\s><'\"]+))[^<>]*># is", $html_source, $matches);
  185 
  186     $cnt = count($matches[0]);
  187     for ($x=0; $x<$cnt; $x++)
  188     {
  189       $link_raw = trim($matches[1][$x]);
  190       $linktext = "";
  191       $linkcode = trim($matches[0][$x]);
  192       
  193       if (!empty($link_raw)) $this->addLinkToCache($link_raw, $linkcode, $linktext);
  194     }
  195     
  196     // Now, if agressive_mode is set to true, we look for some
  197     // other things
  198     $pregs = array();
  199     if ($this->aggressive_search == true)
  200     {
  201       // Links like "...:url("animage.gif")..."
  202       $pregs[]="/[\s\.:;](?:".$tag_regex_part.")\s*\(\s*([\"|']{0,1})([^\"'\) ]{1,500})['\"\)]/ is";
  203       
  204       // Everything like "...href="bla.html"..." with qoutes
  205       $pregs[]="/[\s\.:;](?:".$tag_regex_part.")\s*=\s*([\"|'])(.{0,500}?)\\1/ is";
  206       
  207       // Everything like "...href=bla.html..." without qoutes
  208       $pregs[]="/[\s\.:;](?:".$tag_regex_part.")\s*(=)\s*([^\s\">']{1,500})/ is";
  209       
  210       for ($x=0; $x<count($pregs); $x++)
  211       {
  212         unset($matches);
  213         preg_match_all($pregs[$x], $html_source, $matches);
  214         
  215         $cnt = count($matches[0]);
  216         for ($y=0; $y<$cnt; $y++)
  217         {
  218           $link_raw = trim($matches[2][$y]);
  219           $linkcode = trim($matches[0][$y]);
  220           $linktext = "";
  221           
  222           $this->addLinkToCache($link_raw, $linkcode, $linktext);
  223         }
  224       }
  225     }
  226     
  227     $this->found_links_map = array();
  228     
  229     PHPCrawlerBenchmark::stop("searching_for_links_in_page");
  230   }
  231   
  232   protected function addLinkToCache($link_raw, $link_code, $link_text = "")
  233   {
  234     //PHPCrawlerBenchmark::start("preparing_link_for_cache");
  235     
  236     // If liks already was found and processed -> skip this link
  237     if (isset($this->found_links_map[$link_raw])) return;
  238     
  239     // Rebuild URL from link
  240     $url_rebuild = PHPCrawlerUtils::buildURLFromLink($link_raw, $this->baseUrlParts);
  241 
  242     // If link coulnd't be rebuild
  243     if ($url_rebuild == null) return;
  244     
  245     // Create an PHPCrawlerURLDescriptor-object with URL-data
  246     $UrlDescriptor = new PHPCrawlerURLDescriptor($url_rebuild, $link_raw, $link_code, $link_text, $this->SourceUrl->url_rebuild);
  247     
  248     // Add the PHPCrawlerURLDescriptor-object to LinkCache
  249     $this->LinkCache->addURL($UrlDescriptor);
  250         
  251     // Add the PHPCrawlerURLDescriptor-object to found-links-array
  252     $map_key = $link_raw;
  253     $this->found_links_map[$map_key] = true;
  254     
  255     //PHPCrawlerBenchmark::stop("preparing_link_for_cache");
  256   }
  257   
  258   /**
  259    * Returns all URLs/links found so far in the document.
  260    *
  261    * @return array Numeric array containing all URLs as PHPCrawlerURLDescriptor-objects
  262    */
  263   public function getAllURLs()
  264   {
  265     return $this->LinkCache->getAllURLs();
  266   }
  267   
  268   /**
  269    * Returns all meta-tag attributes found so far in the document.
  270    *
  271    * @return array Assoziative array conatining all found meta-attributes.
  272    *               The keys are the meta-names, the values the content of the attributes.
  273    *               (like $tags["robots"] = "nofollow")
  274    *
  275    */
  276   public function getAllMetaAttributes()
  277   {
  278     return $this->meta_attributes;
  279   }
  280 }
  281 ?>