"Fossies" - the Fresh Open Source Software Archive

Member "libs/PHPCrawler/PHPCrawlerUtils.class.php" (8 Jan 2013, 19251 Bytes) of package /linux/www/SitemapCreator.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) PHP source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 <?php
    2 /**
    3  * Static util-methods used by phpcrawl.
    4  *
    5  * @package phpcrawl
    6  * @internal
    7  */
    8 class PHPCrawlerUtils
    9 {
   10   /**
   11    * Splits an URL into its parts
   12    *
   13    * @param string $url  The URL
   14    * @return array       An array containig the parts of the URL
   15    *
   16    *                     The keys are:
   17    *
   18    *                     "protocol" (z.B. "http://")
   19    *                     "host"     (z.B. "www.bla.de")
   20    *                     "path"     (z.B. "/test/palimm/")
   21    *                     "file"     (z.B. "index.htm")
   22    *                     "domain"   (z.B. "foo.com")
   23    *                     "port"     (z.B. 80)
   24    *                     "auth_username"
   25    *                     "auth_password"
   26    */
   27   public static function splitURL($url)
   28   {
   29     // Protokoll der URL hinzufügen (da ansonsten parse_url nicht klarkommt)
   30     if (!preg_match("#^[a-z]+://# i", $url))
   31       $url = "http://" . $url;
   32     
   33     $parts = @parse_url($url);
   34     
   35     if (!isset($parts)) return null;
   36     
   37     $protocol = $parts["scheme"]."://";
   38     $host = (isset($parts["host"]) ? $parts["host"] : "");
   39     $path = (isset($parts["path"]) ? $parts["path"] : "");
   40     $query = (isset($parts["query"]) ? "?".$parts["query"] : "");
   41     $auth_username = (isset($parts["user"]) ? $parts["user"] : "");
   42     $auth_password = (isset($parts["pass"]) ? $parts["pass"] : "");
   43     $port = (isset($parts["port"]) ? $parts["port"] : "");
   44     
   45     // File
   46     preg_match("#^(.*/)([^/]*)$#", $path, $match); // Alles ab dem letzten "/"
   47     if (isset($match[0]))
   48     {
   49       $file = trim($match[2]);
   50       $path = trim($match[1]);
   51     }
   52     else
   53     {
   54       $file = "";
   55     }
   56       
   57     // Der Domainname aus dem Host
   58     // Host: www.foo.com -> Domain: foo.com
   59     $parts = @explode(".", $host);
   60     if (count($parts) <= 2)
   61     {
   62       $domain = $host;
   63     }
   64     else if (preg_match("#^[0-9]+$#", str_replace(".", "", $host))) // IP
   65     {
   66       $domain = $host;
   67     }
   68     else
   69     {
   70       $pos = strpos($host, ".");
   71       $domain = substr($host, $pos+1);
   72     }
   73     
   74     // DEFAULT VALUES für protocol, path, port etc. (wenn noch nicht gesetzt)
   75       
   76     // Wenn Protokoll leer -> Protokoll ist "http://"
   77     if ($protocol == "") $protocol="http://";
   78     
   79     // Wenn Port leer -> Port setzen auf 80 or 443
   80     // (abhängig vom Protokoll)
   81     if ($port == "")
   82     {
   83       if (strtolower($protocol) == "http://") $port=80;
   84       if (strtolower($protocol) == "https://") $port=443;
   85     }
   86     
   87     // Wenn Pfad leet -> Pfad ist "/"
   88     if ($path=="") $path = "/";
   89     
   90     // Rückgabe-Array
   91     $url_parts["protocol"] = $protocol;
   92     $url_parts["host"] = $host;
   93     $url_parts["path"] = $path;
   94     $url_parts["file"] = $file;
   95     $url_parts["query"] = $query;
   96     $url_parts["domain"] = $domain;
   97     $url_parts["port"] = $port;
   98     
   99     $url_parts["auth_username"] = $auth_username;
  100     $url_parts["auth_password"] = $auth_password;
  101     
  102     return $url_parts;
  103   }
  104   
  105   /**
  106    * Builds an URL from it's single parts.
  107    *
  108    * @param array $url_parts Array conatining the URL-parts.
  109    *                         The keys should be:
  110    *
  111    *                         "protocol" (z.B. "http://") OPTIONAL
  112    *                         "host"     (z.B. "www.bla.de")
  113    *                         "path"     (z.B. "/test/palimm/") OPTIONAL
  114    *                         "file"     (z.B. "index.htm") OPTIONAL
  115    *                         "port"     (z.B. 80) OPTIONAL
  116    *                         "auth_username" OPTIONAL
  117    *                         "auth_password" OPTIONAL
  118    * @param bool $normalize   If TRUE, the URL will be returned normalized.
  119    *                          (I.e. http://www.foo.com/path/ insetad of http://www.foo.com:80/path/)
  120    * @return string The URL
  121    *                         
  122    */
  123   public static function buildURLFromParts($url_parts, $normalize = false)
  124   {
  125     // Host has to be set aat least
  126     if (!isset($url_parts["host"]))
  127     {
  128       throw new Exception("Cannot generate URL, host not specified!");
  129     }
  130     
  131     if (!isset($url_parts["protocol"]) || $url_parts["protocol"] == "") $url_parts["protocol"] = "http://";
  132     if (!isset($url_parts["port"])) $url_parts["port"]= 80;
  133     if (!isset($url_parts["path"])) $url_parts["path"] = "";
  134     if (!isset($url_parts["file"])) $url_parts["file"] = "";
  135     if (!isset($url_parts["query"])) $url_parts["query"]= "";
  136     if (!isset($url_parts["auth_username"])) $url_parts["auth_username"]= "";
  137     if (!isset($url_parts["auth_password"])) $url_parts["auth_password"]= "";
  138     
  139     // Autentication-part
  140     $auth_part = "";
  141     if ($url_parts["auth_username"] != "" && $url_parts["auth_password"] != "")
  142     {
  143       $auth_part = $url_parts["auth_username"].":".$url_parts["auth_password"]."@";
  144     }
  145     
  146     // Port-part
  147     $port_part = ":" . $url_parts["port"];
  148     
  149     // Normalize
  150     if ($normalize == true)
  151     {
  152       if ($url_parts["protocol"] == "http://" && $url_parts["port"] == 80 ||
  153           $url_parts["protocol"] == "https://" && $url_parts["port"] == 443)
  154       {
  155         $port_part = "";
  156       }
  157       
  158       // Don't add port to links other than "http://" or "https://"
  159       if ($url_parts["protocol"] != "http://" && $url_parts["protocol"] != "https://")
  160       {
  161         $port_part = "";
  162       }
  163     }
  164     
  165     // Put together the url
  166     $url = $url_parts["protocol"] . $auth_part . $url_parts["host"]. $port_part . $url_parts["path"] . $url_parts["file"] . $url_parts["query"];
  167     
  168     return $url;
  169   }
  170   
  171   /**
  172    * Normalizes an URL
  173    *
  174    * I.e. converts http://www.foo.com:80/path/ to http://www.foo.com/path/
  175    *
  176    * @param string $url
  177    * @return string OR NULL on failure
  178    */
  179   public static function normalizeURL($url)
  180   {
  181     $url_parts = self::splitURL($url);
  182     
  183     if ($url_parts == null) return null;
  184     
  185     $url_normalized = self::buildURLFromParts($url_parts, true);
  186     return $url_normalized;
  187   }
  188   
  189   /**
  190    * Checks whether a given RegEx-pattern is valid or not.
  191    *
  192    * @return bool
  193    */
  194   public static function checkRegexPattern($pattern)
  195   {
  196     $check = @preg_match($pattern, "anything"); // thats the easy way to check a pattern ;)
  197     if (is_integer($check) == false) return false;
  198     else return true;
  199   }
  200   
  201   /**
  202    * Gets the HTTP-statuscode from a given response-header.
  203    *
  204    * @param string $header  The response-header
  205    * @return int            The status-code or NULL if no status-code was found.
  206    */
  207   public static function getHTTPStatusCode($header)
  208   {
  209     $first_line = strtok($header, "\n");
  210     
  211     preg_match("# [0-9]{3}#", $first_line, $match);
  212     
  213     if (isset($match[0]))
  214       return (int)trim($match[0]);
  215     else
  216       return null;
  217   }
  218   
  219   /**
  220    * Reconstructs a full qualified and normalized URL from a given link relating to the URL the link was found in.
  221    *
  222    * @param string $link          The link (i.e. "../page.htm")
  223    * @param PHPCrawlerUrlPartsDescriptor $BaseUrlParts  The parts of the URL the link was found in (i.e. "http://www.foo.com/folder/index.html")
  224    *
  225    * @return string The rebuild, full qualified and normilazed URL the link is leading to (i.e. "http://www.foo.com/page.htm")
  226    *                Or NULL if the link couldn't be rebuild correctly.
  227    */
  228   public static function buildURLFromLink($link, PHPCrawlerUrlPartsDescriptor $BaseUrlParts)
  229   { 
  230     
  231     $url_parts = $BaseUrlParts->toArray();
  232     
  233     // Entities-replacements
  234     $entities= array ("'&(quot|#34);'i",
  235                         "'&(amp|#38);'i",
  236                         "'&(lt|#60);'i",
  237                         "'&(gt|#62);'i",
  238                         "'&(nbsp|#160);'i",
  239                         "'&(iexcl|#161);'i",
  240                         "'&(cent|#162);'i",
  241                         "'&(pound|#163);'i",
  242                         "'&(copy|#169);'i");
  243                         
  244     $replace=array ("\"",
  245                     "&",
  246                     "<",
  247                     ">",
  248                     " ",
  249                     chr(161),
  250                     chr(162),
  251                     chr(163),
  252                     chr(169));
  253    
  254    // Remove "#..." at end, but ONLY at the end,
  255    // not if # is at the beginning !
  256    $link = preg_replace("/^(.{1,})#.{0,}$/", "\\1", $link);
  257 
  258    // Cases
  259    
  260    // Strange link like "//foo.htm" -> make it to "http://foo.html"
  261    if (substr($link, 0, 2) == "//")
  262    {
  263      $link = "http:".$link;
  264    }
  265    
  266    // 1. relative link starts with "/" --> doc_root
  267    // "/index.html" -> "http://www.foo.com/index.html"    
  268    elseif (substr($link,0,1)=="/")
  269    {
  270      $link = $url_parts["protocol"].$url_parts["host"].":".$url_parts["port"].$link;
  271    }
  272     
  273     // 2. "./foo.htm" -> "foo.htm"
  274     elseif (substr($link,0,2)=="./")
  275     {
  276       $link=$url_parts["protocol"].$url_parts["host"].":".$url_parts["port"].$url_parts["path"].substr($link, 2);
  277     }
  278     
  279     // 3. Link is an absolute Link with a given protocol and host (f.e. "http://...")
  280     // DO NOTHING
  281     elseif (preg_match("#^[a-z0-9]{1,}(:\/\/)# i", $link))
  282     {
  283       $link = $link;
  284     }
  285     
  286     // 4. Link is stuff like "javascript: ..." or something
  287     elseif (preg_match("/^[a-zA-Z]{0,}:[^\/]{0,1}/", $link))
  288     {
  289       $link = "";
  290     }
  291     
  292     // 5. "../../foo.html" -> remove the last path from our actual path
  293     // and remove "../" from link at the same time until there are
  294     // no more "../" at the beginning of the link
  295     elseif (substr($link, 0, 3)=="../")
  296     {
  297       $new_path = $url_parts["path"];
  298       
  299       while (substr($link, 0, 3) == "../")
  300       {
  301         $new_path = preg_replace('/\/[^\/]{0,}\/$/',"/", $new_path);
  302         $link  = substr($link, 3);
  303       }
  304       
  305       $link = $url_parts["protocol"].$url_parts["host"].":".$url_parts["port"].$new_path.$link;
  306     }
  307     
  308     // 6. link starts with #
  309     // -> leads to the same site as we are on, trash
  310     elseif (substr($link,0,1) == "#")
  311     {
  312       $link="";
  313     }
  314     
  315     // 7. link starts with "?"
  316     elseif (substr($link,0,1)=="?")
  317     {
  318       $link = $url_parts["protocol"].$url_parts["host"].":".$url_parts["port"].$url_parts["path"].$url_parts["file"].$link;
  319     }
  320     
  321     // 7. thats it, else the abs_path is simply PATH.LINK ...
  322     else
  323     { 
  324       $link = $url_parts["protocol"].$url_parts["host"].":".$url_parts["port"].$url_parts["path"].$link;
  325     }
  326     
  327     if ($link == "") return null;
  328 
  329     
  330     // Now, at least, replace all HTMLENTITIES with normal text !!
  331     // Fe: HTML-Code of the link is: <a href="index.php?x=1&amp;y=2">
  332     // -> Link has to be "index.php?x=1&y=2"
  333     $link = preg_replace($entities, $replace, $link);
  334     
  335     // Replace linebreaks in the link with "" (happens if a links in the sourcecode
  336     // linebreaks)
  337     $link = str_replace(array("\n", "\r"), "", $link);
  338     
  339     // "Normalize" URL
  340     $link = self::normalizeUrl($link);
  341         
  342     return $link;
  343   }
  344   
  345   /**
  346    * Returns the base-URL specified in a meta-tag in the given HTML-source
  347    *
  348    * @return string The base-URL or NULL if not found.
  349    */
  350   public static function getBaseUrlFromMetaTag(&$html_source)
  351   {
  352     preg_match("#<{1}[ ]{0,}((?i)base){1}[ ]{1,}((?i)href|src)[ ]{0,}=[ ]{0,}(\"|'){0,1}([^\"'><\n ]{0,})(\"|'|>|<|\n| )# i", $html_source, $match);
  353     
  354     if (isset($match[4]))
  355     {
  356       $match[4] = trim($match[4]);
  357       return $match[4];
  358     }
  359     else return null;
  360   }
  361   
  362   /**
  363    * Returns the redirect-URL from the given HTML-header
  364    *
  365    * @return string The redirect-URL or NULL if not found.
  366    */
  367   public static function getRedirectURLFromHeader(&$header)
  368   {
  369     // Get redirect-link from header
  370     preg_match("/((?i)location:|content-location:)(.{0,})[\n]/", $header, $match);
  371     
  372     if (isset($match[2]))
  373     {
  374       $redirect = trim($match[2]);
  375       return $redirect;
  376     }
  377     else return null;
  378   }
  379   
  380   /**
  381    * Checks whether a given string matches with one of the given regular-expressions.
  382    *
  383    * @param &string $string      The string
  384    * @param array   $regex_array Numerich array containing the regular-expressions to check against.
  385    *
  386    * @return bool TRUE if one of the regexes matches the string, otherwise FALSE.
  387    */
  388   public static function checkStringAgainstRegexArray(&$string, $regex_array)
  389   {
  390     if (count($regex_array) == 0) return true;
  391     
  392     $cnt = count($regex_array);
  393     for ($x=0; $x<$cnt; $x++)
  394     {
  395       if (preg_match($regex_array[$x], $string))
  396       {
  397         return true;
  398       }
  399     }
  400     
  401     return false;
  402   }
  403   
  404   /**
  405    * Gets the value of an header-directive from the given HTTP-header.
  406    *
  407    * Example:
  408    * <code>PHPCrawlerUtils::getHeaderValue($header, "content-type");</code>
  409    *
  410    * @param string $header    The HTTP-header
  411    * @param string $directive The header-directive
  412    *
  413    * @return string The value of the given directive found in the header.
  414    *                Or NULL if not found.
  415    */
  416   public static function getHeaderValue($header, $directive)
  417   {
  418     preg_match("#[\r\n]".$directive.":(.*)[\r\n\;]# Ui", $header, $match);
  419     
  420     if (isset($match[1]) && trim($match[1]) != "")
  421     {
  422       return trim($match[1]);
  423     }
  424     
  425     else return null;
  426   }
  427   
  428   /**
  429    * Returns all cookies from the give response-header.
  430    *
  431    * @param string $header      The response-header
  432    * @param string $source_url  URL the cookie was send from.
  433    * @return array Numeric array containing all cookies as PHPCrawlerCookieDescriptor-objects.
  434    */
  435   public static function getCookiesFromHeader($header, $source_url)
  436   {
  437     $cookies = array();
  438     
  439     $hits = preg_match_all("#[\r\n]set-cookie:(.*)[\r\n]# Ui", $header, $matches);
  440     
  441     if ($hits && $hits != 0)
  442     {
  443       for ($x=0; $x<count($matches[1]); $x++)
  444       {
  445         $cookies[] = PHPCrawlerCookieDescriptor::getFromHeaderLine($matches[1][$x], $source_url);
  446       }
  447     }
  448     
  449     return $cookies;
  450   }
  451   
  452   /**
  453    * Returns the normalized root-URL of the given URL
  454    *
  455    * @param string $url The URL, e.g. "www.foo.con/something/index.html"
  456    * @return string The root-URL, e.g. "http://www.foo.com"
  457    */
  458   public static function getRootUrl($url)
  459   {
  460     $url_parts = self::splitURL($url);
  461     $root_url = $url_parts["protocol"].$url_parts["host"].":".$url_parts["port"];
  462     
  463     return self::normalizeURL($root_url);
  464   }
  465   
  466   /**
  467    * Deletes a directory recursivly
  468    */
  469   public static function rmDir($dir)
  470   {
  471     if (is_dir($dir))
  472     {
  473       $objects = scandir($dir);
  474       foreach ($objects as $object)
  475       {
  476         if ($object != "." && $object != "..")
  477         {
  478           if (filetype($dir.DIRECTORY_SEPARATOR.$object) == "dir")
  479             self::rmDir($dir.DIRECTORY_SEPARATOR.$object);
  480           else
  481             unlink($dir.DIRECTORY_SEPARATOR.$object);
  482         }
  483       }
  484       reset($objects);
  485       
  486       rmdir($dir);
  487     }
  488   } 
  489   
  490   /**
  491    * Serializes data (objects, arrayse etc.) and writes it to the given file.
  492    */
  493   public static function serializeToFile($target_file, $data)
  494   {
  495     $serialized_data = serialize($data);
  496     file_put_contents($target_file, $serialized_data);
  497   }
  498   
  499   /**
  500    * Returns deserialized data that is stored in a file.
  501    *
  502    * @param string $file The file containing the serialized data
  503    *
  504    * @return mixed The data or NULL if the file doesn't exist
  505    */
  506   public static function deserializeFromFile($file)
  507   {
  508     if (file_exists($file))
  509     {
  510       $serialized_data = file_get_contents($file);
  511       return unserialize($serialized_data);
  512     }
  513     else return null;
  514   }
  515   
  516   /**
  517    * Sorts a twodimensiolnal array.
  518    */
  519   public static function sort2dArray(&$array, $sort_args)
  520   {
  521     $args = func_get_args();
  522     
  523     // Für jedes zu sortierende Feld ein eigenes Array bilden
  524     @reset($array);
  525     while (list($field) = @each($array)) 
  526     {
  527       for ($x=1; $x<count($args); $x++)
  528       {
  529         // Ist das Argument ein String, sprich ein Sortier-Feld?
  530         if (is_string($args[$x]))
  531         {
  532           $value = $array[$field][$args[$x]];
  533           
  534           ${$args[$x]}[] = $value;
  535         }
  536       }
  537     }
  538 
  539     // Argumente für array_multisort bilden
  540     for ($x=1; $x<count($args); $x++)
  541     {
  542       if (is_string($args[$x]))
  543       {
  544         // Argument ist ein TMP-Array
  545         $params[] = &${$args[$x]};
  546       }
  547       else
  548       {
  549         // Argument ist ein Sort-Flag so wie z.B. "SORT_ASC"
  550         $params[] = &$args[$x];
  551       }
  552     }
  553     
  554     // Der letzte Parameter ist immer das zu sortierende Array (Referenz!)
  555     $params[] = &$array;
  556 
  557     // Array sortieren
  558     call_user_func_array("array_multisort", $params);
  559     
  560     @reset($array);
  561   }
  562   
  563   /**
  564    * Determinates the systems temporary-directory.
  565    *
  566    * @return string
  567    */
  568   public static function getSystemTempDir()
  569   {
  570     $tmpfile = tempnam("dummy","");
  571     $path = dirname($tmpfile);
  572     unlink($tmpfile);
  573     
  574     return $path."/";
  575   }
  576   
  577   /**
  578    * Gets all meta-tag atteributes from the given HTML-source.
  579    *
  580    * @param &string &$html_source
  581    * @return array Assoziative array conatining all found meta-attributes.
  582    *               The keys are the meta-names, the values the content of the attributes.
  583    *               (like $tags["robots"] = "nofollow")
  584    *
  585    */
  586   public static function getMetaTagAttributes(&$html_source)
  587   {                
  588     preg_match_all("#<\s*meta\s+".
  589                    "name\s*=\s*(?|\"([^\"]+)\"|'([^']+)'|([^\s><'\"]+))\s+".
  590                    "content\s*=\s*(?|\"([^\"]+)\"|'([^']+)'|([^\s><'\"]+))".
  591                    ".*># Uis", $html_source, $matches);
  592     
  593     $tags = array();            
  594     for ($x=0; $x<count($matches[0]); $x++)
  595     {
  596       $meta_name = strtolower(trim($matches[1][$x]));
  597       $meta_value = strtolower(trim($matches[2][$x]));
  598       
  599       $tags[$meta_name] = $meta_value;
  600     }
  601     
  602     return $tags;
  603   }
  604   
  605   /**
  606    * Checks wether the given string is an UTF8-encoded string.
  607    *
  608    * Taken from http://www.php.net/manual/de/function.mb-detect-encoding.php
  609    * (comment from "prgss at bk dot ru")
  610    * 
  611    * @param string $string The string
  612    * @return bool TRUE if the string is UTF-8 encoded.
  613    */
  614   public static function isUTF8String($string)
  615   { 
  616     $sample = @iconv('utf-8', 'utf-8', $string);
  617     
  618     if (md5($sample) == md5($string))
  619       return true;
  620     else
  621       return false;
  622   }
  623   
  624   /**
  625    * Checks whether the given string is a valid, urlencoded URL (by RFC)
  626    * 
  627    * @param string $string The string
  628    * @return bool TRUE if the string is a valid url-string.
  629    */
  630   public static function isValidUrlString($string)
  631   { 
  632     if (preg_match("#^[a-z0-9/.&=?%-_.!~*'()]+$# i", $string)) return true;
  633     else return false;
  634   }
  635 }
  636 ?>