"Fossies" - the Fresh Open Source Software Archive

Member "libs/PHPCrawler/PHPCrawlerHTTPRequest.class.php" (8 Jan 2013, 31934 Bytes) of package /linux/www/SitemapCreator.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) PHP source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 <?php
    2 /**
    3  * Class for performing HTTP-requests.
    4  *
    5  * @package phpcrawl
    6  * @internal
    7  */
    8 class PHPCrawlerHTTPRequest
    9 {
   10   /**
   11    * The user-agent-string
   12    */
   13   public $userAgentString = "PHPCrawl";
   14   
   15   /**
   16    * Timeout-value for socket-connection
   17    */
   18   public $socketConnectTimeout = 5;
   19   
   20   /**
   21    * Socket-read-timeout
   22    */
   23   public $socketReadTimeout = 2;
   24   
   25   /**
   26    * Limit for content-size to receive
   27    *
   28    * @var int The kimit n bytes
   29    */
   30   protected $content_size_limit = 0;
   31   
   32   /**
   33    * Global counter for traffic this instance of the HTTPRequest-class caused.
   34    *
   35    * @vat int Traffic in bytes
   36    */
   37   protected $global_traffic_count = 0;
   38   
   39   /**
   40    * The time it took te receive data-packets for the request.
   41    *
   42    * @vat float time in seconds and milliseconds.
   43    */
   44   protected $data_transfer_time = 0;
   45   
   46   /**
   47    * Contains all rules defining the content-types that should be received
   48    *
   49    * @var array Numeric array conatining the regex-rules
   50    */
   51   protected $receive_content_types = array();
   52   
   53   /**
   54    * Contains all rules defining the content-types of pages/files that should be streamed directly to
   55    * a temporary file (instead of to memory)
   56    *
   57    * @var array Numeric array conatining the regex-rules
   58    */
   59   protected $receive_to_file_content_types = array();
   60   
   61   /**
   62    * Contains all rules defining the content-types defining which documents shoud get checked for links.
   63    *
   64    * @var array Numeric array conatining the regex-rules
   65    */
   66   protected $linksearch_content_types = array("#text/html# i");
   67   
   68   /**
   69    * The TMP-File to use when a page/file should be streamed to file.
   70    *
   71    * @var string
   72    */
   73   protected $tmpFile = "phpcrawl.tmp";
   74   
   75   /**
   76    * The URL for the request as PHPCrawlerURLDescriptor-object
   77    *
   78    * @var PHPCrawlerURLDescriptor
   79    */
   80   protected $UrlDescriptor;
   81   
   82   /**
   83    * The parts of the URL for the request as returned by PHPCrawlerUtils::splitURL()
   84    *
   85    * @var array
   86    */
   87   protected $url_parts = array();
   88   
   89   /**
   90    * DNS-cache
   91    *
   92    * @var PHPCrawlerDNSCache
   93    */
   94   public $DNSCache;
   95   
   96   /**
   97    * Link-finder object
   98    *
   99    * @var PHPCrawlerLinkFinder
  100    */
  101   protected $LinkFinder;
  102   
  103   /**
  104    * The last response-header this request-instance received.
  105    */
  106   protected $lastResponseHeader;
  107   
  108   /**
  109    * Array containing cookies to send with the request
  110    *
  111    * @array
  112    */
  113   protected $cookie_array = array();
  114   
  115   /**
  116    * Array containing POST-data to send with the request
  117    *
  118    * @var array
  119    */
  120   protected $post_data = array();
  121   
  122   /**
  123    * The proxy to use
  124    *
  125    * @var array Array containing the keys "proxy_host", "proxy_port", "proxy_username", "proxy_password".
  126    */
  127   protected $proxy;
  128   
  129   /**
  130    * The socket used for HTTP-requests
  131    */
  132   protected $socket;
  133   
  134   protected $header_check_callback_function = null;
  135   
  136   public function __construct()
  137   {
  138     // Init LinkFinder
  139     if (!class_exists("PHPCrawlerLinkFinder")) include_once(dirname(__FILE__)."/PHPCrawlerLinkFinder.class.php");
  140     $this->LinkFinder = new PHPCrawlerLinkFinder();
  141     
  142     // Init DNS-cache
  143     if (!class_exists("PHPCrawlerDNSCache")) include_once(dirname(__FILE__)."/PHPCrawlerDNSCache.class.php");
  144     $this->DNSCache = new PHPCrawlerDNSCache();
  145     
  146     // Cookie-Descriptor
  147     if (!class_exists("PHPCrawlerCookieDescriptor")) include_once(dirname(__FILE__)."/PHPCrawlerCookieDescriptor.class.php");
  148     
  149     // ResponseHeader-class
  150     if (!class_exists("PHPCrawlerResponseHeader")) include_once(dirname(__FILE__)."/PHPCrawlerResponseHeader.class.php");
  151   }
  152   
  153   /**
  154    * Sets the URL for the request.
  155    *
  156    * @param PHPCrawlerURLDescriptor $UrlDescriptor An PHPCrawlerURLDescriptor-object containing the URL to request
  157    */
  158   public function setUrl(PHPCrawlerURLDescriptor $UrlDescriptor)
  159   {
  160     $this->UrlDescriptor = $UrlDescriptor;
  161     
  162     // Split the URL into its parts
  163     $this->url_parts = PHPCrawlerUtils::splitURL($UrlDescriptor->url_rebuild);
  164   }
  165   
  166   /**
  167    * Adds a cookie to send with the request.
  168    *
  169    * @param string $name Cookie-name
  170    * @param string $value Cookie-value
  171    */
  172   public function addCookie($name, $value)
  173   {
  174     $this->cookie_array[$name] = $value;
  175   }
  176   
  177   /**
  178    * Adds a cookie to send with the request.
  179    *
  180    * @param PHPCrawlerCookieDescriptor $Cookie
  181    */
  182   public function addCookieDescriptor(PHPCrawlerCookieDescriptor $Cookie)
  183   {
  184     //var_dump($Cookie);
  185     $this->addCookie($Cookie->name, $Cookie->value);
  186   }
  187   
  188   /**
  189    * Adds a bunch of cookies to send with the request
  190    *
  191    * @param array $cookies Numeric array containins cookies as PHPCrawlerCookieDescriptor-objects
  192    */
  193   public function addCookieDescriptors($cookies)
  194   {
  195     $cnt = count($cookies);
  196     for ($x=0; $x<$cnt; $x++)
  197     {
  198       $this->addCookieDescriptor($cookies[$x]);
  199     }
  200   }
  201   
  202   /**
  203    * Removes all cookies to send with the request.
  204    */
  205   public function clearCookies()
  206   {
  207     $this->cookie_array = array();
  208   }
  209   
  210   /**
  211    * Sets the html-tags from which to extract/find links from.
  212    *
  213    * @param array $tag_array Numeric array containing the tags, i.g. array("href", "src", "url", ...)
  214    * @return bool
  215    */
  216   public function setLinkExtractionTags($tag_array)
  217   {
  218     if (!is_array($tag_array)) return false;
  219     
  220     $this->LinkFinder->extract_tags = $tag_array;
  221     return true;
  222   }
  223   
  224   /**
  225    * Specifies whether redirect-links set in http-headers should get searched for.
  226    *
  227    * @return bool
  228    */
  229   public function setFindRedirectURLs($mode)
  230   {
  231     if (!is_bool($mode)) return false;
  232     
  233     $this->LinkFinder->find_redirect_urls = $mode;
  234     
  235     return true;
  236   }
  237   
  238   /**
  239    * Adds post-data to send with the request.
  240    */
  241   public function addPostData($key, $value)
  242   {
  243     $this->post_data[$key] = $value;
  244   }
  245   
  246   /**
  247    * Removes all post-data to send with the request.
  248    */
  249   public function clearPostData()
  250   {
  251     $this->post_data = array();
  252   }
  253   
  254   public function setProxy($proxy_host, $proxy_port, $proxy_username = null, $proxy_password = null)
  255   {
  256     $this->proxy = array();
  257     $this->proxy["proxy_host"] = $proxy_host;
  258     $this->proxy["proxy_port"] = $proxy_port;
  259     $this->proxy["proxy_username"] = $proxy_username;
  260     $this->proxy["proxy_password"] = $proxy_password;
  261   }
  262   
  263   /**
  264    * Sets basic-authentication login-data for protected URLs.
  265    */
  266   public function setBasicAuthentication($username, $password)
  267   {
  268     $this->url_parts["auth_username"] = $username;
  269     $this->url_parts["auth_password"] = $password;
  270   }
  271   
  272   /**
  273    * Enables/disables aggresive linksearch
  274    *
  275    * @param bool $mode
  276    * @return bool
  277    */
  278   public function enableAggressiveLinkSearch($mode)
  279   {
  280     if (!is_bool($mode)) return false;
  281     
  282     $this->LinkFinder->aggressive_search = $mode;
  283     return true;
  284   }
  285   
  286   public function setHeaderCheckCallbackFunction(&$obj, $method_name)
  287   {
  288     $this->header_check_callback_function = array($obj, $method_name);
  289   }
  290   
  291   /**
  292    * Sends the HTTP-request and receives the page/file.
  293    *
  294    * @return A PHPCrawlerDocumentInfo-object containing all information about the received page/file
  295    */
  296   public function sendRequest()
  297   {
  298     // Prepare LinkFinder
  299     $this->LinkFinder->resetLinkCache();
  300     $this->LinkFinder->setSourceUrl($this->UrlDescriptor);
  301     
  302     // Initiate the Response-object and pass base-infos
  303     $PageInfo = new PHPCrawlerDocumentInfo();
  304     $PageInfo->url = $this->UrlDescriptor->url_rebuild;
  305     $PageInfo->protocol = $this->url_parts["protocol"];
  306     $PageInfo->host = $this->url_parts["host"];
  307     $PageInfo->path = $this->url_parts["path"];
  308     $PageInfo->file = $this->url_parts["file"];
  309     $PageInfo->query = $this->url_parts["query"];
  310     $PageInfo->port = $this->url_parts["port"];
  311     
  312     
  313     // Create header to send
  314     $request_header_lines = $this->buildRequestHeader();
  315     $header_string = trim(implode("", $request_header_lines));
  316     $PageInfo->header_send = $header_string;
  317     
  318     // Open socket
  319     $this->openSocket($PageInfo->error_code, $PageInfo->error_string);
  320     
  321     // If error occured
  322     if ($PageInfo->error_code != null)
  323     {
  324       // If proxy-error -> throw exception
  325       if ($PageInfo->error_code ==  PHPCrawlerRequestErrors::ERROR_PROXY_UNREACHABLE)
  326       {
  327         throw new Exception("Unable to connect to proxy '".$this->proxy["proxy_host"]."' on port '".$this->proxy["proxy_port"]."'");
  328       }
  329       
  330       $PageInfo->error_occured = true;
  331       return $PageInfo; 
  332     }
  333     
  334     // Send request
  335     $this->sendRequestHeader($request_header_lines);
  336     
  337     // Read response-header
  338     $response_header = $this->readResponseHeader($PageInfo->error_code, $PageInfo->error_string);
  339     
  340     // If error occured
  341     if ($PageInfo->error_code != null)
  342     {
  343       $PageInfo->error_occured = true;
  344       return $PageInfo; 
  345     }
  346     
  347     // Set header-infos
  348     $this->lastResponseHeader = new PHPCrawlerResponseHeader($response_header, $this->UrlDescriptor->url_rebuild);
  349     $PageInfo->responseHeader = $this->lastResponseHeader;
  350     $PageInfo->header = $this->lastResponseHeader->header_raw;
  351     $PageInfo->http_status_code = $this->lastResponseHeader->http_status_code;
  352     $PageInfo->content_type = $this->lastResponseHeader->content_type;
  353     $PageInfo->cookies = $this->lastResponseHeader->cookies;
  354     
  355     // Referer-Infos
  356     if ($this->UrlDescriptor->refering_url != null)
  357     {
  358       $PageInfo->referer_url = $this->UrlDescriptor->refering_url;
  359       $PageInfo->refering_linkcode = $this->UrlDescriptor->linkcode;
  360       $PageInfo->refering_link_raw = $this->UrlDescriptor->link_raw;
  361       $PageInfo->refering_linktext = $this->UrlDescriptor->linktext;
  362     }
  363       
  364     // Call header-check-callback
  365     $ret = 0;
  366     if ($this->header_check_callback_function != null)
  367       $ret = call_user_func($this->header_check_callback_function, $this->lastResponseHeader);
  368     
  369     // Check if content should be received
  370     $receive = $this->decideRecevieContent($this->lastResponseHeader);
  371     
  372     if ($ret < 0 || $receive == false)
  373     {
  374       @fclose($this->socket);
  375       $PageInfo->received = false;
  376       $PageInfo->links_found_url_descriptors = $this->LinkFinder->getAllURLs(); // Maybe found a link/redirect in the header
  377       $PageInfo->meta_attributes = $this->LinkFinder->getAllMetaAttributes();
  378       return $PageInfo;
  379     }
  380     else
  381     {
  382       $PageInfo->received = true;
  383     }
  384     
  385     // Check if content should be streamd to file
  386     $stream_to_file = $this->decideStreamToFile($response_header);
  387                     
  388     // Read content
  389     $response_content = $this->readResponseContent($stream_to_file, $PageInfo->error_code, $PageInfo->error_string, $PageInfo->received_completely, $PageInfo->bytes_received);
  390 
  391     // If error occured
  392     if ($PageInfo->error_code != null)
  393     {
  394       $PageInfo->error_occured = true;
  395     }
  396     
  397     @fclose($this->socket);
  398     
  399     // Complete ResponseObject
  400     $PageInfo->content = $PageInfo->source = $response_content;
  401     $PageInfo->received_completly = $PageInfo->received_completely;
  402     $PageInfo->data_transfer_time = $this->data_transfer_time;
  403     $PageInfo->data_transfer_rate = $PageInfo->bytes_received / $this->data_transfer_time;
  404     
  405     if ($stream_to_file == true)
  406     {
  407       $PageInfo->received_to_file = true;
  408       $PageInfo->content_tmp_file = $this->tmpFile;
  409     }
  410     else $PageInfo->received_to_memory = true;
  411     
  412     $PageInfo->links_found_url_descriptors = $this->LinkFinder->getAllURLs();
  413     $PageInfo->meta_attributes = $this->LinkFinder->getAllMetaAttributes();
  414     $PageInfo->setLinksFoundArray();
  415     
  416     return $PageInfo;
  417   }
  418   
  419   /**
  420    * Opens the socket to the host.
  421    *
  422    * @param  int    &$error_code   Error-code by referenct if an error occured.
  423    * @param  string &$error_string Error-string by reference
  424    * @return bool   TRUE if socket could be opened, otherwise FALSE.
  425    */
  426   protected function openSocket(&$error_code, &$error_string)
  427   {
  428     PHPCrawlerBenchmark::start("connecting_server");
  429     
  430     // SSL or not?
  431     if ($this->url_parts["protocol"] == "https://") $protocol_prefix = "ssl://";
  432     else $protocol_prefix = "";
  433     
  434     // If SSL-request, but openssl is not installed
  435     if ($protocol_prefix == "ssl://" && !extension_loaded("openssl"))
  436     {
  437       $error_code = PHPCrawlerRequestErrors::ERROR_SSL_NOT_SUPPORTED;
  438       $error_string = "Error connecting to ".$this->url_parts["protocol"].$this->url_parts["host"].": SSL/HTTPS-requests not supported, extension openssl not installed.";
  439     }
  440     
  441     // Get IP for hostname
  442     $ip_address = $this->DNSCache->getIP($this->url_parts["host"]);
  443     
  444     // Open socket
  445     if ($this->proxy != null)
  446     {
  447       //$this->socket = @fsockopen ($this->proxy["proxy_host"], $this->proxy["proxy_port"], $error_code, $error_str, $this->socketConnectTimeout);
  448       $this->socket = @stream_socket_client($this->proxy["proxy_host"].":".$this->proxy["proxy_port"], $error_code, $error_str,
  449                                            $this->socketConnectTimeout, STREAM_CLIENT_CONNECT);
  450     }
  451     else
  452     {
  453       //$this->socket = @fsockopen ($protocol_prefix.$ip_address, $this->url_parts["port"], $error_code, $error_str, $this->socketConnectTimeout);
  454       
  455       // If ssl -> perform Server name indication
  456       if ($this->url_parts["protocol"] == "https://")
  457         $context = stream_context_create(array('ssl' => array('SNI_server_name' => $this->url_parts["host"])));
  458       else
  459         $context = stream_context_create(array());
  460       
  461       $this->socket = @stream_socket_client($protocol_prefix.$ip_address.":".$this->url_parts["port"], $error_code, $error_str,
  462                                            $this->socketConnectTimeout, STREAM_CLIENT_CONNECT, $context);
  463     }
  464     
  465     PHPCrawlerBenchmark::stop("connecting_server");
  466     
  467     // If socket not opened -> throw error
  468     if ($this->socket == false)
  469     {
  470       // If proxy not reachable
  471       if ($this->proxy != null)
  472       {
  473         $error_code = PHPCrawlerRequestErrors::ERROR_PROXY_UNREACHABLE;
  474         $error_string = "Error connecting to proxy ".$this->proxy["proxy_host"].": Host unreachable (".$error_str.").";
  475         return false;
  476       }
  477       else
  478       {
  479         $error_code = PHPCrawlerRequestErrors::ERROR_HOST_UNREACHABLE;
  480         $error_string = "Error connecting to ".$this->url_parts["protocol"].$this->url_parts["host"].": Host unreachable (".$error_str.").";
  481         return false;
  482       }
  483     }
  484     else return true;
  485   }
  486   
  487   /**
  488    * Send the request-header.
  489    */
  490   protected function sendRequestHeader($request_header_lines)
  491   {
  492     PHPCrawlerBenchmark::start("sending_header");
  493     
  494     // Header senden
  495     $cnt = count($request_header_lines);
  496     for ($x=0; $x<$cnt; $x++)
  497     {
  498       fputs($this->socket, $request_header_lines[$x]);
  499     }
  500     
  501     PHPCrawlerBenchmark::stop("sending_header");
  502   }
  503   
  504   /**
  505    * Reads the response-header.
  506    *
  507    * @param  int    &$error_code   Error-code by reference if an error occured.
  508    * @param  string &$error_string Error-string by reference
  509    * @return string  The response-header or NULL if an error occured
  510    */
  511   protected function readResponseHeader(&$error_code, &$error_string)
  512   {
  513     PHPCrawlerBenchmark::start("server_response_time");
  514     PHPCrawlerBenchmark::start("data_transfer_time", true);
  515     
  516     $status = socket_get_status($this->socket);
  517     $source_read = "";
  518     $header = "";
  519     $server_responded = false;
  520     
  521     while ($status["eof"] == false)
  522     {
  523       socket_set_timeout($this->socket, $this->socketReadTimeout);
  524       
  525       // Read from socket
  526       $line_read = fgets($this->socket, 1024); // Das @ ist da um die blöde "SSL fatal protocol error"-Warnung zu unterdrücken, 
  527                                                // die keinen Sinn macht
  528       if ($server_responded == false)
  529       {
  530         $server_responded = true;
  531         PHPCrawlerBenchmark::stop("server_response_time");
  532         PHPCrawlerBenchmark::start("retreiving_header");
  533       }
  534       
  535       $source_read .= $line_read;
  536       
  537       $this->global_traffic_count += strlen($line_read);
  538       
  539       $status = socket_get_status($this->socket);
  540       
  541       // Socket timed out
  542       if ($status["timed_out"] == true)
  543       {
  544         $error_code = PHPCrawlerRequestErrors::ERROR_SOCKET_TIMEOUT;
  545         $error_string = "Socket-stream timed out (timeout set to ".$this->socketReadTimeout." sec).";
  546         return $header;
  547       }
  548       
  549       // No "HTTP" at beginnig of response
  550       if (strtolower(substr($source_read, 0, 4)) != "http")
  551       {
  552         $error_code = PHPCrawlerRequestErrors::ERROR_NO_HTTP_HEADER;
  553         $error_string = "HTTP-protocol error.";
  554         return $header;
  555       }
  556       
  557       if (substr($source_read, -4, 4) == "\r\n\r\n")
  558       {
  559         $header = substr($source_read, 0, strlen($source_read)-2);
  560         
  561         // Search for links (redirects) in the header
  562         $this->LinkFinder->processHTTPHeader($header);
  563         
  564         PHPCrawlerBenchmark::stop("retreiving_header");
  565         PHPCrawlerBenchmark::stop("data_transfer_time");
  566         return $header;
  567       }
  568     }
  569     
  570     // No header found
  571     if ($header == "")
  572     {
  573       $error_code = PHPCrawlerRequestErrors::ERROR_NO_HTTP_HEADER;
  574       $error_string = "Host doesn't respond with a HTTP-header.";
  575       return null;
  576     }
  577   }
  578   
  579   /**
  580    * Reads the response-content.
  581    * 
  582    * @param bool    $stream_to_file If TRUE, the content will be streamed diretly to the temporary file and
  583    *                                this method will not return the content as a string.                            
  584    * @param int     &$error_code    Error-code by reference if an error occured.
  585    * @param &string &$error_string  Error-string by reference
  586    * @param &string &$document_received_completely Flag indicatign whether the content was received completely passed by reference
  587    * @param &string &$bytes_received Number of bytes received, passed by reference
  588    * @return string  The response-content/source. May be emtpy if an error ocdured or data was streamed to the tmp-file.
  589    */
  590   protected function readResponseContent($stream_to_file = false, &$error_code, &$error_string, &$document_received_completely, &$bytes_received)
  591   {
  592     PHPCrawlerBenchmark::start("retreiving_content");
  593     PHPCrawlerBenchmark::start("data_transfer_time", true);
  594     
  595     // If content should be streamed to file
  596     if ($stream_to_file == true)
  597     {
  598       $fp = @fopen($this->tmpFile, "w");
  599       
  600       if ($fp == false)
  601       {
  602         $error_code = PHPCrawlerRequestErrors::ERROR_TMP_FILE_NOT_WRITEABLE;
  603         $error_string = "Couldn't open the temporary file ".$this->tmpFile." for writing.";
  604         return "";
  605       }
  606     }
  607     
  608     // Init
  609     $status = socket_get_status($this->socket);
  610     $source_portion = "";
  611     $source_complete = "";
  612     $bytes_received = 0;
  613     $document_received_completely = true;
  614     $stop_receving = false;
  615     
  616     while ($stop_receving == false)
  617     {
  618       socket_set_timeout($this->socket, $this->socketReadTimeout);
  619       
  620       // Read from socket
  621       $line_read = @fread($this->socket, 1024); // Das @ ist da um die blöde "SSL fatal protocol error"-Warnung zu unterdrücken, 
  622                                                 // die keinen Sinn macht
  623       
  624       // Check socket-status
  625       $status = socket_get_status($this->socket);
  626       
  627       // Check for EOF
  628       if ($status["eof"] == true) $stop_receving = true;
  629       
  630       // Socket timed out
  631       if ($status["timed_out"] == true)
  632       {
  633         $stop_receving = true;
  634         $error_code = PHPCrawlerRequestErrors::ERROR_SOCKET_TIMEOUT;
  635         $error_string = "Socket-stream timed out (timeout set to ".$this->socketReadTimeout." sec).";
  636         $document_received_completely = false;
  637       }
  638       else
  639       {
  640         $source_portion .= $line_read;
  641         $bytes_received += strlen($line_read);
  642         $this->global_traffic_count += strlen($line_read);
  643         
  644         // Stream to file or store source in memory
  645         if ($stream_to_file == true)
  646         {
  647           @fwrite($fp, $line_read);
  648         }
  649         else
  650         {
  651           $source_complete .= $line_read;
  652         }
  653       }
  654       
  655       // Check if content-length stated in the header is reached
  656       if ($this->lastResponseHeader->content_length == $bytes_received)
  657       {
  658         $stop_receving = true;
  659       }
  660       
  661       // Check if contentsize-limit is reached
  662       if ($this->content_size_limit > 0 && $this->content_size_limit <= $bytes_received)
  663       {
  664         $stop_receving = true;
  665       }
  666                 
  667       // Find links in portion of the source
  668       if (strlen($source_portion) >= 100000 || $stop_receving == true)
  669       {
  670         if (PHPCrawlerUtils::checkStringAgainstRegexArray($this->lastResponseHeader->content_type, $this->linksearch_content_types))
  671         {
  672           PHPCrawlerBenchmark::stop("retreiving_content");
  673           PHPCrawlerBenchmark::stop("data_transfer_time");
  674           
  675           $this->LinkFinder->findLinksInHTMLChunk($source_portion);
  676           $source_portion = substr($source_portion, -1500);
  677           
  678           PHPCrawlerBenchmark::start("retreiving_content");
  679           PHPCrawlerBenchmark::start("data_transfer_time", true);
  680         }
  681       }
  682 
  683     }
  684     
  685     if ($stream_to_file == true) @fclose($fp);
  686     
  687     PHPCrawlerBenchmark::stop("retreiving_content");
  688     PHPCrawlerBenchmark::stop("data_transfer_time");
  689     
  690     $this->data_transfer_time = PHPCrawlerBenchmark::getElapsedTime("data_transfer_time");
  691     PHPCrawlerBenchmark::reset("data_transfer_time");
  692     
  693     return $source_complete;
  694   }
  695   
  696   /**
  697    * Builds the request-header from the given settings.
  698    *
  699    * @return array  Numeric array containing the lines of the request-header
  700    */
  701   protected function buildRequestHeader()
  702   {
  703     // Create header
  704     $headerlines = array();
  705     
  706     // Methode(GET or POST)
  707     if (count($this->post_data) > 0) $request_type = "POST";
  708     else $request_type = "GET";
  709     
  710     if ($this->proxy != null)
  711     {
  712       // A Proxy needs the full qualified URL in the GET or POST headerline.
  713       $headerlines[] = $request_type." ".$this->UrlDescriptor->url_rebuild ." HTTP/1.0\r\n";
  714     }
  715     else
  716     {
  717       $query = $this->prepareHTTPRequestQuery($this->url_parts["path"].$this->url_parts["file"].$this->url_parts["query"]);
  718       $headerlines[] = $request_type." ".$query." HTTP/1.0\r\n";
  719     }
  720     
  721     $headerlines[] = "HOST: ".$this->url_parts["host"]."\r\n";
  722     
  723     $headerlines[] = "User-Agent: ".str_replace("\n", "", $this->userAgentString)."\r\n";
  724     
  725     // Referer
  726     if ($this->UrlDescriptor->refering_url != null)
  727     {
  728       $headerlines[] = "Referer: ".$this->UrlDescriptor->refering_url."\r\n";
  729     }
  730     
  731     // Cookies
  732     $headerlines[] = $this->buildCookieHeader();
  733     
  734     // Authentication
  735     if ($this->url_parts["auth_username"] != "" && $this->url_parts["auth_password"] != "")
  736     {
  737       $auth_string = base64_encode($this->url_parts["auth_username"].":".$this->url_parts["auth_password"]);
  738       $headerlines[] = "Authorization: Basic ".$auth_string."\r\n";
  739     }
  740     
  741     // Proxy authentication
  742     if ($this->proxy != null && $this->proxy["proxy_username"] != null)
  743     {
  744       $auth_string = base64_encode($this->proxy["proxy_username"].":".$this->proxy["proxy_password"]);
  745       $headerlines[] = "Proxy-Authorization: Basic ".$auth_string."\r\n";
  746     }
  747     
  748     $headerlines[] = "Connection: close\r\n";
  749     
  750     // Wenn POST-Request
  751     if ($request_type == "POST")
  752     {
  753       // Post-Content bauen
  754       $post_content = $this->buildPostContent();
  755       
  756       $headerlines[] = "Content-Type: multipart/form-data; boundary=---------------------------10786153015124\r\n";
  757       $headerlines[] = "Content-Length: ".strlen($post_content)."\r\n\r\n";
  758       $headerlines[] = $post_content;
  759     }
  760     else
  761     {
  762       $headerlines[] = "\r\n";
  763     }
  764 
  765     return $headerlines;
  766   }
  767   
  768   /**
  769    * Prepares the given HTTP-query-string for the HTTP-request.
  770    *
  771    * HTTP-query-strings always should be utf8-encoded and urlencoded afterwards.
  772    * So "/path/file?test=tatütata" will be converted to "/path/file?test=tat%C3%BCtata":
  773    *
  774    * @param stirng The quetry-string (like "/path/file?test=tatütata")
  775    * @return string
  776    */
  777   protected function prepareHTTPRequestQuery($query)
  778   {
  779     // If string already is a valid URL -> do nothing
  780     if (PHPCrawlerUtils::isValidUrlString($query))
  781     {
  782       return $query;
  783     }
  784     
  785     // Decode query-string (for URLs that are partly urlencoded and partly not)
  786     $query = rawurldecode($query);
  787     
  788     // if query is already utf-8 encoded -> simply urlencode it,
  789     // otherwise encode it to utf8 first.
  790     if (PHPCrawlerUtils::isUTF8String($query) == true)
  791     {
  792       $query = rawurlencode($query);
  793     }
  794     else
  795     {
  796       $query = rawurlencode(utf8_encode($query));
  797     }
  798     
  799     // Replace url-specific signs back
  800     $query = str_replace("%2F", "/", $query);
  801     $query = str_replace("%3F", "?", $query);
  802     $query = str_replace("%3D", "=", $query);
  803     $query = str_replace("%26", "&", $query);
  804    
  805     return $query;
  806   }
  807   
  808   /**
  809    * Builds the post-content from the postdata-array for the header to send with the request (MIME-style)
  810    *
  811    * @return array  Numeric array containing the lines of the POST-part for the header
  812    */
  813   protected function buildPostContent()
  814   {
  815     $post_content = "";
  816     
  817     // Post-Data
  818     @reset($this->post_data);
  819     while (list($key, $value) = @each($this->post_data))
  820     {
  821       $post_content .= "-----------------------------10786153015124\r\n";
  822       $post_content .= "Content-Disposition: form-data; name=\"".$key."\"\r\n\r\n";
  823       $post_content .= $value."\r\n";
  824     }
  825     
  826     $post_content .= "-----------------------------10786153015124\r\n";
  827     
  828     return $post_content;
  829   }
  830   
  831   /**
  832    * Builds the cookie-header-part for the header to send.
  833    *
  834    * @return string  The cookie-header-part, i.e. "Cookie: test=bla; palimm=palaber"
  835    */
  836   protected function buildCookieHeader()
  837   {
  838     $cookie_string = "";
  839     
  840     @reset($this->cookie_array);
  841     while(list($key, $value) = @each($this->cookie_array))
  842     {
  843       $cookie_string .= "; ".$key."=".$value."";
  844     }
  845     
  846     if ($cookie_string != "")
  847     {
  848       return "Cookie: ".substr($cookie_string, 2)."\r\n";
  849     }
  850     else
  851     {
  852       return "";
  853     }
  854   }
  855   
  856   /**
  857    * Checks whether the content of this page/file should be received (based on the content-type
  858    * and the applied rules)
  859    *
  860    * @param PHPCrawlerResponseHeader $responseHeader The response-header as an PHPCrawlerResponseHeader-object
  861    * @return bool TRUE if the content should be received
  862    */
  863   protected function decideRecevieContent(PHPCrawlerResponseHeader $responseHeader)
  864   {
  865     // Get Content-Type from header
  866     $content_type = $responseHeader->content_type;
  867     
  868     // No Content-Type given
  869     if ($content_type == null) return false;
  870     
  871     // Check against the given rules
  872     $receive = PHPCrawlerUtils::checkStringAgainstRegexArray($content_type, $this->receive_content_types);
  873     
  874     return $receive;
  875   }
  876   
  877   /**
  878    * Checks whether the content of this page/file should be streamed directly to file.
  879    *
  880    * @param string $response_header The response-header
  881    * @return bool TRUE if the content should be streamed to TMP-file
  882    */
  883   protected function decideStreamToFile($response_header)
  884   {
  885     if (count($this->receive_to_file_content_types) == 0) return false;
  886     
  887     // Get Content-Type from header
  888     $content_type = PHPCrawlerUtils::getHeaderValue($response_header, "content-type");
  889     
  890     // No Content-Type given
  891     if ($content_type == null) return false;
  892     
  893     // Check against the given rules
  894     $receive = PHPCrawlerUtils::checkStringAgainstRegexArray($content_type, $this->receive_to_file_content_types);
  895     
  896     return $receive;
  897   }
  898   
  899   /**
  900    * Adds a rule to the list of rules that decides which pages or files - regarding their content-type - should be received
  901    *
  902    * If the content-type of a requested document doesn't match with the given rules, the request will be aborted after the header
  903    * was received.
  904    *
  905    * @param string $regex The rule as a regular-expression
  906    * @return bool TRUE if the rule was added to the list.
  907    *              FALSE if the given regex is not valid.
  908    */
  909   public function addReceiveContentType($regex)
  910   {
  911     $check = PHPCrawlerUtils::checkRegexPattern($regex); // Check pattern
  912     
  913     if ($check == true)
  914     {
  915       $this->receive_content_types[] = trim(strtolower($regex));
  916     }
  917     return $check;
  918   }
  919   
  920   /**
  921    * Adds a rule to the list of rules that decides what types of content should be streamed diretly to the temporary file.
  922    *
  923    * If a content-type of a page or file matches with one of these rules, the content will be streamed directly into the temporary file
  924    * given in setTmpFile() without claiming local RAM.
  925    * 
  926    * @param string $regex The rule as a regular-expression
  927    * @return bool         TRUE if the rule was added to the list and the regex is valid.
  928    */
  929   public function addStreamToFileContentType($regex)
  930   {
  931     $check = PHPCrawlerUtils::checkRegexPattern($regex); // Check pattern
  932     
  933     if ($check == true)
  934     {
  935       $this->receive_to_file_content_types[] = trim($regex);
  936     }
  937     return $check;
  938   }
  939   
  940   /**
  941    * Sets the temporary file to use when content of found documents should be streamed directly into a temporary file.
  942    *
  943    * @param string $tmp_file The TMP-file to use.
  944    */
  945   public function setTmpFile($tmp_file)
  946   {
  947     //Check if writable
  948     $fp = @fopen($tmp_file, "w");
  949     
  950     if (!$fp)
  951     {
  952       return false;
  953     }
  954     else
  955     {
  956       fclose($fp);
  957       $this->tmpFile = $tmp_file;
  958       return true;
  959     }
  960   }
  961   
  962   /**
  963    * Sets the size-limit in bytes for content the request should receive.
  964    *
  965    * @param int $bytes
  966    * @return bool
  967    */
  968   public function setContentSizeLimit($bytes)
  969   {
  970     if (preg_match("#^[0-9]*$#", $bytes))
  971     {
  972       $this->content_size_limit = $bytes;
  973       return true;
  974     }
  975     else return false;
  976   }
  977   
  978   /**
  979    * Returns the global traffic this instance of the HTTPRequest-class caused so far.
  980    *
  981    * @return int The traffic in bytes.
  982    */
  983   public function getGlobalTrafficCount()
  984   {
  985     return $this->global_traffic_count;
  986   }
  987   
  988   /**
  989    * Adds a rule to the list of rules that decide what kind of documents should get
  990    * checked for links in (regarding their content-type)
  991    *
  992    * @param string $regex Regular-expression defining the rule
  993    * @return bool         TRUE if the rule was successfully added
  994    */
  995   function addLinkSearchContentType($regex)
  996   {
  997     $check = PHPCrawlerUtils::checkRegexPattern($regex); // Check pattern
  998     if ($check == true)
  999     {
 1000       $this->linksearch_content_types[] = trim($regex);
 1001     }
 1002     return $check;
 1003   }
 1004 }
 1005 ?>