"Fossies" - the Fresh Open Source Software Archive

Member "libs/PHPCrawler/PHPCrawlerDocumentInfo.class.php" (8 Jan 2013, 10245 Bytes) of package /linux/www/SitemapCreator.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) PHP source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 <?php
    2 /**
    3  * Contains information about a page or file the crawler found and received during the crawling-process.
    4  *
    5  * @package phpcrawl
    6  */
    7 class PHPCrawlerDocumentInfo
    8 { 
    9   /**
   10    * The complete, full qualified URL of the page or file, e.g. "http://www.foo.com/bar/page.html?x=y".
   11    *
   12    * @var string
   13    * @section 1 URL-related information
   14    */
   15   public $url = "";
   16   
   17   /**
   18    * The protocol-part of the URL of the page or file, e.g. "http://"
   19    *
   20    * @var string
   21    * @section 1 URL-related information
   22    */
   23   public $protocol = "";
   24   
   25   /**
   26    * The host-part of the URL of the requested page or file, e.g. "www.foo.com".
   27    *
   28    * @var string
   29    * @section 1 URL-related information
   30    */
   31   public $host = "";
   32   
   33   /**
   34    * The path in the URL of the requested page or file, e.g. "/page/".
   35    *
   36    * @var string
   37    * @section 1 URL-related information
   38    */
   39   public $path = "";
   40   
   41   /**
   42    * The name of the requested page or file, e.g. "page.html".
   43    *
   44    * @var string
   45    * @section 1 URL-related information
   46    */
   47   public $file = "";
   48   
   49   /**
   50    * The query-part of the URL of the requested page or file, e.g. "?x=y".
   51    *
   52    * @var string
   53    * @section 1 URL-related information
   54    */
   55   public $query = "";
   56   
   57   /**
   58    * The port of the URL the request was send to, e.g. 80
   59    *
   60    * @var int
   61    * @section 1 URL-related information
   62    */
   63   public $port;
   64   
   65   /**
   66    * The complete HTTP-header the webserver responded with this page or file.
   67    *
   68    * @var string
   69    * @section 2 Content-related information
   70    */
   71   public $header = "";
   72   
   73   /**
   74    * The complete HTTP-header the webserver responded with this page or file as a PHPCrawlerResponseHeader-object.
   75    *
   76    * @var PHPCrawlerResponseHeader
   77    * @section 2 Content-related information
   78    */
   79   public $responseHeader;
   80     
   81   /**
   82    * The complete HTTP-request-header the crawler sent to the server (debugging info).
   83    *
   84    * @var string
   85    */
   86   public $header_send = "";
   87   
   88   /**
   89    * Flag indicating whether content was received from the page or file.
   90    *
   91    * @var bool TRUE if the crawler received at least some source/content of this page or file.
   92    * @section 2 Content-related information
   93    */
   94   public $received = false;
   95   
   96   /**
   97    * Flag indicating whether content was completely received from the page or file.
   98    *
   99    * The conten of the current document may not be received comepletely due to settings made
  100    * with {@link PHPCrawler::setContentSizeLimit()) and/or {@link PHPCrawler::setTrafficLimit()}.
  101    *
  102    * @var bool TRUE if the crawler received the complete source/content of this page or file.
  103    * @section 2 Content-related information
  104    */
  105   public $received_completely = false;
  106   
  107   /**
  108    * Alias for received_completely, was spelled wrong in prevoius versions of phpcrawl.
  109    *
  110    * @deprecated
  111    * @section 11 Deprecated
  112    */
  113   public $received_completly = false;
  114   
  115   /**
  116    * Will be true if the content was received into local memory.
  117    *
  118    * You will have access to the content of the current page or file through $pageInfo->source.
  119    *
  120    * @section 2 Content-related information
  121    * @var bool
  122    */
  123   public $received_to_memory = false;
  124   
  125   /**
  126    * Will be true if the content was received into temporary file.
  127    *
  128    * The content is stored in the temporary file $pageInfo->content_tmp_file in this case.
  129    *
  130    * @section 2 Content-related information
  131    * @var bool
  132    */
  133   public $received_to_file = false;
  134   
  135   /**
  136    * The number of bytes the crawler received of the content of the document.
  137    *
  138    * @var int Received bytes
  139    * @section 2 Content-related information
  140    */
  141   public $bytes_received = 0;  
  142   
  143   /**
  144    * The content-type of the page or file, e.g. "text/html" or "image/gif".
  145    *
  146    * @var string The content-type
  147    * @section 2 Content-related information
  148    */
  149   public $content_type = "";
  150   
  151   /**
  152    * The content of the requested document (html-sourcecode or content of file).
  153    *
  154    * Will be empty if "received" is FALSE and the source won't be complete if "received_completly" is FALSE!
  155    *
  156    * @var string
  157    * @section 2 Content-related information
  158    */
  159   public $content = "";
  160   
  161   /**
  162    * Same as "content", the content of the requested document.
  163    *
  164    * @var string
  165    * @section 2 Content-related information
  166    */
  167   public $source = "";
  168   
  169   /**
  170    * The temporary file to which the content was received.
  171    *
  172    * Will be NULL if the content wasn't received to the temporary file.
  173    *
  174    * @var string
  175    * @section 2 Content-related information
  176    */
  177   public $content_tmp_file = null;
  178   
  179   /**
  180    * The HTTP-statuscode the webserver responded for the request, e.g. 200 (OK) or 404 (file not found).
  181    *
  182    * @var int
  183    * @section 2 Content-related information
  184    */
  185   public $http_status_code = null;
  186   
  187   /**
  188    * Cookies send by the server.
  189    *
  190    * @var array Numeric array containing all send cookies as {@link PHPCrawlerCookieDescriptor}-objects.
  191    * @section 2 Content-related information
  192    */
  193   public $cookies = array();
  194   
  195   /**
  196    * An numeric array containing information about all links that were found in the source of the page.
  197    *
  198    * Every element of that numeric array contains the following keys again:
  199    *
  200    * link_raw - contains the raw link as it was found
  201    * url_rebuild - contains the full qualified URL the link leads to
  202    * linkcode - the html-codepart that contained the link.
  203    * linktext - the linktext the link was layed over (may be empty).
  204    *
  205    * So e.g $page_data["links_found"][5]["link_raw"] contains the fifth link that was found in the current page.
  206    * (May be something like "../../foo.html").
  207    *
  208    * @var array
  209    * @section 3 Information about found links
  210    */
  211   public $links_found = array();
  212   
  213   /**
  214    * An numeric array containing a PHPCrawlerURLDescriptor-object for every link that was found in the page.
  215    *
  216    * Example: Printing the second raw link that was found on the page
  217    * <code>
  218    * echo $PageInfo->links_found_url_descriptors[2]->link_raw;
  219    * </code>
  220    *
  221    * @var array Numneric array containing {@link PHPCrawlerURLDescriptor}-objects
  222    * @section 3 Information about found links
  223    */
  224   public $links_found_url_descriptors = array();
  225   
  226   /**
  227    * The complete URL of the page that contained the link to this document.
  228    *
  229    * @var string
  230    * @section 7 Referer information
  231    */
  232   public $referer_url = null;
  233   
  234   /**
  235    * The html-sourcecode that contained the link to the current document.
  236    *
  237    * (E.g. <a href="../foo.html">LINKTEXT</a>)
  238    *
  239    * @var string
  240    * @section 7 Referer information
  241    */
  242   public $refering_linkcode = null;
  243   
  244   /**
  245    * Contains the raw link as it was found in the content of the refering URL. (E.g. "../foo.html")
  246    *
  247    * @var string
  248    * @section 7 Referer information
  249    */
  250   public $refering_link_raw = null;
  251   
  252   /**
  253    * The linktext of the link that "linked" to this document.
  254    *
  255    * E.g. if the refering link was <a href="../foo.html">LINKTEXT</a>, the refering linktext is "LINKTEXT".
  256    * May contain html-tags of course. 
  257    *
  258    * @var string
  259    * @section 7 Referer information
  260    */
  261   public $refering_linktext = null;
  262   
  263   /**
  264    * Indicates whether an error occured while requesting/receiving the document.
  265    *
  266    * @var bool TRUE if an error occured.
  267    * @section 8 Error-handling
  268    */
  269   public $error_occured = false;
  270   
  271   /**
  272    * The code of the error that perhaps occured while requesting/receiving the document.
  273    * (See PHPCrawlerRequestErrors::ERROR_... - constants)
  274    *
  275    * @var int One of the {@link PHPCrawlerRequestErrors}::ERROR_ ... constants.
  276    * @section 8 Error-handling
  277    */
  278   public $error_code = null;
  279   
  280   /**
  281    * A representig, human readable string for the error that perhaps occured while requesting/receiving the document.
  282    *
  283    * @var string A human readable error-string.
  284    * @section 8 Error-handling
  285    */
  286   public $error_string = null;
  287   
  288   /**
  289    * Indicated whether the traffic-limit set by the user was reached after downloading this document.
  290    *
  291    * @var bool  TRUE if traffic-limit was reached.
  292    */
  293   public $traffic_limit_reached = false;
  294   
  295   /**
  296    * The time it took to receive the document.
  297    *
  298    * @var float The time seconds
  299    * @section 10 Benchmarks
  300    */
  301   public $data_transfer_time = null;
  302   
  303   /**
  304    * The average data-transferrate for this document.
  305    *
  306    * @var float The rate in bytes per seconds.
  307    * @section 10 Benchmarks
  308    */
  309   public $data_transfer_rate = null;
  310   
  311   /**
  312    * Some internal benchmak-results as array.
  313    *
  314    * @var array Array containing some interlnal benchmark-results for receiving and processing this document.
  315    *            The keys are the identifiers, the values are the benchmark-times.
  316    * @section 10 Benchmarks
  317    * @internal
  318    */
  319   public $benchmarks = array();
  320   
  321   /**
  322    * All meta-tag atteributes found in the source of the document.
  323    *
  324    * @var array Assoziative array conatining all found meta-attributes.
  325    *            The keys are the meta-names, the values the content of the attributes.
  326    *            (like $tags["robots"] = "nofollow")
  327    * @section 2 Content-related information
  328    *
  329    */
  330   public $meta_attributes = array();
  331   
  332   /**
  333    * Workaround-method, copies and converts the array $links_found_url_descriptors to $links_found.
  334    *
  335    * @internal
  336    */
  337   public function setLinksFoundArray()
  338   { 
  339     $cnt = count($this->links_found_url_descriptors);
  340     for ($x=0; $x<$cnt; $x++)
  341     {
  342       $UrlDescriptor = $this->links_found_url_descriptors[$x];
  343       
  344       // Convert $UrlDescriptor-object to an array
  345       $object_vars = get_object_vars($UrlDescriptor);
  346       
  347       $this->links_found[] = $object_vars;
  348     }
  349   }
  350   
  351   /**
  352    * Returns an array with all properties of this class.
  353    *
  354    * @return array
  355    * @internal
  356    */
  357   public function toArray()
  358   {
  359     return get_object_vars($this);
  360   }
  361 }
  362 ?>