"Fossies" - the Fresh Open Source Software Archive

Member "libs/PHPCrawler/PHPCrawler.class.php" (11 Jan 2013, 69228 Bytes) of package /linux/www/SitemapCreator.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) PHP source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 <?php
    2 /**
    3  * PHPCrawl mainclass
    4  *
    5  * @package phpcrawl
    6  * @author Uwe Hunfeld (phpcrawl@cuab.de)
    7  * @version 0.81
    8  * @License GPL2
    9  */
   10 class PHPCrawler
   11 {
   12   public $class_version = "0.81";
   13   
   14   /**
   15    * The PHPCrawlerHTTPRequest-Object
   16    *
   17    * @var PHPCrawlerHTTPRequest
   18    */
   19   protected $PageRequest;
   20   
   21   /**
   22    * The PHPCrawlerLinkCache-Object
   23    *
   24    * @var PHPCrawlerURLCache
   25    */
   26   public $LinkCache;
   27   
   28   /**
   29    * The PHPCrawlerCookieCache-Object
   30    *
   31    * @var  PHPCrawlerCookieCache
   32    */
   33   protected $CookieCache;
   34   
   35   /**
   36    * The UrlFilter-Object
   37    *
   38    * @var PHPCrawlerURLFilter
   39    */
   40   protected $UrlFilter;
   41   
   42   /**
   43    * The RobotsTxtParser-Object
   44    *
   45    * @var PHPCrawlerRobotsTxtParser
   46    */
   47   protected $RobotsTxtParser;
   48   
   49   /**
   50    * UserSendDataCahce-object.
   51    *
   52    * @var PHPCrawlerUserSendDataCache
   53    */
   54   protected $UserSendDataCache;
   55   
   56   /**
   57    * The URL the crawler should start with.
   58    *
   59    * The URL is full qualified and normalized.
   60    *
   61    * @var string
   62    */
   63   protected $starting_url = "";
   64   
   65   /**
   66    * Defines whether robots.txt-file should be obeyed
   67    *
   68    * @val bool
   69    */
   70   protected $obey_robots_txt = false;
   71   
   72   /**
   73    * Limit of documents to receive
   74    *
   75    * @var int
   76    */
   77   protected $document_limit = 0;
   78   
   79   /**
   80    * Limit of bytes to receive
   81    *
   82    * @var int The limit in bytes
   83    */
   84   protected $traffic_limit = 0;
   85   
   86   /**
   87    * Defines if only documents that were received will be counted.
   88    *
   89    * @var bool
   90    */
   91   protected $only_count_received_documents = true;
   92   
   93   /**
   94    * Flag cookie-handling enabled/diabled
   95    *
   96    * @var bool
   97    */
   98   protected $cookie_handling_enabled = true;
   99   
  100   /**
  101    * The reason why the process was aborted/finished.
  102    *
  103    * @var int One of the PHPCrawlerAbortReasons::ABORTREASON-constants.
  104    */
  105   protected $porcess_abort_reason = null;
  106   
  107   /**
  108    * Flag indicating whether this instance is running in a child-process (if crawler runs multi-processed)
  109    */
  110   protected $is_chlid_process = false;
  111   
  112   /**
  113    * Flag indicating whether this instance is running in the parent-process (if crawler runs multi-processed)
  114    */
  115   protected $is_parent_process = false;
  116   
  117   /**
  118    * URl cache-type.
  119    *
  120    * @var int One of the PHPCrawlerUrlCacheTypes::URLCACHE..-constants.
  121    */
  122   protected $url_cache_type = 1;
  123   
  124   /**
  125    * UID of this instance of the crawler
  126    *
  127    * @var string
  128    */
  129   protected $crawler_uniqid = null;
  130   
  131   /**
  132    * Base-directory for temporary directories
  133    *
  134    * @var string
  135    */
  136   protected $working_base_directory;
  137   
  138   /**
  139    * Complete path to the temporary directory
  140    *
  141    * @var string
  142    */
  143   protected $working_directory = null;
  144   
  145   protected $link_priority_array = array();
  146   
  147   /**
  148    * Number of child-process (NOT the PID!)
  149    *
  150    * @var int
  151    */
  152   protected $child_process_number = null;
  153   
  154   /**
  155    * ProcessCommunication-object
  156    *
  157    * @var PHPCrawlerProcessCommunication
  158    */
  159   protected $ProcessCommunication = null;
  160   
  161   /**
  162    * Multiprocess-mode the crawler is runnung in.
  163    *
  164    * @var int One of the PHPCrawlerMultiProcessModes-constants
  165    */
  166   protected $multiprocess_mode = 0;
  167   
  168   /**
  169    * DocumentInfoQueue-object
  170    *
  171    * @var PHPCrawlerDocumentInfoQueue
  172    */
  173   protected $DocumentInfoQueue = null;
  174   
  175   protected $follow_redirects_till_content = true;
  176   
  177   /**
  178    * Flag indicating whether resumtion is activated
  179    *
  180    * @var PHPCrawlerDocumentInfoQueue
  181    */
  182   protected $resumtion_enabled = false;
  183   
  184   /**
  185    * Flag indicating whether the URL-cahce was purged at the beginning of a crawling-process
  186    */
  187   protected $urlcache_purged = false;
  188   
  189   /**
  190    * Initiates a new crawler.
  191    */
  192   public function __construct()
  193   { 
  194     // Create uniqid for this crawlerinstance
  195     $this->crawler_uniqid = getmypid().time();
  196     
  197     // Include needed class-files
  198     $classpath = dirname(__FILE__);
  199     
  200     // Utils-class
  201     if (!class_exists("PHPCrawlerUtils")) include_once($classpath."/PHPCrawlerUtils.class.php");
  202     
  203     // URL-Cache-classes
  204     if (!class_exists("PHPCrawlerURLCacheBase")) include_once($classpath."/UrlCache/PHPCrawlerURLCacheBase.class.php");
  205     if (!class_exists("PHPCrawlerMemoryURLCache")) include_once($classpath."/UrlCache/PHPCrawlerMemoryURLCache.class.php");
  206     if (!class_exists("PHPCrawlerSQLiteURLCache")) include_once($classpath."/UrlCache/PHPCrawlerSQLiteURLCache.class.php");
  207     
  208     // PageRequest-class
  209     if (!class_exists("PHPCrawlerHTTPRequest")) include_once($classpath."/PHPCrawlerHTTPRequest.class.php");
  210     $this->PageRequest = new PHPCrawlerHTTPRequest();
  211     $this->PageRequest->setHeaderCheckCallbackFunction($this, "handleHeaderInfo");
  212       
  213     // Cookie-Cache-class
  214     if (!class_exists("PHPCrawlerCookieCacheBase")) include_once($classpath."/CookieCache/PHPCrawlerCookieCacheBase.class.php");
  215     if (!class_exists("PHPCrawlerMemoryCookieCache")) include_once($classpath."/CookieCache/PHPCrawlerMemoryCookieCache.class.php");
  216     if (!class_exists("PHPCrawlerSQLiteCookieCache")) include_once($classpath."/CookieCache/PHPCrawlerSQLiteCookieCache.class.php");
  217     
  218     // URL-filter-class
  219     if (!class_exists("PHPCrawlerURLFilter")) include_once($classpath."/PHPCrawlerURLFilter.class.php");
  220     $this->UrlFilter = new PHPCrawlerURLFilter();
  221     
  222     // RobotsTxtParser-class
  223     if (!class_exists("PHPCrawlerRobotsTxtParser")) include_once($classpath."/PHPCrawlerRobotsTxtParser.class.php");
  224     $this->RobotsTxtParser = new PHPCrawlerRobotsTxtParser();
  225     
  226     // ProcessReport-class
  227     if (!class_exists("PHPCrawlerProcessReport")) include_once($classpath."/PHPCrawlerProcessReport.class.php");
  228     
  229     // UserSendDataCache-class
  230     if (!class_exists("PHPCrawlerUserSendDataCache")) include_once($classpath."/PHPCrawlerUserSendDataCache.class.php");
  231     $this->UserSendDataCache = new PHPCrawlerUserSendDataCache();
  232     
  233     // URLDescriptor-class
  234     if (!class_exists("PHPCrawlerURLDescriptor")) include_once($classpath."/PHPCrawlerURLDescriptor.class.php");
  235     
  236     // PageInfo-class
  237     if (!class_exists("PHPCrawlerDocumentInfo")) include_once($classpath."/PHPCrawlerDocumentInfo.class.php");
  238     
  239     // Benchmark-class
  240     if (!class_exists("PHPCrawlerBenchmark")) include_once($classpath."/PHPCrawlerBenchmark.class.php");
  241     
  242     // URLDescriptor-class
  243     if (!class_exists("PHPCrawlerUrlPartsDescriptor")) include_once($classpath."/PHPCrawlerUrlPartsDescriptor.class.php");
  244     
  245     // CrawlerStatus-class
  246     if (!class_exists("PHPCrawlerStatus")) include_once($classpath."/PHPCrawlerStatus.class.php");
  247     
  248     // AbortReasons-class
  249     if (!class_exists("PHPCrawlerAbortReasons")) include_once($classpath."/Enums/PHPCrawlerAbortReasons.class.php");
  250     
  251     // RequestErrors-class
  252     if (!class_exists("PHPCrawlerRequestErrors")) include_once($classpath."/Enums/PHPCrawlerRequestErrors.class.php");
  253     
  254     // PHPCrawlerUrlCacheTypes-class
  255     if (!class_exists("PHPCrawlerUrlCacheTypes")) include_once($classpath."/Enums/PHPCrawlerUrlCacheTypes.class.php");
  256     
  257     // PHPCrawlerMultiProcessModes-class
  258     if (!class_exists("PHPCrawlerMultiProcessModes")) include_once($classpath."/Enums/PHPCrawlerMultiProcessModes.class.php");
  259     
  260     // PHPCrawlerProcessCommunication-class
  261     if (!class_exists("PHPCrawlerProcessCommunication")) include_once($classpath."/ProcessCommunication/PHPCrawlerProcessCommunication.class.php");
  262     
  263     // PHPCrawlerDocumentInfoQueue-class
  264     if (!class_exists("PHPCrawlerDocumentInfoQueue")) include_once($classpath."/ProcessCommunication/PHPCrawlerDocumentInfoQueue.class.php");
  265     
  266     // Set default temp-dir
  267     $this->working_base_directory = PHPCrawlerUtils::getSystemTempDir();
  268   }
  269   
  270   /**
  271    * Initiates a crawler-process
  272    */
  273   protected function initCrawlerProcess()
  274   {
  275     // Create working directory
  276     $this->createWorkingDirectory();
  277     
  278     // Setup url-cache
  279     if ($this->url_cache_type == PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE) 
  280       $this->LinkCache = new PHPCrawlerSQLiteURLCache($this->working_directory."urlcache.db3", true);
  281     else
  282       $this->LinkCache = new PHPCrawlerMemoryURLCache();
  283     
  284     // Perge/cleanup SQLite-urlcache for resumed crawling-processes (only ONCE!)
  285     if ($this->url_cache_type == PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE && $this->urlcache_purged == false)
  286     {
  287       $this->LinkCache->purgeCache();
  288       $this->urlcache_purged = true;
  289     }
  290     
  291     // Setup cookie-cache (use SQLite-cache if crawler runs multi-processed)
  292     if ($this->url_cache_type == PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE)
  293       $this->CookieCache = new PHPCrawlerSQLiteCookieCache($this->working_directory."cookiecache.db3", true);
  294     else $this->CookieCache = new PHPCrawlerMemoryCookieCache();
  295     
  296     // ProcessCommunication
  297     $this->ProcessCommunication = new PHPCrawlerProcessCommunication($this->crawler_uniqid, $this->multiprocess_mode, $this->working_directory, $this->resumtion_enabled);
  298     
  299     // DocumentInfo-Queue
  300     if ($this->multiprocess_mode == PHPCrawlerMultiProcessModes::MPMODE_PARENT_EXECUTES_USERCODE)
  301       $this->DocumentInfoQueue = new PHPCrawlerDocumentInfoQueue($this->working_directory."doc_queue.db3", true);
  302     
  303     // Set tmp-file for PageRequest
  304     $this->PageRequest->setTmpFile($this->working_directory."phpcrawl_".getmypid().".tmp");
  305     
  306     // Pass url-priorities to link-cache
  307     $this->LinkCache->addLinkPriorities($this->link_priority_array);
  308                 
  309     // Pass base-URL to the UrlFilter
  310     $this->UrlFilter->setBaseURL($this->starting_url);
  311     
  312     // Add the starting-URL to the url-cache
  313     $this->LinkCache->addUrl(new PHPCrawlerURLDescriptor($this->starting_url));
  314   }
  315   
  316   /**
  317    * Starts the crawling process in single-process-mode.
  318    *
  319    * Be sure you did override the {@link handleDocumentInfo()}- or {@link handlePageData()}-method before calling the go()-method
  320    * to process the documents the crawler finds.
  321    *
  322    * @section 1 Basic settings
  323    */
  324   public function go()
  325   {
  326     // Process robots.txt
  327     if ($this->obey_robots_txt == true)
  328       $this->processRobotsTxt();
  329     
  330     $this->startChildProcessLoop();
  331   }
  332   
  333   /**
  334    * Starts the cralwer by using multi processes.
  335    * 
  336    * When using this method instead of the {@link go()}-method to start the crawler, phpcrawl will use the given
  337    * number of processes simultaneously for spidering the target-url.
  338    * Using multi processes will speed up the crawling-progress dramatically in most cases.
  339    *
  340    * There are some requirements though to successfully run the cralwler in multi-process mode:
  341    * <ul>
  342    * <li> The multi-process mode only works on unix-based systems (linux)</li>
  343    * <li> Scripts using the crawler have to be run from the commandline (cli)</li>
  344    * <li> The <a href="http://php.net/manual/en/pcntl.installation.php">PCNTL-extension</a> for php (process control) has to be installed and activated.</li>
  345    * <li> The <a href="http://php.net/manual/en/sem.installation.php">SEMAPHORE-extension</a> for php has to be installed and activated.</li>
  346    * <li>The <a href="http://de.php.net/manual/en/posix.installation.php">POSIX-extension</a> for php has to be installed and activated.</li>
  347    * <li> The <a href="http://de2.php.net/manual/en/pdo.installation.php">PDO-extension</a> together with the SQLite-driver (PDO_SQLITE) has to be installed and activated.</li>
  348    * </ul>
  349    *
  350    * PHPCrawls supports two different modes of multiprocessing:
  351    * <ol>
  352    * <li><b>{@link PHPCrawlerMultiProcessModes}::MPMODE_PARENT_EXECUTES_USERCODE</b>
  353    *
  354    * The cralwer uses multi processes simultaneously for spidering the target URL, but the usercode provided to
  355    * the overridable function {@link handleDocumentInfo()} gets always executed on the same main-process. This
  356    * means that the <b>usercode never gets executed simultaneously</b> and so you dont't have to care about
  357    * concurrent file/database/handle-accesses or smimilar things.
  358    * But on the other side the usercode may slow down the crawling-procedure because every child-process has to
  359    * wait until the usercode got executed on the main-process. <b>This ist the recommended multiprocess-mode!</b>
  360    * </li>
  361    * <li><b>{@link PHPCrawlerMultiProcessModes}::MPMODE_CHILDS_EXECUTES_USERCODE</b>
  362    *
  363    * The cralwer uses multi processes simultaneously for spidering the target URL, and every chld-process executes
  364    * the usercode provided to the overridable function {@link handleDocumentInfo()} directly from it's process. This
  365    * means that the <b>usercode gets executed simultaneously</b> by the different child-processes and you should 
  366    * take care of concurrent file/data/handle-accesses proberbly (if used).
  367    *
  368    * When using this mode and you use any handles like database-connections or filestreams in your extended
  369    * crawler-class, you should open them within the overridden mehtod {@link initChildProcess()} instead of opening
  370    * them from the constructor. For more details see the documentation of the {@link initChildProcess()}-method.
  371    * </li>
  372    * </ol>
  373    *
  374    * Example for starting the crawler with 5 processes using the recommended MPMODE_PARENT_EXECUTES_USERCODE-mode:
  375    * <code>
  376    * $crawler->goMultiProcessed(5, PHPCrawlerMultiProcessModes::MPMODE_PARENT_EXECUTES_USERCODE);
  377    * </code>
  378    *
  379    * Please note that increasing the number of processes to high values does't automatically mean that the crawling-process
  380    * will go off faster! Using 3 to 5 processes should be good values to start from.
  381    *
  382    * @param int $process_count     Number of processes to use
  383    * @param int $multiprocess_mode The multiprocess-mode to use.
  384    *                               One of the {@link PHPCrawlerMultiProcessModes}-constants
  385    * @section 1 Basic settings
  386    */
  387   public function goMultiProcessed($process_count = 3, $multiprocess_mode = 1)
  388   { 
  389     $this->multiprocess_mode = $multiprocess_mode;
  390     
  391     // Check if fork is supported
  392     if (!function_exists("pcntl_fork"))
  393     {
  394       throw new Exception("PHPCrawl running with multi processes not supported in this PHP-environment (function pcntl_fork() missing).".
  395                           "Try running from command-line (cli) and/or installing the PHP PCNTL-extension.");
  396     }
  397     
  398     if (!function_exists("sem_get"))
  399     {
  400       throw new Exception("PHPCrawl running with multi processes not supported in this PHP-environment (function sem_get() missing).".
  401                           "Try installing the PHP SEMAPHORE-extension.");
  402     }
  403     
  404     if (!function_exists("posix_kill"))
  405     {
  406       throw new Exception("PHPCrawl running with multi processes not supported in this PHP-environment (function posix_kill() missing).".
  407                           "Try installing the PHP POSIX-extension.");
  408     }
  409     
  410     if (!class_exists("PDO"))
  411     {
  412       throw new Exception("PHPCrawl running with multi processes not supported in this PHP-environment (class PDO missing).".
  413                           "Try installing the PHP PDO-extension.");
  414     }
  415     
  416     PHPCrawlerBenchmark::start("crawling_process");
  417     
  418     // Set url-cache-type to sqlite.
  419     $this->url_cache_type = PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE;
  420     
  421     // Init process
  422     $this->initCrawlerProcess();
  423     
  424     // Process robots.txt
  425     if ($this->obey_robots_txt == true)
  426       $this->processRobotsTxt();
  427     
  428     // Fork off child-processes
  429     $pids = array();
  430     
  431     for($i=1; $i<=$process_count; $i++)
  432     {
  433       $pids[$i] = pcntl_fork();
  434 
  435       if(!$pids[$i])
  436       {   
  437         // Childprocess goes here
  438         $this->is_chlid_process = true;
  439         $this->child_process_number = $i;
  440         $this->ProcessCommunication->registerChildPID(getmypid());
  441         $this->startChildProcessLoop();
  442       }
  443     }
  444         
  445     // Set flag "parent-process"
  446     $this->is_parent_process = true;
  447     
  448     // Determinate all child-PIDs
  449     $this->child_pids = $this->ProcessCommunication->getChildPIDs($process_count);
  450     
  451     // If crawler runs in MPMODE_PARENT_EXECUTES_USERCODE-mode -> start controller-loop
  452     if ($this->multiprocess_mode == PHPCrawlerMultiProcessModes::MPMODE_PARENT_EXECUTES_USERCODE)
  453     {
  454       $this->starControllerProcessLoop();
  455     }
  456      
  457     // Wait for childs to finish
  458     for ($i=1; $i<=$process_count; $i++)
  459     {
  460       pcntl_waitpid($pids[$i], $status, WUNTRACED);
  461     }
  462     
  463     // Get crawler-status (needed for process-report)
  464     $this->crawlerStatus = $this->ProcessCommunication->getCrawlerStatus();
  465     
  466     // Cleanup crawler
  467     $this->cleanup();
  468     
  469     PHPCrawlerBenchmark::stop("crawling_process");
  470   }
  471   
  472   /**
  473    * Starts the loop of the controller-process (main-process).
  474    */
  475   protected function starControllerProcessLoop()
  476   {
  477     // If multiprocess-mode is not MPMODE_PARENT_EXECUTES_USERCODE -> exit process
  478     if ($this->multiprocess_mode != PHPCrawlerMultiProcessModes::MPMODE_PARENT_EXECUTES_USERCODE) exit;
  479     
  480     $this->initCrawlerProcess();
  481     $this->initChildProcess();
  482     
  483     while (true)
  484     { 
  485       // Check for abort
  486       if ($this->checkForAbort() !== null)
  487       {
  488         $this->ProcessCommunication->killChildProcesses();
  489         break;
  490       }
  491       
  492       // Get next DocInfo-object from queue
  493       $DocInfo = $this->DocumentInfoQueue->getNextDocumentInfo();
  494       
  495       if ($DocInfo == null)
  496       { 
  497         
  498         // If there are nor more links in cache AND there are no more DocInfo-objects in queue -> passedthrough
  499         if ($this->LinkCache->containsURLs() == false && $this->DocumentInfoQueue->getDocumentInfoCount() == 0)
  500         {
  501           $this->ProcessCommunication->updateCrawlerStatus(null, PHPCrawlerAbortReasons::ABORTREASON_PASSEDTHROUGH);
  502         }
  503         
  504         sleep(0.2);
  505         continue;
  506       }
  507       
  508       // Update crawler-status
  509       $this->ProcessCommunication->updateCrawlerStatus($DocInfo);
  510       
  511       // Call the "abstract" method handlePageData
  512       $user_abort = false;
  513       $page_info = $DocInfo->toArray();
  514       $user_return_value = $this->handlePageData($page_info);
  515       if ($user_return_value < 0) $user_abort = true;
  516       
  517       // Call the "abstract" method handleDocumentInfo
  518       $user_return_value = $this->handleDocumentInfo($DocInfo);
  519       if ($user_return_value < 0) $user_abort = true;
  520         
  521       // Update status if user aborted process
  522       if ($user_abort == true) 
  523         $this->ProcessCommunication->updateCrawlerStatus(null, PHPCrawlerAbortReasons::ABORTREASON_USERABORT);
  524     }
  525   }
  526   
  527   /**
  528    * Starts the loop of a child-process.
  529    */
  530   protected function startChildProcessLoop()
  531   { 
  532     $this->initCrawlerProcess();
  533     
  534     // Call overidable method initChildProcess()
  535     $this->initChildProcess();
  536     
  537     // Start benchmark (if single-processed)
  538     if ($this->is_chlid_process == false)
  539     {
  540       PHPCrawlerBenchmark::start("crawling_process");
  541     }
  542     
  543     // Init vars
  544     $stop_crawling = false;
  545     
  546     // Main-Loop
  547     while ($stop_crawling == false)
  548     { 
  549       // Get next URL from cache
  550       $UrlDescriptor = $this->LinkCache->getNextUrl();
  551       
  552       // Process URL
  553       if ($UrlDescriptor != null)
  554       {
  555         $stop_crawling = $this->processUrl($UrlDescriptor);
  556       }
  557       else
  558       {
  559         sleep(1);
  560       }
  561       
  562       if ($this->multiprocess_mode != PHPCrawlerMultiProcessModes::MPMODE_PARENT_EXECUTES_USERCODE)
  563       {
  564         // If there's nothing more to do
  565         if ($this->LinkCache->containsURLs() == false)
  566         {
  567           $stop_crawling = true;
  568           $this->ProcessCommunication->updateCrawlerStatus(null, PHPCrawlerAbortReasons::ABORTREASON_PASSEDTHROUGH);
  569         }
  570         
  571         // Check for abort form other processes
  572         if ($this->checkForAbort() !== null) $stop_crawling = true;
  573       }
  574     }
  575 
  576     // Loop enden gere. If child-process -> kill it
  577     if ($this->is_chlid_process == true)
  578     {
  579       if ($this->multiprocess_mode == PHPCrawlerMultiProcessModes::MPMODE_PARENT_EXECUTES_USERCODE) return;
  580       else exit;
  581     }
  582     
  583     $this->crawlerStatus = $this->ProcessCommunication->getCrawlerStatus();
  584        
  585     // Cleanup crawler
  586     $this->cleanup();
  587     
  588     // Stop benchmark (if single-processed)
  589     if ($this->is_chlid_process == false)
  590     {
  591       PHPCrawlerBenchmark::stop("crawling_process");
  592     }
  593   }
  594   
  595   /**
  596    * Receives and processes the given URL
  597    *
  598    * @param PHPCrawlerURLDescriptor $UrlDescriptor The URL as PHPCrawlerURLDescriptor-object
  599    * @return bool TURE if the crawling-process should be aborted after processig the URL, otherwise FALSE.
  600    */
  601   protected function processUrl(PHPCrawlerURLDescriptor $UrlDescriptor)
  602   { 
  603     PHPCrawlerBenchmark::start("processing_url");
  604     
  605     // Setup HTTP-request
  606     $this->PageRequest->setUrl($UrlDescriptor);
  607     
  608     // Add cookies to request
  609     if ($this->cookie_handling_enabled == true)
  610       $this->PageRequest->addCookieDescriptors($this->CookieCache->getCookiesForUrl($UrlDescriptor->url_rebuild));
  611     
  612     // Add basic-authentications to request
  613     $authentication = $this->UserSendDataCache->getBasicAuthenticationForUrl($UrlDescriptor->url_rebuild);
  614     if ($authentication != null)
  615     {
  616       $this->PageRequest->setBasicAuthentication($authentication["username"], $authentication["password"]);
  617     }
  618     
  619     // Add post-data to request
  620     $post_data = $this->UserSendDataCache->getPostDataForUrl($UrlDescriptor->url_rebuild);
  621     while (list($post_key, $post_value) = @each($post_data))
  622     {
  623       $this->PageRequest->addPostData($post_key, $post_value);
  624     }
  625     
  626     // Do request
  627     $PageInfo = $this->PageRequest->sendRequest();
  628     
  629     if ($this->multiprocess_mode != PHPCrawlerMultiProcessModes::MPMODE_PARENT_EXECUTES_USERCODE)
  630     {
  631       // Check for abort
  632       $abort_reason = $this->checkForAbort();
  633       if ($abort_reason !== null) return true;
  634       
  635       $this->ProcessCommunication->updateCrawlerStatus($PageInfo);
  636     }
  637     
  638     // Remove post and cookie-data from request-object
  639     $this->PageRequest->clearCookies();
  640     $this->PageRequest->clearPostData();
  641     
  642     // Call user-moethods if crawler doesn't run in MPMODE_PARENT_EXECUTES_USERCODE
  643     if ($this->multiprocess_mode != PHPCrawlerMultiProcessModes::MPMODE_PARENT_EXECUTES_USERCODE)
  644     {
  645       // Call the "abstract" method handlePageData
  646       $user_abort = false;
  647       $page_info = $PageInfo->toArray();
  648       $user_return_value = $this->handlePageData($page_info);
  649       if ($user_return_value < 0) $user_abort = true;
  650       
  651       // Call the "abstract" method handleDocumentInfo
  652       $user_return_value = $this->handleDocumentInfo($PageInfo);
  653       if ($user_return_value < 0) $user_abort = true;
  654       
  655       // Update status if user aborted process
  656       if ($user_abort == true) 
  657       {
  658         $this->ProcessCommunication->updateCrawlerStatus(null, PHPCrawlerAbortReasons::ABORTREASON_USERABORT);
  659       }
  660       
  661       // Check for abort from other processes
  662       if ($this->checkForAbort() !== null) return true;
  663     }
  664     
  665     // Filter found URLs by defined rules
  666     if ($this->follow_redirects_till_content == true)
  667     {
  668       $crawler_status = $this->ProcessCommunication->getCrawlerStatus();
  669       
  670       // If content wasn't found so far and content was found NOW
  671       if ($crawler_status->first_content_url == null && $PageInfo->http_status_code == 200)
  672       {
  673         $this->ProcessCommunication->updateCrawlerStatus(null, null, $PageInfo->url);
  674         $this->UrlFilter->setBaseURL($PageInfo->url); // Set current page as base-URL
  675         $this->UrlFilter->filterUrls($PageInfo);
  676         $this->follow_redirects_till_content = false; // Content was found, so this can be set to FALSE
  677       }
  678       else if ($crawler_status->first_content_url == null)
  679       {
  680         $this->UrlFilter->keepRedirectUrls($PageInfo); // Content wasn't found so far, so just keep redirect-urls 
  681       }
  682       else if ($crawler_status->first_content_url != null)
  683       {
  684         $this->follow_redirects_till_content = false;
  685         $this->UrlFilter->filterUrls($PageInfo);
  686       }
  687     }
  688     else
  689     {
  690       $this->UrlFilter->filterUrls($PageInfo);
  691     }
  692     
  693     // Add Cookies to Cookie-cache
  694     if ($this->cookie_handling_enabled == true) $this->CookieCache->addCookies($PageInfo->cookies);
  695 
  696     // Add filtered links to URL-cache
  697     $this->LinkCache->addURLs($PageInfo->links_found_url_descriptors);
  698     
  699     PHPCrawlerBenchmark::stop("processing_url");
  700     
  701     // Complete PageInfo-Object with benchmarks
  702     $PageInfo->benchmarks = PHPCrawlerBenchmark::getAllBenchmarks();
  703     
  704     if ($this->multiprocess_mode == PHPCrawlerMultiProcessModes::MPMODE_PARENT_EXECUTES_USERCODE)
  705     {
  706       $this->DocumentInfoQueue->addDocumentInfo($PageInfo);
  707     }
  708     
  709      // Mark URL as "followed"
  710     $this->LinkCache->markUrlAsFollowed($UrlDescriptor);
  711     
  712     PHPCrawlerBenchmark::resetAll(array("crawling_process"));
  713     
  714     return false;
  715   }
  716   
  717   protected function processRobotsTxt()
  718   {
  719     PHPCrawlerBenchmark::start("processing_robots_txt");
  720     $robotstxt_rules = $this->RobotsTxtParser->parseRobotsTxt(new PHPCrawlerURLDescriptor($this->starting_url), $this->PageRequest->userAgentString);
  721     $this->UrlFilter->addURLFilterRules($robotstxt_rules);
  722     PHPCrawlerBenchmark::stop("processing_robots_txt");
  723   }
  724   
  725   /**
  726    * Checks if the crawling-process should be aborted.
  727    *
  728    * @return int NULL if the process shouldn't be aborted yet, otherwise one of the PHPCrawlerAbortReasons::ABORTREASON-constants.
  729    */
  730   protected function checkForAbort()
  731   {
  732     PHPCrawlerBenchmark::start("checkning_for_abort");
  733     
  734     $abort_reason = null;
  735      
  736     // Get current status
  737     $crawler_status = $this->ProcessCommunication->getCrawlerStatus();
  738     
  739     // if crawlerstatus already marked for ABORT
  740     if ($crawler_status->abort_reason !== null)
  741     {
  742       $abort_reason = $crawler_status->abort_reason;
  743     }
  744     
  745     // Check for reached limits
  746     
  747     // If traffic-limit is reached
  748     if ($this->traffic_limit > 0 && $crawler_status->bytes_received >= $this->traffic_limit)
  749       $abort_reason = PHPCrawlerAbortReasons::ABORTREASON_TRAFFICLIMIT_REACHED;
  750     
  751     // If document-limit is set
  752     if ($this->document_limit > 0)
  753     {
  754       // If document-limit regards to received documetns
  755       if ($this->only_count_received_documents == true && $crawler_status->documents_received >= $this->document_limit)
  756       {
  757         $abort_reason = PHPCrawlerAbortReasons::ABORTREASON_FILELIMIT_REACHED;
  758       }
  759       elseif ($this->only_count_received_documents == false && $crawler_status->links_followed >= $this->document_limit)
  760       {
  761         $abort_reason = PHPCrawlerAbortReasons::ABORTREASON_FILELIMIT_REACHED;
  762       }
  763     }
  764     
  765     $this->ProcessCommunication->updateCrawlerStatus(null, $abort_reason);
  766     
  767     PHPCrawlerBenchmark::stop("checkning_for_abort");
  768     
  769     return $abort_reason;
  770   }
  771   
  772   /**
  773    * Creates the working-directory for this instance of the cralwer.
  774    */
  775   protected function createWorkingDirectory()
  776   {
  777     $this->working_directory = $this->working_base_directory."phpcrawl_tmp_".$this->crawler_uniqid.DIRECTORY_SEPARATOR;
  778     
  779     // Check if writable
  780     if (!is_writeable($this->working_base_directory))
  781     {
  782       throw new Exception("Error creating working directory '".$this->working_directory."'");
  783     }
  784     
  785     // Create dir
  786     if (!file_exists($this->working_directory))
  787     {
  788       mkdir($this->working_directory);
  789     }
  790   }
  791   
  792   /**
  793    * Cleans up the crawler after it has finished.
  794    */
  795   protected function cleanup()
  796   {
  797     // Delete working-dir
  798     PHPCrawlerUtils::rmDir($this->working_directory);
  799     
  800     // Remove semaphore (if multiprocess-mode)
  801     if ($this->multiprocess_mode != PHPCrawlerMultiProcessModes::MPMODE_NONE)
  802     {
  803       $sem_key = sem_get($this->crawler_uniqid);
  804       sem_remove($sem_key);
  805     }
  806   }
  807   
  808   /**
  809    * Retruns summarizing report-information about the crawling-process after it has finished.
  810    *
  811    * @return PHPCrawlerProcessReport PHPCrawlerProcessReport-object containing process-summary-information
  812    * @section 1 Basic settings
  813    */
  814   public function getProcessReport()
  815   { 
  816     // Get current crawler-Status
  817     $CrawlerStatus = $this->crawlerStatus;
  818     
  819     // Create report
  820     $Report = new PHPCrawlerProcessReport();
  821     
  822     $Report->links_followed = $CrawlerStatus->links_followed;
  823     $Report->files_received = $CrawlerStatus->documents_received;
  824     $Report->bytes_received = $CrawlerStatus->bytes_received;
  825     $Report->process_runtime = PHPCrawlerBenchmark::getElapsedTime("crawling_process");
  826     
  827     if ($Report->process_runtime > 0)
  828       $Report->data_throughput = $Report->bytes_received / $Report->process_runtime;
  829     
  830     // Process abort-reason
  831     $Report->abort_reason = $CrawlerStatus->abort_reason;
  832     
  833     if ($CrawlerStatus->abort_reason == PHPCrawlerAbortReasons::ABORTREASON_TRAFFICLIMIT_REACHED)
  834       $Report->traffic_limit_reached = true;
  835     
  836     if ($CrawlerStatus->abort_reason == PHPCrawlerAbortReasons::ABORTREASON_FILELIMIT_REACHED)
  837       $Report->file_limit_reached = true;
  838     
  839     if ($CrawlerStatus->abort_reason == PHPCrawlerAbortReasons::ABORTREASON_USERABORT)
  840       $Report->user_abort = true;
  841     
  842     // Peak memory-usage
  843     if (function_exists("memory_get_peak_usage"))
  844       $Report->memory_peak_usage = memory_get_peak_usage(true);
  845     
  846     return $Report;
  847   }
  848   
  849   /**
  850    * Retruns an array with summarizing report-information after the crawling-process has finished
  851    *
  852    * For detailed information on the conatining array-keys see PHPCrawlerProcessReport-class.
  853    * 
  854    * @deprecated Please use getProcessReport() instead.
  855    * @section 11 Deprecated
  856    */
  857   public function getReport()
  858   {
  859     return $this->getProcessReport()->toArray();
  860   }
  861   
  862   /**
  863    * Overridable method that will be called after the header of a document was received and BEFORE the content
  864    * will be received.
  865    *
  866    * Everytime a header of a document was received, the crawler will call this method.
  867    * If this method returns any negative integer, the crawler will NOT reveice the content of the particular page or file.
  868    *
  869    * Example:
  870    * <code>
  871    * class MyCrawler extends PHPCrawler 
  872    * {
  873    *   function handleHeaderInfo(PHPCrawlerResponseHeader $header)
  874    *   {
  875    *     // If the content-type of the document isn't "text/html" -> don't receive it.
  876    *     if ($header->content_type != "text/html")
  877    *     {
  878    *       return -1;
  879    *     }   
  880    *   }
  881    * 
  882    *   function handleDocumentInfo($PageInfo)
  883    *   {
  884    *     // ...
  885    *   }
  886    * }
  887    * </code>
  888    *
  889    * @param PHPCrawlerResponseHeader $header The header as PHPCrawlerResponseHeader-object
  890    * @return int                             The document won't be received if you let this method return any negative value.
  891    * @section 3 Overridable methods / User data-processing
  892    */
  893   public function handleHeaderInfo(PHPCrawlerResponseHeader $header)
  894   {
  895     return 1;
  896   }
  897   
  898   /**
  899    * Overridable method that will be called by every used child-process just before it starts the crawling-procedure.
  900    *
  901    * Every child-process of the crawler will call this method just before it starts it's crawling-loop from within it's
  902    * process-context.
  903    *
  904    * So when using the multi-process mode "{@link PHPCrawlerMultiProcessModes::MPMODE_CHILDS_EXECUTES_USERCODE}", this method
  905    * should be overidden and used to open any needed database-connections, file streams or other similar handles to ensure
  906    * that they will get opened and accessible for every used child-process.
  907    *
  908    * Example:
  909    * <code>
  910    * class MyCrawler extends PHPCrawler 
  911    * {
  912    *   protected $mysql_link;
  913    *
  914    *   function initChildProcess()
  915    *   {
  916    *     // Open a database-connection for every used process
  917    *     $this->mysql_link = mysql_connect("myhost", "myusername", "mypassword");
  918    *     mysql_select_db ("mydatabasename", $this->mysql_link);
  919    *   }
  920    * 
  921    *   function handleDocumentInfo($PageInfo) 
  922    *   {
  923    *     mysql_query("INSERT INTO urls SET url = '".$PageInfo->url."';", $this->mysql_link);
  924    *   }
  925    * }
  926    *
  927    * // Start crawler with 5 processes
  928    * $crawler = new MyCrawler();
  929    * $crawler->setURL("http://www.any-url.com");
  930    * $crawler->goMultiProcessed(5, PHPCrawlerMultiProcessModes::MPMODE_CHILDS_EXECUTES_USERCODE);
  931    * </code>
  932    *
  933    * @section 3 Overridable methods / User data-processing
  934    */
  935   public function initChildProcess()
  936   {
  937   }
  938   
  939   /**
  940    * Override this method to get access to all information about a page or file the crawler found and received.
  941    *
  942    * Everytime the crawler found and received a document on it's way this method will be called.
  943    * The crawler passes all information about the currently received page or file to this method
  944    * by the array $page_data.
  945    *
  946    * @param array &$page_data Array containing all information about the currently received document.
  947    *                          For detailed information on the conatining keys see {@link PHPCrawlerDocumentInfo}-class.
  948    * @return int              The crawling-process will stop immedeatly if you let this method return any negative value.
  949    * @deprecated Please use and override the {@link handleDocumentInfo}-method to access document-information instead.
  950    * @section 3 Overridable methods / User data-processing
  951    */
  952   public function handlePageData(&$page_data){}
  953   
  954   /**
  955    * Override this method to get access to all information about a page or file the crawler found and received.
  956    *
  957    * Everytime the crawler found and received a document on it's way this method will be called.
  958    * The crawler passes all information about the currently received page or file to this method
  959    * by a PHPCrawlerDocumentInfo-object.
  960    *
  961    * Please see the {@link PHPCrawlerDocumentInfo} documentation for a list of all properties describing the
  962    * html-document.
  963    *
  964    * Example:
  965    * <code>
  966    * class MyCrawler extends PHPCrawler
  967    * {
  968    *   function handleDocumentInfo($PageInfo)
  969    *   {
  970    *     // Print the URL of the document
  971    *     echo "URL: ".$PageInfo->url."<br />";
  972    *
  973    *     // Print the http-status-code
  974    *     echo "HTTP-statuscode: ".$PageInfo->http_status_code."<br />";
  975    *
  976    *     // Print the number of found links in this document
  977    *     echo "Links found: ".count($PageInfo->links_found_url_descriptors)."<br />";
  978    *     
  979    *     // ..
  980    *   }
  981    * }
  982    * </code>
  983    *
  984    * @param PHPCrawlerDocumentInfo $PageInfo A PHPCrawlerDocumentInfo-object containing all information about the currently received document.
  985    *                                         Please see the reference of the {@link PHPCrawlerDocumentInfo}-class for detailed information.
  986    * @return int                             The crawling-process will stop immedeatly if you let this method return any negative value.
  987    *
  988    * @section 3 Overridable methods / User data-processing
  989    */
  990   public function handleDocumentInfo(PHPCrawlerDocumentInfo $PageInfo){}
  991   
  992   /**
  993    * Sets the URL of the first page the crawler should crawl (root-page).
  994    *
  995    * The given url may contain the protocol (http://www.foo.com or https://www.foo.com), the port (http://www.foo.com:4500/index.php)
  996    * and/or basic-authentication-data (http://loginname:passwd@www.foo.com)
  997    *
  998    * This url has to be set before calling the {@link go()}-method (of course)!
  999    * If this root-page doesn't contain any further links, the crawling-process will stop immediately.
 1000    *
 1001    * @param string $url The URL
 1002    * @return bool
 1003    *
 1004    * @section 1 Basic settings
 1005    */
 1006   public function setURL($url)
 1007   {
 1008     $url = trim($url);
 1009     
 1010     if ($url != "" && is_string($url))
 1011     {
 1012       $this->starting_url = PHPCrawlerUtils::normalizeURL($url);
 1013       return true;
 1014     }
 1015     else return false;
 1016   }
 1017   
 1018   /**
 1019    * Sets the port to connect to for crawling the starting-url set in setUrl().
 1020    *
 1021    * The default port is 80.
 1022    *
 1023    * Note:
 1024    * <code>
 1025    * $cralwer->setURL("http://www.foo.com");
 1026    * $crawler->setPort(443);
 1027    * </code>
 1028    * effects the same as
 1029    * 
 1030    * <code>
 1031    * $cralwer->setURL("http://www.foo.com:443");
 1032    * </code>
 1033    *
 1034    * @param int $port The port
 1035    * @return bool
 1036    * @section 1 Basic settings
 1037    */
 1038   public function setPort($port)
 1039   {
 1040     // Check port
 1041     if (!preg_match("#^[0-9]{1,5}$#", $port)) return false;
 1042 
 1043     // Add port to the starting-URL
 1044     $url_parts = PHPCrawlerUtils::splitURL($this->starting_url);
 1045     $url_parts["port"] = $port;
 1046     $this->starting_url = PHPCrawlerUtils::buildURLFromParts($url_parts, true);
 1047     
 1048     return true;
 1049   }
 1050   
 1051   /**
 1052    * Adds a regular expression togehter with a priority-level to the list of rules that decide what links should be prefered.
 1053    *
 1054    * Links/URLs that match an expression with a high priority-level will be followed before links with a lower level.
 1055    * All links that don't match with any of the given rules will get the level 0 (lowest level) automatically.
 1056    *
 1057    * The level can be any positive integer.
 1058    *
 1059    * <b>Example:</b>
 1060    *
 1061    * Telling the crawler to follow links that contain the string "forum" before links that contain ".gif" before all other found links.
 1062    * <code>
 1063    * $crawler->addLinkPriority("/forum/", 10);
 1064    * $cralwer->addLinkPriority("/\.gif/", 5);
 1065    * </code>
 1066    *
 1067    * @param string $regex  Regular expression definig the rule
 1068    * @param int    $level  The priority-level
 1069    *
 1070    * @return bool  TRUE if a valid preg-pattern is given as argument and was succsessfully added, otherwise it returns FALSE.
 1071    * @section 10 Other settings
 1072    */
 1073   function addLinkPriority($regex, $level)
 1074   {
 1075     $check = PHPCrawlerUtils::checkRegexPattern($regex); // Check pattern
 1076     if ($check == true && preg_match("/^[0-9]*$/", $level))
 1077     {
 1078       $c = count($this->link_priority_array);
 1079       $this->link_priority_array[$c]["match"] = trim($regex);
 1080       $this->link_priority_array[$c]["level"] = trim($level);
 1081     
 1082       return true;
 1083     }
 1084     else return false;
 1085   }
 1086   
 1087   /**
 1088    * Defines whether the crawler should follow redirects sent with headers by a webserver or not.
 1089    *
 1090    * @param bool $mode  If TRUE, the crawler will follow header-redirects.
 1091    *                    The default-value is TRUE.
 1092    * @return bool
 1093    * @section 10 Other settings
 1094    */
 1095   public function setFollowRedirects($mode)
 1096   {
 1097     return $this->PageRequest->setFindRedirectURLs($mode);
 1098   }
 1099   
 1100   /**
 1101    * Defines whether the crawler should follow HTTP-redirects until first content was found, regardless of defined filter-rules and follow-modes.
 1102    *
 1103    * Sometimes, when requesting an URL, the first thing the webserver does is sending a redirect to
 1104    * another location, and sometimes the server of this new location is sending a redirect again
 1105    * (and so on). 
 1106    * So at least its possible that you find the expected content on a totally different host
 1107    * as expected.
 1108    *
 1109    * If you set this option to TRUE, the crawler will follow all these redirects until it finds some content.
 1110    * If content finally was found, the root-url of the crawling-process will be set to this url and all
 1111    * defined options (folllow-mode, filter-rules etc.) will relate to it from now on.
 1112    *
 1113    * @param bool $mode If TRUE, the crawler will follow redirects until content was finally found.
 1114    *                   Defaults to TRUE.
 1115    * @section 10 Other settings
 1116    */
 1117   public function setFollowRedirectsTillContent($mode)
 1118   {
 1119     $this->follow_redirects_till_content = $mode;
 1120   }
 1121   
 1122   /**
 1123    * Sets the basic follow-mode of the crawler.
 1124    *
 1125    * The following list explains the supported follow-modes:
 1126    *
 1127    * <b>0 - The crawler will follow EVERY link, even if the link leads to a different host or domain.</b>
 1128    * If you choose this mode, you really should set a limit to the crawling-process (see limit-options),
 1129    * otherwise the crawler maybe will crawl the whole WWW!
 1130    *
 1131    * <b>1 - The crawler only follow links that lead to the same domain like the one in the root-url.</b>
 1132    * E.g. if the root-url (setURL()) is "http://www.foo.com", the crawler will follow links to "http://www.foo.com/..."
 1133    * and "http://bar.foo.com/...", but not to "http://www.another-domain.com/...".
 1134    *
 1135    * <b>2 - The crawler will only follow links that lead to the same host like the one in the root-url.</b>
 1136    * E.g. if the root-url (setURL()) is "http://www.foo.com", the crawler will ONLY follow links to "http://www.foo.com/...", but not
 1137    * to "http://bar.foo.com/..." and "http://www.another-domain.com/...". <b>This is the default mode.</b>
 1138    *
 1139    * <b>3 - The crawler only follows links to pages or files located in or under the same path like the one of the root-url.</b>
 1140    * E.g. if the root-url is "http://www.foo.com/bar/index.html", the crawler will follow links to "http://www.foo.com/bar/page.html" and
 1141    * "http://www.foo.com/bar/path/index.html", but not links to "http://www.foo.com/page.html".
 1142    *
 1143    * @param int $follow_mode The basic follow-mode for the crawling-process (0, 1, 2 or 3).
 1144    * @return bool
 1145    *
 1146    * @section 1 Basic settings
 1147    */
 1148   public function setFollowMode($follow_mode)
 1149   {
 1150     // Check mode
 1151     if (!preg_match("/^[0-3]{1}$/", $follow_mode)) return false;
 1152     
 1153     $this->UrlFilter->general_follow_mode = $follow_mode;
 1154     return true;
 1155   }
 1156   
 1157   /**
 1158    * Adds a rule to the list of rules that decides which pages or files - regarding their content-type - should be received
 1159    *
 1160    * After receiving the HTTP-header of a followed URL, the crawler check's - based on the given rules - whether the content of that URL
 1161    * should be received.
 1162    * If no rule matches with the content-type of the document, the content won't be received.
 1163    *
 1164    * Example:
 1165    * <code>
 1166    * $crawler->addContentTypeReceiveRule("#text/html#");
 1167    * $crawler->addContentTypeReceiveRule("#text/css#");
 1168    * </code>
 1169    * This rules lets the crawler receive the content/source of pages with the Content-Type "text/html" AND "text/css".
 1170    * Other pages or files with different content-types (e.g. "image/gif") won't be received (if this is the only rule added to the list).
 1171    *
 1172    * <b>IMPORTANT:</b> By default, if no rule was added to the list, the crawler receives every content.
 1173    *
 1174    * Note: To reduce the traffic the crawler will cause, you only should add content-types of pages/files you really want to receive.
 1175    * But at least you should add the content-type "text/html" to this list, otherwise the crawler can't find any links.
 1176    *
 1177    * @param string $regex The rule as a regular-expression
 1178    * @return bool TRUE if the rule was added to the list.
 1179    *              FALSE if the given regex is not valid.
 1180    * @section 2 Filter-settings
 1181    */
 1182   public function addContentTypeReceiveRule($regex)
 1183   {
 1184     return $this->PageRequest->addReceiveContentType($regex);
 1185   }
 1186   
 1187   /**
 1188    * Alias for addContentTypeReceiveRule().
 1189    *
 1190    * @section 11 Deprecated
 1191    * @deprecated
 1192    * 
 1193    */
 1194   public function addReceiveContentType($regex)
 1195   {
 1196     return $this->addContentTypeReceiveRule($regex);
 1197   }
 1198   
 1199   /**
 1200    * Adds a rule to the list of rules that decide which URLs found on a page should be followd explicitly.
 1201    *
 1202    * If the crawler finds an URL and this URL doesn't match with any of the given regular-expressions, the crawler
 1203    * will ignore this URL and won't follow it.
 1204    *
 1205    * NOTE: By default and if no rule was added to this list, the crawler will NOT filter ANY URLs, every URL the crawler finds
 1206    * will be followed (except the ones "excluded" by other options of course).
 1207    *
 1208    * Example:
 1209    * <code>
 1210    * $crawler->addURLFollowRule("#(htm|html)$# i");
 1211    * $crawler->addURLFollowRule("#(php|php3|php4|php5)$# i");
 1212    * </code>
 1213    * These rules let the crawler ONLY follow URLs/links that end with "html", "htm", "php", "php3" etc.
 1214    *
 1215    * @param string $regex Regular-expression defining the rule
 1216    * @return bool TRUE if the regex is valid and the rule was added to the list, otherwise FALSE.
 1217    *
 1218    * @section 2 Filter-settings
 1219    */
 1220   public function addURLFollowRule($regex)
 1221   {
 1222     return $this->UrlFilter->addURLFollowRule($regex);
 1223   }
 1224   
 1225   /**
 1226    * Adds a rule to the list of rules that decide which URLs found on a page should be ignored by the crawler.
 1227    *
 1228    * If the crawler finds an URL and this URL matches with one of the given regular-expressions, the crawler
 1229    * will ignore this URL and won't follow it.
 1230    *
 1231    * Example:
 1232    * <code>
 1233    * $crawler->addURLFilterRule("#(jpg|jpeg|gif|png|bmp)$# i");
 1234    * $crawler->addURLFilterRule("#(css|js)$# i");
 1235    * </code>
 1236    * These rules let the crawler ignore URLs that end with "jpg", "jpeg", "gif", ..., "css"  and "js".
 1237    *
 1238    * @param string $regex Regular-expression defining the rule
 1239    * @return bool TRUE if the regex is valid and the rule was added to the list, otherwise FALSE.
 1240    *
 1241    * @section 2 Filter-settings
 1242    */
 1243   public function addURLFilterRule($regex)
 1244   {
 1245     return $this->UrlFilter->addURLFilterRule($regex);
 1246   }
 1247   
 1248   /**
 1249    * Alias for addURLFollowRule().
 1250    *
 1251    * @section 11 Deprecated
 1252    * @deprecated
 1253    * 
 1254    */
 1255   public function addFollowMatch($regex)
 1256   {
 1257     return $this->addURLFollowRule($regex);
 1258   }
 1259   
 1260   /**
 1261    * Alias for addURLFilterRule().
 1262    *
 1263    * @section 11 Deprecated
 1264    * @deprecated
 1265    * 
 1266    */
 1267   public function addNonFollowMatch($regex)
 1268   {
 1269     return $this->addURLFilterRule($regex);
 1270   }
 1271   
 1272   /**
 1273    * Adds a rule to the list of rules that decides what types of content should be streamed diretly to a temporary file.
 1274    *
 1275    * If a content-type of a page or file matches with one of these rules, the content will be streamed directly into a
 1276    * temporary file without claiming local RAM.
 1277    *
 1278    * It's recommendend to add all content-types of files that may be of bigger size to prevent memory-overflows.
 1279    * By default the crawler will receive every content to memory!
 1280    *
 1281    * The content/source of pages and files that were streamed to file are not accessible directly within the overidden method
 1282    * {@link handleDocumentInfo()}, instead you get information about the file the content was stored in.
 1283    * (see properties {@link PHPCrawlerDocumentInfo::received_to_file} and {@link PHPCrawlerDocumentInfo::content_tmp_file}).
 1284    *
 1285    * Please note that this setting doesn't effect the link-finding results, also file-streams will be checked for links.
 1286    *
 1287    * A common setup may look like this example:
 1288    * <code>
 1289    * // Basically let the crawler receive every content (default-setting)
 1290    * $crawler->addReceiveContentType("##");
 1291    *
 1292    * // Tell the crawler to stream everything but "text/html"-documents to a tmp-file
 1293    * $crawler->addStreamToFileContentType("#^((?!text/html).)*$#");
 1294    * </code>
 1295    * 
 1296    * @param string $regex The rule as a regular-expression
 1297    * @return bool         TRUE if the rule was added to the list and the regex is valid.
 1298    * @section 10 Other settings
 1299    */
 1300   public function addStreamToFileContentType($regex)
 1301   {
 1302     return $this->PageRequest->addStreamToFileContentType($regex);
 1303   }
 1304   
 1305   /**
 1306    * Has no function anymore.
 1307    *
 1308    * Please use setWorkingDirectory()
 1309    *
 1310    * @deprecated This method has no function anymore since v 0.8.
 1311    * @section 11 Deprecated
 1312    */
 1313   public function setTmpFile($tmp_file)
 1314   {
 1315   }
 1316   
 1317   /**
 1318    * Decides whether the crawler should parse and obey robots.txt-files. 
 1319    *
 1320    * If this is set to TRUE, the crawler looks for a robots.txt-file for every host that sites or files should be received
 1321    * from during the crawling process. If a robots.txt-file for a host was found, the containig directives appliying to the
 1322    * useragent-identification of the cralwer
 1323    * ("PHPCrawl" or manually set by calling {@link setUserAgentString()}) will be obeyed.
 1324    *
 1325    * The default-value is FALSE (for compatibility reasons).
 1326    *
 1327    * Pleas note that the directives found in a robots.txt-file have a higher priority than other settings made by the user.
 1328    * If e.g. {@link addFollowMatch}("#http://foo\.com/path/file\.html#") was set, but a directive in the robots.txt-file of the host
 1329    * foo.com says "Disallow: /path/", the URL http://foo.com/path/file.html will be ignored by the crawler anyway.
 1330    *
 1331    * @param bool $mode Set to TRUE if you want the crawler to obey robots.txt-files.
 1332    * @return bool
 1333    * @section 2 Filter-settings
 1334    */
 1335   public function obeyRobotsTxt($mode)
 1336   {
 1337     if (!is_bool($mode)) return false;
 1338     
 1339     $this->obey_robots_txt = $mode;
 1340     return true;
 1341   }
 1342   
 1343   /**
 1344    * Alias for addStreamToFileContentType().
 1345    *
 1346    * @deprecated
 1347    * @section 11 Deprecated
 1348    */ 
 1349   public function addReceiveToTmpFileMatch($regex)
 1350   {
 1351     return $this->addStreamToFileContentType($regex);
 1352   }
 1353   
 1354   /**
 1355    * Has no function anymore!
 1356    *
 1357    * This method was redundant, please use addStreamToFileContentType().
 1358    * It just still exists because of compatibility-reasons.
 1359    *
 1360    * @deprecated This method has no function anymore since v 0.8.
 1361    * @section 11 Deprecated
 1362    */ 
 1363   public function addReceiveToMemoryMatch($regex)
 1364   {
 1365     return true;
 1366   }
 1367   
 1368   /**
 1369    * Sets a limit to the number of pages/files the crawler should follow.
 1370    *
 1371    * If the limit is reached, the crawler stops the crawling-process. The default-value is 0 (no limit).
 1372    *
 1373    * @param int $limit                          The limit, set to 0 for no limit (default value).
 1374    * @param bool $only_count_received_documents OPTIONAL.
 1375    *                                            TRUE means that only documents the crawler received will be counted.
 1376    *                                            FALSE means that ALL followed and requested pages/files will be counted, even if the content wasn't be received.
 1377    * @section 5 Limit-settings
 1378    */
 1379   public function setPageLimit($limit, $only_count_received_documents = false)
 1380   {
 1381     if (!preg_match("/^[0-9]*$/", $limit)) return false;
 1382     
 1383     $this->document_limit = $limit;
 1384     $this->only_count_received_documents = $only_count_received_documents;
 1385     return true;
 1386   }
 1387   
 1388   /**
 1389    * Sets the content-size-limit for content the crawler should receive from documents.
 1390    *
 1391    * If the crawler is receiving the content of a page or file and the contentsize-limit is reached, the crawler stops receiving content
 1392    * from this page or file.
 1393    *
 1394    * Please note that the crawler can only find links in the received portion of a document.
 1395    * 
 1396    * The default-value is 0 (no limit).
 1397    *
 1398    * @param int $bytes The limit in bytes.
 1399    * @return bool
 1400    * @section 5 Limit-settings
 1401    */
 1402   public function setContentSizeLimit($bytes)
 1403   {
 1404     return $this->PageRequest->setContentSizeLimit($bytes);
 1405   }
 1406   
 1407   /**
 1408    * Sets a limit to the number of bytes the crawler should receive alltogether during crawling-process.
 1409    *
 1410    * If the limit is reached, the crawler stops the crawling-process.
 1411    * The default-value is 0 (no limit).
 1412    *
 1413    * @param int $bytes Maximum number of bytes
 1414    * @param bool $complete_requested_files This parameter has no function anymore!
 1415    *
 1416    * @return bool
 1417    * @section 5 Limit-settings
 1418    */
 1419   public function setTrafficLimit($bytes, $complete_requested_files = true)
 1420   {
 1421     if (preg_match("#^[0-9]*$#", $bytes))
 1422     {
 1423       $this->traffic_limit = $bytes;
 1424       return true;
 1425     }
 1426     else return false;
 1427   }
 1428   
 1429   /**
 1430    * Enables or disables cookie-handling.
 1431    *
 1432    * If cookie-handling is set to TRUE, the crawler will handle all cookies sent by webservers just like a common browser does.
 1433    * The default-value is TRUE.
 1434    *
 1435    * It's strongly recommended to set or leave the cookie-handling enabled!
 1436    *
 1437    * @param bool $mode
 1438    * @return bool
 1439    * @section 10 Other settings
 1440    */
 1441   public function enableCookieHandling($mode)
 1442   {
 1443     if (!is_bool($mode)) return false;
 1444     
 1445     $this->cookie_handling_enabled = $mode;
 1446     return true;
 1447   }
 1448   
 1449   /**
 1450    * Alias for enableCookieHandling()
 1451    *
 1452    * @section 11 Deprecated
 1453    * @deprecated Please use enableCookieHandling()
 1454    */
 1455   public function setCookieHandling($mode)
 1456   {
 1457     return $this->enableCookieHandling($mode);
 1458   }
 1459   
 1460   /**
 1461    * Enables or disables agressive link-searching.
 1462    *
 1463    * If this is set to FALSE, the crawler tries to find links only inside html-tags (< and >).
 1464    * If this is set to TRUE, the crawler tries to find links everywhere in an html-page, even outside of html-tags.
 1465    * The default value is TRUE.
 1466    *
 1467    * Please note that if agressive-link-searching is enabled, it happens that the crawler will find links that are not meant as links and it also happens that it
 1468    * finds links in script-parts of pages that can't be rebuild correctly - since there is no javascript-parser/interpreter implemented.
 1469    * (E.g. javascript-code like document.location.href= a_var + ".html").
 1470    *
 1471    * Disabling agressive-link-searchingn results in a better crawling-performance.
 1472    *
 1473    * @param bool $mode
 1474    * @return bool
 1475    * @section 6 Linkfinding settings 
 1476    */
 1477   public function enableAggressiveLinkSearch($mode)
 1478   {
 1479     return $this->PageRequest->enableAggressiveLinkSearch($mode);
 1480   }
 1481   
 1482   /**
 1483    * Alias for enableAggressiveLinkSearch()
 1484    *
 1485    * @section 11 Deprecated
 1486    * @deprecated Please use enableAggressiveLinkSearch()
 1487    */
 1488   public function setAggressiveLinkExtraction($mode)
 1489   {
 1490     return $this->enableAggressiveLinkSearch($mode);
 1491   }
 1492   
 1493   /**
 1494    * Sets the list of html-tags the crawler should search for links in.
 1495    *
 1496    * By default the crawler searches for links in the following html-tags: href, src, url, location, codebase, background, data, profile, action and open.
 1497    * As soon as the list is set manually, this default list will be overwritten completly.
 1498    *
 1499    * Example:
 1500    * <code>$crawler->setLinkExtractionTags(array("href", "src"));</code>
 1501    * This setting lets the crawler search for links (only) in "href" and "src"-tags.
 1502    *
 1503    * Note: Reducing the number of tags in this list will improve the crawling-performance (a little).
 1504    *
 1505    * @param array $tag_array Numeric array containing the tags.
 1506    * @section 6 Linkfinding settings
 1507    */
 1508   public function setLinkExtractionTags($tag_array)
 1509   {
 1510     return $this->PageRequest->setLinkExtractionTags($tag_array);
 1511   }
 1512   
 1513   /**
 1514    * Sets the list of html-tags from which links should be extracted from.
 1515    *
 1516    * This method was named wrong in previous versions of phpcrawl.
 1517    * It does not ADD tags, it SETS the tags from which links should be extracted from.
 1518    * 
 1519    * Example
 1520    * <code>$crawler->addLinkExtractionTags("href", "src");</code>
 1521    *
 1522    * @section 11 Deprecated
 1523    * @deprecated Please use setLinkExtractionTags()
 1524    */
 1525   public function addLinkExtractionTags()
 1526   {
 1527     $tags = func_get_args();
 1528     return $this->setLinkExtractionTags($tags);
 1529   }
 1530   
 1531   /**
 1532    * Adds a basic-authentication (username and password) to the list of basic authentications that will be send with requests.
 1533    *
 1534    * Example:
 1535    * <code>
 1536    * $crawler->addBasicAuthentication("#http://www\.foo\.com/protected_path/#", "myusername", "mypasswd");
 1537    * </code>
 1538    * This lets the crawler send the authentication "myusername/mypasswd" with every request for content placed
 1539    * in the path "protected_path" on the host "www.foo.com".
 1540    *
 1541    * @param string $url_regex Regular-expression defining the URL(s) the authentication should be send to.
 1542    * @param string $username  The username
 1543    * @param string $password  The password
 1544    *
 1545    * @return bool
 1546    *
 1547    * @section 10 Other settings
 1548    */
 1549   public function addBasicAuthentication($url_regex, $username, $password)
 1550   {
 1551     return $this->UserSendDataCache->addBasicAuthentication($url_regex, $username, $password);
 1552   }
 1553   
 1554   /**
 1555    * Sets the "User-Agent" identification-string that will be send with HTTP-requests.
 1556    *
 1557    * @param string $user_agent The user-agent-string. The default-value is "PHPCrawl".
 1558    * @section 10 Other settings
 1559    */
 1560   public function setUserAgentString($user_agent)
 1561   {
 1562     $this->PageRequest->userAgentString = $user_agent;
 1563     return true;
 1564   }
 1565   
 1566   /**
 1567    * Has no function anymore.
 1568    *
 1569    * Thes method has no function anymore, just still exists because of compatibility-reasons.
 1570    *
 1571    * @section 11 Deprecated
 1572    * @deprecated
 1573    */
 1574   public function disableExtendedLinkInfo($mode)
 1575   {
 1576   }
 1577   
 1578   /**
 1579    * Sets the working-directory the crawler should use for storing temporary data.
 1580    *
 1581    * Every instance of the crawler needs and creates a temporary directory for storing some
 1582    * internal data.
 1583    *
 1584    * This setting defines which base-directory the crawler will use to store the temporary
 1585    * directories in. By default, the crawler uses the systems temp-directory as working-directory.
 1586    * (i.e. "/tmp/" on linux-systems)
 1587    *
 1588    * All temporary directories created in the working-directory will be deleted automatically
 1589    * after a crawling-process has finished.
 1590    *
 1591    * NOTE: To speed up the performance of a crawling-process (especially when using the
 1592    * SQLite-urlcache), try to set a mounted shared-memory device as working-direcotry
 1593    * (i.e. "/dev/shm/" on Debian/Ubuntu-systems).
 1594    *
 1595    * Example:
 1596    * <code>
 1597    * $crawler->setWorkingDirectory("/tmp/");
 1598    * </code>
 1599    *
 1600    * @param string $directory The working-directory
 1601    * @return bool             TRUE on success, otherwise false.
 1602    * @section 1 Basic settings
 1603    */
 1604   public function setWorkingDirectory($directory)
 1605   {
 1606     if (is_writeable($this->working_base_directory))
 1607     {
 1608       $this->working_base_directory = $directory;
 1609       return true;
 1610     }
 1611     else return false;
 1612   }
 1613   
 1614   /**
 1615    * Assigns a proxy-server the crawler should use for all HTTP-Requests.
 1616    *
 1617    * @param string $proxy_host     Hostname or IP of the proxy-server
 1618    * @param int    $proxy_port     Port of the proxy-server
 1619    * @param string $proxy_username Optional. The username for proxy-authentication or NULL if no authentication is required.
 1620    * @param string $proxy_password Optional. The password for proxy-authentication or NULL if no authentication is required.
 1621    *
 1622    * @section 10 Other settings
 1623    */
 1624   public function setProxy($proxy_host, $proxy_port, $proxy_username = null, $proxy_password = null)
 1625   {
 1626     $this->PageRequest->setProxy($proxy_host, $proxy_port, $proxy_username, $proxy_password);
 1627   }
 1628   
 1629   /**
 1630    * Sets the timeout in seconds for connection tries to hosting webservers.
 1631    *
 1632    * If the the connection to a host can't be established within the given time, the
 1633    * request will be aborted.
 1634    *
 1635    * @param int $timeout The timeout in seconds, the default-value is 5 seconds.
 1636    * @return bool
 1637    *
 1638    * @section 10 Other settings
 1639    */
 1640   public function setConnectionTimeout($timeout)
 1641   {
 1642     if (preg_match("#[0-9]+#", $timeout))
 1643     {
 1644       $this->PageRequest->socketConnectTimeout = $timeout;
 1645       return true;
 1646     }
 1647     else
 1648     {
 1649       return false;
 1650     }
 1651   }
 1652   
 1653   /**
 1654    * Sets the timeout in seconds for waiting for data on an established server-connection.
 1655    *
 1656    * If the connection to a server was be etablished but the server doesnt't send data anymore without
 1657    * closing the connection, the crawler will wait the time given in timeout and then close the connection.
 1658    *
 1659    * @param int $timeout The timeout in seconds, the default-value is 2 seconds.
 1660    * @return bool
 1661    *
 1662    * @section 10 Other settings
 1663    */
 1664   public function setStreamTimeout($timeout)
 1665   {
 1666     if (preg_match("#[0-9]+#", $timeout))
 1667     {
 1668       $this->PageRequest->socketReadTimeout = $timeout;
 1669       return true;
 1670     }
 1671     else
 1672     {
 1673       return false;
 1674     }
 1675   }
 1676   
 1677   /**
 1678    * Adds a rule to the list of rules that decide in what kind of documents the crawler
 1679    * should search for links in (regarding their content-type)
 1680    *
 1681    * By default the crawler ONLY searches for links in documents of type "text/html".
 1682    * Use this method to add one or more other content-types the crawler should check for links.
 1683    *
 1684    * Example:
 1685    * <code>
 1686    * $crawler->addLinkSearchContentType("#text/css# i");
 1687    * $crawler->addLinkSearchContentType("#text/xml# i");
 1688    * </code>
 1689    * These rules let the crawler search for links in HTML-, CSS- ans XML-documents.
 1690    *
 1691    * <b>Please note:</b> It is NOT recommended to let the crawler checkfor links in EVERY document-
 1692    * type! This could slow down the crawling-process dramatically (e.g. if the crawler receives large
 1693    * binary-files like images and tries to find links in them).
 1694    *
 1695    * @param string $regex Regular-expression defining the rule
 1696    * @return bool         TRUE if the rule was successfully added
 1697    *
 1698    * @section 6 Linkfinding settings
 1699    */
 1700   public function addLinkSearchContentType($regex)
 1701   {
 1702     return $this->PageRequest->addLinkSearchContentType($regex);
 1703   }
 1704   
 1705   /**
 1706    * Defines what type of cache will be internally used for caching URLs.
 1707    *
 1708    * Currently phpcrawl is able to use a in-memory-cache or a SQlite-database-cache for
 1709    * caching/storing found URLs internally.
 1710    *
 1711    * The memory-cache ({@link PHPCrawlerUrlCacheTypes}::URLCACHE_MEMORY) is recommended for spidering small to medium websites.
 1712    * It provides better performance, but the php-memory-limit may be hit when too many URLs get added to the cache.
 1713    * This is the default-setting.
 1714    *
 1715    * The SQlite-cache ({@link PHPCrawlerUrlCacheTypes}::URLCACHE_SQLite) is recommended for spidering huge websites.
 1716    * URLs get cached in a SQLite-database-file, so the cache only is limited by available harddisk-space.
 1717    * To increase performance of the SQLite-cache you may set it's location to a shared-memory device like "/dev/shm/"
 1718    * by using the {@link setWorkingDirectory()}-method.
 1719    *
 1720    * Example:
 1721    * <code>
 1722    * $crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE);
 1723    * $crawler->setWorkingDirectory("/dev/shm/");
 1724    * </code>
 1725    *
 1726    * <b>NOTE:</b> When using phpcrawl in multi-process-mode ({@link goMultiProcessed()}), the cache-type is automatically set
 1727    * to PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE.
 1728    *
 1729    * @param int $url_cache_type 1 -> in-memory-cache (default setting)
 1730    *                            2 -> SQlite-database-cache
 1731    *
 1732    *                            Or one of the {@link PHPCrawlerUrlCacheTypes}::URLCACHE..-constants.
 1733    * @return bool
 1734    * @section 1 Basic settings
 1735    */
 1736   public function setUrlCacheType($url_cache_type)
 1737   {
 1738     if (preg_match("#[1-2]#", $url_cache_type))
 1739     {
 1740       $this->url_cache_type = $url_cache_type;
 1741       return true;
 1742     }
 1743     else return false;
 1744   }
 1745   
 1746   /**
 1747    * Decides whether the crawler should obey "nofollow"-tags
 1748    *
 1749    * If set to TRUE, the crawler will not follow links that a marked with rel="nofollow"
 1750    * (like &lt;a href="page.html" rel="nofollow"&gt;) nor links from pages containing the meta-tag
 1751    * <meta name="robots" content="nofollow">.
 1752    *
 1753    * By default, the crawler will NOT obey nofollow-tags.
 1754    * 
 1755    * @param bool $mode If set to TRUE, the crawler will obey "nofollow"-tags
 1756    * @section 2 Filter-settings
 1757    */
 1758   public function obeyNoFollowTags($mode)
 1759   {
 1760     $this->UrlFilter->obey_nofollow_tags = $mode;
 1761   }
 1762   
 1763   /**
 1764    * Adds post-data together with an URL-rule to the list of post-data to send with requests.
 1765    *
 1766    * Example
 1767    * <code>
 1768    * $post_data = array("username" => "me", "password" => "my_password", "action" => "do_login");
 1769    * $crawler->addPostData("#http://www\.foo\.com/login.php#", $post_data);
 1770    * </code>
 1771    * This example sends the post-values "username=me", "password=my_password" and "action=do_login" to the URL
 1772    * http://www.foo.com/login.php
 1773    * 
 1774    * @param string $url_regex       Regular expression defining the URL(s) the post-data should be send to.
 1775    * @param array  $post_data_array Post-data-array, the array-keys are the post-data-keys, the array-values the post-values.
 1776    *                                (like array("post_key1" => "post_value1", "post_key2" => "post_value2")
 1777    *
 1778    * @return bool
 1779    * @section 10 Other settings
 1780    */
 1781   public function addPostData($url_regex, $post_data_array)
 1782   {
 1783     return $this->UserSendDataCache->addPostData($url_regex, $post_data_array);
 1784   }
 1785   
 1786   /**
 1787    * Returns the unique ID of the instance of the crawler
 1788    *
 1789    * @return int
 1790    * @section 9 Process resumption
 1791    */
 1792   public function getCrawlerId()
 1793   {
 1794     return $this->crawler_uniqid;
 1795   }
 1796   
 1797   /**
 1798    * Resumes the crawling-process with the given crawler-ID
 1799    *
 1800    * If a crawling-process was aborted (for whatever reasons), it is possible
 1801    * to resume it by calling the resume()-method before calling the go() or goMultiProcessed() method
 1802    * and passing the crawler-ID of the aborted process to it (as returned by {@link getCrawlerId()}).
 1803    * 
 1804    * In order to be able to resume a process, it is necessary that it was initially
 1805    * started with resumption enabled (by calling the {@link enableResumption()} method).
 1806    *
 1807    * This method throws an exception if resuming of a crawling-process failed.
 1808    *
 1809    *
 1810    * Example of a resumeable crawler-script:
 1811    * <code>
 1812    * // ...
 1813    * $crawler = new MyCrawler();
 1814    * $crawler->enableResumption();
 1815    * $crawler->setURL("www.url123.com");
 1816    *
 1817    * // If process was started the first time:
 1818    * // Get the crawler-ID and store it somewhere in order to be able to resume the process later on
 1819    * if (!file_exists("/tmp/crawlerid_for_url123.tmp"))
 1820    * {
 1821    *   $crawler_id = $crawler->getCrawlerId();
 1822    *   file_put_contents("/tmp/crawlerid_for_url123.tmp", $crawler_id);
 1823    * }
 1824    *
 1825    * // If process was restarted again (after a termination):
 1826    * // Read the crawler-id and resume the process
 1827    * else
 1828    * {
 1829    *   $crawler_id = file_get_contents("/tmp/crawlerid_for_url123.tmp");
 1830    *   $crawler->resume($crawler_id);
 1831    * }
 1832    *
 1833    * // ...
 1834    * 
 1835    * // Start your crawling process
 1836    * $crawler->goMultiProcessed(5);
 1837    *
 1838    * // After the process is finished completely: Delete the crawler-ID
 1839    * unlink("/tmp/crawlerid_for_url123.tmp");
 1840    * </code>
 1841    *
 1842    * @param int $crawler_id The crawler-ID of the crawling-process that should be resumed.
 1843    *                        (see {@link getCrawlerId()})
 1844    * @section 9 Process resumption
 1845    */
 1846   public function resume($crawler_id)
 1847   {
 1848     if ($this->resumtion_enabled == false)
 1849       throw new Exception("Resumption was not enalbled, call enableResumption() before calling the resume()-method!");
 1850     
 1851     // Adobt crawler-id
 1852     $this->crawler_uniqid = $crawler_id;
 1853     
 1854     if (!file_exists($this->working_base_directory."phpcrawl_tmp_".$this->crawler_uniqid.DIRECTORY_SEPARATOR))
 1855     {
 1856       throw new Exception("Couldn't find any previous aborted crawling-process with crawler-id '".$this->crawler_uniqid."'");
 1857     }
 1858     
 1859     $this->createWorkingDirectory();
 1860     
 1861     // Unlinks pids file in working-dir (because all PIDs will change in new process)
 1862     if (file_exists($this->working_directory."pids")) unlink($this->working_directory."pids");
 1863   }
 1864   
 1865   /**
 1866    * Prepares the crawler for process-resumption.
 1867    *
 1868    * In order to be able to resume an aborted/terminated crawling-process, it is necessary to
 1869    * initially call the enableResumption() method in your script/project.
 1870    *
 1871    * For further details on how to resume aborted processes please see the documentation of the
 1872    * {@link resume()} method.
 1873    * @section 9 Process resumption
 1874    */
 1875   public function enableResumption()
 1876   {
 1877     $this->resumtion_enabled = true;
 1878     $this->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE);
 1879   }
 1880 }
 1881 ?>