"Fossies" - the Fresh Open Source Software Archive

Member "SitemapCreator.class.php" (20 Jan 2013, 40522 Bytes) of package /linux/www/SitemapCreator.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) PHP source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 <?php
    2 
    3 /**
    4  * Sitemap Creator Main class
    5  * 
    6  * @desc 
    7  * Sitemap Creator creates XML sitemaps files compatible with the standard 
    8  * sitemaps.org protocol and supported by Google and Bing. 
    9  * 
   10  * @link http://sitemapcreator.org/
   11  * @package SitemapCreator
   12  * @category MainClass
   13  * @author Waleed Gadelkareem <gadelkareem@gmail.com>
   14  * @link http://gadelkareem.com/
   15  * @version 1.0
   16  * @license  GPLv2
   17  * 
   18  */
   19 class SitemapCreator {
   20 
   21     public $class_version = "1.0";
   22     static $useragent = "Sitemaps Creator 1.0 (compatible; sitemapcreatorbot/1.0; +http://sitemapcreator.org/)";
   23 
   24     /**
   25      *  Disables priority calculations
   26      *  @see setPriority()
   27      * 
   28      */
   29     const PRIORITY_Disable = 0;
   30     /**
   31      *  Crawled first pages get higher priority
   32      *  Default
   33      *  @see setPriority()
   34      */
   35     const PRIORITY_CRAWLED_FIRST = 1;
   36     /**
   37      *  Deeper pathes get lower priority
   38      *  @see setPriority()
   39      */
   40     const PRIORITY_URL_STRUCTURE = 2;
   41 
   42     /**
   43      *  Priority mode 
   44      * @see setPriority()
   45      * @var int one of SitemapCreator::PRIORITY-constants
   46      */
   47     protected $priority_mode = 1; //0,1,2
   48     /**
   49      *  Minimum Priority
   50      * @see setMinPriority()
   51      * @var int 
   52      */
   53     protected $min_priority = 0;
   54 
   55     /**
   56      *  
   57      *  Disables frequency calculations
   58      * @see setFrequency()
   59      */
   60 
   61     const FREQUENCY_Disable = 0;
   62             /**
   63              *  Latest modified pages get higher frequency
   64              *  Default 
   65              * @see setFrequency()
   66              */
   67             const FREQUENCY_LAST_MODIFIED = 1;
   68             /**
   69              *  Higher priority pages get higher frequency
   70              * @see setFrequency()
   71              */
   72             const FREQUENCY_PRIORITY = 2;
   73 
   74     /**
   75      *  Frequency mode
   76      * @see setFrequency()
   77      * @var int one of SitemapCreator::FREQUENCY-constants
   78      */
   79     protected $frequency_mode = 1; //0,1,2
   80     /**
   81      *  Minimum Priority
   82      * @see setMinFrequency()
   83      * @var string one of  {@link $frequency_types}
   84      */
   85     protected $min_frequency = 'never';
   86 
   87     /**
   88      *  Array contains Frequency types as keys and max time in seconds as values.
   89      * @var array
   90      */
   91     protected $frequency_types = array
   92         (
   93         'always' => 3600, //1 hour
   94         'hourly' => 86400, //1 day
   95         'daily' => 604800, //1 week
   96         'weekly' => 2678400, //1 month
   97         'monthly' => 31536000, //1 year
   98         'yearly' => 63072000, //2 years
   99         'never' => 94608000 //3 years
  100     );
  101 
  102     /**
  103      *  Current time().
  104      * @var int
  105      */
  106     protected $now;
  107 
  108     /**
  109      * The URL of the website.It should be full qualified and normalized.
  110      * Set on class creation {@link __construct()} or {@link setSite()}
  111      * 
  112      * Default: 'http://' . $_SERVER['HTTP_HOST'] . '/'
  113      * 
  114      * @var string
  115      */
  116     protected $site;
  117 
  118     /**
  119      * Array contianing the entries of the sitemap.
  120      *
  121      * @var array
  122      */
  123     protected $entries = array();
  124 
  125     /**
  126      * Maximum number of entries per sitemap file 
  127      * @see  setEntriesPerSitemap()
  128      *
  129      * @var int
  130      */
  131     protected $entries_per_sitemap = 50000;
  132 
  133     /**
  134      * Number of sitemap files created
  135      *
  136      * @var int
  137      */
  138     protected $sitemaps_count = 0;
  139 
  140     /**
  141      * XML string containing sitemap <urlset></urlset> elements
  142      *
  143      * @var string
  144      */
  145             protected $xml_url_set = '',
  146             $xml_head,
  147             $xml_foot;
  148 
  149     /**
  150      * Data directory Path.
  151      * 
  152      * Valid system path for the directory where sitemaps directories
  153      * will be created.The directory should be writable.
  154      * @see setDataDir()
  155      * @see getDataDir()
  156      * 
  157      * Default: sys_get_temp_dir()
  158      *
  159      * @var string
  160      */
  161     protected $data_dir = '';
  162 
  163     /**
  164      * Sitemaps directory path auto created in {@link prepareSitemapsDir()}
  165      * @see getSitemapPath()
  166      *
  167      * @var string
  168      */
  169     protected $sitemaps_dir;
  170 
  171     /**
  172      * Sitemap URL where the sitemap file name will be appended to the end of the URL.
  173      * If not set then the link will be generated automatically.
  174      * @example
  175      *  http://www.example.com/sitemap.php?=
  176      * 
  177      * @see setSitemapURL()
  178      * @see getSitemapURL()
  179      *    
  180      * @var string
  181      */
  182     protected $sitemaps_url;
  183 
  184     /**
  185      * choose to save sitemaps in gzip format
  186      * @see useGzip()
  187      *
  188      * @var bool
  189      */
  190     protected $use_gzip = false;
  191 
  192     /**
  193      * Ping URLs of the search engines sitemaps API
  194      * @see addEngine()
  195      *
  196      * @var array
  197      */
  198     protected $engines = array
  199         (
  200         'Google' => 'http://www.google.com/webmasters/sitemaps/ping?sitemap=',
  201         'Live Search' => 'http://www.bing.com/webmaster/ping.aspx?siteMap='
  202     );
  203     /*
  204      * The SMCCrawler-Object: "Crawler" 
  205      *
  206      * @var SMCCrawler
  207      */
  208     public $Crawler;
  209     /*
  210      * Class path
  211      *
  212      * @var string
  213      */
  214     protected $classpath;
  215     /*
  216      * The PHPCrawlerProcessReport-Object:
  217      * contains summarizing report-information about the crawling-process after it has finished.  
  218      *
  219      * @var PHPCrawlerProcessReport
  220      */
  221     public $crawler_reports;
  222 
  223     /**
  224      * Initiates a new Sitemap.
  225      * 
  226      * @param string    $site   The url may contain the protocol (http://www.foo.com or https://www.foo.com), the port (http://www.foo.com:4500/index.php)
  227      *                                        and/or basic-authentication-data (http://loginname:passwd@www.foo.com)
  228      * @section 1 Settings                            
  229      */
  230     public function __construct($site = '') {
  231         // Include needed class-files        
  232         $this->classpath = dirname(__FILE__);
  233         // PHPCrawlerUtils class
  234         if (!class_exists("PHPCrawlerUtils"))
  235             require_once($this->classpath . "/libs/PHPCrawler/PHPCrawlerUtils.class.php");
  236         //set website URL to default host
  237         if ($site == '')
  238             $site = 'http://' . $_SERVER['HTTP_HOST'] . '/';
  239         $this->setSite($site);
  240         //get current time
  241         $this->now = time();
  242         //set data dir to system temp directory
  243         $this->data_dir = sys_get_temp_dir();
  244     }
  245 
  246     /**
  247      * Sets the URL of the website {@link $site}
  248      *
  249      * Normalizes the given URL and returns a full qualified and normalized URL.
  250      * The method also generates {@link $sitemaps_url} if not set.
  251      * 
  252      * * This method throws an exception if the URL is invalid
  253      *
  254      * @param string    $site   The url may contain the protocol (http://www.foo.com or https://www.foo.com), the port (http://www.foo.com:4500/index.php)
  255      *                                        and/or basic-authentication-data (http://loginname:passwd@www.foo.com)
  256      * @return string|bool  SitemapCreator::$site|false Returns the valid normalized URL on success or false on failure
  257      * @section 1 Settings  
  258      */
  259     public function setSite($site) {
  260         $site = trim($site);
  261         if (!empty($site) && is_string($site)) {
  262             $this->site = PHPCrawlerUtils::normalizeURL($site);
  263 
  264             //create sitemap URL from website URL
  265             //@example http://example.com/1/index.php -> http://example.com/1/sitemap.php?s=
  266             if (!isset($this->sitemaps_url))
  267                 $this->sitemaps_url = preg_replace('`([^/]/)[^/]*$`', '\\1sitemap.php?s=', $this->site);
  268             return $this->site;
  269         }else
  270             throw new Exception("Invalid URL: {$site}.");
  271         return false;
  272     }
  273 
  274     /**
  275      * Sets the data directory path of the sitemaps {@link $data_dir}
  276      *
  277      * The directory is used to store sitemaps and csv files. If not set then
  278      * sys_get_temp_dir() will be used.
  279      * 
  280      * * This method throws an exception if the path is not valid or the directory is not writable.
  281      *
  282      * @example
  283      *  /server/html/example.com/sitemaps/
  284      * 
  285      * @see prepareSitemapsDir()
  286      *
  287      * @param string $dir data directory {@link $data_dir}
  288      * @section 1 Settings 
  289      */
  290     public function setDataDir($dir) {
  291         $dir = rtrim($dir, '/');
  292         if (!is_dir($dir) || !$this->isDataDirWritable($dir))
  293             throw new Exception("Invalid directory path or directory is not writable: '{$dir}'");
  294         $this->data_dir = $dir;
  295     }
  296 
  297     /**
  298      * Sets the URL of the sitemap files {@link $sitemaps_url}
  299      *
  300      * Sitemap URL where the sitemap file name will be appended to the end of the URL.
  301      * @example
  302      *  http://www.example.com/sitemap.php?=
  303      * 
  304      * @see getSitemapURL()
  305      *
  306      * @param string $url The sitemap URL
  307      * @section 1 Settings 
  308      */
  309     public function setSitemapURL($url) {
  310         $this->sitemaps_url = $url;
  311     }
  312 
  313     /**
  314      * Sets number of URLs set for each sitemap file {@link $entries_per_sitemap}
  315      *
  316      * Each sitemap file should have a maximum of 50,000 URL. 
  317      * Use this function to change the number of URLs set per sitemap.
  318      * @example
  319      *  $sitemap->entries_per_sitemap = 20000;
  320      *
  321      * @param int $number  number greater than 0
  322      * @return bool true on success, false otherwise
  323      * @section 1 Settings 
  324      */
  325     public function setEntriesPerSitemap($number) {
  326         if (!is_numeric($number) || 1 > $number || 50000 < $number)
  327             return false;
  328         $this->entries_per_sitemap = $number;
  329         return true;
  330     }
  331 
  332     /**
  333      *  Use gzip compressed sitemaps files {@link $use_gzip}
  334      *
  335      * @param bool $mode  true to enable gzip, false otherwise
  336      * @section 1 Settings 
  337      */
  338     public function useGzip($mode) {
  339         $this->use_gzip = ($mode);
  340     }
  341 
  342     /**
  343      *  Set the URLs sets manually {@link $entries}
  344      * 
  345      * * this method throws exception if $entires is not an array or 
  346      *   the first entry does not have 'URL' key 
  347      * 
  348      * @param array $entries  array of entries to be added to sitemap
  349      *                                        @example array(
  350      *                                                              array(
  351      *                                                                  "URL"=>"http://example.com/",
  352      *                                                                  "Priority" => 0.8,
  353      *                                                                  "Last-Modified" => 3455554,
  354      *                                                                  "Frequency" => "always"
  355      *                                                                   ),......);
  356      * @section 1 Settings 
  357      */
  358     public function setEntries($entries) {
  359         if (!is_array($entries) || empty($entries[0]['URL']))
  360             throw new exception('Invalid URLs set');
  361         $this->entries = $entries;
  362     }
  363 
  364     /**
  365      *  Set Priority mode {@link $priority_mode}
  366      * 
  367      * Choose how the Priority of every URL should be calculated
  368      * @link http://www.sitemaps.org/protocol.html#prioritydef
  369      * 
  370      * @see setMinPriority()
  371      *
  372      * @param int $mode   number between 0 and 2 or  use the predefined constants
  373      *                                  SitemapCreator::PRIORITY_Disable  Disables priority calculations
  374      *                                  SitemapCreator::PRIORITY_CRAWLED_FIRST Crawled first pages get higher priority
  375      *                                  SitemapCreator::PRIORITY_URL_STRUCTURE Deeper pathes get lower priority
  376      * @return bool true on success, false otherwise
  377      * @section 1 Settings 
  378      */
  379     public function setPriority($mode) {
  380         if (!preg_match("/^[0-2]{1}$/", $mode))
  381             return false;
  382         $this->priority_mode = $mode;
  383         return true;
  384     }
  385 
  386     /**
  387      *  Set minimum Priority value for all URLs {@link $min_priority}
  388      * 
  389      * @see setMinPriority()
  390      *
  391      * @param float $mode  number between 0 and 1.0
  392      * @return bool true on success, false otherwise
  393      * @section 1 Settings 
  394      */
  395     public function setMinPriority($mode) {
  396         if ($mode > 1 || !preg_match("/^[0-9\.]{3}$/", $mode))
  397             return false;
  398         $this->min_priority = $mode;
  399     }
  400 
  401     /**
  402      *  Set Frequency mode {@link $frequency_mode}
  403      * 
  404      * Choose how the Frequency of every URL should be calculated
  405      * @link http://www.sitemaps.org/protocol.html#changefreqdef
  406      * 
  407      * @see setMinFrequency()
  408      *
  409      * @param int $mode   number between 0 and 2 or  use the predefined constants
  410      *                                  SitemapCreator::FREQUENCY_Disable  Disables frequency calculations
  411      *                                  SitemapCreator::FREQUENCY_LAST_MODIFIED Latest modified pages get higher frequency
  412      *                                  SitemapCreator::FREQUENCY_PRIORITY Higher priority pages get higher frequency
  413      * @return bool true on success, false otherwise
  414      * @section 1 Settings 
  415      */
  416     public function setFrequency($mode) {
  417         if (!preg_match("/^[0-2]{1}$/", $mode))
  418             return false;
  419         $this->frequency_mode = $mode;
  420         return true;
  421     }
  422 
  423     /**
  424      *  Set minimum Frequency value for all URLs {@link $min_priority}
  425      * 
  426      * @see setMinFrequency()
  427      *
  428      * @param string $mode  one of {@link $frequency_types} keys
  429      * @return bool true on success, false otherwise
  430      * @section 1 Settings 
  431      */
  432     public function setMinFrequency($mode) {
  433         $mode = strtolower(trim($mode));
  434         if (!in_array($mode, array_flip($this->frequency_types)))
  435             return false;
  436         $this->min_frequency = $mode;
  437         return true;
  438     }
  439 
  440     /**
  441      *  add URL entry manually to {@link $entries}
  442      * 
  443      * * this method throws exception if $entry is not an array or 
  444      *   does not have 'URL' key 
  445      * 
  446      * @param array $entry  URL set to be added to sitemap
  447      *                                      @example 
  448      *                                                  array(
  449      *                                                                  "URL"=>"http://example.com/",
  450      *                                                                  "Priority" => 0.8,
  451      *                                                                  "Last-Modified" => 3455554,
  452      *                                                                  "Frequency" => "always"
  453      *                                                     );
  454      * @section 1 Settings 
  455      */
  456     public function addEntry($entry) {
  457         if (!is_array($entry) || empty($entry['URL']))
  458             throw new exception('Invalid URLs set');
  459         $this->entries[] = $entry;
  460     }
  461 
  462     /**
  463      *  add a ping URL to the array {@link $engines}
  464      * 
  465      * 
  466      * @param string $url  ping URL of the search engine
  467      * @section 1 Settings 
  468      */
  469     public function addEngine($url) {
  470         $this->engines[] = $url;
  471     }
  472 
  473     /**
  474      *  Get sitemap file path
  475      * 
  476      * @see setDataDir()
  477      *
  478      * @param string $filename  'index' string or number
  479      * @return bool|string false on invalid $filename parameter, sitemap path on success
  480      * @section 2 Info 
  481      */
  482     public function getSitemapPath($filename) {
  483         if (!self::validSitemapName($filename))
  484             return false;
  485         $filename = $this->addSitemapEXT($filename);
  486         return $this->getSitemapsDir() . "/$filename";
  487     }
  488 
  489     /**
  490      *  Get sitemap file URL 
  491      * 
  492      * @see validSitemapName()
  493      * @see addSitemapEXT()
  494      *
  495      * @param string $filename  'index' string or number
  496      * @return bool|string false on invalid $filename parameter, sitemap path on success
  497      * @section 2 Info 
  498      */
  499     public function getSitemapURL($filename) {
  500         if (!self::validSitemapName($filename))
  501             return false;
  502         //add file extension
  503         $filename = $this->addSitemapEXT($filename);
  504         return $this->sitemaps_url . $filename;
  505     }
  506 
  507     /**
  508      *  Get sitemap directory name
  509      * 
  510      *  Get this site's directory name where sitemaps are strored.
  511      *  The directory is created inside the {@link $data_dir} directory.
  512      * @example
  513      *  /server/datadir/http__example_com_
  514      * 
  515      * @return bool|string false if {@link $site} has not been set, sitemap dir path on success
  516      * @section 2 Info 
  517      */
  518     public function getSitemapDirName() {
  519         if ($this->site == '')
  520             return false;
  521         return preg_replace('`[^a-z]+`i', '_', $this->site);
  522     }
  523 
  524     /**
  525      *  Get sitemap directory path {@link $sitemaps_dir}
  526      * 
  527      *  Get this site's directory path where sitemaps are strored.
  528      *  The directory is created inside the {@link $data_dir} directory.
  529      * @example
  530      *  /server/datadir/http__example_com_
  531      * 
  532      * @return string SitemapCreator::sitemaps_dir sitemap dir path
  533      * @section 2 Info 
  534      */
  535     public function getSitemapsDir() {
  536         if (!isset($this->sitemaps_dir))
  537             $this->sitemaps_dir = $this->data_dir . '/' . $this->getSitemapDirName();
  538         return $this->sitemaps_dir;
  539     }
  540 
  541     /**
  542      *  Get URLs sets array {@link $entries}
  543      * 
  544      * @return array {@link $entries}
  545      * @section 2 Info 
  546      */
  547     public function getEntries() {
  548         return $this->entries;
  549     }
  550 
  551     /**
  552      *  Get data directory path {@link $data_dir}
  553      * 
  554      * @see setDataDir()
  555      * @return string {@link $data_dir}
  556      * @section 2 Info 
  557      */
  558     public function getDataDir() {
  559         return $this->data_dir;
  560     }
  561 
  562     /**
  563      *  Validates sitemap filename
  564      * 
  565      * @see getSitemapPath()
  566      * @see getSitemapURL()
  567      * @return bool
  568      * @section 2 Info 
  569      */
  570     static function validSitemapName($filename) {
  571         if (!preg_match('`^(index|[0-9]+)$`', $filename))
  572             return false;
  573         return true;
  574     }
  575 
  576     /**
  577      *  Initiate the crawler {@link $Crawler}
  578      * 
  579      * visit {@link http://phpcrawl.cuab.de/classreferences/index.html} for
  580      * full cralwer options which can be accessed through {@link $Crawler} object.
  581      * @example
  582      *  $sitemap->Crawler->enableCookieHandling(false);
  583      *       
  584      * * this method throws exception if {@link $site} has not been set
  585      * 
  586      * @see Crawl()
  587 
  588      * @return SMCCrawler object {@link SMCCrawler} 
  589      * @section 3 Crawler 
  590      */
  591     public function initCrawler() {
  592         if ($this->site == '')
  593             throw new Exception("Please add a Site URL 'SitemapCreator::setSite()' Before starting the cralwer 'SitemapCreator::initCrawler()'.");
  594         //load required class files
  595         //PHPCrawler
  596         if (!class_exists("PHPCrawler"))
  597             require_once( $this->classpath . "/libs/PHPCrawler/PHPCrawler.class.php");
  598         // Crawler-class
  599         if (!class_exists("SMCCrawler"))
  600             require_once( $this->classpath . "/SitemapCreatorCrawler.class.php");
  601         //Create Crwaler object
  602         $this->Crawler = new SMCCrawler();
  603         //set default settings
  604         $this->setCrawlerDefaults();
  605         //add site URL to the crawler
  606         $this->Crawler->setURL($this->site);
  607         //return the crawler object
  608         //@example $cralwer = $sitemap->initCrawler();
  609         return $this->Crawler;
  610     }
  611 
  612     /**
  613      *  Start the crawl process
  614      *  
  615      *  More related options could be found on {@link http://phpcrawl.cuab.de/classreferences/index.html}
  616      * 
  617      * @see initCrawler()
  618      * @section 3 Crawler 
  619      */
  620     public function Crawl() {
  621         //if the cralwr was not create, create one
  622         if (get_class($this->Crawler) != 'SMCCrawler')
  623             $this->initCrawler();
  624         //start crawling
  625         $this->Crawler->go();
  626         //add the entries from the crawler
  627         $this->setEntries($this->Crawler->entries);
  628         //save the reports from the crawler before destroying the object
  629         $this->crawler_reports = $this->Crawler->getProcessReport();
  630         //unset the crawler object to clear memory
  631         //@todo Not working! check PHPCrawl memory leak
  632         unset($this->Crawler);
  633         gc_collect_cycles();
  634     }
  635 
  636     /**
  637      *  Load default crawler settings for {@link $Crawler}
  638      *  
  639      * Internally load default options, no external calls allowed
  640      *  More related options could be found on 
  641      * {@link http://phpcrawl.cuab.de/classreferences/index.html}
  642      * 
  643      * @see Crawl()
  644      * @see initCrawler()
  645      * @section 3 Crawler 
  646      */
  647     protected function setCrawlerDefaults() {
  648         // Only receive content of files with content-type "text/html" 
  649         $this->Crawler->addContentTypeReceiveRule("#text/html#");
  650         // If this is set to TRUE, the crawler tries to find links everywhere in an html-page, even outside of html-tags.
  651         $this->Crawler->enableAggressiveLinkSearch(false);
  652         // The crawler will only follow links that lead to the same host like the one in the root-url.
  653         $this->Crawler->setFollowMode(2);
  654         //Sets the "User-Agent" identification-string that will be send with HTTP-requests.
  655         $this->Crawler->setUserAgentString(self::$useragent);
  656         //Decides whether the crawler should obey "nofollow"-tags
  657         $this->Crawler->obeyNoFollowTags(true);
  658         // Ignore links to pictures, dont even request pictures 
  659         $this->Crawler->addURLFilterRule("#\.(jpg|jpeg|gif|png|bmp|js|css|ico)$# i");
  660         // Store and send cookie-data like a browser does 
  661         $this->Crawler->enableCookieHandling(true);
  662         // enable Get Last-Modified date from page header
  663         $this->Crawler->enableLastModifiedCount(true);
  664         //Decides whether the crawler should parse and obey robots.txt-files.
  665         $this->Crawler->obeyRobotsTxt(true);
  666     }
  667 
  668     /**
  669      * Creates sitemaps directory {@link $sitemaps_dir}
  670      * 
  671      * * this method throws exception if data is not writable
  672      * @see setDataDir();
  673      * @section 4 Sitemap  
  674      */
  675     protected function prepareSitemapsDir() {
  676         //Get sitemap directory path  {@link $sitemaps_dir}
  677         $this->getSitemapsDir();
  678         //check if dir exists
  679         clearstatcache();
  680         if (file_exists($this->sitemaps_dir))
  681             return;
  682         //check if dir is writable
  683         if (!$this->isDataDirWritable($this->data_dir))
  684             throw new Exception("Data directory {$this->data_dir} is not writable");
  685         //create sitemap dir
  686         mkdir($this->sitemaps_dir);
  687     }
  688 
  689     /**
  690      * Check if data directory is writable {@link $data_dir}
  691      * 
  692      * @return bool true on success, false otherwise
  693      * @see setDataDir()
  694      * @see prepareSitemapsDir()
  695      * @section 4 Sitemap  
  696      */
  697     protected function isDataDirWritable($dir) {
  698         if (!is_writable($dir) && !chmod($dir, 0777))
  699             return false;
  700         return true;
  701     }
  702 
  703     /**
  704      * adds sitemap gzip extension to sitemap filename if {@link $use_gzip} enabled
  705      * 
  706      * @return string filename 
  707      * @see useGzip();
  708      * @section 4 Sitemap  
  709      */
  710     protected function addSitemapEXT($filename) {
  711         return "{$filename}.xml" . ( $this->use_gzip ? '.gz' : '' );
  712     }
  713 
  714     /**
  715      * Create sitemaps files and index
  716      * 
  717      * @param array $entries  array of entries to be added to sitemap
  718      *                                        @example array(
  719      *                                                              array(
  720      *                                                                  "URL"=>"http://example.com/",
  721      *                                                                  "Priority" => 0.8,
  722      *                                                                  "Last-Modified" => 3455554,
  723      *                                                                  "Frequency" => "always"
  724      *                                                                   ),......);
  725      * @section 4 Sitemap  
  726      */
  727     //@todo add benchmark
  728     public function CreateSitemaps($entries = array()) {
  729         if (!empty($entries))
  730             $this->setEntries($entries);
  731         //prepare write dir
  732         $this->prepareSitemapsDir();
  733         //@todo add stylesheet
  734         //adding XML schemas
  735         $this->xml_head = '<?xml version="1.0" encoding="UTF-8"?>' .
  736                 '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">';
  737         $this->xml_foot = '</urlset>';
  738         //storing entries count
  739         $total_entries = count($this->entries);
  740         //iterate through entries
  741         for ($i = 0; $i < $total_entries; $i++) {
  742             //calculate priority
  743             $this->calcPriority($i, $this->entries[$i], $total_entries);
  744             //calculate frequency
  745             $this->calcFrequency($i, $this->entries[$i]);
  746             //add final XML code
  747             $this->addXMLURLSet($this->entries[$i]);
  748             //check if max {@link $entries_per_sitemap} has been reached
  749             //or reached Total URLs 
  750             $x = $i + 1;
  751             if (( $x % $this->entries_per_sitemap ) == 0
  752                     || $x == $total_entries) {
  753                 //write sitemap file
  754                 $this->writeSitemap();
  755             }
  756         }
  757         //create sitemaps index
  758         $this->writeIndex();
  759     }
  760 
  761     /**
  762      * calculates Priority for each entry {@link $priority_mode}
  763      * 
  764      *
  765      * @param int  $key array offset
  766      *                      $entry reference to array offset in {@link $entries}
  767      *                      $total_entries {@link $entries} array count
  768      * @section 4 Sitemap  
  769      */
  770     protected function calcPriority($key, &$entry, $total_entries) {
  771         //no priority?
  772         if ($this->priority_mode == self::PRIORITY_Disable)
  773             return;
  774         //Home page is highest priority
  775         if ($key == 0) {
  776             $entry['Priority'] = 1.0;
  777             return;
  778         }
  779         //storing homepage URL slices count to save from over-counting
  780         static $site_url_slices_num = 0;
  781         if ($this->priority_mode == self::PRIORITY_URL_STRUCTURE
  782                 && !$site_url_slices_num)
  783             $site_url_slices_num = count(explode('/', $this->entries[0]['URL']));
  784 
  785         switch ($this->priority_mode) {
  786             case self::PRIORITY_CRAWLED_FIRST:
  787             default:
  788                 //crawled first URLs should have higher priority
  789                 $entry['Priority'] = round(($total_entries - $key) / $total_entries, 1);
  790                 break;
  791             case self::PRIORITY_URL_STRUCTURE:
  792                 //deeper URL have more slices thus have less priority
  793                 $URL_slices = explode('/', $entry['URL']);
  794                 //URL depth is always related to homepage depth
  795                 $entry['Priority'] = round(( (1 / count($URL_slices)) * $site_url_slices_num) + 0.1, 1);
  796                 break;
  797         }
  798         //respect min priority
  799         if ($entry['Priority'] < $this->min_priority)
  800             $entry['Priority'] = $this->min_priority;
  801     }
  802 
  803     /**
  804      * calculates Frequency for each entry {@link $priority_mode}
  805      * 
  806      * @see $frequency_types
  807      * @param int  $key array offset
  808      *                      $entry reference to array offset in {@link $entries}
  809      * @section 4 Sitemap  
  810      */
  811     //@todo Cache-Control: max-age=86400
  812     protected function calcFrequency($key, &$entry) {
  813         //Frequency disabled?
  814         if ($this->frequency_mode == self::FREQUENCY_Disable)
  815             return;
  816         //Home page is highest Frequency
  817         if ($key == 0) {
  818             $entry['Frequency'] = 'always';
  819             return;
  820         }
  821         switch ($this->frequency_mode) {
  822             case self::FREQUENCY_LAST_MODIFIED:
  823             default:
  824                 if (!empty($entry['Last-Modified'])) {
  825                     //get difference in time between the date page last modified
  826                     //and current date.
  827                     $diff = $this->now - $entry['Last-Modified'];
  828                     //compare difference to {@link $frequency_types} values
  829                     foreach ($this->frequency_types as $type => $value) {
  830                         if ($diff < $value) {
  831                             //set frequency to the suitable frequency type in {@link $frequency_types} 
  832                             $entry['Frequency'] = $type;
  833                             break;
  834                         }
  835                     }
  836                 }
  837                 break;
  838             case self::FREQUENCY_PRIORITY:
  839                 //set Frequency based on priority
  840                 if ($entry['Priority'] >= 0.9)
  841                     $entry['Frequency'] = 'always';
  842                 elseif ($entry['Priority'] >= 0.7)
  843                     $entry['Frequency'] = 'hourly';
  844                 elseif ($entry['Priority'] >= 0.6)
  845                     $entry['Frequency'] = 'daily';
  846                 elseif ($entry['Priority'] >= 0.4)
  847                     $entry['Frequency'] = 'weekly';
  848                 elseif ($entry['Priority'] >= 0.2)
  849                     $entry['Frequency'] = 'monthly';
  850                 elseif ($entry['Priority'] >= 0.1)
  851                     $entry['Frequency'] = 'yearly';
  852                 else
  853                     $entry['Frequency'] = 'never';
  854                 break;
  855         }
  856         //respect min frequency
  857         if ($this->frequency_types[$entry['Frequency']] > $this->frequency_types[$this->min_frequency])
  858             $entry['Frequency'] = $this->min_frequency;
  859     }
  860 
  861     /**
  862      * Add single XML code to {@link $xml_url_set}
  863      * 
  864      * @param array  $entry  URL set entry
  865      * @section 4 Sitemap  
  866      */
  867     protected function addXMLURLSet($entry) {
  868         //ignore if 'URL' offset is empty
  869         if (empty($entry['URL']))
  870             return;
  871         $this->xml_url_set .= '<url>';
  872         //encode URL
  873         $this->xml_url_set .= '<loc>' . utf8_encode(htmlentities($entry['URL'], ENT_QUOTES)) . '</loc>';
  874         if (!empty($entry['Last-Modified']))
  875             $this->xml_url_set.= "<lastmod>" . gmdate("Y-m-d\TH:i:s", $entry['Last-Modified']) . "+00:00</lastmod>";
  876         if (!empty($entry['Frequency']))
  877             $this->xml_url_set.= "<changefreq>{$entry['Frequency']}</changefreq>";
  878         if (!empty($entry['Priority']))
  879             $this->xml_url_set .= "<priority>{$entry['Priority']}</priority>";
  880         $this->xml_url_set .= '</url>';
  881     }
  882 
  883     /**
  884      *  Write sitemap file
  885      * @section 4 Sitemap  
  886      */
  887     protected function writeSitemap() {
  888         $this->sitemaps_count += 1;
  889         $this->putContent($this->getSitemapPath($this->sitemaps_count), $this->xml_head . $this->xml_url_set . $this->xml_foot);
  890         $this->xml_url_set = '';
  891     }
  892 
  893     /**
  894      *  Write sitemap index file
  895      * @section 4 Sitemap  
  896      */
  897     protected function writeIndex() {
  898         $xml = '<?xml version="1.0" encoding="UTF-8"?>' .
  899                 '<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">';
  900         for ($i = 1; $i <= $this->sitemaps_count; $i++) {
  901             $xml .= '<sitemap>' .
  902                     '<loc>' . $this->getSitemapURL($i) . '</loc>' .
  903                     '<lastmod>' . gmdate("Y-m-d\TH:i:s", time()) . '</lastmod>' .
  904                     '</sitemap>';
  905         }
  906         $xml .= '</sitemapindex>';
  907         $this->putContent($this->getSitemapPath('index'), $xml);
  908     }
  909 
  910     /**
  911      * Ping search engines 
  912      * 
  913      * @see addEngine()
  914      * 
  915      * @param string|int  $filename 'index' string or number
  916      * @return array $results Array contains errors as $results['google']['error'], or respond body as $request['google']['body']
  917      * @section 4 Sitemap  
  918      */
  919     public function ping($filename = 'index') {
  920         $results = array();
  921         //get full sitemap encoded URL
  922         $sitemap = urlencode($this->getSitemapURL($filename));
  923         foreach ($this->engines as $engine => $url)
  924             $results[$engine] = self::openURL($url . $sitemap);
  925 
  926         return $results;
  927     }
  928 
  929     /**
  930      * Open URL and get respond body or error
  931      * 
  932      * @see Ping()
  933      * 
  934      * @param string  $url  URL to retrieve
  935      * @param int $max_redirects max allowed redirects (optional)
  936      * @param int $timeout process timeout (optional )
  937      * @return array $results Array contains errors as $results['error'], or respond body as $request['body']
  938      * @section 4 Sitemap  
  939      */
  940     static function openURL($url, $max_redirects = 5, $timeout = 15) {
  941         $user_agent = self::$useragent;
  942         ini_set('user_agent', $user_agent);
  943         static $redirects = 0;
  944         $result = array();
  945         $url_parts = PHPCrawlerUtils::splitURL($url);
  946         if (($fp = @fsockopen($url_parts['host'], $url_parts['port'], $errno, $errstr, $timeout)) === false) {
  947             switch ($errno) {
  948                 case -3: $result['error'] = 'Socket creation failed (-3)';
  949                     break;
  950                 case -4: $result['error'] = 'DNS lookup failure (-4)';
  951                     break;
  952                 default: $result['error'] = 'Connection failed (' . $errno . ') ' . $errstr;
  953                     break;
  954             }
  955             return $result;
  956         }
  957         $get = "GET {$url_parts['path']}{$url_parts['file']}{$url_parts['query']} HTTP/1.1\r\n";
  958         $get .= "Host: {$url_parts['host']}\r\n";
  959         $get .= "User-Agent: {$user_agent})\r\n";
  960         $get .= "Connection: close\r\n\r\n";
  961         socket_set_timeout($fp, $timeout);
  962         stream_set_blocking($fp, 3);
  963         fwrite($fp, $get);
  964         //reading headers
  965         while ('' != ($line = trim(fgets($fp)))) {
  966             if (false !== ($pos = strpos($line, ':'))) {
  967                 $header = strtolower(trim(substr($line, 0, $pos)));
  968                 $val = trim(substr($line, $pos + 1));
  969                 //redirection
  970                 if ($header == 'location') {
  971                     if ($redirects >= $max_redirects) {
  972                         $result['error'] = "Max redirects reached: {$max_redirects}";
  973                         return $result;
  974                     }
  975                     $redirects++;
  976                     return self::openURL($val);
  977                 }
  978                 //response code
  979             } elseif (preg_match('#(?:^|\s)(?!200|302|301)([0-9]{3})\s?(.*)$#', $line, $code)) {
  980                 $result['error'] = 'HTTP Error (' . $code[1] . ') ' . $code[2];
  981                 return $result;
  982             }
  983         }
  984         //body
  985         $result['body'] = '';
  986         while (!feof($fp)) {
  987             $result['body'] .= fread($fp, 1024);
  988         }
  989         fclose($fp);
  990         return $result;
  991     }
  992 
  993     /**
  994      * Write XML to disk
  995      * 
  996      * @param string  $file file path
  997      * @param $XML XML code
  998      * @section 4 Sitemap  
  999      */
 1000     public function putContent($file, $XML) {
 1001         //write files with gzip format
 1002         if ($this->use_gzip) {
 1003             $fh = gzopen($file, 'wb');
 1004             gzwrite($fh, $XML);
 1005             gzclose($fh);
 1006         }else
 1007             file_put_contents($file, $XML, LOCK_EX);
 1008     }
 1009 
 1010     /**
 1011      * Read sitemap file from disk
 1012      * 
 1013      * *this method throws exception if sitemap file doesn't exsit
 1014      * @param string|int  $filename 'index' string or number
 1015      * @section 4 Sitemap  
 1016      */
 1017     public function readSitemap($filename) {
 1018         $file = $this->getSitemapPath($filename);
 1019         if (!file_exists($file))
 1020             throw new Exception("Invalid sitemap {$file}");
 1021         //add XML header
 1022         header("Content-Type: text/xml");
 1023         if ($this->use_gzip)
 1024             readgzfile($file);
 1025         else
 1026             readfile($file);
 1027     }
 1028 
 1029     /**
 1030      * Add index sitemap URL to robots.txt file
 1031      * 
 1032      * * this method throws exception if robots file doesn't exist and is writable
 1033      * @param string $robots_file Robots.txt file path
 1034      * @return string $robotstxt robots.txt text 
 1035      * @section 4 Sitemap  
 1036      */
 1037     public function addToRobots($robots_file = '') {
 1038         //if no path given create one
 1039         if ($robots_file == '')
 1040             $robots_file = $_SERVER['DOCUMENT_ROOT'] . '/robots.txt';
 1041         //check if file doesn't exist and is writable
 1042         if (!@touch($robots_file))
 1043             throw new Exception("File {$robots_file}is not writable");
 1044         $robotstxt = file_get_contents($robots_file);
 1045         $addtxt = "\nSitemap : " . $this->getSitemapURL('index') . "\n";
 1046         //check if sitemap URL already added
 1047         if (stripos($robotstxt, $addtxt) !== false)
 1048             return false;
 1049         //add sitemap URL
 1050         if (!file_put_contents($robots_file, $robotstxt . $addtxt, LOCK_EX))
 1051             return false;
 1052         //return robots text
 1053         return $robotstxt;
 1054     }
 1055 
 1056     /**
 1057      * Delete all sitemap dir and files
 1058      * 
 1059      * @return bool true on success, false otherwise
 1060      * @section 4 Sitemap  
 1061      */
 1062     public function removeSitemaps() {
 1063         $this->getSitemapsDir();
 1064         //check if dir exists
 1065         if (!file_exists($this->sitemaps_dir))
 1066             return false;
 1067         //iterate through the sitemap directory to remove files
 1068         $iterator = new RecursiveDirectoryIterator($this->sitemaps_dir, RecursiveIteratorIterator::CHILD_FIRST | FilesystemIterator::SKIP_DOTS);
 1069         foreach ($iterator as $path)
 1070             if ($path->isFile())
 1071                 unlink($path->__toString());
 1072         //remove dir
 1073         return rmdir($this->sitemaps_dir);
 1074     }
 1075 
 1076     /**
 1077      * Write to CSV file
 1078      * 
 1079      * @param string file path
 1080      * @return bool true on success, false otherwise
 1081      * @section 5 CSV  
 1082      */
 1083     public function writeToCSV($file = '') {
 1084         //end if not entries
 1085         if (empty($this->entries))
 1086             return false;
 1087         //if path is set check if it's valid otherwise use default
 1088         if ($file != '')
 1089             $file = $this->csvFile();
 1090         //check if file path is valid
 1091         if (!touch($file))
 1092             return false;
 1093         //write file
 1094         $fp = fopen($file, 'w');
 1095         //CSV header
 1096         fwrite($fp, "URL\tPriority\tLast-Modified\tFrequency\n");
 1097         foreach ($this->entries as $entry)
 1098             fwrite($fp, "{$entry['URL']}\t{$entry['Priority']}\t{$entry['Last-Modified']}\t{$entry['Frequency']}\n");
 1099         fclose($fp);
 1100         return true;
 1101     }
 1102 
 1103     /**
 1104      * Read from CSV file and add to {@link $entries}
 1105      * 
 1106      * @param string file path
 1107      * @return bool true on success, false otherwise
 1108      * @section 5 CSV  
 1109      */
 1110     public function readFromCSV($file = '') {
 1111         //if path is set check if it's valid otherwise use default
 1112         if ($file == '')
 1113             $file = $this->csvFile();
 1114         //open file and make sure is readable
 1115         $fp = fopen($file, 'r');
 1116         if (!$fp)
 1117             return false;
 1118         //get file header
 1119         $header = fgetcsv($fp, 1000, "\t");
 1120         $header_count = count($header);
 1121         //add entries
 1122         while (($data = fgetcsv($fp, 1000, "\t")) !== false) {
 1123             //add headers to entry array
 1124             for ($i = 0, $entry = array(); $i < $header_count; $i++)
 1125                 $entry[$header[$i]] = $data[$i];
 1126             $this->addEntry($entry);
 1127         }
 1128         fclose($fp);
 1129         return true;
 1130     }
 1131 
 1132     /**
 1133      * get CSV file path
 1134      * 
 1135      * @return string CSV file path
 1136      * @section 5 CSV  
 1137      */
 1138     public function csvFile() {
 1139         return "{$this->data_dir}/" . $this->getSitemapDirName() . "/entries.csv";
 1140     }
 1141 
 1142 }