"Fossies" - the Fresh Open Source Software Archive

Member "libs/PHPCrawler/UrlCache/PHPCrawlerSQLiteURLCache.class.php" (8 Jan 2013, 9923 Bytes) of package /linux/www/SitemapCreator.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) PHP source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 <?php
    2 /**
    3  * Class for caching/storing URLs/links in a SQLite-database-file.
    4  *
    5  * @package phpcrawl
    6  * @internal
    7  */
    8 class PHPCrawlerSQLiteURLCache extends PHPCrawlerURLCacheBase
    9 {
   10   /**
   11    * PDO-object for querying SQLite-file.
   12    *
   13    * @var PDO
   14    */
   15   protected $PDO;
   16   
   17   /**
   18    * Prepared statement for inserting URLS into the db-file as PDOStatement-object.
   19    *
   20    * @var PDOStatement
   21    */
   22   protected $PreparedInsertStatement;
   23   
   24   protected $sqlite_db_file;
   25   
   26   protected $db_analyzed = false;
   27   
   28   /**
   29    * Initiates an SQLite-URL-cache.
   30    *
   31    * @param string $file            The SQLite-fiel to use.
   32    * @param bool   $create_tables   Defines whether all necessary tables should be created
   33    */
   34   public function __construct($file, $create_tables = false)
   35   {
   36     $this->sqlite_db_file = $file;
   37     $this->openConnection($create_tables);
   38   }
   39   
   40   public function getUrlCount()
   41   {
   42     $Result = $this->PDO->query("SELECT count(id) AS sum FROM urls WHERE processed = 0;");
   43     $row = $Result->fetch(PDO::FETCH_ASSOC);
   44     return $row["sum"];
   45   }
   46   
   47   /**
   48    * Returns the next URL from the cache that should be crawled.
   49    *
   50    * @return PhpCrawlerURLDescriptor An PhpCrawlerURLDescriptor or NULL if currently no
   51    *                                 URL to process.
   52    */
   53   public function getNextUrl()
   54   {
   55     PHPCrawlerBenchmark::start("fetching_next_url_from_sqlitecache"); 
   56     
   57     $ok = $this->PDO->exec("BEGIN EXCLUSIVE TRANSACTION");
   58     
   59     // Get row with max priority-level
   60     $Result = $this->PDO->query("SELECT max(priority_level) AS max_priority_level FROM urls WHERE in_process = 0 AND processed = 0;");
   61     $row = $Result->fetch(PDO::FETCH_ASSOC);
   62     
   63     if ($row["max_priority_level"] == null) 
   64     {
   65       $Result->closeCursor();
   66       $this->PDO->exec("COMMIT;");
   67       return null;
   68     }
   69     
   70     $Result = $this->PDO->query("SELECT * FROM urls WHERE priority_level = ".$row["max_priority_level"]." and in_process = 0 AND processed = 0;");
   71     $row = $Result->fetch(PDO::FETCH_ASSOC);
   72     $Result->closeCursor();
   73      
   74     // Update row (set in process-flag)
   75     $this->PDO->exec("UPDATE urls SET in_process = 1 WHERE id = ".$row["id"].";");
   76     
   77     $this->PDO->exec("COMMIT;");
   78     
   79     PHPCrawlerBenchmark::stop("fetching_next_url_from_sqlitecache");
   80      
   81     // Return URL
   82     return new PHPCrawlerURLDescriptor($row["url_rebuild"], $row["link_raw"], $row["linkcode"], $row["linktext"], $row["refering_url"]);
   83   }
   84   
   85   /**
   86    * Has no function in this class
   87    */
   88   public function getAllURLs()
   89   {
   90   }
   91   
   92   /**
   93    * Removes all URLs and all priority-rules from the URL-cache.
   94    */
   95   public function clear()
   96   {
   97     $this->PDO->exec("DELETE FROM urls;");
   98     $this->PDO->exec("VACUUM;");
   99   }
  100   
  101   /**
  102    * Adds an URL to the url-cache
  103    *
  104    * @param PHPCrawlerURLDescriptor $UrlDescriptor      
  105    */
  106   public function addURL(PHPCrawlerURLDescriptor $UrlDescriptor)
  107   {
  108     if ($UrlDescriptor == null) return;
  109     
  110     // Hash of the URL
  111     $map_key = md5($UrlDescriptor->url_rebuild);
  112     
  113     // Get priority of URL
  114     $priority_level = $this->getUrlPriority($UrlDescriptor->url_rebuild);
  115     
  116     $this->createPreparedInsertStatement();
  117                                                                     
  118     // Insert URL via prepared statement
  119     $this->PreparedInsertStatement->execute(array(":priority_level" => $priority_level,
  120                                                   ":distinct_hash" => $map_key,
  121                                                   ":link_raw" => $UrlDescriptor->link_raw,
  122                                                   ":linkcode" => $UrlDescriptor->linkcode,
  123                                                   ":linktext" => $UrlDescriptor->linktext,
  124                                                   ":refering_url" => $UrlDescriptor->refering_url,
  125                                                   ":url_rebuild" => $UrlDescriptor->url_rebuild,
  126                                                   ":is_redirect_url" => $UrlDescriptor->is_redirect_url));
  127   }
  128   
  129   /**
  130    * Adds an bunch of URLs to the url-cache
  131    *
  132    * @param array $urls  A numeric array containing the URLs as PHPCrawlerURLDescriptor-objects
  133    */
  134   public function addURLs($urls)
  135   {
  136     PHPCrawlerBenchmark::start("adding_urls_to_sqlitecache"); 
  137     
  138     $this->PDO->exec("BEGIN EXCLUSIVE TRANSACTION;");
  139     
  140     $cnt = count($urls);
  141     for ($x=0; $x<$cnt; $x++)
  142     {
  143       if ($urls[$x] != null)
  144       {
  145         $this->addURL($urls[$x]);
  146       }
  147     }
  148     
  149     $this->PDO->exec("COMMIT;");
  150     $this->PreparedInsertStatement->closeCursor();
  151         
  152     if ($this->db_analyzed == false)
  153     {
  154       $this->PDO->exec("ANALYZE;");
  155       $this->db_analyzed = true;
  156     }
  157     
  158     PHPCrawlerBenchmark::stop("adding_urls_to_sqlitecache"); 
  159   }
  160   
  161   /**
  162    * Marks the given URL in the cache as "followed"
  163    *
  164    * @param PHPCrawlerURLDescriptor $UrlDescriptor
  165    */
  166   public function markUrlAsFollowed(PHPCrawlerURLDescriptor $UrlDescriptor)
  167   {
  168     PHPCrawlerBenchmark::start("marking_url_as_followes");
  169     $hash = md5($UrlDescriptor->url_rebuild);
  170     $this->PDO->exec("UPDATE urls SET processed = 1, in_process = 0 WHERE distinct_hash = '".$hash."';");
  171     PHPCrawlerBenchmark::stop("marking_url_as_followes"); 
  172   }
  173   
  174   /**
  175    * Checks whether there are URLs left in the cache that should be processed or not.
  176    *
  177    * @return bool
  178    */
  179   public function containsURLs()
  180   {
  181     PHPCrawlerBenchmark::start("checking_for_urls_in_cache");
  182     
  183     $Result = $this->PDO->query("SELECT id FROM urls WHERE processed = 0 OR in_process = 1 LIMIT 1;");
  184     
  185     $has_columns = $Result->fetchColumn();
  186     
  187     $Result->closeCursor();
  188     
  189     PHPCrawlerBenchmark::stop("checking_for_urls_in_cache");
  190     
  191     if ($has_columns != false)
  192     {
  193       return true;
  194     }
  195     else return false;
  196   }
  197   
  198   /**
  199    * Cleans/purges the URL-cache from inconsistent entries.
  200    */
  201   public function purgeCache()
  202   {
  203     // Set "in_process" to 0 for all URLs
  204     $this->PDO->exec("UPDATE urls SET in_process = 0;");
  205   }
  206   
  207   /**
  208    * Creates the sqlite-db-file and opens connection to it.
  209    *
  210    * @param bool $create_tables Defines whether all necessary tables should be created
  211    */
  212   protected function openConnection($create_tables = false)
  213   {
  214     PHPCrawlerBenchmark::start("connecting_to_sqlite_db");
  215     
  216     // Open sqlite-file
  217     try
  218     {
  219       $this->PDO = new PDO("sqlite:".$this->sqlite_db_file);
  220     }
  221     catch (Exception $e)
  222     {
  223       throw new Exception("Error creating SQLite-cache-file, ".$e->getMessage().", try installing sqlite3-extension for PHP.");
  224     }
  225     
  226     $this->PDO->exec("PRAGMA journal_mode = OFF");
  227     
  228     $this->PDO->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_WARNING);
  229     $this->PDO->setAttribute(PDO::ATTR_TIMEOUT, 100);
  230     
  231     if ($create_tables == true)
  232     {
  233       // Create url-table (if not exists)
  234       $this->PDO->exec("CREATE TABLE IF NOT EXISTS urls (id integer PRIMARY KEY AUTOINCREMENT,
  235                                                          in_process bool DEFAULT 0,
  236                                                          processed bool DEFAULT 0,
  237                                                          priority_level integer,
  238                                                          distinct_hash TEXT UNIQUE,
  239                                                          link_raw TEXT,
  240                                                          linkcode TEXT,
  241                                                          linktext TEXT,
  242                                                          refering_url TEXT,
  243                                                          url_rebuild TEXT,
  244                                                          is_redirect_url bool);");
  245       
  246       // Create indexes (seems that indexes make the whole thingy slower)
  247       $this->PDO->exec("CREATE INDEX IF NOT EXISTS priority_level ON urls (priority_level);");
  248       $this->PDO->exec("CREATE INDEX IF NOT EXISTS distinct_hash ON urls (distinct_hash);");
  249       $this->PDO->exec("CREATE INDEX IF NOT EXISTS in_process ON urls (in_process);");
  250       $this->PDO->exec("CREATE INDEX IF NOT EXISTS processed ON urls (processed);");
  251       
  252       $this->PDO->exec("ANALYZE;");
  253     }
  254     
  255     PHPCrawlerBenchmark::stop("connecting_to_sqlite_db");
  256   }
  257   
  258   /**
  259    * Creates the prepared statement for insterting URLs into database (if not done yet)
  260    */
  261   protected function createPreparedInsertStatement()
  262   {
  263     if ($this->PreparedInsertStatement == null)
  264     {
  265       // Prepared statement for URL-inserts                                      
  266       $this->PreparedInsertStatement = $this->PDO->prepare("INSERT OR IGNORE INTO urls (priority_level, distinct_hash, link_raw, linkcode, linktext, refering_url, url_rebuild, is_redirect_url)
  267                                                             VALUES(:priority_level,
  268                                                                    :distinct_hash,
  269                                                                    :link_raw,
  270                                                                    :linkcode,
  271                                                                    :linktext,
  272                                                                    :refering_url,
  273                                                                    :url_rebuild,
  274                                                                    :is_redirect_url);");
  275     }
  276   }
  277   
  278   /**
  279    * Cleans up the cache after is it not needed anymore.
  280    */
  281   public function cleanup()
  282   {
  283     unlink($this->sqlite_db_file);
  284   }
  285 }
  286 ?>