"Fossies" - the Fresh Open Source Software Archive

Member "sitemap_gen.py" (19 Jul 2007, 67145 Bytes) of package /linux/www/old/sitemap_gen_1.5.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Python source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. For more information about "sitemap_gen.py" see the Fossies "Dox" file reference documentation.

    1 #!/usr/bin/env python
    2 #
    3 # Copyright (c) 2004, 2005 Google Inc.
    4 # All rights reserved.
    5 #
    6 # Redistribution and use in source and binary forms, with or without
    7 # modification, are permitted provided that the following conditions
    8 # are met:
    9 #
   10 # * Redistributions of source code must retain the above copyright
   11 #   notice, this list of conditions and the following disclaimer.
   12 #
   13 # * Redistributions in binary form must reproduce the above copyright
   14 #   notice, this list of conditions and the following disclaimer in
   15 #   the documentation and/or other materials provided with the
   16 #   distribution.
   17 #
   18 # * Neither the name of Google nor the names of its contributors may
   19 #   be used to endorse or promote products derived from this software
   20 #   without specific prior written permission.
   21 #
   22 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   23 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   24 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   25 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   26 # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   27 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   28 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   29 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   30 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   31 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   32 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   33 # POSSIBILITY OF SUCH DAMAGE.
   34 #
   35 #
   36 # The sitemap_gen.py script is written in Python 2.2 and released to
   37 # the open source community for continuous improvements under the BSD
   38 # 2.0 new license, which can be found at:
   39 #
   40 #   http://www.opensource.org/licenses/bsd-license.php
   41 #
   42 
   43 __usage__ = \
   44 """A simple script to automatically produce sitemaps for a webserver,
   45 in the Google Sitemap Protocol (GSP).
   46 
   47 Usage: python sitemap_gen.py --config=config.xml [--help] [--testing]
   48             --config=config.xml, specifies config file location
   49             --help, displays usage message
   50             --testing, specified when user is experimenting
   51 """
   52 
   53 # Please be careful that all syntax used in this file can be parsed on
   54 # Python 1.5 -- this version check is not evaluated until after the
   55 # entire file has been parsed.
   56 import sys
   57 if sys.hexversion < 0x02020000:
   58   print 'This script requires Python 2.2 or later.'
   59   print 'Currently run with version: %s' % sys.version
   60   sys.exit(1)
   61 
   62 import fnmatch
   63 import glob
   64 import gzip
   65 import md5
   66 import os
   67 import re
   68 import stat
   69 import time
   70 import types
   71 import urllib
   72 import urlparse
   73 import xml.sax
   74 
   75 # True and False were introduced in Python2.2.2
   76 try:
   77   testTrue=True
   78   del testTrue
   79 except NameError:
   80   True=1
   81   False=0
   82 
   83 # Text encodings
   84 ENC_ASCII = 'ASCII'
   85 ENC_UTF8  = 'UTF-8'
   86 ENC_IDNA  = 'IDNA'
   87 ENC_ASCII_LIST = ['ASCII', 'US-ASCII', 'US', 'IBM367', 'CP367', 'ISO646-US'
   88                   'ISO_646.IRV:1991', 'ISO-IR-6', 'ANSI_X3.4-1968',
   89                   'ANSI_X3.4-1986', 'CPASCII' ]
   90 ENC_DEFAULT_LIST = ['ISO-8859-1', 'ISO-8859-2', 'ISO-8859-5']
   91 
   92 # Available Sitemap types
   93 SITEMAP_TYPES = ['web', 'mobile', 'news']
   94 
   95 # General Sitemap tags
   96 GENERAL_SITEMAP_TAGS = ['loc', 'changefreq', 'priority', 'lastmod']
   97 
   98 # News specific tags
   99 NEWS_SPECIFIC_TAGS = ['keywords', 'publication_date', 'stock_tickers']
  100 
  101 # News Sitemap tags
  102 NEWS_SITEMAP_TAGS = GENERAL_SITEMAP_TAGS + NEWS_SPECIFIC_TAGS
  103 
  104 # Maximum number of urls in each sitemap, before next Sitemap is created
  105 MAXURLS_PER_SITEMAP = 50000
  106 
  107 # Suffix on a Sitemap index file
  108 SITEINDEX_SUFFIX = '_index.xml'
  109 
  110 # Regular expressions tried for extracting URLs from access logs.
  111 ACCESSLOG_CLF_PATTERN = re.compile(
  112   r'.+\s+"([^\s]+)\s+([^\s]+)\s+HTTP/\d+\.\d+"\s+200\s+.*'
  113   )
  114 
  115 # Match patterns for lastmod attributes
  116 DATE_PATTERNS = map(re.compile, [
  117   r'^\d\d\d\d$',
  118   r'^\d\d\d\d-\d\d$',
  119   r'^\d\d\d\d-\d\d-\d\d$',
  120   r'^\d\d\d\d-\d\d-\d\dT\d\d:\d\dZ$',
  121   r'^\d\d\d\d-\d\d-\d\dT\d\d:\d\d[+-]\d\d:\d\d$',
  122   r'^\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d(\.\d+)?Z$',
  123   r'^\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d(\.\d+)?[+-]\d\d:\d\d$',
  124   ])
  125 
  126 # Match patterns for changefreq attributes
  127 CHANGEFREQ_PATTERNS = [
  128   'always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never'
  129   ]
  130 
  131 # XML formats
  132 GENERAL_SITEINDEX_HEADER   = \
  133   '<?xml version="1.0" encoding="UTF-8"?>\n' \
  134   '<sitemapindex\n' \
  135   '  xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"\n' \
  136   '  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n' \
  137   '  xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9\n' \
  138   '                      http://www.sitemaps.org/schemas/sitemap/0.9/' \
  139   'siteindex.xsd">\n'
  140 
  141 NEWS_SITEINDEX_HEADER   = \
  142   '<?xml version="1.0" encoding="UTF-8"?>\n' \
  143   '<sitemapindex\n' \
  144   '  xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"\n' \
  145   '  xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"\n' \
  146   '  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n' \
  147   '  xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9\n' \
  148   '                      http://www.sitemaps.org/schemas/sitemap/0.9/' \
  149   'siteindex.xsd">\n'
  150 
  151 SITEINDEX_FOOTER   = '</sitemapindex>\n'
  152 SITEINDEX_ENTRY    = \
  153   ' <sitemap>\n' \
  154   '  <loc>%(loc)s</loc>\n' \
  155   '  <lastmod>%(lastmod)s</lastmod>\n' \
  156   ' </sitemap>\n'
  157 GENERAL_SITEMAP_HEADER     = \
  158   '<?xml version="1.0" encoding="UTF-8"?>\n' \
  159   '<urlset\n' \
  160   '  xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"\n' \
  161   '  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n' \
  162   '  xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9\n' \
  163   '                      http://www.sitemaps.org/schemas/sitemap/0.9/' \
  164   'sitemap.xsd">\n'
  165 
  166 NEWS_SITEMAP_HEADER = \
  167   '<?xml version="1.0" encoding="UTF-8"?>\n' \
  168   '<urlset\n' \
  169   '  xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"\n' \
  170   '  xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"\n' \
  171   '  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"\n' \
  172   '  xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9\n' \
  173   '                      http://www.sitemaps.org/schemas/sitemap/0.9/' \
  174   'sitemap.xsd">\n'
  175 
  176 SITEMAP_FOOTER     = '</urlset>\n'
  177 SITEURL_XML_PREFIX = ' <url>\n'
  178 SITEURL_XML_SUFFIX = ' </url>\n'
  179 
  180 NEWS_TAG_XML_PREFIX = '  <news:news>\n'
  181 NEWS_TAG_XML_SUFFIX = '  </news:news>\n'
  182 
  183 # Search engines to notify with the updated sitemaps
  184 #
  185 # This list is very non-obvious in what's going on.  Here's the gist:
  186 # Each item in the list is a 6-tuple of items.  The first 5 are "almost"
  187 # the same as the input arguments to urlparse.urlunsplit():
  188 #   0 - schema
  189 #   1 - netloc
  190 #   2 - path
  191 #   3 - query    <-- EXCEPTION: specify a query map rather than a string
  192 #   4 - fragment
  193 # Additionally, add item 5:
  194 #   5 - query attribute that should be set to the new Sitemap URL
  195 # Clear as mud, I know.
  196 NOTIFICATION_SITES = [
  197   ('http', 'www.google.com', 'webmasters/sitemaps/ping', {}, '', 'sitemap'),
  198   ]
  199 
  200 
  201 class Error(Exception):
  202   """
  203   Base exception class.  In this module we tend not to use our own exception
  204   types for very much, but they come in very handy on XML parsing with SAX.
  205   """
  206   pass
  207 #end class Error
  208 
  209 
  210 class SchemaError(Error):
  211   """Failure to process an XML file according to the schema we know."""
  212   pass
  213 #end class SchemeError
  214 
  215 
  216 class Encoder:
  217   """
  218   Manages wide-character/narrow-character conversions for just about all
  219   text that flows into or out of the script.
  220 
  221   You should always use this class for string coercion, as opposed to
  222   letting Python handle coercions automatically.  Reason: Python
  223   usually assumes ASCII (7-bit) as a default narrow character encoding,
  224   which is not the kind of data we generally deal with.
  225 
  226   General high-level methodologies used in sitemap_gen:
  227 
  228   [PATHS]
  229   File system paths may be wide or narrow, depending on platform.
  230   This works fine, just be aware of it and be very careful to not
  231   mix them.  That is, if you have to pass several file path arguments
  232   into a library call, make sure they are all narrow or all wide.
  233   This class has MaybeNarrowPath() which should be called on every
  234   file system path you deal with.
  235 
  236   [URLS]
  237   URL locations are stored in Narrow form, already escaped.  This has the
  238   benefit of keeping escaping and encoding as close as possible to the format
  239   we read them in.  The downside is we may end up with URLs that have
  240   intermingled encodings -- the root path may be encoded in one way
  241   while the filename is encoded in another.  This is obviously wrong, but
  242   it should hopefully be an issue hit by very few users.  The workaround
  243   from the user level (assuming they notice) is to specify a default_encoding
  244   parameter in their config file.
  245 
  246   [OTHER]
  247   Other text, such as attributes of the URL class, configuration options,
  248   etc, are generally stored in Unicode for simplicity.
  249   """
  250 
  251   def __init__(self):
  252     self._user      = None                  # User-specified default encoding
  253     self._learned   = []                    # Learned default encodings
  254     self._widefiles = False                 # File system can be wide
  255 
  256     # Can the file system be Unicode?
  257     try:
  258       self._widefiles = os.path.supports_unicode_filenames
  259     except AttributeError:
  260       try:
  261         self._widefiles = sys.getwindowsversion() == os.VER_PLATFORM_WIN32_NT
  262       except AttributeError:
  263         pass
  264 
  265     # Try to guess a working default
  266     try:
  267       encoding = sys.getfilesystemencoding()
  268       if encoding and not (encoding.upper() in ENC_ASCII_LIST):
  269         self._learned = [ encoding ]
  270     except AttributeError:
  271       pass
  272 
  273     if not self._learned:
  274       encoding = sys.getdefaultencoding()
  275       if encoding and not (encoding.upper() in ENC_ASCII_LIST):
  276         self._learned = [ encoding ]
  277 
  278     # If we had no guesses, start with some European defaults
  279     if not self._learned:
  280       self._learned = ENC_DEFAULT_LIST
  281   #end def __init__
  282 
  283   def SetUserEncoding(self, encoding):
  284     self._user = encoding
  285   #end def SetUserEncoding
  286 
  287   def NarrowText(self, text, encoding):
  288     """ Narrow a piece of arbitrary text """
  289     if type(text) != types.UnicodeType:
  290       return text
  291 
  292     # Try the passed in preference
  293     if encoding:
  294       try:
  295         result = text.encode(encoding)
  296         if not encoding in self._learned:
  297           self._learned.append(encoding)
  298         return result
  299       except UnicodeError:
  300         pass
  301       except LookupError:
  302         output.Warn('Unknown encoding: %s' % encoding)
  303 
  304     # Try the user preference
  305     if self._user:
  306       try:
  307         return text.encode(self._user)
  308       except UnicodeError:
  309         pass
  310       except LookupError:
  311         temp = self._user
  312         self._user = None
  313         output.Warn('Unknown default_encoding: %s' % temp)
  314 
  315     # Look through learned defaults, knock any failing ones out of the list
  316     while self._learned:
  317       try:
  318         return text.encode(self._learned[0])
  319       except:
  320         del self._learned[0]
  321 
  322     # When all other defaults are exhausted, use UTF-8
  323     try:
  324       return text.encode(ENC_UTF8)
  325     except UnicodeError:
  326       pass
  327 
  328     # Something is seriously wrong if we get to here
  329     return text.encode(ENC_ASCII, 'ignore')
  330   #end def NarrowText
  331   
  332   def MaybeNarrowPath(self, text):
  333     """ Paths may be allowed to stay wide """
  334     if self._widefiles:
  335       return text
  336     return self.NarrowText(text, None)
  337   #end def MaybeNarrowPath
  338 
  339   def WidenText(self, text, encoding):
  340     """ Widen a piece of arbitrary text """
  341     if type(text) != types.StringType:
  342       return text
  343 
  344     # Try the passed in preference
  345     if encoding:
  346       try:
  347         result = unicode(text, encoding)
  348         if not encoding in self._learned:
  349           self._learned.append(encoding)
  350         return result
  351       except UnicodeError:
  352         pass
  353       except LookupError:
  354         output.Warn('Unknown encoding: %s' % encoding)
  355 
  356     # Try the user preference
  357     if self._user:
  358       try:
  359         return unicode(text, self._user)
  360       except UnicodeError:
  361         pass
  362       except LookupError:
  363         temp = self._user
  364         self._user = None
  365         output.Warn('Unknown default_encoding: %s' % temp)
  366 
  367     # Look through learned defaults, knock any failing ones out of the list
  368     while self._learned:
  369       try:
  370         return unicode(text, self._learned[0])
  371       except:
  372         del self._learned[0]
  373 
  374     # When all other defaults are exhausted, use UTF-8
  375     try:
  376       return unicode(text, ENC_UTF8)
  377     except UnicodeError:
  378       pass
  379 
  380     # Getting here means it wasn't UTF-8 and we had no working default.
  381     # We really don't have anything "right" we can do anymore.
  382     output.Warn('Unrecognized encoding in text: %s' % text)
  383     if not self._user:
  384       output.Warn('You may need to set a default_encoding in your '
  385                   'configuration file.')
  386     return text.decode(ENC_ASCII, 'ignore')
  387   #end def WidenText
  388 #end class Encoder
  389 encoder = Encoder()
  390 
  391 
  392 class Output:
  393   """
  394   Exposes logging functionality, and tracks how many errors
  395   we have thus output.
  396 
  397   Logging levels should be used as thus:
  398     Fatal     -- extremely sparingly
  399     Error     -- config errors, entire blocks of user 'intention' lost
  400     Warn      -- individual URLs lost
  401     Log(,0)   -- Un-suppressable text that's not an error
  402     Log(,1)   -- touched files, major actions
  403     Log(,2)   -- parsing notes, filtered or duplicated URLs
  404     Log(,3)   -- each accepted URL
  405   """
  406 
  407   def __init__(self):
  408     self.num_errors    = 0                   # Count of errors
  409     self.num_warns     = 0                   # Count of warnings
  410 
  411     self._errors_shown = {}                  # Shown errors
  412     self._warns_shown  = {}                  # Shown warnings
  413     self._verbose      = 0                   # Level of verbosity
  414   #end def __init__
  415 
  416   def Log(self, text, level):
  417     """ Output a blurb of diagnostic text, if the verbose level allows it """
  418     if text:
  419       text = encoder.NarrowText(text, None)
  420       if self._verbose >= level:
  421         print text
  422   #end def Log
  423 
  424   def Warn(self, text):
  425     """ Output and count a warning.  Suppress duplicate warnings. """
  426     if text:
  427       text = encoder.NarrowText(text, None)
  428       hash = md5.new(text).digest()
  429       if not self._warns_shown.has_key(hash):
  430         self._warns_shown[hash] = 1
  431         print '[WARNING] ' + text
  432       else:
  433         self.Log('(suppressed) [WARNING] ' + text, 3)
  434       self.num_warns = self.num_warns + 1
  435   #end def Warn
  436 
  437   def Error(self, text):
  438     """ Output and count an error.  Suppress duplicate errors. """
  439     if text:
  440       text = encoder.NarrowText(text, None)
  441       hash = md5.new(text).digest()
  442       if not self._errors_shown.has_key(hash):
  443         self._errors_shown[hash] = 1
  444         print '[ERROR] ' + text
  445       else:
  446         self.Log('(suppressed) [ERROR] ' + text, 3)
  447       self.num_errors = self.num_errors + 1
  448   #end def Error
  449 
  450   def Fatal(self, text):
  451     """ Output an error and terminate the program. """
  452     if text:
  453       text = encoder.NarrowText(text, None)
  454       print '[FATAL] ' + text
  455     else:
  456       print 'Fatal error.'
  457     sys.exit(1)
  458   #end def Fatal
  459 
  460   def SetVerbose(self, level):
  461     """ Sets the verbose level. """
  462     try:
  463       if type(level) != types.IntType:
  464         level = int(level)
  465       if (level >= 0) and (level <= 3):
  466         self._verbose = level
  467         return
  468     except ValueError:
  469       pass
  470     self.Error('Verbose level (%s) must be between 0 and 3 inclusive.' % level)
  471   #end def SetVerbose
  472 #end class Output
  473 output = Output()
  474 
  475 
  476 class URL(object):
  477   """ URL is a smart structure grouping together the properties we
  478   care about for a single web reference. """
  479   __slots__ = 'loc', 'lastmod', 'changefreq', 'priority'
  480 
  481   def __init__(self):
  482     self.loc        = None                  # URL -- in Narrow characters
  483     self.lastmod    = None                  # ISO8601 timestamp of last modify
  484     self.changefreq = None                  # Text term for update frequency
  485     self.priority   = None                  # Float between 0 and 1 (inc)
  486   #end def __init__
  487 
  488   def __cmp__(self, other):
  489     if self.loc < other.loc:
  490       return -1
  491     if self.loc > other.loc:
  492       return 1
  493     return 0
  494   #end def __cmp__
  495 
  496   def TrySetAttribute(self, attribute, value):
  497     """ Attempt to set the attribute to the value, with a pretty try
  498     block around it.  """
  499     if attribute == 'loc':
  500       self.loc = self.Canonicalize(value)
  501     else:
  502       try:
  503         setattr(self, attribute, value)
  504       except AttributeError:
  505         output.Warn('Unknown URL attribute: %s' % attribute)
  506   #end def TrySetAttribute
  507 
  508   def IsAbsolute(loc):
  509     """ Decide if the URL is absolute or not """
  510     if not loc:
  511       return False
  512     narrow = encoder.NarrowText(loc, None)
  513     (scheme, netloc, path, query, frag) = urlparse.urlsplit(narrow)
  514     if (not scheme) or (not netloc):
  515       return False
  516     return True
  517   #end def IsAbsolute
  518   IsAbsolute = staticmethod(IsAbsolute)
  519 
  520   def Canonicalize(loc):
  521     """ Do encoding and canonicalization on a URL string """
  522     if not loc:
  523       return loc
  524     
  525     # Let the encoder try to narrow it
  526     narrow = encoder.NarrowText(loc, None)
  527 
  528     # Escape components individually
  529     (scheme, netloc, path, query, frag) = urlparse.urlsplit(narrow)
  530     unr    = '-._~'
  531     sub    = '!$&\'()*+,;='
  532     netloc = urllib.quote(netloc, unr + sub + '%:@/[]')
  533     path   = urllib.quote(path,   unr + sub + '%:@/')
  534     query  = urllib.quote(query,  unr + sub + '%:@/?')
  535     frag   = urllib.quote(frag,   unr + sub + '%:@/?')
  536 
  537     # Try built-in IDNA encoding on the netloc
  538     try:
  539       (ignore, widenetloc, ignore, ignore, ignore) = urlparse.urlsplit(loc)
  540       for c in widenetloc:
  541         if c >= unichr(128):
  542           netloc = widenetloc.encode(ENC_IDNA)
  543           netloc = urllib.quote(netloc, unr + sub + '%:@/[]')
  544           break
  545     except UnicodeError:
  546       # urlsplit must have failed, based on implementation differences in the
  547       # library.  There is not much we can do here, except ignore it.
  548       pass
  549     except LookupError:
  550       output.Warn('An International Domain Name (IDN) is being used, but this '
  551                   'version of Python does not have support for IDNA encoding. '
  552                   ' (IDNA support was introduced in Python 2.3)  The encoding '
  553                   'we have used instead is wrong and will probably not yield '
  554                   'valid URLs.')
  555     bad_netloc = False
  556     if '%' in netloc:
  557       bad_netloc = True
  558 
  559     # Put it all back together
  560     narrow = urlparse.urlunsplit((scheme, netloc, path, query, frag))
  561 
  562     # I let '%' through.  Fix any that aren't pre-existing escapes.
  563     HEXDIG = '0123456789abcdefABCDEF'
  564     list   = narrow.split('%')
  565     narrow = list[0]
  566     del list[0]
  567     for item in list:
  568       if (len(item) >= 2) and (item[0] in HEXDIG) and (item[1] in HEXDIG):
  569         narrow = narrow + '%' + item
  570       else:
  571         narrow = narrow + '%25' + item
  572 
  573     # Issue a warning if this is a bad URL
  574     if bad_netloc:
  575       output.Warn('Invalid characters in the host or domain portion of a URL: '
  576                   + narrow)
  577 
  578     return narrow
  579   #end def Canonicalize
  580   Canonicalize = staticmethod(Canonicalize)
  581 
  582   def VerifyDate(self, date, metatag):
  583     """Verify the date format is valid"""
  584     match = False
  585     if date:
  586       date = date.upper()
  587       for pattern in DATE_PATTERNS:
  588         match = pattern.match(date)
  589         if match:
  590           return True
  591       if not match:
  592         output.Warn('The value for %s does not appear to be in ISO8601 '
  593             'format on URL: %s' % (metatag, self.loc))
  594         return False
  595   #end of VerifyDate
  596 
  597   def Validate(self, base_url, allow_fragment):
  598     """ Verify the data in this URL is well-formed, and override if not. """
  599     assert type(base_url) == types.StringType
  600     
  601     # Test (and normalize) the ref
  602     if not self.loc:
  603       output.Warn('Empty URL')
  604       return False
  605     if allow_fragment:
  606       self.loc = urlparse.urljoin(base_url, self.loc)
  607     if not self.loc.startswith(base_url):
  608       output.Warn('Discarded URL for not starting with the base_url: %s' %
  609                   self.loc)
  610       self.loc = None
  611       return False
  612 
  613     # Test the lastmod
  614     if self.lastmod:
  615       if not self.VerifyDate(self.lastmod, "lastmod"):
  616         self.lastmod = None
  617 
  618     # Test the changefreq
  619     if self.changefreq:
  620       match = False
  621       self.changefreq = self.changefreq.lower()
  622       for pattern in CHANGEFREQ_PATTERNS:
  623         if self.changefreq == pattern:
  624           match = True
  625           break
  626       if not match:
  627         output.Warn('Changefreq "%s" is not a valid change frequency on URL '
  628                     ': %s' % (self.changefreq, self.loc))
  629         self.changefreq = None
  630 
  631     # Test the priority
  632     if self.priority:
  633       priority = -1.0
  634       try:
  635         priority = float(self.priority)
  636       except ValueError:
  637         pass
  638       if (priority < 0.0) or (priority > 1.0):
  639         output.Warn('Priority "%s" is not a number between 0 and 1 inclusive '
  640                     'on URL: %s' % (self.priority, self.loc))
  641         self.priority = None
  642 
  643     return True
  644   #end def Validate
  645 
  646   def MakeHash(self):
  647     """ Provides a uniform way of hashing URLs """
  648     if not self.loc:
  649       return None
  650     if self.loc.endswith('/'):
  651       return md5.new(self.loc[:-1]).digest()
  652     return md5.new(self.loc).digest()
  653   #end def MakeHash
  654 
  655   def Log(self, prefix='URL', level=3):
  656     """ Dump the contents, empty or not, to the log. """
  657     out = prefix + ':'
  658     
  659     for attribute in self.__slots__:
  660       value = getattr(self, attribute)
  661       if not value:
  662         value = ''
  663       out = out + ('  %s=[%s]' % (attribute, value))
  664 
  665     output.Log('%s' % encoder.NarrowText(out, None), level)
  666   #end def Log
  667 
  668   def WriteXML(self, file):
  669     """ Dump non-empty contents to the output file, in XML format. """
  670     if not self.loc:
  671       return
  672     out = SITEURL_XML_PREFIX
  673 
  674     for attribute in self.__slots__:
  675       value = getattr(self, attribute)
  676       if value:
  677         if type(value) == types.UnicodeType:
  678           value = encoder.NarrowText(value, None)
  679         elif type(value) != types.StringType:
  680           value = str(value)
  681         value = xml.sax.saxutils.escape(value)
  682         out = out + ('  <%s>%s</%s>\n' % (attribute, value, attribute))
  683     
  684     out = out + SITEURL_XML_SUFFIX
  685     file.write(out)
  686   #end def WriteXML
  687 #end class URL
  688 
  689 class NewsURL(URL):
  690   """ NewsURL is a subclass of URL with News-Sitemap specific properties. """
  691   __slots__ = 'loc', 'lastmod', 'changefreq', 'priority', 'publication_date', \
  692           'keywords', 'stock_tickers'
  693 
  694   def __init__(self):
  695     URL.__init__(self)
  696     self.publication_date   = None  # ISO8601 timestamp of publication date
  697     self.keywords       = None  # Text keywords
  698     self.stock_tickers      = None  # Text stock
  699   #end def __init__
  700 
  701   def Validate(self, base_url, allow_fragment):
  702     """ Verify the data in this News URL is well-formed, and override if not. """
  703     assert type(base_url) == types.StringType
  704 
  705     if not URL.Validate(self, base_url, allow_fragment):
  706       return False
  707  
  708     if not URL.VerifyDate(self, self.publication_date, "publication_date"):
  709       self.publication_date = None
  710  
  711     return True
  712   #end def Validate
  713 
  714   def WriteXML(self, file):
  715     """ Dump non-empty contents to the output file, in XML format. """
  716     if not self.loc:
  717       return
  718     out = SITEURL_XML_PREFIX
  719  
  720     # printed_news_tag indicates if news-specific metatags are present
  721     printed_news_tag = False
  722     for attribute in self.__slots__:
  723       value = getattr(self, attribute)
  724       if value:
  725         if type(value) == types.UnicodeType:
  726           value = encoder.NarrowText(value, None)
  727         elif type(value) != types.StringType:
  728           value = str(value)
  729           value = xml.sax.saxutils.escape(value)
  730         if attribute in NEWS_SPECIFIC_TAGS:
  731           if not printed_news_tag:
  732         printed_news_tag = True
  733         out = out + NEWS_TAG_XML_PREFIX
  734       out = out + ('    <news:%s>%s</news:%s>\n' % (attribute, value, attribute))
  735         else:
  736       out = out + ('  <%s>%s</%s>\n' % (attribute, value, attribute))
  737  
  738     if printed_news_tag:
  739       out = out + NEWS_TAG_XML_SUFFIX
  740     out = out + SITEURL_XML_SUFFIX
  741     file.write(out)
  742   #end def WriteXML
  743 #end class NewsURL
  744 
  745 
  746 class Filter:
  747   """
  748   A filter on the stream of URLs we find.  A filter is, in essence,
  749   a wildcard applied to the stream.  You can think of this as an
  750   operator that returns a tri-state when given a URL:
  751 
  752     True  -- this URL is to be included in the sitemap
  753     None  -- this URL is undecided
  754     False -- this URL is to be dropped from the sitemap
  755   """
  756 
  757   def __init__(self, attributes):
  758     self._wildcard  = None                  # Pattern for wildcard match
  759     self._regexp    = None                  # Pattern for regexp match
  760     self._pass      = False                 # "Drop" filter vs. "Pass" filter
  761 
  762     if not ValidateAttributes('FILTER', attributes,
  763                               ('pattern', 'type', 'action')):
  764       return
  765 
  766     # Check error count on the way in
  767     num_errors = output.num_errors
  768 
  769     # Fetch the attributes
  770     pattern = attributes.get('pattern')
  771     type    = attributes.get('type', 'wildcard')
  772     action  = attributes.get('action', 'drop')
  773     if type:
  774       type = type.lower()
  775     if action:
  776       action = action.lower()
  777 
  778     # Verify the attributes
  779     if not pattern:
  780       output.Error('On a filter you must specify a "pattern" to match')
  781     elif (not type) or ((type != 'wildcard') and (type != 'regexp')):
  782       output.Error('On a filter you must specify either \'type="wildcard"\' '
  783                    'or \'type="regexp"\'')
  784     elif (action != 'pass') and (action != 'drop'):
  785       output.Error('If you specify a filter action, it must be either '
  786                    '\'action="pass"\' or \'action="drop"\'')
  787 
  788     # Set the rule
  789     if action == 'drop':
  790       self._pass = False
  791     elif action == 'pass':
  792       self._pass = True
  793 
  794     if type == 'wildcard':
  795       self._wildcard = pattern
  796     elif type == 'regexp':
  797       try:
  798         self._regexp = re.compile(pattern)
  799       except re.error:
  800         output.Error('Bad regular expression: %s' %  pattern)
  801 
  802     # Log the final results iff we didn't add any errors
  803     if num_errors == output.num_errors:
  804       output.Log('Filter: %s any URL that matches %s "%s"' %
  805                  (action, type, pattern), 2)
  806   #end def __init__
  807 
  808   def Apply(self, url):
  809     """ Process the URL, as above. """
  810     if (not url) or (not url.loc):
  811       return None
  812     
  813     if self._wildcard:
  814       if fnmatch.fnmatchcase(url.loc, self._wildcard):
  815         return self._pass
  816       return None
  817 
  818     if self._regexp:
  819       if self._regexp.search(url.loc):
  820         return self._pass
  821       return None
  822 
  823     assert False # unreachable
  824   #end def Apply
  825 #end class Filter
  826 
  827 
  828 class InputURL:
  829   """
  830   Each Input class knows how to yield a set of URLs from a data source.
  831 
  832   This one handles a single URL, manually specified in the config file.
  833   """
  834 
  835   def __init__(self, attributes):
  836     self._url = None                        # The lonely URL
  837 
  838     if not ValidateAttributes('URL', attributes,
  839                                 ('href', 'lastmod', 'changefreq', 'priority')):
  840       return
  841     
  842     url = URL()
  843     for attr in attributes.keys():
  844       if attr == 'href':
  845         url.TrySetAttribute('loc', attributes[attr])
  846       else:
  847         url.TrySetAttribute(attr, attributes[attr])
  848 
  849     if not url.loc:
  850       output.Error('Url entries must have an href attribute.')
  851       return
  852     
  853     self._url = url
  854     output.Log('Input: From URL "%s"' % self._url.loc, 2)
  855   #end def __init__
  856 
  857   def ProduceURLs(self, consumer):
  858     """ Produces URLs from our data source, hands them in to the consumer. """
  859     if self._url:
  860       consumer(self._url, True)
  861   #end def ProduceURLs
  862 #end class InputURL
  863 
  864 
  865 class InputURLList:
  866   """
  867   Each Input class knows how to yield a set of URLs from a data source.
  868 
  869   This one handles a text file with a list of URLs
  870   """
  871 
  872   def __init__(self, attributes):
  873     self._path      = None                  # The file path
  874     self._encoding  = None                  # Encoding of that file
  875 
  876     if not ValidateAttributes('URLLIST', attributes, ('path', 'encoding')):
  877       return
  878     
  879     self._path      = attributes.get('path')
  880     self._encoding  = attributes.get('encoding', ENC_UTF8)
  881     if self._path:
  882       self._path    = encoder.MaybeNarrowPath(self._path)
  883       if os.path.isfile(self._path):
  884         output.Log('Input: From URLLIST "%s"' % self._path, 2)
  885       else:
  886         output.Error('Can not locate file: %s' % self._path)
  887         self._path = None
  888     else:
  889       output.Error('Urllist entries must have a "path" attribute.')
  890   #end def __init__
  891 
  892   def ProduceURLs(self, consumer):
  893     """ Produces URLs from our data source, hands them in to the consumer. """
  894 
  895     # Open the file
  896     (frame, file) = OpenFileForRead(self._path, 'URLLIST')
  897     if not file:
  898       return
  899 
  900     # Iterate lines
  901     linenum = 0
  902     for line in file.readlines():
  903       linenum = linenum + 1
  904 
  905       # Strip comments and empty lines
  906       if self._encoding:
  907         line = encoder.WidenText(line, self._encoding)
  908       line = line.strip()
  909       if (not line) or line[0] == '#':
  910         continue
  911       
  912       # Split the line on space
  913       url = URL()
  914       cols = line.split(' ')
  915       for i in range(0,len(cols)):
  916         cols[i] = cols[i].strip()
  917       url.TrySetAttribute('loc', cols[0])
  918 
  919       # Extract attributes from the other columns
  920       for i in range(1,len(cols)):
  921         if cols[i]:
  922           try:
  923             (attr_name, attr_val) = cols[i].split('=', 1)
  924             url.TrySetAttribute(attr_name, attr_val)
  925           except ValueError:
  926             output.Warn('Line %d: Unable to parse attribute: %s' %
  927                         (linenum, cols[i]))
  928 
  929       # Pass it on
  930       consumer(url, False)
  931 
  932     file.close()
  933     if frame:
  934       frame.close()
  935   #end def ProduceURLs
  936 #end class InputURLList
  937 
  938 
  939 class InputNewsURLList:
  940   """
  941   Each Input class knows how to yield a set of URLs from a data source.
  942 
  943   This one handles a text file with a list of News URLs and their metadata
  944   """
  945 
  946   def __init__(self, attributes):
  947     self._path      = None                  # The file path
  948     self._encoding  = None                  # Encoding of that file
  949     self._tag_order = []                    # Order of URL metadata
  950  
  951     if not ValidateAttributes('URLLIST', attributes, ('path', 'encoding', \
  952                       'tag_order')):
  953       return
  954  
  955     self._path      = attributes.get('path')
  956     self._encoding  = attributes.get('encoding', ENC_UTF8)
  957     self._tag_order = attributes.get('tag_order')
  958  
  959     if self._path:
  960       self._path    = encoder.MaybeNarrowPath(self._path)
  961       if os.path.isfile(self._path):
  962         output.Log('Input: From URLLIST "%s"' % self._path, 2)
  963       else:
  964         output.Error('Can not locate file: %s' % self._path)
  965         self._path = None
  966     else:
  967       output.Error('Urllist entries must have a "path" attribute.')
  968 
  969     # parse tag_order into an array
  970     # tag_order_ascii created for more readable logging
  971     tag_order_ascii = []
  972     if self._tag_order:
  973       self._tag_order = self._tag_order.split(",")
  974       for i in range(0, len(self._tag_order)):
  975         element = self._tag_order[i].strip().lower()
  976     self._tag_order[i]= element
  977     tag_order_ascii.append(element.encode('ascii'))
  978       output.Log('Input: From URLLIST tag order is "%s"' % tag_order_ascii, 0)
  979     else:
  980       output.Error('News Urllist configuration file must contain tag_order '
  981            'to define Sitemap metatags.')
  982 
  983     # verify all tag_order inputs are valid
  984     tag_order_dict = {}
  985     for tag in self._tag_order:
  986       tag_order_dict[tag] = ""
  987     if not ValidateAttributes('URLLIST', tag_order_dict, \
  988             NEWS_SITEMAP_TAGS): 
  989       return
  990 
  991     # loc tag must be present
  992     loc_tag = False
  993     for tag in self._tag_order:
  994       if tag == 'loc':
  995         loc_tag = True
  996         break
  997     if not loc_tag:
  998       output.Error('News Urllist tag_order in configuration file '
  999            'does not contain "loc" value: %s' % tag_order_ascii)
 1000   #end def __init__
 1001 
 1002   def ProduceURLs(self, consumer):
 1003     """ Produces URLs from our data source, hands them in to the consumer. """
 1004 
 1005     # Open the file
 1006     (frame, file) = OpenFileForRead(self._path, 'URLLIST')
 1007     if not file:
 1008       return
 1009 
 1010     # Iterate lines
 1011     linenum = 0
 1012     for line in file.readlines():
 1013       linenum = linenum + 1
 1014 
 1015       # Strip comments and empty lines
 1016       if self._encoding:
 1017         line = encoder.WidenText(line, self._encoding)
 1018       line = line.strip()
 1019       if (not line) or line[0] == '#':
 1020         continue
 1021       
 1022       # Split the line on tabs
 1023       url = NewsURL()
 1024       cols = line.split('\t')
 1025       for i in range(0,len(cols)):
 1026         cols[i] = cols[i].strip()
 1027 
 1028       for i in range(0,len(cols)):
 1029         if cols[i]:
 1030           attr_value = cols[i]
 1031       if i < len(self._tag_order):
 1032             attr_name = self._tag_order[i]
 1033             try:
 1034               url.TrySetAttribute(attr_name, attr_value)
 1035             except ValueError:
 1036               output.Warn('Line %d: Unable to parse attribute: %s' %
 1037                         (linenum, cols[i]))
 1038 
 1039       # Pass it on
 1040       consumer(url, False)
 1041 
 1042     file.close()
 1043     if frame:
 1044       frame.close()
 1045   #end def ProduceURLs
 1046 #end class InputNewsURLList
 1047 
 1048 
 1049 class InputDirectory:
 1050   """
 1051   Each Input class knows how to yield a set of URLs from a data source.
 1052 
 1053   This one handles a directory that acts as base for walking the filesystem.
 1054   """
 1055 
 1056   def __init__(self, attributes, base_url):
 1057     self._path         = None               # The directory
 1058     self._url          = None               # The URL equivalent
 1059     self._default_file = None
 1060     self._remove_empty_directories = False 
 1061 
 1062     if not ValidateAttributes('DIRECTORY', attributes, ('path', 'url',
 1063                               'default_file', 'remove_empty_directories')):
 1064       return
 1065 
 1066     # Prep the path -- it MUST end in a sep
 1067     path = attributes.get('path')
 1068     if not path:
 1069       output.Error('Directory entries must have both "path" and "url" '
 1070                   'attributes')
 1071       return
 1072     path = encoder.MaybeNarrowPath(path)
 1073     if not path.endswith(os.sep):
 1074       path = path + os.sep
 1075     if not os.path.isdir(path):
 1076       output.Error('Can not locate directory: %s' % path)
 1077       return
 1078     
 1079     # Prep the URL -- it MUST end in a sep
 1080     url = attributes.get('url')
 1081     if not url:
 1082       output.Error('Directory entries must have both "path" and "url" '
 1083                   'attributes')
 1084       return
 1085     url = URL.Canonicalize(url)
 1086     if not url.endswith('/'):
 1087       url = url + '/'
 1088     if not url.startswith(base_url):
 1089       url = urlparse.urljoin(base_url, url)
 1090       if not url.startswith(base_url):
 1091         output.Error('The directory URL "%s" is not relative to the '
 1092                     'base_url: %s' % (url, base_url))
 1093         return
 1094 
 1095     # Prep the default file -- it MUST be just a filename
 1096     file = attributes.get('default_file')
 1097     if file:
 1098       file = encoder.MaybeNarrowPath(file)
 1099       if os.sep in file:
 1100         output.Error('The default_file "%s" can not include path information.'
 1101                      % file)
 1102         file = None
 1103 
 1104     # Prep the remove_empty_directories -- default is false
 1105     remove_empty_directories = attributes.get('remove_empty_directories')
 1106     if remove_empty_directories:
 1107       if (remove_empty_directories == '1') or \
 1108          (remove_empty_directories.lower() == 'true'):
 1109         remove_empty_directories = True
 1110       elif (remove_empty_directories == '0') or \
 1111        (remove_empty_directories.lower() == 'false'):
 1112         remove_empty_directories = False
 1113       # otherwise the user set a non-default value
 1114       else:
 1115         output.Error('Configuration file remove_empty_directories '
 1116              'value is not recognized.  Value must be true or false.')
 1117         return
 1118     else:
 1119       remove_empty_directories = False
 1120 
 1121     self._path         = path
 1122     self._url          = url
 1123     self._default_file = file
 1124     self._remove_empty_directories = remove_empty_directories
 1125 
 1126     if file:
 1127       output.Log('Input: From DIRECTORY "%s" (%s) with default file "%s"'
 1128                  % (path, url, file), 2)
 1129     else:
 1130       output.Log('Input: From DIRECTORY "%s" (%s) with no default file'
 1131                  % (path, url), 2)
 1132   #end def __init__
 1133   
 1134      
 1135   def ProduceURLs(self, consumer):
 1136     """ Produces URLs from our data source, hands them in to the consumer. """
 1137     if not self._path:
 1138       return
 1139 
 1140     root_path = self._path
 1141     root_URL  = self._url
 1142     root_file = self._default_file
 1143     remove_empty_directories = self._remove_empty_directories
 1144 
 1145     def HasReadPermissions(path):
 1146       """ Verifies a given path has read permissions. """  
 1147       stat_info = os.stat(path)
 1148       mode = stat_info[stat.ST_MODE]
 1149       if mode & stat.S_IREAD:
 1150         return True
 1151       else:
 1152         return None
 1153 
 1154     def PerFile(dirpath, name):
 1155       """
 1156       Called once per file.
 1157       Note that 'name' will occasionally be None -- for a directory itself
 1158       """
 1159       # Pull a timestamp
 1160       url           = URL()
 1161       isdir         = False
 1162       try:
 1163         if name:
 1164           path      = os.path.join(dirpath, name)
 1165         else:
 1166           path      = dirpath
 1167         isdir       = os.path.isdir(path)
 1168         time        = None
 1169         if isdir and root_file:
 1170           file      = os.path.join(path, root_file)
 1171           try:
 1172             time    = os.stat(file)[stat.ST_MTIME];
 1173           except OSError:
 1174             pass
 1175         if not time:
 1176           time      = os.stat(path)[stat.ST_MTIME];
 1177         url.lastmod = TimestampISO8601(time)
 1178       except OSError:
 1179         pass
 1180       except ValueError:
 1181         pass
 1182 
 1183       # Build a URL
 1184       middle        = dirpath[len(root_path):]
 1185       if os.sep != '/':
 1186         middle = middle.replace(os.sep, '/')
 1187       if middle:
 1188         middle      = middle + '/'
 1189       if name:
 1190         middle      = middle + name
 1191         if isdir:
 1192           middle    = middle + '/'
 1193       url.TrySetAttribute('loc', root_URL + encoder.WidenText(middle, None))
 1194 
 1195       # Suppress default files.  (All the way down here so we can log it.)
 1196       if name and (root_file == name):
 1197         url.Log(prefix='IGNORED (default file)', level=2)
 1198         return
 1199   
 1200       # Suppress directories when remove_empty_directories="true"
 1201       try:
 1202         if isdir:
 1203       if HasReadPermissions(path):
 1204             if remove_empty_directories == 'true' and \
 1205            len(os.listdir(path)) == 0:
 1206               output.Log('IGNORED empty directory %s' % str(path), level=1)
 1207               return
 1208           elif path == self._path:
 1209             output.Error('IGNORED configuration file directory input %s due '
 1210              'to file permissions' % self._path)
 1211           else:
 1212             output.Log('IGNORED files within directory %s due to file '
 1213                'permissions' % str(path), level=0)
 1214       except OSError:
 1215         pass
 1216       except ValueError:
 1217         pass
 1218  
 1219       consumer(url, False)
 1220     #end def PerFile
 1221 
 1222     def PerDirectory(ignore, dirpath, namelist):
 1223       """
 1224       Called once per directory with a list of all the contained files/dirs.
 1225       """
 1226       ignore = ignore  # Avoid warnings of an unused parameter
 1227 
 1228       if not dirpath.startswith(root_path):
 1229         output.Warn('Unable to decide what the root path is for directory: '
 1230                     '%s' % dirpath)
 1231         return
 1232 
 1233       for name in namelist:
 1234         PerFile(dirpath, name)
 1235     #end def PerDirectory
 1236 
 1237     output.Log('Walking DIRECTORY "%s"' % self._path, 1)
 1238     PerFile(self._path, None)
 1239     os.path.walk(self._path, PerDirectory, None)
 1240   #end def ProduceURLs
 1241 #end class InputDirectory
 1242 
 1243 
 1244 class InputAccessLog:
 1245   """
 1246   Each Input class knows how to yield a set of URLs from a data source.
 1247 
 1248   This one handles access logs.  It's non-trivial in that we want to
 1249   auto-detect log files in the Common Logfile Format (as used by Apache,
 1250   for instance) and the Extended Log File Format (as used by IIS, for
 1251   instance).
 1252   """
 1253 
 1254   def __init__(self, attributes):
 1255     self._path         = None               # The file path
 1256     self._encoding     = None               # Encoding of that file
 1257     self._is_elf       = False              # Extended Log File Format?
 1258     self._is_clf       = False              # Common Logfile Format?
 1259     self._elf_status   = -1                 # ELF field: '200'
 1260     self._elf_method   = -1                 # ELF field: 'HEAD'
 1261     self._elf_uri      = -1                 # ELF field: '/foo?bar=1'
 1262     self._elf_urifrag1 = -1                 # ELF field: '/foo'
 1263     self._elf_urifrag2 = -1                 # ELF field: 'bar=1'
 1264 
 1265     if not ValidateAttributes('ACCESSLOG', attributes, ('path', 'encoding')):
 1266       return
 1267 
 1268     self._path      = attributes.get('path')
 1269     self._encoding  = attributes.get('encoding', ENC_UTF8)
 1270     if self._path:
 1271       self._path    = encoder.MaybeNarrowPath(self._path)
 1272       if os.path.isfile(self._path):
 1273         output.Log('Input: From ACCESSLOG "%s"' % self._path, 2)
 1274       else:
 1275         output.Error('Can not locate file: %s' % self._path)
 1276         self._path = None
 1277     else:
 1278       output.Error('Accesslog entries must have a "path" attribute.')
 1279   #end def __init__
 1280 
 1281   def RecognizeELFLine(self, line):
 1282     """ Recognize the Fields directive that heads an ELF file """
 1283     if not line.startswith('#Fields:'):
 1284       return False
 1285     fields = line.split(' ')
 1286     del fields[0]
 1287     for i in range(0, len(fields)):
 1288       field = fields[i].strip()
 1289       if field == 'sc-status':
 1290         self._elf_status   = i
 1291       elif field == 'cs-method':
 1292         self._elf_method   = i
 1293       elif field == 'cs-uri':
 1294         self._elf_uri      = i
 1295       elif field == 'cs-uri-stem':
 1296         self._elf_urifrag1 = i
 1297       elif field == 'cs-uri-query':
 1298         self._elf_urifrag2 = i
 1299     output.Log('Recognized an Extended Log File Format file.', 2)
 1300     return True
 1301   #end def RecognizeELFLine
 1302 
 1303   def GetELFLine(self, line):
 1304     """ Fetch the requested URL from an ELF line """
 1305     fields = line.split(' ')
 1306     count  = len(fields)
 1307 
 1308     # Verify status was Ok
 1309     if self._elf_status >= 0:
 1310       if self._elf_status >= count:
 1311         return None
 1312       if not fields[self._elf_status].strip() == '200':
 1313         return None
 1314 
 1315     # Verify method was HEAD or GET
 1316     if self._elf_method >= 0:
 1317       if self._elf_method >= count:
 1318         return None
 1319       if not fields[self._elf_method].strip() in ('HEAD', 'GET'):
 1320         return None
 1321 
 1322     # Pull the full URL if we can
 1323     if self._elf_uri >= 0:
 1324       if self._elf_uri >= count:
 1325         return None
 1326       url = fields[self._elf_uri].strip()
 1327       if url != '-':
 1328         return url
 1329 
 1330     # Put together a fragmentary URL
 1331     if self._elf_urifrag1 >= 0:
 1332       if self._elf_urifrag1 >= count or self._elf_urifrag2 >= count:
 1333         return None
 1334       urlfrag1 = fields[self._elf_urifrag1].strip()
 1335       urlfrag2 = None
 1336       if self._elf_urifrag2 >= 0:
 1337         urlfrag2 = fields[self._elf_urifrag2]
 1338       if urlfrag1 and (urlfrag1 != '-'):
 1339         if urlfrag2 and (urlfrag2 != '-'):
 1340           urlfrag1 = urlfrag1 + '?' + urlfrag2
 1341         return urlfrag1
 1342 
 1343     return None
 1344   #end def GetELFLine
 1345 
 1346   def RecognizeCLFLine(self, line):
 1347     """ Try to tokenize a logfile line according to CLF pattern and see if
 1348     it works. """
 1349     match = ACCESSLOG_CLF_PATTERN.match(line)
 1350     recognize = match and (match.group(1) in ('HEAD', 'GET'))
 1351     if recognize:
 1352       output.Log('Recognized a Common Logfile Format file.', 2)
 1353     return recognize
 1354   #end def RecognizeCLFLine
 1355 
 1356   def GetCLFLine(self, line):
 1357     """ Fetch the requested URL from a CLF line """
 1358     match = ACCESSLOG_CLF_PATTERN.match(line)
 1359     if match:
 1360       request = match.group(1)
 1361       if request in ('HEAD', 'GET'):
 1362         return match.group(2)
 1363     return None
 1364   #end def GetCLFLine
 1365 
 1366   def ProduceURLs(self, consumer):
 1367     """ Produces URLs from our data source, hands them in to the consumer. """
 1368 
 1369     # Open the file
 1370     (frame, file) = OpenFileForRead(self._path, 'ACCESSLOG')
 1371     if not file:
 1372       return
 1373 
 1374     # Iterate lines
 1375     for line in file.readlines():
 1376       if self._encoding:
 1377         line = encoder.WidenText(line, self._encoding)
 1378       line = line.strip()
 1379 
 1380       # If we don't know the format yet, try them both
 1381       if (not self._is_clf) and (not self._is_elf):
 1382         self._is_elf = self.RecognizeELFLine(line)
 1383         self._is_clf = self.RecognizeCLFLine(line)
 1384 
 1385       # Digest the line
 1386       match = None
 1387       if self._is_elf:
 1388         match = self.GetELFLine(line)
 1389       elif self._is_clf:
 1390         match = self.GetCLFLine(line)
 1391       if not match:
 1392         continue
 1393 
 1394       # Pass it on
 1395       url = URL()
 1396       url.TrySetAttribute('loc', match)
 1397       consumer(url, True)
 1398 
 1399     file.close()
 1400     if frame:
 1401       frame.close()
 1402   #end def ProduceURLs
 1403 #end class InputAccessLog
 1404 
 1405 
 1406 class FilePathGenerator:
 1407   """
 1408   This class generates filenames in a series, upon request.
 1409   You can request any iteration number at any time, you don't
 1410   have to go in order.
 1411 
 1412   Example of iterations for '/path/foo.xml.gz':
 1413     0           --> /path/foo.xml.gz
 1414     1           --> /path/foo1.xml.gz
 1415     2           --> /path/foo2.xml.gz
 1416     _index.xml  --> /path/foo_index.xml
 1417   """
 1418 
 1419   def __init__(self):
 1420     self.is_gzip     = False                 # Is this a  GZIP file?
 1421 
 1422     self._path       = None                  # '/path/'
 1423     self._prefix     = None                  # 'foo'
 1424     self._suffix     = None                  # '.xml.gz'
 1425   #end def __init__
 1426 
 1427   def Preload(self, path):
 1428     """ Splits up a path into forms ready for recombination. """
 1429     path = encoder.MaybeNarrowPath(path)
 1430 
 1431     # Get down to a base name
 1432     path = os.path.normpath(path)
 1433     base = os.path.basename(path).lower()
 1434     if not base:
 1435       output.Error('Couldn\'t parse the file path: %s' % path)
 1436       return False
 1437     lenbase = len(base)
 1438 
 1439     # Recognize extension
 1440     lensuffix = 0
 1441     compare_suffix = ['.xml', '.xml.gz', '.gz']
 1442     for suffix in compare_suffix:
 1443       if base.endswith(suffix):
 1444         lensuffix = len(suffix)
 1445         break
 1446     if not lensuffix:
 1447       output.Error('The path "%s" doesn\'t end in a supported file '
 1448                    'extension.' % path)
 1449       return False
 1450     self.is_gzip = suffix.endswith('.gz')
 1451 
 1452     # Split the original path
 1453     lenpath = len(path)
 1454     self._path   = path[:lenpath-lenbase]
 1455     self._prefix = path[lenpath-lenbase:lenpath-lensuffix]
 1456     self._suffix = path[lenpath-lensuffix:]
 1457 
 1458     return True
 1459   #end def Preload
 1460 
 1461   def GeneratePath(self, instance):
 1462     """ Generates the iterations, as described above. """
 1463     prefix = self._path + self._prefix
 1464     if type(instance) == types.IntType:
 1465       if instance:
 1466         return '%s%d%s' % (prefix, instance, self._suffix)
 1467       return prefix + self._suffix
 1468     return prefix + instance
 1469   #end def GeneratePath
 1470 
 1471   def GenerateURL(self, instance, root_url):
 1472     """ Generates iterations, but as a URL instead of a path. """
 1473     prefix = root_url + self._prefix
 1474     retval = None
 1475     if type(instance) == types.IntType:
 1476       if instance:
 1477         retval = '%s%d%s' % (prefix, instance, self._suffix)
 1478       else:
 1479         retval = prefix + self._suffix
 1480     else:
 1481       retval = prefix + instance
 1482     return URL.Canonicalize(retval)
 1483   #end def GenerateURL
 1484 
 1485   def GenerateWildURL(self, root_url):
 1486     """ Generates a wildcard that should match all our iterations """
 1487     prefix = URL.Canonicalize(root_url + self._prefix)
 1488     temp   = URL.Canonicalize(prefix + self._suffix)
 1489     suffix = temp[len(prefix):]
 1490     return prefix + '*' + suffix
 1491   #end def GenerateURL
 1492 #end class FilePathGenerator
 1493 
 1494 
 1495 class PerURLStatistics:
 1496   """ Keep track of some simple per-URL statistics, like file extension. """
 1497 
 1498   def __init__(self):
 1499     self._extensions  = {}                  # Count of extension instances
 1500   #end def __init__
 1501 
 1502   def Consume(self, url):
 1503     """ Log some stats for the URL.  At the moment, that means extension. """
 1504     if url and url.loc:
 1505       (scheme, netloc, path, query, frag) = urlparse.urlsplit(url.loc)
 1506       if not path:
 1507         return
 1508 
 1509       # Recognize directories
 1510       if path.endswith('/'):
 1511         if self._extensions.has_key('/'):
 1512           self._extensions['/'] = self._extensions['/'] + 1
 1513         else:
 1514           self._extensions['/'] = 1
 1515         return
 1516 
 1517       # Strip to a filename
 1518       i = path.rfind('/')
 1519       if i >= 0:
 1520         assert i < len(path)
 1521         path = path[i:]
 1522 
 1523       # Find extension
 1524       i = path.rfind('.')
 1525       if i > 0:
 1526         assert i < len(path)
 1527         ext = path[i:].lower()
 1528         if self._extensions.has_key(ext):
 1529           self._extensions[ext] = self._extensions[ext] + 1
 1530         else:
 1531           self._extensions[ext] = 1
 1532       else:
 1533         if self._extensions.has_key('(no extension)'):
 1534           self._extensions['(no extension)'] = self._extensions[
 1535             '(no extension)'] + 1
 1536         else:
 1537           self._extensions['(no extension)'] = 1
 1538   #end def Consume
 1539 
 1540   def Log(self):
 1541     """ Dump out stats to the output. """
 1542     if len(self._extensions):
 1543       output.Log('Count of file extensions on URLs:', 1)
 1544       set = self._extensions.keys()
 1545       set.sort()
 1546       for ext in set:
 1547         output.Log(' %7d  %s' % (self._extensions[ext], ext), 1)
 1548   #end def Log
 1549 
 1550 class Sitemap(xml.sax.handler.ContentHandler):
 1551   """
 1552   This is the big workhorse class that processes your inputs and spits
 1553   out sitemap files.  It is built as a SAX handler for set up purposes.
 1554   That is, it processes an XML stream to bring itself up.
 1555   """
 1556 
 1557   def __init__(self, suppress_notify):
 1558     xml.sax.handler.ContentHandler.__init__(self)
 1559     self._filters      = []                  # Filter objects
 1560     self._inputs       = []                  # Input objects
 1561     self._urls         = {}                  # Maps URLs to count of dups
 1562     self._set          = []                  # Current set of URLs
 1563     self._filegen      = None                # Path generator for output files
 1564     self._wildurl1     = None                # Sitemap URLs to filter out
 1565     self._wildurl2     = None                # Sitemap URLs to filter out
 1566     self._sitemaps     = 0                   # Number of output files
 1567     # We init _dup_max to 2 so the default priority is 0.5 instead of 1.0
 1568     self._dup_max      = 2                   # Max number of duplicate URLs
 1569     self._stat         = PerURLStatistics()  # Some simple stats
 1570     self._in_site      = False               # SAX: are we in a Site node?
 1571     self._in_Site_ever = False               # SAX: were we ever in a Site?
 1572 
 1573     self._default_enc  = None                # Best encoding to try on URLs
 1574     self._base_url     = None                # Prefix to all valid URLs
 1575     self._store_into   = None                # Output filepath
 1576     self._sitemap_type = None            # Sitemap type (web, mobile or news)
 1577     self._suppress     = suppress_notify     # Suppress notify of servers
 1578   #end def __init__
 1579 
 1580   def ValidateBasicConfig(self):
 1581     """ Verifies (and cleans up) the basic user-configurable options. """
 1582     all_good = True
 1583 
 1584     if self._default_enc:
 1585       encoder.SetUserEncoding(self._default_enc)
 1586 
 1587     # Canonicalize the base_url
 1588     if all_good and not self._base_url:
 1589       output.Error('A site needs a "base_url" attribute.')
 1590       all_good = False
 1591     if all_good and not URL.IsAbsolute(self._base_url):
 1592         output.Error('The "base_url" must be absolute, not relative: %s' %
 1593                      self._base_url)
 1594         all_good = False
 1595     if all_good:
 1596       self._base_url = URL.Canonicalize(self._base_url)
 1597       if not self._base_url.endswith('/'):
 1598         self._base_url = self._base_url + '/'
 1599       output.Log('BaseURL is set to: %s' % self._base_url, 2)
 1600 
 1601     # Load store_into into a generator
 1602     if all_good:
 1603       if self._store_into:
 1604         self._filegen = FilePathGenerator()
 1605         if not self._filegen.Preload(self._store_into):
 1606           all_good = False
 1607       else:
 1608         output.Error('A site needs a "store_into" attribute.')
 1609         all_good = False
 1610 
 1611     # Ask the generator for patterns on what its output will look like
 1612     if all_good:
 1613       self._wildurl1 = self._filegen.GenerateWildURL(self._base_url)
 1614       self._wildurl2 = self._filegen.GenerateURL(SITEINDEX_SUFFIX,
 1615                                                  self._base_url)
 1616 
 1617     # Unify various forms of False
 1618     if all_good:
 1619       if self._suppress:
 1620         if (type(self._suppress) == types.StringType) or (type(self._suppress)
 1621                                  == types.UnicodeType):
 1622           if (self._suppress == '0') or (self._suppress.lower() == 'false'):
 1623             self._suppress = False
 1624 
 1625     # Clean up the sitemap_type
 1626     if all_good:
 1627       match = False
 1628       # If sitemap_type is not specified, default to web sitemap
 1629       if not self._sitemap_type:
 1630         self._sitemap_type = 'web'
 1631       else:
 1632     self._sitemap_type = self._sitemap_type.lower()
 1633         for pattern in SITEMAP_TYPES:
 1634           if self._sitemap_type == pattern:
 1635             match = True
 1636             break
 1637         if not match:
 1638           output.Error('The "sitemap_type" value must be "web", "mobile" '
 1639                'or "news": %s' % self._sitemap_type)
 1640           all_good = False
 1641       output.Log('The Sitemap type is %s Sitemap.' % \
 1642                 self._sitemap_type.upper(), 0)
 1643 
 1644     # Done
 1645     if not all_good:
 1646       output.Log('See "example_config.xml" for more information.', 0)
 1647     return all_good
 1648   #end def ValidateBasicConfig
 1649 
 1650   def Generate(self):
 1651     """ Run over all the Inputs and ask them to Produce """
 1652     # Run the inputs
 1653     for input in self._inputs:
 1654       input.ProduceURLs(self.ConsumeURL)
 1655 
 1656     # Do last flushes
 1657     if len(self._set):
 1658       self.FlushSet()
 1659     if not self._sitemaps:
 1660       output.Warn('No URLs were recorded, writing an empty sitemap.')
 1661       self.FlushSet()
 1662 
 1663     # Write an index as needed
 1664     if self._sitemaps > 1:
 1665       self.WriteIndex()
 1666 
 1667     # Notify
 1668     self.NotifySearch()
 1669 
 1670     # Dump stats
 1671     self._stat.Log()
 1672   #end def Generate
 1673 
 1674   def ConsumeURL(self, url, allow_fragment):
 1675     """
 1676     All per-URL processing comes together here, regardless of Input.
 1677     Here we run filters, remove duplicates, spill to disk as needed, etc.
 1678     
 1679     """
 1680     if not url:
 1681       return
 1682 
 1683     # Validate
 1684     if not url.Validate(self._base_url, allow_fragment):
 1685       return
 1686 
 1687     # Run filters
 1688     accept = None
 1689     for filter in self._filters:
 1690       accept = filter.Apply(url)
 1691       if accept != None:
 1692         break
 1693     if not (accept or (accept == None)):
 1694       url.Log(prefix='FILTERED', level=2)
 1695       return
 1696 
 1697     # Ignore our out output URLs
 1698     if fnmatch.fnmatchcase(url.loc, self._wildurl1) or fnmatch.fnmatchcase(
 1699       url.loc, self._wildurl2):
 1700       url.Log(prefix='IGNORED (output file)', level=2)
 1701       return
 1702 
 1703     # Note the sighting
 1704     hash = url.MakeHash()
 1705     if self._urls.has_key(hash):
 1706       dup = self._urls[hash]
 1707       if dup > 0:
 1708         dup = dup + 1
 1709         self._urls[hash] = dup
 1710         if self._dup_max < dup:
 1711           self._dup_max = dup
 1712       url.Log(prefix='DUPLICATE')
 1713       return
 1714 
 1715     # Acceptance -- add to set
 1716     self._urls[hash] = 1
 1717     self._set.append(url)
 1718     self._stat.Consume(url)
 1719     url.Log()
 1720 
 1721     # Flush the set if needed
 1722     if len(self._set) >= MAXURLS_PER_SITEMAP:
 1723       self.FlushSet()
 1724   #end def ConsumeURL
 1725 
 1726   def FlushSet(self):
 1727     """
 1728     Flush the current set of URLs to the output.  This is a little
 1729     slow because we like to sort them all and normalize the priorities
 1730     before dumping.
 1731     """
 1732     
 1733     # Determine what Sitemap header to use (News or General)
 1734     if self._sitemap_type == 'news':
 1735       sitemap_header = NEWS_SITEMAP_HEADER
 1736     else:
 1737       sitemap_header = GENERAL_SITEMAP_HEADER
 1738       
 1739     # Sort and normalize
 1740     output.Log('Sorting and normalizing collected URLs.', 1)
 1741     self._set.sort()
 1742     for url in self._set:
 1743       hash = url.MakeHash()
 1744       dup = self._urls[hash]
 1745       if dup > 0:
 1746         self._urls[hash] = -1
 1747         if not url.priority:
 1748           url.priority = '%.4f' % (float(dup) / float(self._dup_max))
 1749 
 1750     # Get the filename we're going to write to
 1751     filename = self._filegen.GeneratePath(self._sitemaps)
 1752     if not filename:
 1753       output.Fatal('Unexpected: Couldn\'t generate output filename.')
 1754     self._sitemaps = self._sitemaps + 1
 1755     output.Log('Writing Sitemap file "%s" with %d URLs' %
 1756         (filename, len(self._set)), 1)
 1757 
 1758     # Write to it
 1759     frame = None
 1760     file  = None
 1761 
 1762     try:
 1763       if self._filegen.is_gzip:
 1764         basename = os.path.basename(filename);
 1765         frame = open(filename, 'wb')
 1766         file = gzip.GzipFile(fileobj=frame, filename=basename, mode='wt')
 1767       else:
 1768         file = open(filename, 'wt')
 1769 
 1770       file.write(sitemap_header)
 1771       for url in self._set:
 1772         url.WriteXML(file)
 1773       file.write(SITEMAP_FOOTER)
 1774 
 1775       file.close()
 1776       if frame:
 1777         frame.close()
 1778 
 1779       frame = None
 1780       file  = None
 1781     except IOError:
 1782       output.Fatal('Couldn\'t write out to file: %s' % filename)
 1783     os.chmod(filename, 0644)
 1784 
 1785     # Flush
 1786     self._set = []
 1787   #end def FlushSet
 1788 
 1789   def WriteIndex(self):
 1790     """ Write the master index of all Sitemap files """
 1791     # Make a filename
 1792     filename = self._filegen.GeneratePath(SITEINDEX_SUFFIX)
 1793     if not filename:
 1794       output.Fatal('Unexpected: Couldn\'t generate output index filename.')
 1795     output.Log('Writing index file "%s" with %d Sitemaps' %
 1796         (filename, self._sitemaps), 1)
 1797 
 1798    # Determine what Sitemap index header to use (News or General)
 1799     if self._sitemap_type == 'news':
 1800       sitemap_index_header = NEWS_SITEMAP_HEADER
 1801     else:
 1802       sitemap__index_header = GENERAL_SITEMAP_HEADER
 1803  
 1804     # Make a lastmod time
 1805     lastmod = TimestampISO8601(time.time())
 1806 
 1807     # Write to it
 1808     try:
 1809       fd = open(filename, 'wt')
 1810       fd.write(sitemap_index_header)
 1811 
 1812       for mapnumber in range(0,self._sitemaps):
 1813         # Write the entry
 1814         mapurl = self._filegen.GenerateURL(mapnumber, self._base_url)
 1815         mapattributes = { 'loc' : mapurl, 'lastmod' : lastmod }
 1816         fd.write(SITEINDEX_ENTRY % mapattributes)
 1817 
 1818       fd.write(SITEINDEX_FOOTER)
 1819 
 1820       fd.close()
 1821       fd = None
 1822     except IOError:
 1823       output.Fatal('Couldn\'t write out to file: %s' % filename)
 1824     os.chmod(filename, 0644)
 1825   #end def WriteIndex
 1826 
 1827   def NotifySearch(self):
 1828     """ Send notification of the new Sitemap(s) to the search engines. """
 1829     if self._suppress:
 1830       output.Log('Search engine notification is suppressed.', 1)
 1831       return
 1832 
 1833     output.Log('Notifying search engines.', 1)
 1834 
 1835     # Override the urllib's opener class with one that doesn't ignore 404s
 1836     class ExceptionURLopener(urllib.FancyURLopener):
 1837       def http_error_default(self, url, fp, errcode, errmsg, headers):
 1838         output.Log('HTTP error %d: %s' % (errcode, errmsg), 2)
 1839         raise IOError
 1840       #end def http_error_default
 1841     #end class ExceptionURLOpener
 1842     old_opener = urllib._urlopener
 1843     urllib._urlopener = ExceptionURLopener()
 1844 
 1845     # Build the URL we want to send in
 1846     if self._sitemaps > 1:
 1847       url = self._filegen.GenerateURL(SITEINDEX_SUFFIX, self._base_url)
 1848     else:
 1849       url = self._filegen.GenerateURL(0, self._base_url)
 1850 
 1851     # Test if we can hit it ourselves
 1852     try:
 1853       u = urllib.urlopen(url)
 1854       u.close()
 1855     except IOError:
 1856       output.Error('When attempting to access our generated Sitemap at the '
 1857                    'following URL:\n    %s\n  we failed to read it.  Please '
 1858                    'verify the store_into path you specified in\n'
 1859                    '  your configuration file is web-accessable.  Consult '
 1860                    'the FAQ for more\n  information.' % url)
 1861       output.Warn('Proceeding to notify with an unverifyable URL.')
 1862 
 1863     # Cycle through notifications
 1864     # To understand this, see the comment near the NOTIFICATION_SITES comment
 1865     for ping in NOTIFICATION_SITES:
 1866       query_map             = ping[3]
 1867       query_attr            = ping[5]
 1868       query_map[query_attr] = url
 1869       query = urllib.urlencode(query_map)
 1870       notify = urlparse.urlunsplit((ping[0], ping[1], ping[2], query, ping[4]))
 1871 
 1872       # Send the notification
 1873       output.Log('Notifying: %s' % ping[1], 0)
 1874       output.Log('Notification URL: %s' % notify, 2)
 1875       try:
 1876         u = urllib.urlopen(notify)
 1877         u.read()
 1878         u.close()
 1879       except IOError:
 1880         output.Warn('Cannot contact: %s' % ping[1])
 1881 
 1882     if old_opener:
 1883       urllib._urlopener = old_opener
 1884   #end def NotifySearch
 1885 
 1886   def startElement(self, tag, attributes):
 1887     """ SAX processing, called per node in the config stream. """
 1888     if tag == 'site':
 1889       if self._in_site:
 1890         output.Error('Can not nest Site entries in the configuration.')
 1891       else:
 1892         self._in_site     = True
 1893 
 1894         if not ValidateAttributes('SITE', attributes,
 1895           ('verbose', 'default_encoding', 'base_url', 'store_into',
 1896            'suppress_search_engine_notify', 'sitemap_type')):
 1897           return
 1898 
 1899         verbose           = attributes.get('verbose', 0)
 1900         if verbose:
 1901           output.SetVerbose(verbose)
 1902 
 1903         self._default_enc = attributes.get('default_encoding')
 1904         self._base_url    = attributes.get('base_url')
 1905         self._store_into  = attributes.get('store_into')
 1906     self._sitemap_type= attributes.get('sitemap_type')
 1907         if not self._suppress:
 1908           self._suppress  = attributes.get('suppress_search_engine_notify',
 1909                                             False)
 1910         self.ValidateBasicConfig()
 1911     elif tag == 'filter':
 1912       self._filters.append(Filter(attributes))
 1913 
 1914     elif tag == 'url':
 1915      print type(attributes)
 1916      self._inputs.append(InputURL(attributes))
 1917 
 1918     elif tag == 'urllist':
 1919       for attributeset in ExpandPathAttribute(attributes, 'path'):
 1920         if self._sitemap_type == 'news':
 1921           self._inputs.append(InputNewsURLList(attributeset))
 1922         else:
 1923       self._inputs.append(InputURLList(attributeset))
 1924 
 1925     elif tag == 'directory':
 1926       self._inputs.append(InputDirectory(attributes, self._base_url))
 1927 
 1928     elif tag == 'accesslog':
 1929       for attributeset in ExpandPathAttribute(attributes, 'path'):
 1930         self._inputs.append(InputAccessLog(attributeset))
 1931     else:
 1932       output.Error('Unrecognized tag in the configuration: %s' % tag)
 1933   #end def startElement
 1934 
 1935   def endElement(self, tag):
 1936     """ SAX processing, called per node in the config stream. """
 1937     if tag == 'site':
 1938       assert self._in_site
 1939       self._in_site      = False
 1940       self._in_site_ever = True
 1941   #end def endElement
 1942 
 1943   def endDocument(self):
 1944     """ End of SAX, verify we can proceed. """
 1945     if not self._in_site_ever:
 1946       output.Error('The configuration must specify a "site" element.')
 1947     else:
 1948       if not self._inputs:
 1949         output.Warn('There were no inputs to generate a sitemap from.')
 1950   #end def endDocument
 1951 #end class Sitemap
 1952 
 1953 
 1954 def ValidateAttributes(tag, attributes, goodattributes):
 1955   """ Makes sure 'attributes' does not contain any attribute not
 1956       listed in 'goodattributes' """
 1957   all_good = True
 1958   for attr in attributes.keys():
 1959     if not attr in goodattributes:
 1960       output.Error('Unknown %s attribute: %s' % (tag, attr))
 1961       all_good = False
 1962   return all_good
 1963 #end def ValidateAttributes
 1964 
 1965 def ExpandPathAttribute(src, attrib):
 1966   """ Given a dictionary of attributes, return a list of dictionaries
 1967       with all the same attributes except for the one named attrib.
 1968       That one, we treat as a file path and expand into all its possible
 1969       variations. """
 1970   # Do the path expansion.  On any error, just return the source dictionary.
 1971   path = src.get(attrib)
 1972   if not path:
 1973     return [src]
 1974   path = encoder.MaybeNarrowPath(path);
 1975   pathlist = glob.glob(path)
 1976   if not pathlist:
 1977     return [src]
 1978 
 1979   # If this isn't actually a dictionary, make it one
 1980   if type(src) != types.DictionaryType:
 1981     tmp = {}
 1982     for key in src.keys():
 1983       tmp[key] = src[key]
 1984     src = tmp
 1985   # Create N new dictionaries
 1986   retval = []
 1987   for path in pathlist:
 1988     dst = src.copy()
 1989     dst[attrib] = path
 1990     retval.append(dst)
 1991 
 1992   return retval
 1993 #end def ExpandPathAttribute
 1994 
 1995 def OpenFileForRead(path, logtext):
 1996   """ Opens a text file, be it GZip or plain """
 1997 
 1998   frame = None
 1999   file  = None
 2000 
 2001   if not path:
 2002     return (frame, file)
 2003 
 2004   try:
 2005     if path.endswith('.gz'):
 2006       frame = open(path, 'rb')
 2007       file = gzip.GzipFile(fileobj=frame, mode='rt')
 2008     else:
 2009       file = open(path, 'rt')
 2010 
 2011     if logtext:
 2012       output.Log('Opened %s file: %s' % (logtext, path), 1)
 2013     else:
 2014       output.Log('Opened file: %s' % path, 1)
 2015   except IOError:
 2016     output.Error('Can not open file: %s' % path)
 2017 
 2018   return (frame, file)
 2019 #end def OpenFileForRead
 2020 
 2021 def TimestampISO8601(t):
 2022   """Seconds since epoch (1970-01-01) --> ISO 8601 time string."""
 2023   return time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(t))
 2024 #end def TimestampISO8601
 2025 
 2026 def CreateSitemapFromFile(configpath, suppress_notify):
 2027   """ Sets up a new Sitemap object from the specified configuration file.  """
 2028 
 2029   # Remember error count on the way in
 2030   num_errors = output.num_errors
 2031 
 2032   # Rev up SAX to parse the config
 2033   sitemap = Sitemap(suppress_notify)
 2034   try:
 2035     output.Log('Reading configuration file: %s' % configpath, 0)
 2036     xml.sax.parse(configpath, sitemap)
 2037   except IOError:
 2038     output.Error('Cannot read configuration file: %s' % configpath)
 2039   except xml.sax._exceptions.SAXParseException, e:
 2040     output.Error('XML error in the config file (line %d, column %d): %s' %
 2041                  (e._linenum, e._colnum, e.getMessage()))
 2042   except xml.sax._exceptions.SAXReaderNotAvailable:
 2043     output.Error('Some installs of Python 2.2 did not include complete support'
 2044                  ' for XML.\n  Please try upgrading your version of Python'
 2045                  ' and re-running the script.')
 2046 
 2047   # If we added any errors, return no sitemap
 2048   if num_errors == output.num_errors:
 2049     return sitemap
 2050   return None
 2051 #end def CreateSitemapFromFile
 2052 
 2053 def ProcessCommandFlags(args):
 2054   """
 2055   Parse command line flags per specified usage, pick off key, value pairs
 2056   All flags of type "--key=value" will be processed as __flags[key] = value,
 2057                     "--option" will be processed as __flags[option] = option
 2058   """
 2059 
 2060   flags   = {}
 2061   rkeyval = '--(?P<key>\S*)[=](?P<value>\S*)' # --key=val
 2062   roption = '--(?P<option>\S*)'               # --key
 2063   r = '(' + rkeyval + ')|(' + roption + ')'
 2064   rc = re.compile(r)
 2065   for a in args:
 2066     try:
 2067       rcg = rc.search(a).groupdict()
 2068       if rcg.has_key('key'):
 2069         flags[rcg['key']] = rcg['value']
 2070       if rcg.has_key('option'):
 2071         flags[rcg['option']] = rcg['option']
 2072     except AttributeError:
 2073       return None
 2074   return flags
 2075 #end def ProcessCommandFlags
 2076 
 2077 
 2078 #
 2079 # __main__
 2080 #
 2081 
 2082 if __name__ == '__main__':
 2083   flags = ProcessCommandFlags(sys.argv[1:])
 2084   if not flags or not flags.has_key('config') or flags.has_key('help'):
 2085     output.Log(__usage__, 0)
 2086   else:
 2087     suppress_notify = flags.has_key('testing')
 2088     sitemap = CreateSitemapFromFile(flags['config'], suppress_notify)
 2089     if not sitemap:
 2090       output.Log('Configuration file errors -- exiting.', 0)
 2091     else:
 2092       sitemap.Generate()
 2093       output.Log('Number of errors: %d' % output.num_errors, 1)
 2094       output.Log('Number of warnings: %d' % output.num_warns, 1)