"Fossies" - the Fresh Open Source Software Archive

Member "roundup-2.0.0/roundup/backends/indexer_dbm.py" (1 Jan 2020, 10564 Bytes) of package /linux/www/roundup-2.0.0.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Python source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. See also the latest Fossies "Diffs" side-by-side code changes report for "indexer_dbm.py": 1.6.1_vs_2.0.0.

    1 #
    2 # This module is derived from the module described at:
    3 #   http://gnosis.cx/publish/programming/charming_python_15.txt
    4 # 
    5 # Author: David Mertz (mertz@gnosis.cx)
    6 # Thanks to: Pat Knight (p.knight@ktgroup.co.uk)
    7 #            Gregory Popovitch (greg@gpy.com)
    8 # 
    9 # The original module was released under this license, and remains under
   10 # it:
   11 #
   12 #     This file is released to the public domain.  I (dqm) would
   13 #     appreciate it if you choose to keep derived works under terms
   14 #     that promote freedom, but obviously am giving up any rights
   15 #     to compel such.
   16 # 
   17 '''This module provides an indexer class, RoundupIndexer, that stores text
   18 indices in a roundup instance.  This class makes searching the content of
   19 messages, string properties and text files possible.
   20 '''
   21 __docformat__ = 'restructuredtext'
   22 
   23 import os, shutil, re, mimetypes, marshal, zlib, errno
   24 from roundup.hyperdb import Link, Multilink
   25 from roundup.backends.indexer_common import Indexer as IndexerBase
   26 
   27 class Indexer(IndexerBase):
   28     '''Indexes information from roundup's hyperdb to allow efficient
   29     searching.
   30 
   31     Three structures are created by the indexer::
   32 
   33           files   {identifier: (fileid, wordcount)}
   34           words   {word: {fileid: count}}
   35           fileids {fileid: identifier}
   36 
   37     where identifier is (classname, nodeid, propertyname)
   38     '''
   39     def __init__(self, db):
   40         IndexerBase.__init__(self, db)
   41         self.indexdb_path = os.path.join(db.config.DATABASE, 'indexes')
   42         self.indexdb = os.path.join(self.indexdb_path, 'index.db')
   43         self.reindex = 0
   44         self.quiet = 9
   45         self.changed = 0
   46 
   47         # see if we need to reindex because of a change in code
   48         version = os.path.join(self.indexdb_path, 'version')
   49         if (not os.path.exists(self.indexdb_path) or
   50                 not os.path.exists(version)):
   51             # for now the file itself is a flag
   52             self.force_reindex()
   53         elif os.path.exists(version):
   54             version = open(version).read()
   55             # check the value and reindex if it's not the latest
   56             if version.strip() != '1':
   57                 self.force_reindex()
   58 
   59     def force_reindex(self):
   60         '''Force a reindex condition
   61         '''
   62         if os.path.exists(self.indexdb_path):
   63             shutil.rmtree(self.indexdb_path)
   64         os.makedirs(self.indexdb_path)
   65         os.chmod(self.indexdb_path, 0o775)  # nosec - allow group write
   66         open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n')
   67         self.reindex = 1
   68         self.changed = 1
   69 
   70     def should_reindex(self):
   71         '''Should we reindex?
   72         '''
   73         return self.reindex
   74 
   75     def add_text(self, identifier, text, mime_type='text/plain'):
   76         '''Add some text associated with the (classname, nodeid, property)
   77         identifier.
   78         '''
   79         # make sure the index is loaded
   80         self.load_index()
   81 
   82         # remove old entries for this identifier
   83         if identifier in self.files:
   84             self.purge_entry(identifier)
   85 
   86         # split into words
   87         words = self.splitter(text, mime_type)
   88 
   89         # Find new file index, and assign it to identifier
   90         # (_TOP uses trick of negative to avoid conflict with file index)
   91         self.files['_TOP'] = (self.files['_TOP'][0]-1, None)
   92         file_index = abs(self.files['_TOP'][0])
   93         self.files[identifier] = (file_index, len(words))
   94         self.fileids[file_index] = identifier
   95 
   96         # find the unique words
   97         filedict = {}
   98         for word in words:
   99             if self.is_stopword(word):
  100                 continue
  101             if word in filedict:
  102                 filedict[word] = filedict[word]+1
  103             else:
  104                 filedict[word] = 1
  105 
  106         # now add to the totals
  107         for word in filedict:
  108             # each word has a dict of {identifier: count}
  109             if word in self.words:
  110                 entry = self.words[word]
  111             else:
  112                 # new word
  113                 entry = {}
  114                 self.words[word] = entry
  115 
  116             # make a reference to the file for this word
  117             entry[file_index] = filedict[word]
  118 
  119         # save needed
  120         self.changed = 1
  121 
  122     def splitter(self, text, ftype):
  123         '''Split the contents of a text string into a list of 'words'
  124         '''
  125         if ftype == 'text/plain':
  126             words = self.text_splitter(text)
  127         else:
  128             return []
  129         return words
  130 
  131     def text_splitter(self, text):
  132         """Split text/plain string into a list of words
  133         """
  134         if not text:
  135             return []
  136         
  137         # case insensitive
  138         text = text.upper()
  139 
  140         # Split the raw text
  141         return re.findall(r'\b\w{%d,%d}\b' % (self.minlength, self.maxlength),
  142                           text, re.UNICODE)
  143 
  144     # we override this to ignore too short and too long words
  145     # and also to fix a bug - the (fail) case.
  146     def find(self, wordlist):
  147         '''Locate files that match ALL the words in wordlist
  148         '''
  149         if not hasattr(self, 'words'):
  150             self.load_index()
  151         self.load_index(wordlist=wordlist)
  152         entries = {}
  153         hits = None
  154         for word in wordlist:
  155             if not self.minlength <= len(word) <= self.maxlength:
  156                 # word outside the bounds of what we index - ignore
  157                 continue
  158             word = word.upper()
  159             if self.is_stopword(word):
  160                 continue
  161             entry = self.words.get(word)    # For each word, get index
  162             entries[word] = entry           #   of matching files
  163             if not entry:                   # Nothing for this one word (fail)
  164                 return {}
  165             if hits is None:
  166                 hits = {}
  167                 for k in entry:
  168                     if k not in self.fileids:
  169                         raise ValueError('Index is corrupted: re-generate it')
  170                     hits[k] = self.fileids[k]
  171             else:
  172                 # Eliminate hits for every non-match
  173                 for fileid in list(hits):
  174                     if fileid not in entry:
  175                         del hits[fileid]
  176         if hits is None:
  177             return {}
  178         return list(hits.values())
  179 
  180     segments = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_-!"
  181     def load_index(self, reload=0, wordlist=None):
  182         # Unless reload is indicated, do not load twice
  183         if self.index_loaded() and not reload:
  184             return 0
  185 
  186         # Ok, now let's actually load it
  187         db = {'WORDS': {}, 'FILES': {'_TOP':(0,None)}, 'FILEIDS': {}}
  188 
  189         # Identify the relevant word-dictionary segments
  190         if not wordlist:
  191             segments = self.segments
  192         else:
  193             segments = ['-','#']
  194             for word in wordlist:
  195                 initchar = word[0].upper()
  196                 if initchar not in self.segments:
  197                     initchar = '_'
  198                 segments.append(initchar)
  199 
  200         # Load the segments
  201         for segment in segments:
  202             try:
  203                 f = open(self.indexdb + segment, 'rb')
  204             except IOError as error:
  205                 # probably just nonexistent segment index file
  206                 if error.errno != errno.ENOENT: raise
  207             else:
  208                 pickle_str = zlib.decompress(f.read())
  209                 f.close()
  210                 dbslice = marshal.loads(pickle_str)
  211                 if dbslice.get('WORDS'):
  212                     # if it has some words, add them
  213                     for word, entry in dbslice['WORDS'].items():
  214                         db['WORDS'][word] = entry
  215                 if dbslice.get('FILES'):
  216                     # if it has some files, add them
  217                     db['FILES'] = dbslice['FILES']
  218                 if dbslice.get('FILEIDS'):
  219                     # if it has fileids, add them
  220                     db['FILEIDS'] = dbslice['FILEIDS']
  221 
  222         self.words = db['WORDS']
  223         self.files = db['FILES']
  224         self.fileids = db['FILEIDS']
  225         self.changed = 0
  226 
  227     def save_index(self):
  228         # only save if the index is loaded and changed
  229         if not self.index_loaded() or not self.changed:
  230             return
  231 
  232         # brutal space saver... delete all the small segments
  233         for segment in self.segments:
  234             try:
  235                 os.remove(self.indexdb + segment)
  236             except OSError as error:
  237                 # probably just nonexistent segment index file
  238                 if error.errno != errno.ENOENT: raise
  239 
  240         # First write the much simpler filename/fileid dictionaries
  241         dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids}
  242         open(self.indexdb+'-','wb').write(zlib.compress(marshal.dumps(dbfil)))
  243 
  244         # The hard part is splitting the word dictionary up, of course
  245         letters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_"
  246         segdicts = {}                           # Need batch of empty dicts
  247         for segment in letters:
  248             segdicts[segment] = {}
  249         for word, entry in self.words.items():  # Split into segment dicts
  250             initchar = word[0].upper()
  251             if initchar not in letters:
  252                 # if it's a unicode character, add it to the '_' segment
  253                 initchar = '_'
  254             segdicts[initchar][word] = entry
  255 
  256         # save
  257         for initchar in letters:
  258             db = {'WORDS':segdicts[initchar], 'FILES':None, 'FILEIDS':None}
  259             pickle_str = marshal.dumps(db)
  260             filename = self.indexdb + initchar
  261             pickle_fh = open(filename, 'wb')
  262             pickle_fh.write(zlib.compress(pickle_str))
  263             os.chmod(filename, 0o664)
  264 
  265         # save done
  266         self.changed = 0
  267 
  268     def purge_entry(self, identifier):
  269         '''Remove a file from file index and word index
  270         '''
  271         self.load_index()
  272 
  273         if identifier not in self.files:
  274             return
  275 
  276         file_index = self.files[identifier][0]
  277         del self.files[identifier]
  278         del self.fileids[file_index]
  279 
  280         # The much harder part, cleanup the word index
  281         for key, occurs in self.words.items():
  282             if file_index in occurs:
  283                 del occurs[file_index]
  284 
  285         # save needed
  286         self.changed = 1
  287 
  288     def index_loaded(self):
  289         return (hasattr(self,'fileids') and hasattr(self,'files') and
  290             hasattr(self,'words'))
  291 
  292     def rollback(self):
  293         ''' load last saved index info. '''
  294         self.load_index(reload=1)
  295 
  296     def close(self):
  297         pass
  298 
  299 
  300 # vim: set filetype=python ts=4 sw=4 et si