"Fossies" - the Fresh Open Source Software Archive

Member "roundup-2.0.0/roundup/backends/indexer_rdbms.py" (26 Aug 2019, 5134 Bytes) of package /linux/www/roundup-2.0.0.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Python source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. See also the latest Fossies "Diffs" side-by-side code changes report for "indexer_rdbms.py": 1.6.1_vs_2.0.0.

    1 """ This implements the full-text indexer over two RDBMS tables. The first
    2 is a mapping of words to occurance IDs. The second maps the IDs to (Class,
    3 propname, itemid) instances.
    4 """
    5 import re
    6 
    7 from roundup.backends.indexer_common import Indexer as IndexerBase
    8 from roundup.anypy.strings import us2u, u2s
    9 
   10 class Indexer(IndexerBase):
   11     def __init__(self, db):
   12         IndexerBase.__init__(self, db)
   13         self.db = db
   14         self.reindex = 0
   15 
   16     def close(self):
   17         """close the indexing database"""
   18         # just nuke the circular reference
   19         self.db = None
   20 
   21     def save_index(self):
   22         """Save the changes to the index."""
   23         # not necessary - the RDBMS connection will handle this for us
   24         pass
   25 
   26     def force_reindex(self):
   27         """Force a reindexing of the database.  This essentially
   28         empties the tables ids and index and sets a flag so
   29         that the databases are reindexed"""
   30         self.reindex = 1
   31 
   32     def should_reindex(self):
   33         """returns True if the indexes need to be rebuilt"""
   34         return self.reindex
   35 
   36     def add_text(self, identifier, text, mime_type='text/plain'):
   37         """ "identifier" is  (classname, itemid, property) """
   38         if mime_type != 'text/plain':
   39             return
   40 
   41         # Ensure all elements of the identifier are strings 'cos the itemid
   42         # column is varchar even if item ids may be numbers elsewhere in the
   43         # code. ugh.
   44         identifier = tuple(map(str, identifier))
   45 
   46         # first, find the id of the (classname, itemid, property)
   47         a = self.db.arg
   48         sql = 'select _textid from __textids where _class=%s and '\
   49             '_itemid=%s and _prop=%s'%(a, a, a)
   50         self.db.cursor.execute(sql, identifier)
   51         r = self.db.cursor.fetchone()
   52         if not r:
   53             # not previously indexed
   54             id = self.db.newid('__textids')
   55             sql = 'insert into __textids (_textid, _class, _itemid, _prop)'\
   56                 ' values (%s, %s, %s, %s)'%(a, a, a, a)
   57             self.db.cursor.execute(sql, (id, ) + identifier)
   58         else:
   59             id = int(r[0])
   60             # clear out any existing indexed values
   61             sql = 'delete from __words where _textid=%s'%a
   62             self.db.cursor.execute(sql, (id, ))
   63 
   64         # ok, find all the unique words in the text
   65         text = us2u(text, "replace")
   66         text = text.upper()
   67         wordlist = [u2s(w)
   68                     for w in re.findall(r'(?u)\b\w{%d,%d}\b'
   69                                         % (self.minlength, self.maxlength), text)]
   70         words = set()
   71         for word in wordlist:
   72             if self.is_stopword(word): continue
   73             words.add(word)
   74 
   75         # for each word, add an entry in the db
   76         sql = 'insert into __words (_word, _textid) values (%s, %s)'%(a, a)
   77         words = [(word, id) for word in words]
   78         self.db.cursor.executemany(sql, words)
   79 
   80     def find(self, wordlist):
   81         """look up all the words in the wordlist.
   82         If none are found return an empty dictionary
   83         * more rules here
   84         """
   85         if not wordlist:
   86             return []
   87 
   88         l = [word.upper() for word in wordlist
   89              if self.minlength <= len(word) <= self.maxlength]
   90         l = [word for word in l if not self.is_stopword(word)]
   91 
   92         if not l:
   93             return []
   94 
   95         if self.db.implements_intersect:
   96             # simple AND search
   97             sql = 'select distinct(_textid) from __words where _word=%s'%self.db.arg
   98             sql = '\nINTERSECT\n'.join([sql]*len(l))
   99             self.db.cursor.execute(sql, tuple(l))
  100             r = self.db.cursor.fetchall()
  101             if not r:
  102                 return []
  103             a = ','.join([self.db.arg] * len(r))
  104             sql = 'select _class, _itemid, _prop from __textids '\
  105                 'where _textid in (%s)'%a
  106             self.db.cursor.execute(sql, tuple([int(row[0]) for row in r]))
  107 
  108         else:
  109             # A more complex version for MySQL since it doesn't implement INTERSECT
  110 
  111             # Construct SQL statement to join __words table to itself
  112             # multiple times.
  113             sql = """select distinct(__words1._textid)
  114                         from __words as __words1 %s
  115                         where __words1._word=%s %s"""
  116 
  117             join_tmpl = ' left join __words as __words%d using (_textid) \n'
  118             match_tmpl = ' and __words%d._word=%s \n'
  119 
  120             join_list = []
  121             match_list = []
  122             for n in range(len(l) - 1):
  123                 join_list.append(join_tmpl % (n + 2))
  124                 match_list.append(match_tmpl % (n + 2, self.db.arg))
  125 
  126             sql = sql%(' '.join(join_list), self.db.arg, ' '.join(match_list))
  127             self.db.cursor.execute(sql, l)
  128 
  129             r = [x[0] for x in self.db.cursor.fetchall()]
  130             if not r:
  131                 return []
  132 
  133             a = ','.join([self.db.arg] * len(r))
  134             sql = 'select _class, _itemid, _prop from __textids '\
  135                 'where _textid in (%s)'%a
  136 
  137             self.db.cursor.execute(sql, tuple(map(int, r)))
  138 
  139         return self.db.cursor.fetchall()
  140