"Fossies" - the Fresh Open Source Software Archive  

Source code changes of the file "roundup/backends/indexer_whoosh.py" between
roundup-1.6.1.tar.gz and roundup-2.0.0.tar.gz

About: Roundup is an highly customisable issue-tracking system with command-line, web and e-mail interfaces (written in Python).

indexer_whoosh.py  (roundup-1.6.1):indexer_whoosh.py  (roundup-2.0.0)
''' This implements the full-text indexer using Whoosh. ''' This implements the full-text indexer using Whoosh.
''' '''
import re, os import os
from whoosh import fields, qparser, index, query, analysis from whoosh import fields, qparser, index, query, analysis
from roundup.backends.indexer_common import Indexer as IndexerBase from roundup.backends.indexer_common import Indexer as IndexerBase
from roundup.anypy.strings import us2u
class Indexer(IndexerBase): class Indexer(IndexerBase):
def __init__(self, db): def __init__(self, db):
IndexerBase.__init__(self, db) IndexerBase.__init__(self, db)
self.db_path = db.config.DATABASE self.db_path = db.config.DATABASE
self.reindex = 0 self.reindex = 0
self.writer = None self.writer = None
self.index = None self.index = None
self.deleted = set() self.deleted = set()
def _get_index(self): def _get_index(self):
if self.index is None: if self.index is None:
path = os.path.join(self.db_path, 'whoosh-index') path = os.path.join(self.db_path, 'whoosh-index')
if not os.path.exists(path): if not os.path.exists(path):
# StandardAnalyzer lowercases all words and configure it to # StandardAnalyzer lowercases all words and configure it to
# block stopwords and words with lengths not between # block stopwords and words with lengths not between
# self.minlength and self.maxlength from indexer_common # self.minlength and self.maxlength from indexer_common
stopfilter = analysis.StandardAnalyzer( #stoplist=self.stopword s, stopfilter = analysis.StandardAnalyzer( #stoplist=self.stopword s,
minsize=self.minlength, minsize=self.minlength,
maxsize=self.maxlength) maxsize=self.maxlength)
os.mkdir(path) os.mkdir(path)
schema = fields.Schema(identifier=fields.ID(stored=True, schema = fields.Schema(identifier=fields.ID(stored=True,
unique=True), unique=True),
content=fields.TEXT(analyzer=stopfilter)) content=fields.TEXT(analyzer=stopfilter))
index.create_in(path, schema) index.create_in(path, schema)
self.index = index.open_dir(path) self.index = index.open_dir(path)
return self.index return self.index
def save_index(self): def save_index(self):
'''Save the changes to the index.''' '''Save the changes to the index.'''
if not self.writer: if not self.writer:
return return
self.writer.commit() self.writer.commit()
self.deleted = set() self.deleted = set()
skipping to change at line 81 skipping to change at line 82
return self._get_index().searcher() return self._get_index().searcher()
def add_text(self, identifier, text, mime_type='text/plain'): def add_text(self, identifier, text, mime_type='text/plain'):
''' "identifier" is (classname, itemid, property) ''' ''' "identifier" is (classname, itemid, property) '''
if mime_type != 'text/plain': if mime_type != 'text/plain':
return return
if not text: if not text:
text = u'' text = u''
if not isinstance(text, unicode): text = us2u(text, "replace")
text = unicode(text, "utf-8", "replace")
# We use the identifier twice: once in the actual "text" being # We use the identifier twice: once in the actual "text" being
# indexed so we can search on it, and again as the "data" being # indexed so we can search on it, and again as the "data" being
# indexed so we know what we're matching when we get results # indexed so we know what we're matching when we get results
identifier = u"%s:%s:%s"%identifier identifier = u"%s:%s:%s" % identifier
# FIXME need to enhance this to handle the whoosh.store.LockError # FIXME need to enhance this to handle the whoosh.store.LockError
# that maybe raised if there is already another process with a lock. # that maybe raised if there is already another process with a lock.
writer = self._get_writer() writer = self._get_writer()
# Whoosh gets upset if a document is deleted twice in one transaction, # Whoosh gets upset if a document is deleted twice in one transaction,
# so we keep a list of the documents we have so far deleted to make # so we keep a list of the documents we have so far deleted to make
# sure that we only delete them once. # sure that we only delete them once.
if identifier not in self.deleted: if identifier not in self.deleted:
searcher = self._get_searcher() searcher = self._get_searcher()
skipping to change at line 114 skipping to change at line 114
# better results that way. # better results that way.
writer.add_document(identifier=identifier, content=text) writer.add_document(identifier=identifier, content=text)
self.save_index() self.save_index()
def find(self, wordlist): def find(self, wordlist):
'''look up all the words in the wordlist. '''look up all the words in the wordlist.
If none are found return an empty dictionary If none are found return an empty dictionary
* more rules here * more rules here
''' '''
wordlist = [ word for word in wordlist wordlist = [word for word in wordlist
if (self.minlength <= len(word) <= self.maxlength) and if (self.minlength <= len(word) <= self.maxlength) and
not self.is_stopword(word.upper()) ] not self.is_stopword(word.upper())]
if not wordlist: if not wordlist:
return {} return {}
searcher = self._get_searcher() searcher = self._get_searcher()
q = query.And([ query.FuzzyTerm("content", word.lower()) q = query.And([query.FuzzyTerm("content", word.lower())
for word in wordlist ]) for word in wordlist])
results = searcher.search(q, limit=None) results = searcher.search(q, limit=None)
return [tuple(result["identifier"].split(':')) return [tuple(result["identifier"].split(':'))
for result in results] for result in results]
 End of changes. 8 change blocks. 
11 lines changed or deleted 11 lines changed or added

Home  |  About  |  Features  |  All  |  Newest  |  Dox  |  Diffs  |  RSS Feeds  |  Screenshots  |  Comments  |  Imprint  |  Privacy  |  HTTP(S)