import logging from contextlib import contextmanager from whoosh import highlight from whoosh.fields import Schema, TEXT, NUMERIC from whoosh.highlight import Formatter, get_text from whoosh.index import create_in, exists_in, open_dir from whoosh.qparser import MultifieldParser from whoosh.writing import AsyncWriter from paperless import settings logger = logging.getLogger(__name__) class JsonFormatter(Formatter): def __init__(self): self.seen = {} def format_token(self, text, token, replace=False): seen = self.seen ttext = self._text(get_text(text, token, replace)) if ttext in seen: termnum = seen[ttext] else: termnum = len(seen) seen[ttext] = termnum return {'text': ttext, 'term': termnum} def format_fragment(self, fragment, replace=False): output = [] index = fragment.startchar text = fragment.text for t in fragment.matches: if t.startchar is None: continue if t.startchar < index: continue if t.startchar > index: output.append({'text': text[index:t.startchar]}) output.append(self.format_token(text, t, replace)) index = t.endchar if index < fragment.endchar: output.append({'text': text[index:fragment.endchar]}) return output def format(self, fragments, replace=False): output = [] for fragment in fragments: output.append(self.format_fragment(fragment, replace=replace)) return output def get_schema(): return Schema( id=NUMERIC(stored=True, unique=True, numtype=int), title=TEXT(stored=True), content=TEXT(), correspondent=TEXT(stored=True) ) def open_index(recreate=False): if exists_in(settings.INDEX_DIR) and not recreate: return open_dir(settings.INDEX_DIR) else: # TODO: this is not thread safe. If 2 instances try to create the index # at the same time, this fails. This currently prevents parallel # tests. return create_in(settings.INDEX_DIR, get_schema()) def update_document(writer, doc): logger.debug("Indexing {}...".format(doc)) writer.update_document( id=doc.pk, title=doc.title, content=doc.content, correspondent=doc.correspondent.name if doc.correspondent else None ) def remove_document(writer, doc): logger.debug("Removing {} from index...".format(doc)) writer.delete_by_term('id', doc.pk) def add_or_update_document(document): ix = open_index() with AsyncWriter(ix) as writer: update_document(writer, document) def remove_document_from_index(document): ix = open_index() with AsyncWriter(ix) as writer: remove_document(writer, document) @contextmanager def query_page(ix, query, page): searcher = ix.searcher() try: query_parser = MultifieldParser(["content", "title", "correspondent"], ix.schema).parse(query) result_page = searcher.search_page(query_parser, page) result_page.results.fragmenter = highlight.ContextFragmenter( surround=50) result_page.results.formatter = JsonFormatter() yield result_page finally: searcher.close() def autocomplete(ix, term, limit=10): with ix.reader() as reader: terms = [] for (score, t) in reader.most_distinctive_terms("content", limit, term.lower()): terms.append(t) return terms