mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-09-16 21:55:37 -05:00
Merge index
This commit is contained in:
@@ -220,7 +220,7 @@ def delete(doc_ids: list[int]) -> Literal["OK"]:
|
||||
try:
|
||||
Document.objects.filter(id__in=doc_ids).delete()
|
||||
|
||||
from documents import index
|
||||
from paperless import index
|
||||
|
||||
with index.open_index_writer() as writer:
|
||||
for id in doc_ids:
|
||||
|
442
src/paperless/index.py
Normal file
442
src/paperless/index.py
Normal file
@@ -0,0 +1,442 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import math
|
||||
from collections import Counter
|
||||
from contextlib import contextmanager
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from shutil import rmtree
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Literal
|
||||
|
||||
from django.conf import settings
|
||||
from django.utils import timezone as django_timezone
|
||||
from guardian.shortcuts import get_users_with_perms
|
||||
from whoosh import classify
|
||||
from whoosh import highlight
|
||||
from whoosh import query
|
||||
from whoosh.fields import BOOLEAN
|
||||
from whoosh.fields import DATETIME
|
||||
from whoosh.fields import KEYWORD
|
||||
from whoosh.fields import NUMERIC
|
||||
from whoosh.fields import TEXT
|
||||
from whoosh.fields import Schema
|
||||
from whoosh.highlight import HtmlFormatter
|
||||
from whoosh.idsets import BitSet
|
||||
from whoosh.idsets import DocIdSet
|
||||
from whoosh.index import FileIndex
|
||||
from whoosh.index import create_in
|
||||
from whoosh.index import exists_in
|
||||
from whoosh.index import open_dir
|
||||
from whoosh.qparser import MultifieldParser
|
||||
from whoosh.qparser import QueryParser
|
||||
from whoosh.qparser.dateparse import DateParserPlugin
|
||||
from whoosh.qparser.dateparse import English
|
||||
from whoosh.qparser.plugins import FieldsPlugin
|
||||
from whoosh.scoring import TF_IDF
|
||||
from whoosh.util.times import timespan
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
from paperless.models import CustomFieldInstance
|
||||
from paperless.models import Document
|
||||
from paperless.models import Note
|
||||
from paperless.models import User
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from django.db.models import QuerySet
|
||||
from whoosh.reading import IndexReader
|
||||
from whoosh.searching import ResultsPage
|
||||
from whoosh.searching import Searcher
|
||||
|
||||
logger = logging.getLogger("paperless.index")
|
||||
|
||||
|
||||
def get_schema() -> Schema:
|
||||
return Schema(
|
||||
id=NUMERIC(stored=True, unique=True),
|
||||
title=TEXT(sortable=True),
|
||||
content=TEXT(),
|
||||
asn=NUMERIC(sortable=True, signed=False),
|
||||
correspondent=TEXT(sortable=True),
|
||||
correspondent_id=NUMERIC(),
|
||||
has_correspondent=BOOLEAN(),
|
||||
tag=KEYWORD(commas=True, scorable=True, lowercase=True),
|
||||
tag_id=KEYWORD(commas=True, scorable=True),
|
||||
has_tag=BOOLEAN(),
|
||||
type=TEXT(sortable=True),
|
||||
type_id=NUMERIC(),
|
||||
has_type=BOOLEAN(),
|
||||
created=DATETIME(sortable=True),
|
||||
modified=DATETIME(sortable=True),
|
||||
added=DATETIME(sortable=True),
|
||||
path=TEXT(sortable=True),
|
||||
path_id=NUMERIC(),
|
||||
has_path=BOOLEAN(),
|
||||
notes=TEXT(),
|
||||
num_notes=NUMERIC(sortable=True, signed=False),
|
||||
custom_fields=TEXT(),
|
||||
custom_field_count=NUMERIC(sortable=True, signed=False),
|
||||
has_custom_fields=BOOLEAN(),
|
||||
custom_fields_id=KEYWORD(commas=True),
|
||||
owner=TEXT(),
|
||||
owner_id=NUMERIC(),
|
||||
has_owner=BOOLEAN(),
|
||||
viewer_id=KEYWORD(commas=True),
|
||||
checksum=TEXT(),
|
||||
page_count=NUMERIC(sortable=True),
|
||||
original_filename=TEXT(sortable=True),
|
||||
is_shared=BOOLEAN(),
|
||||
)
|
||||
|
||||
|
||||
def open_index(*, recreate=False) -> FileIndex:
|
||||
try:
|
||||
if exists_in(settings.INDEX_DIR) and not recreate:
|
||||
return open_dir(settings.INDEX_DIR, schema=get_schema())
|
||||
except Exception:
|
||||
logger.exception("Error while opening the index, recreating.")
|
||||
|
||||
# create_in doesn't handle corrupted indexes very well, remove the directory entirely first
|
||||
if settings.INDEX_DIR.is_dir():
|
||||
rmtree(settings.INDEX_DIR)
|
||||
settings.INDEX_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
return create_in(settings.INDEX_DIR, get_schema())
|
||||
|
||||
|
||||
@contextmanager
|
||||
def open_index_writer(*, optimize=False) -> AsyncWriter:
|
||||
writer = AsyncWriter(open_index())
|
||||
|
||||
try:
|
||||
yield writer
|
||||
except Exception as e:
|
||||
logger.exception(str(e))
|
||||
writer.cancel()
|
||||
finally:
|
||||
writer.commit(optimize=optimize)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def open_index_searcher() -> Searcher:
|
||||
searcher = open_index().searcher()
|
||||
|
||||
try:
|
||||
yield searcher
|
||||
finally:
|
||||
searcher.close()
|
||||
|
||||
|
||||
def update_document(writer: AsyncWriter, doc: Document) -> None:
|
||||
tags = ",".join([t.name for t in doc.tags.all()])
|
||||
tags_ids = ",".join([str(t.id) for t in doc.tags.all()])
|
||||
notes = ",".join([str(c.note) for c in Note.objects.filter(document=doc)])
|
||||
custom_fields = ",".join(
|
||||
[str(c) for c in CustomFieldInstance.objects.filter(document=doc)],
|
||||
)
|
||||
custom_fields_ids = ",".join(
|
||||
[str(f.field.id) for f in CustomFieldInstance.objects.filter(document=doc)],
|
||||
)
|
||||
asn: int | None = doc.archive_serial_number
|
||||
if asn is not None and (
|
||||
asn < Document.ARCHIVE_SERIAL_NUMBER_MIN
|
||||
or asn > Document.ARCHIVE_SERIAL_NUMBER_MAX
|
||||
):
|
||||
logger.error(
|
||||
f"Not indexing Archive Serial Number {asn} of document {doc.pk}. "
|
||||
f"ASN is out of range "
|
||||
f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, "
|
||||
f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}.",
|
||||
)
|
||||
asn = 0
|
||||
users_with_perms = get_users_with_perms(
|
||||
doc,
|
||||
only_with_perms_in=["view_document"],
|
||||
)
|
||||
viewer_ids: str = ",".join([str(u.id) for u in users_with_perms])
|
||||
writer.update_document(
|
||||
id=doc.pk,
|
||||
title=doc.title,
|
||||
content=doc.content,
|
||||
correspondent=doc.correspondent.name if doc.correspondent else None,
|
||||
correspondent_id=doc.correspondent.id if doc.correspondent else None,
|
||||
has_correspondent=doc.correspondent is not None,
|
||||
tag=tags if tags else None,
|
||||
tag_id=tags_ids if tags_ids else None,
|
||||
has_tag=len(tags) > 0,
|
||||
type=doc.document_type.name if doc.document_type else None,
|
||||
type_id=doc.document_type.id if doc.document_type else None,
|
||||
has_type=doc.document_type is not None,
|
||||
created=doc.created,
|
||||
added=doc.added,
|
||||
asn=asn,
|
||||
modified=doc.modified,
|
||||
path=doc.storage_path.name if doc.storage_path else None,
|
||||
path_id=doc.storage_path.id if doc.storage_path else None,
|
||||
has_path=doc.storage_path is not None,
|
||||
notes=notes,
|
||||
num_notes=len(notes),
|
||||
custom_fields=custom_fields,
|
||||
custom_field_count=len(doc.custom_fields.all()),
|
||||
has_custom_fields=len(custom_fields) > 0,
|
||||
custom_fields_id=custom_fields_ids if custom_fields_ids else None,
|
||||
owner=doc.owner.username if doc.owner else None,
|
||||
owner_id=doc.owner.id if doc.owner else None,
|
||||
has_owner=doc.owner is not None,
|
||||
viewer_id=viewer_ids if viewer_ids else None,
|
||||
checksum=doc.checksum,
|
||||
page_count=doc.page_count,
|
||||
original_filename=doc.original_filename,
|
||||
is_shared=len(viewer_ids) > 0,
|
||||
)
|
||||
|
||||
|
||||
def remove_document(writer: AsyncWriter, doc: Document) -> None:
|
||||
remove_document_by_id(writer, doc.pk)
|
||||
|
||||
|
||||
def remove_document_by_id(writer: AsyncWriter, doc_id) -> None:
|
||||
writer.delete_by_term("id", doc_id)
|
||||
|
||||
|
||||
def add_or_update_document(document: Document) -> None:
|
||||
with open_index_writer() as writer:
|
||||
update_document(writer, document)
|
||||
|
||||
|
||||
def remove_document_from_index(document: Document) -> None:
|
||||
with open_index_writer() as writer:
|
||||
remove_document(writer, document)
|
||||
|
||||
|
||||
class MappedDocIdSet(DocIdSet):
|
||||
"""
|
||||
A DocIdSet backed by a set of `Document` IDs.
|
||||
Supports efficiently looking up if a whoosh docnum is in the provided `filter_queryset`.
|
||||
"""
|
||||
|
||||
def __init__(self, filter_queryset: QuerySet, ixreader: IndexReader) -> None:
|
||||
super().__init__()
|
||||
document_ids = filter_queryset.order_by("id").values_list("id", flat=True)
|
||||
max_id = document_ids.last() or 0
|
||||
self.document_ids = BitSet(document_ids, size=max_id)
|
||||
self.ixreader = ixreader
|
||||
|
||||
def __contains__(self, docnum) -> bool:
|
||||
document_id = self.ixreader.stored_fields(docnum)["id"]
|
||||
return document_id in self.document_ids
|
||||
|
||||
def __bool__(self) -> Literal[True]:
|
||||
# searcher.search ignores a filter if it's "falsy".
|
||||
# We use this hack so this DocIdSet, when used as a filter, is never ignored.
|
||||
return True
|
||||
|
||||
|
||||
class DelayedQuery:
|
||||
def _get_query(self):
|
||||
raise NotImplementedError # pragma: no cover
|
||||
|
||||
def _get_query_sortedby(self) -> tuple[None, Literal[False]] | tuple[str, bool]:
|
||||
if "ordering" not in self.query_params:
|
||||
return None, False
|
||||
|
||||
field: str = self.query_params["ordering"]
|
||||
|
||||
sort_fields_map: dict[str, str] = {
|
||||
"created": "created",
|
||||
"modified": "modified",
|
||||
"added": "added",
|
||||
"title": "title",
|
||||
"correspondent__name": "correspondent",
|
||||
"document_type__name": "type",
|
||||
"archive_serial_number": "asn",
|
||||
"num_notes": "num_notes",
|
||||
"owner": "owner",
|
||||
"page_count": "page_count",
|
||||
}
|
||||
|
||||
if field.startswith("-"):
|
||||
field = field[1:]
|
||||
reverse = True
|
||||
else:
|
||||
reverse = False
|
||||
|
||||
if field not in sort_fields_map:
|
||||
return None, False
|
||||
else:
|
||||
return sort_fields_map[field], reverse
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
searcher: Searcher,
|
||||
query_params,
|
||||
page_size,
|
||||
filter_queryset: QuerySet,
|
||||
) -> None:
|
||||
self.searcher = searcher
|
||||
self.query_params = query_params
|
||||
self.page_size = page_size
|
||||
self.saved_results = dict()
|
||||
self.first_score = None
|
||||
self.filter_queryset = filter_queryset
|
||||
|
||||
def __len__(self) -> int:
|
||||
page = self[0:1]
|
||||
return len(page)
|
||||
|
||||
def __getitem__(self, item):
|
||||
if item.start in self.saved_results:
|
||||
return self.saved_results[item.start]
|
||||
|
||||
q, mask = self._get_query()
|
||||
sortedby, reverse = self._get_query_sortedby()
|
||||
|
||||
page: ResultsPage = self.searcher.search_page(
|
||||
q,
|
||||
mask=mask,
|
||||
filter=MappedDocIdSet(self.filter_queryset, self.searcher.ixreader),
|
||||
pagenum=math.floor(item.start / self.page_size) + 1,
|
||||
pagelen=self.page_size,
|
||||
sortedby=sortedby,
|
||||
reverse=reverse,
|
||||
)
|
||||
page.results.fragmenter = highlight.ContextFragmenter(surround=50)
|
||||
page.results.formatter = HtmlFormatter(tagname="span", between=" ... ")
|
||||
|
||||
if not self.first_score and len(page.results) > 0 and sortedby is None:
|
||||
self.first_score = page.results[0].score
|
||||
|
||||
page.results.top_n = list(
|
||||
map(
|
||||
lambda hit: (
|
||||
(hit[0] / self.first_score) if self.first_score else None,
|
||||
hit[1],
|
||||
),
|
||||
page.results.top_n,
|
||||
),
|
||||
)
|
||||
|
||||
self.saved_results[item.start] = page
|
||||
|
||||
return page
|
||||
|
||||
|
||||
class LocalDateParser(English):
|
||||
def reverse_timezone_offset(self, d):
|
||||
return (d.replace(tzinfo=django_timezone.get_current_timezone())).astimezone(
|
||||
timezone.utc,
|
||||
)
|
||||
|
||||
def date_from(self, *args, **kwargs):
|
||||
d = super().date_from(*args, **kwargs)
|
||||
if isinstance(d, timespan):
|
||||
d.start = self.reverse_timezone_offset(d.start)
|
||||
d.end = self.reverse_timezone_offset(d.end)
|
||||
elif isinstance(d, datetime):
|
||||
d = self.reverse_timezone_offset(d)
|
||||
return d
|
||||
|
||||
|
||||
class DelayedFullTextQuery(DelayedQuery):
|
||||
def _get_query(self) -> tuple:
|
||||
q_str = self.query_params["query"]
|
||||
qp = MultifieldParser(
|
||||
[
|
||||
"content",
|
||||
"title",
|
||||
"correspondent",
|
||||
"tag",
|
||||
"type",
|
||||
"notes",
|
||||
"custom_fields",
|
||||
],
|
||||
self.searcher.ixreader.schema,
|
||||
)
|
||||
qp.add_plugin(
|
||||
DateParserPlugin(
|
||||
basedate=django_timezone.now(),
|
||||
dateparser=LocalDateParser(),
|
||||
),
|
||||
)
|
||||
q = qp.parse(q_str)
|
||||
|
||||
corrected = self.searcher.correct_query(q, q_str)
|
||||
if corrected.query != q:
|
||||
corrected.query = corrected.string
|
||||
|
||||
return q, None
|
||||
|
||||
|
||||
class DelayedMoreLikeThisQuery(DelayedQuery):
|
||||
def _get_query(self) -> tuple:
|
||||
more_like_doc_id = int(self.query_params["more_like_id"])
|
||||
content = Document.objects.get(id=more_like_doc_id).content
|
||||
|
||||
docnum = self.searcher.document_number(id=more_like_doc_id)
|
||||
kts = self.searcher.key_terms_from_text(
|
||||
"content",
|
||||
content,
|
||||
numterms=20,
|
||||
model=classify.Bo1Model,
|
||||
normalize=False,
|
||||
)
|
||||
q = query.Or(
|
||||
[query.Term("content", word, boost=weight) for word, weight in kts],
|
||||
)
|
||||
mask: set = {docnum}
|
||||
|
||||
return q, mask
|
||||
|
||||
|
||||
def autocomplete(
|
||||
ix: FileIndex,
|
||||
term: str,
|
||||
limit: int = 10,
|
||||
user: User | None = None,
|
||||
) -> list:
|
||||
"""
|
||||
Mimics whoosh.reading.IndexReader.most_distinctive_terms with permissions
|
||||
and without scoring
|
||||
"""
|
||||
terms = []
|
||||
|
||||
with ix.searcher(weighting=TF_IDF()) as s:
|
||||
qp = QueryParser("content", schema=ix.schema)
|
||||
# Don't let searches with a query that happen to match a field override the
|
||||
# content field query instead and return bogus, not text data
|
||||
qp.remove_plugin_class(FieldsPlugin)
|
||||
q = qp.parse(f"{term.lower()}*")
|
||||
user_criterias: list = get_permissions_criterias(user)
|
||||
|
||||
results = s.search(
|
||||
q,
|
||||
terms=True,
|
||||
filter=query.Or(user_criterias) if user_criterias is not None else None,
|
||||
)
|
||||
|
||||
termCounts = Counter()
|
||||
if results.has_matched_terms():
|
||||
for hit in results:
|
||||
for _, match in hit.matched_terms():
|
||||
termCounts[match] += 1
|
||||
terms = [t for t, _ in termCounts.most_common(limit)]
|
||||
|
||||
term_encoded: bytes = term.encode("UTF-8")
|
||||
if term_encoded in terms:
|
||||
terms.insert(0, terms.pop(terms.index(term_encoded)))
|
||||
|
||||
return terms
|
||||
|
||||
|
||||
def get_permissions_criterias(user: User | None = None) -> list:
|
||||
user_criterias = [query.Term("has_owner", text=False)]
|
||||
if user is not None:
|
||||
if user.is_superuser: # superusers see all docs
|
||||
user_criterias = []
|
||||
else:
|
||||
user_criterias.append(query.Term("owner_id", user.id))
|
||||
user_criterias.append(
|
||||
query.Term("viewer_id", str(user.id)),
|
||||
)
|
||||
return user_criterias
|
@@ -89,7 +89,6 @@ from rest_framework.viewsets import ModelViewSet
|
||||
from rest_framework.viewsets import ReadOnlyModelViewSet
|
||||
from rest_framework.viewsets import ViewSet
|
||||
|
||||
from documents import index
|
||||
from documents.bulk_download import ArchiveOnlyStrategy
|
||||
from documents.bulk_download import OriginalAndArchiveStrategy
|
||||
from documents.bulk_download import OriginalsOnlyStrategy
|
||||
@@ -118,7 +117,6 @@ from documents.filters import PaperlessTaskFilterSet
|
||||
from documents.filters import ShareLinkFilterSet
|
||||
from documents.filters import StoragePathFilterSet
|
||||
from documents.filters import TagFilterSet
|
||||
from documents.index import DelayedQuery
|
||||
from documents.mail import send_email
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
from documents.parsers import parse_date_generator
|
||||
@@ -137,6 +135,7 @@ from documents.tasks import sanity_check
|
||||
from documents.tasks import train_classifier
|
||||
from documents.templating.filepath import validate_filepath_template_and_render
|
||||
from paperless import bulk_edit
|
||||
from paperless import index
|
||||
from paperless import version
|
||||
from paperless.celery import app as celery_app
|
||||
from paperless.config import GeneralConfig
|
||||
@@ -146,6 +145,7 @@ from paperless.data_models import DocumentSource
|
||||
from paperless.db import GnuPG
|
||||
from paperless.filters import GroupFilterSet
|
||||
from paperless.filters import UserFilterSet
|
||||
from paperless.index import DelayedQuery
|
||||
from paperless.matching import match_correspondents
|
||||
from paperless.matching import match_document_types
|
||||
from paperless.matching import match_storage_paths
|
||||
@@ -978,7 +978,7 @@ class DocumentViewSet(
|
||||
|
||||
def update(self, request, *args, **kwargs):
|
||||
response = super().update(request, *args, **kwargs)
|
||||
from documents import index
|
||||
from paperless import index
|
||||
|
||||
index.add_or_update_document(self.get_object())
|
||||
|
||||
@@ -990,7 +990,7 @@ class DocumentViewSet(
|
||||
return response
|
||||
|
||||
def destroy(self, request, *args, **kwargs):
|
||||
from documents import index
|
||||
from paperless import index
|
||||
|
||||
index.remove_document_from_index(self.get_object())
|
||||
try:
|
||||
@@ -1266,7 +1266,7 @@ class DocumentViewSet(
|
||||
doc.modified = timezone.now()
|
||||
doc.save()
|
||||
|
||||
from documents import index
|
||||
from paperless import index
|
||||
|
||||
index.add_or_update_document(doc)
|
||||
|
||||
@@ -1303,7 +1303,7 @@ class DocumentViewSet(
|
||||
doc.modified = timezone.now()
|
||||
doc.save()
|
||||
|
||||
from documents import index
|
||||
from paperless import index
|
||||
|
||||
index.add_or_update_document(doc)
|
||||
|
||||
@@ -1498,7 +1498,7 @@ class UnifiedSearchViewSet(DocumentViewSet):
|
||||
filtered_queryset = super().filter_queryset(queryset)
|
||||
|
||||
if self._is_search_request():
|
||||
from documents import index
|
||||
from paperless import index
|
||||
|
||||
if "query" in self.request.query_params:
|
||||
query_class = index.DelayedFullTextQuery
|
||||
@@ -1518,7 +1518,7 @@ class UnifiedSearchViewSet(DocumentViewSet):
|
||||
|
||||
def list(self, request, *args, **kwargs):
|
||||
if self._is_search_request():
|
||||
from documents import index
|
||||
from paperless import index
|
||||
|
||||
try:
|
||||
with index.open_index_searcher() as s:
|
||||
@@ -2032,7 +2032,7 @@ class SearchAutoCompleteView(GenericAPIView):
|
||||
else:
|
||||
limit = 10
|
||||
|
||||
from documents import index
|
||||
from paperless import index
|
||||
|
||||
ix = index.open_index()
|
||||
|
||||
@@ -2110,7 +2110,7 @@ class GlobalSearchView(PassUserMixin):
|
||||
docs = all_docs.filter(title__icontains=query)
|
||||
if not db_only and len(docs) < OBJECT_LIMIT:
|
||||
# If we don't have enough results, search by content
|
||||
from documents import index
|
||||
from paperless import index
|
||||
|
||||
with index.open_index_searcher() as s:
|
||||
fts_query = index.DelayedFullTextQuery(
|
||||
|
Reference in New Issue
Block a user