This commit is contained in:
jonaswinkler 2020-12-05 00:37:05 +01:00
parent e9758d5224
commit f88cf69173
3 changed files with 17 additions and 7 deletions

View File

@ -82,6 +82,10 @@ def open_index(recreate=False):
def update_document(writer, doc):
# TODO: this line caused many issues all around, since:
# We need to make sure that this method does not get called with
# deserialized documents (i.e, document objects that don't come from
# Django's ORM interfaces directly.
logger.debug("Indexing {}...".format(doc))
tags = ",".join([t.name for t in doc.tags.all()])
writer.update_document(
@ -98,6 +102,7 @@ def update_document(writer, doc):
def remove_document(writer, doc):
# TODO: see above.
logger.debug("Removing {} from index...".format(doc))
writer.delete_by_term('id', doc.pk)

View File

@ -23,7 +23,9 @@ from ...parsers import get_parser_class_for_mime_type
logger = logging.getLogger(__name__)
def handle_document(document):
def handle_document(document_id):
document = Document.objects.get(id=document_id)
mime_type = document.mime_type
parser_class = get_parser_class_for_mime_type(mime_type)
@ -98,9 +100,12 @@ class Command(Renderable, BaseCommand):
else:
documents = Document.objects.all()
documents_to_process = list(filter(
lambda d: overwrite or not d.archive_checksum,
documents
document_ids = list(map(
lambda doc: doc.id,
filter(
lambda d: overwrite or not d.archive_checksum,
documents
)
))
logging.getLogger().handlers[0].level = logging.ERROR
@ -108,7 +113,7 @@ class Command(Renderable, BaseCommand):
list(tqdm.tqdm(
pool.imap_unordered(
handle_document,
documents_to_process
document_ids
),
total=len(documents_to_process)
total=len(document_ids)
))

View File

@ -32,7 +32,7 @@ class TestArchiver(DirectoriesMixin, TestCase):
shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, "0000001.pdf"))
self.make_models()
handle_document(self.d1)
handle_document(self.d1.pk)
doc = Document.objects.get(id=self.d1.id)