added a simple document archiver that produces archived versions of all originals.

2025-11-13 04:06:15 -06:00 · 2020-11-28 11:49:07 +01:00
parent f7e554a3c1
commit 074b682312
1 changed files with 89 additions and 0 deletions
--- a/src/documents/management/commands/document_archiver.py
+++ b/src/documents/management/commands/document_archiver.py
@@ -0,0 +1,89 @@
 import multiprocessing
 import ocrmypdf
 import logging
 import os
 import shutil
 import uuid
 from django.conf import settings
 from django.core.management import call_command
 from django.core.management.base import BaseCommand
 from ocrmypdf import Verbosity
 from whoosh.writing import AsyncWriter
 from documents.models import Document
 from ... import index
 from ...mixins import Renderable
 from ...parsers import get_parser_class_for_mime_type
 def handle_document(document):
    mime_type = document.mime_type
    parser_class = get_parser_class_for_mime_type(mime_type)
    parser = parser_class(logging_group=uuid.uuid4())
    parser.parse(document.source_path, mime_type)
    if parser.get_archive_path():
        shutil.copy(parser.get_archive_path(), document.archive_path)
    else:
        logging.getLogger(__name__).warning(
            f"Parser {parser} did not produce an archived document "
            f"for {document.file_name}"
        )
    if parser.get_text():
        document.content = parser.get_text()
        document.save()
    parser.cleanup()
 class Command(Renderable, BaseCommand):
    help = """
        Using the current classification model, assigns correspondents, tags
        and document types to all documents, effectively allowing you to
        back-tag all previously indexed documents with metadata created (or
        modified) after their initial import.
    """.replace("    ", "")
    def __init__(self, *args, **kwargs):
        self.verbosity = 0
        BaseCommand.__init__(self, *args, **kwargs)
    def add_arguments(self, parser):
        parser.add_argument(
            "-f", "--overwrite",
            default=False,
            action="store_true",
            help="Recreates the archived document for documents that already "
                 "have an archived version."
        )
    def handle(self, *args, **options):
        os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
        overwrite = options["overwrite"]
        documents = Document.objects.all()
        documents_to_process = filter(
            lambda d: overwrite or not os.path.exists(d.archive_path),
            documents
        )
        with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
            list(
                pool.imap(
                    handle_document,
                    list(documents_to_process)
                )
            )
        ix = index.open_index()
        with AsyncWriter(ix) as writer:
            for d in documents_to_process:
                index.update_document(writer, d)