mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-09 09:58:20 -05:00
added a simple document archiver that produces archived versions of all originals.
This commit is contained in:
parent
f7e554a3c1
commit
074b682312
89
src/documents/management/commands/document_archiver.py
Normal file
89
src/documents/management/commands/document_archiver.py
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
import multiprocessing
|
||||||
|
|
||||||
|
import ocrmypdf
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
from django.core.management import call_command
|
||||||
|
from django.core.management.base import BaseCommand
|
||||||
|
from ocrmypdf import Verbosity
|
||||||
|
from whoosh.writing import AsyncWriter
|
||||||
|
|
||||||
|
from documents.models import Document
|
||||||
|
from ... import index
|
||||||
|
from ...mixins import Renderable
|
||||||
|
from ...parsers import get_parser_class_for_mime_type
|
||||||
|
|
||||||
|
|
||||||
|
def handle_document(document):
|
||||||
|
mime_type = document.mime_type
|
||||||
|
|
||||||
|
parser_class = get_parser_class_for_mime_type(mime_type)
|
||||||
|
|
||||||
|
parser = parser_class(logging_group=uuid.uuid4())
|
||||||
|
parser.parse(document.source_path, mime_type)
|
||||||
|
if parser.get_archive_path():
|
||||||
|
shutil.copy(parser.get_archive_path(), document.archive_path)
|
||||||
|
else:
|
||||||
|
logging.getLogger(__name__).warning(
|
||||||
|
f"Parser {parser} did not produce an archived document "
|
||||||
|
f"for {document.file_name}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if parser.get_text():
|
||||||
|
document.content = parser.get_text()
|
||||||
|
document.save()
|
||||||
|
|
||||||
|
parser.cleanup()
|
||||||
|
|
||||||
|
|
||||||
|
class Command(Renderable, BaseCommand):
|
||||||
|
|
||||||
|
help = """
|
||||||
|
Using the current classification model, assigns correspondents, tags
|
||||||
|
and document types to all documents, effectively allowing you to
|
||||||
|
back-tag all previously indexed documents with metadata created (or
|
||||||
|
modified) after their initial import.
|
||||||
|
""".replace(" ", "")
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
self.verbosity = 0
|
||||||
|
BaseCommand.__init__(self, *args, **kwargs)
|
||||||
|
|
||||||
|
def add_arguments(self, parser):
|
||||||
|
parser.add_argument(
|
||||||
|
"-f", "--overwrite",
|
||||||
|
default=False,
|
||||||
|
action="store_true",
|
||||||
|
help="Recreates the archived document for documents that already "
|
||||||
|
"have an archived version."
|
||||||
|
)
|
||||||
|
|
||||||
|
def handle(self, *args, **options):
|
||||||
|
|
||||||
|
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||||
|
|
||||||
|
overwrite = options["overwrite"]
|
||||||
|
|
||||||
|
documents = Document.objects.all()
|
||||||
|
|
||||||
|
documents_to_process = filter(
|
||||||
|
lambda d: overwrite or not os.path.exists(d.archive_path),
|
||||||
|
documents
|
||||||
|
)
|
||||||
|
|
||||||
|
with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
|
||||||
|
list(
|
||||||
|
pool.imap(
|
||||||
|
handle_document,
|
||||||
|
list(documents_to_process)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
ix = index.open_index()
|
||||||
|
with AsyncWriter(ix) as writer:
|
||||||
|
for d in documents_to_process:
|
||||||
|
index.update_document(writer, d)
|
Loading…
x
Reference in New Issue
Block a user