mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-17 10:13:56 -05:00
proper document archiver with progress bar.
This commit is contained in:
parent
e22769ca63
commit
72a4ff0fca
@ -5,38 +5,55 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
import uuid
|
import uuid
|
||||||
|
from time import sleep
|
||||||
|
|
||||||
|
import tqdm
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.core.management.base import BaseCommand
|
from django.core.management.base import BaseCommand
|
||||||
|
from django.db import transaction
|
||||||
from whoosh.writing import AsyncWriter
|
from whoosh.writing import AsyncWriter
|
||||||
|
|
||||||
from documents.models import Document
|
from documents.models import Document
|
||||||
from ... import index
|
from ... import index
|
||||||
|
from ...file_handling import create_source_path_directory
|
||||||
from ...mixins import Renderable
|
from ...mixins import Renderable
|
||||||
from ...parsers import get_parser_class_for_mime_type
|
from ...parsers import get_parser_class_for_mime_type
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def handle_document(document):
|
def handle_document(document):
|
||||||
mime_type = document.mime_type
|
mime_type = document.mime_type
|
||||||
|
|
||||||
parser_class = get_parser_class_for_mime_type(mime_type)
|
parser_class = get_parser_class_for_mime_type(mime_type)
|
||||||
|
|
||||||
parser = parser_class(logging_group=uuid.uuid4())
|
parser = parser_class(logging_group=uuid.uuid4())
|
||||||
|
|
||||||
|
try:
|
||||||
parser.parse(document.source_path, mime_type)
|
parser.parse(document.source_path, mime_type)
|
||||||
|
|
||||||
if parser.get_archive_path():
|
if parser.get_archive_path():
|
||||||
shutil.copy(parser.get_archive_path(), document.archive_path)
|
with transaction.atomic():
|
||||||
with document.archive_file as f:
|
with open(parser.get_archive_path(), 'rb') as f:
|
||||||
document.archive_checksum = hashlib.md5(f.read()).hexdigest()
|
checksum = hashlib.md5(f.read()).hexdigest()
|
||||||
else:
|
# i'm going to save first so that in case the file move
|
||||||
logging.getLogger(__name__).warning(
|
# fails, the database is rolled back.
|
||||||
f"Parser {parser} did not produce an archived document "
|
# we also don't use save() since that triggers the filehandling
|
||||||
f"for {document.file_name}"
|
# logic, and we don't want that yet (file not yet in place)
|
||||||
|
Document.objects.filter(pk=document.pk).update(
|
||||||
|
archive_checksum=checksum,
|
||||||
|
content=parser.get_text()
|
||||||
)
|
)
|
||||||
|
create_source_path_directory(document.archive_path)
|
||||||
|
shutil.move(parser.get_archive_path(), document.archive_path)
|
||||||
|
|
||||||
if parser.get_text():
|
with AsyncWriter(index.open_index()) as writer:
|
||||||
document.content = parser.get_text()
|
index.update_document(writer, document)
|
||||||
document.save()
|
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error while parsing document {document}: {str(e)}")
|
||||||
|
finally:
|
||||||
parser.cleanup()
|
parser.cleanup()
|
||||||
|
|
||||||
|
|
||||||
@ -61,6 +78,14 @@ class Command(Renderable, BaseCommand):
|
|||||||
help="Recreates the archived document for documents that already "
|
help="Recreates the archived document for documents that already "
|
||||||
"have an archived version."
|
"have an archived version."
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-d", "--document",
|
||||||
|
default=None,
|
||||||
|
type=int,
|
||||||
|
required=False,
|
||||||
|
help="Specify the ID of a document, and this command will only "
|
||||||
|
"run on this specific document."
|
||||||
|
)
|
||||||
|
|
||||||
def handle(self, *args, **options):
|
def handle(self, *args, **options):
|
||||||
|
|
||||||
@ -68,22 +93,22 @@ class Command(Renderable, BaseCommand):
|
|||||||
|
|
||||||
overwrite = options["overwrite"]
|
overwrite = options["overwrite"]
|
||||||
|
|
||||||
|
if options['document']:
|
||||||
|
documents = Document.objects.filter(pk=options['document'])
|
||||||
|
else:
|
||||||
documents = Document.objects.all()
|
documents = Document.objects.all()
|
||||||
|
|
||||||
documents_to_process = filter(
|
documents_to_process = list(filter(
|
||||||
lambda d: overwrite or not os.path.exists(d.archive_path),
|
lambda d: overwrite or not d.archive_checksum,
|
||||||
documents
|
documents
|
||||||
)
|
))
|
||||||
|
|
||||||
|
logging.getLogger().handlers[0].level = logging.ERROR
|
||||||
with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
|
with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
|
||||||
list(
|
list(tqdm.tqdm(
|
||||||
pool.imap(
|
pool.imap_unordered(
|
||||||
handle_document,
|
handle_document,
|
||||||
list(documents_to_process)
|
documents_to_process
|
||||||
)
|
),
|
||||||
)
|
total=len(documents_to_process)
|
||||||
|
))
|
||||||
ix = index.open_index()
|
|
||||||
with AsyncWriter(ix) as writer:
|
|
||||||
for d in documents_to_process:
|
|
||||||
index.update_document(writer, d)
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user