Merge pull request #1451 from paperless-ngx/feature-better-redo-ocr

Feature: Even better re-do of OCR
This commit is contained in:
Quinn Casey
2022-08-25 17:01:54 -07:00
committed by GitHub
5 changed files with 70 additions and 146 deletions

View File

@@ -1,85 +1,18 @@
import hashlib
import logging
import multiprocessing
import os
import shutil
import uuid
import tqdm
from django import db
from django.conf import settings
from django.core.management.base import BaseCommand
from django.db import transaction
from documents.models import Document
from filelock import FileLock
from ... import index
from ...file_handling import create_source_path_directory
from ...file_handling import generate_unique_filename
from ...parsers import get_parser_class_for_mime_type
from documents.tasks import update_document_archive_file
logger = logging.getLogger("paperless.management.archiver")
def handle_document(document_id):
document = Document.objects.get(id=document_id)
mime_type = document.mime_type
parser_class = get_parser_class_for_mime_type(mime_type)
if not parser_class:
logger.error(
f"No parser found for mime type {mime_type}, cannot "
f"archive document {document} (ID: {document_id})",
)
return
parser = parser_class(logging_group=uuid.uuid4())
try:
parser.parse(document.source_path, mime_type, document.get_public_filename())
thumbnail = parser.get_thumbnail(
document.source_path,
mime_type,
document.get_public_filename(),
)
if parser.get_archive_path():
with transaction.atomic():
with open(parser.get_archive_path(), "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
# I'm going to save first so that in case the file move
# fails, the database is rolled back.
# We also don't use save() since that triggers the filehandling
# logic, and we don't want that yet (file not yet in place)
document.archive_filename = generate_unique_filename(
document,
archive_filename=True,
)
Document.objects.filter(pk=document.pk).update(
archive_checksum=checksum,
content=parser.get_text(),
archive_filename=document.archive_filename,
)
with FileLock(settings.MEDIA_LOCK):
create_source_path_directory(document.archive_path)
shutil.move(parser.get_archive_path(), document.archive_path)
shutil.move(thumbnail, document.thumbnail_path)
with index.open_index_writer() as writer:
index.update_document(writer, document)
except Exception:
logger.exception(
f"Error while parsing document {document} " f"(ID: {document_id})",
)
finally:
parser.cleanup()
class Command(BaseCommand):
help = """
@@ -146,7 +79,7 @@ class Command(BaseCommand):
with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
list(
tqdm.tqdm(
pool.imap_unordered(handle_document, document_ids),
pool.imap_unordered(update_document_archive_file, document_ids),
total=len(document_ids),
disable=options["no_progress_bar"],
),

View File

@@ -1,35 +0,0 @@
import tqdm
from django.core.management.base import BaseCommand
from documents.tasks import redo_ocr
class Command(BaseCommand):
help = """
This will rename all documents to match the latest filename format.
""".replace(
" ",
"",
)
def add_arguments(self, parser):
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown",
)
parser.add_argument(
"documents",
nargs="+",
help="Document primary keys for re-processing OCR on",
)
def handle(self, *args, **options):
doc_pks = tqdm.tqdm(
options["documents"],
disable=options["no_progress_bar"],
)
redo_ocr(doc_pks)