mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-28 18:24:38 -05:00
Merge pull request #1451 from paperless-ngx/feature-better-redo-ocr
Feature: Even better re-do of OCR
This commit is contained in:
@@ -1,85 +1,18 @@
|
||||
import hashlib
|
||||
import logging
|
||||
import multiprocessing
|
||||
import os
|
||||
import shutil
|
||||
import uuid
|
||||
|
||||
import tqdm
|
||||
from django import db
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.db import transaction
|
||||
from documents.models import Document
|
||||
from filelock import FileLock
|
||||
|
||||
from ... import index
|
||||
from ...file_handling import create_source_path_directory
|
||||
from ...file_handling import generate_unique_filename
|
||||
from ...parsers import get_parser_class_for_mime_type
|
||||
from documents.tasks import update_document_archive_file
|
||||
|
||||
|
||||
logger = logging.getLogger("paperless.management.archiver")
|
||||
|
||||
|
||||
def handle_document(document_id):
|
||||
document = Document.objects.get(id=document_id)
|
||||
|
||||
mime_type = document.mime_type
|
||||
|
||||
parser_class = get_parser_class_for_mime_type(mime_type)
|
||||
|
||||
if not parser_class:
|
||||
logger.error(
|
||||
f"No parser found for mime type {mime_type}, cannot "
|
||||
f"archive document {document} (ID: {document_id})",
|
||||
)
|
||||
return
|
||||
|
||||
parser = parser_class(logging_group=uuid.uuid4())
|
||||
|
||||
try:
|
||||
parser.parse(document.source_path, mime_type, document.get_public_filename())
|
||||
|
||||
thumbnail = parser.get_thumbnail(
|
||||
document.source_path,
|
||||
mime_type,
|
||||
document.get_public_filename(),
|
||||
)
|
||||
|
||||
if parser.get_archive_path():
|
||||
with transaction.atomic():
|
||||
with open(parser.get_archive_path(), "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
# I'm going to save first so that in case the file move
|
||||
# fails, the database is rolled back.
|
||||
# We also don't use save() since that triggers the filehandling
|
||||
# logic, and we don't want that yet (file not yet in place)
|
||||
document.archive_filename = generate_unique_filename(
|
||||
document,
|
||||
archive_filename=True,
|
||||
)
|
||||
Document.objects.filter(pk=document.pk).update(
|
||||
archive_checksum=checksum,
|
||||
content=parser.get_text(),
|
||||
archive_filename=document.archive_filename,
|
||||
)
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
create_source_path_directory(document.archive_path)
|
||||
shutil.move(parser.get_archive_path(), document.archive_path)
|
||||
shutil.move(thumbnail, document.thumbnail_path)
|
||||
|
||||
with index.open_index_writer() as writer:
|
||||
index.update_document(writer, document)
|
||||
|
||||
except Exception:
|
||||
logger.exception(
|
||||
f"Error while parsing document {document} " f"(ID: {document_id})",
|
||||
)
|
||||
finally:
|
||||
parser.cleanup()
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
|
||||
help = """
|
||||
@@ -146,7 +79,7 @@ class Command(BaseCommand):
|
||||
with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
|
||||
list(
|
||||
tqdm.tqdm(
|
||||
pool.imap_unordered(handle_document, document_ids),
|
||||
pool.imap_unordered(update_document_archive_file, document_ids),
|
||||
total=len(document_ids),
|
||||
disable=options["no_progress_bar"],
|
||||
),
|
||||
|
@@ -1,35 +0,0 @@
|
||||
import tqdm
|
||||
from django.core.management.base import BaseCommand
|
||||
from documents.tasks import redo_ocr
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
|
||||
help = """
|
||||
This will rename all documents to match the latest filename format.
|
||||
""".replace(
|
||||
" ",
|
||||
"",
|
||||
)
|
||||
|
||||
def add_arguments(self, parser):
|
||||
|
||||
parser.add_argument(
|
||||
"--no-progress-bar",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="If set, the progress bar will not be shown",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"documents",
|
||||
nargs="+",
|
||||
help="Document primary keys for re-processing OCR on",
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
doc_pks = tqdm.tqdm(
|
||||
options["documents"],
|
||||
disable=options["no_progress_bar"],
|
||||
)
|
||||
redo_ocr(doc_pks)
|
Reference in New Issue
Block a user