Merge pull request #1451 from paperless-ngx/feature-better-redo-ocr

Feature: Even better re-do of OCR
This commit is contained in:
Quinn Casey 2022-08-25 17:01:54 -07:00 committed by GitHub
commit 44e596b0c4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 70 additions and 146 deletions

View File

@ -122,6 +122,10 @@ def delete(doc_ids):
def redo_ocr(doc_ids):
async_task("documents.tasks.redo_ocr", document_ids=doc_ids)
for document_id in doc_ids:
async_task(
"documents.tasks.update_document_archive_file",
document_id=document_id,
)
return "OK"

View File

@ -1,85 +1,18 @@
import hashlib
import logging
import multiprocessing
import os
import shutil
import uuid
import tqdm
from django import db
from django.conf import settings
from django.core.management.base import BaseCommand
from django.db import transaction
from documents.models import Document
from filelock import FileLock
from ... import index
from ...file_handling import create_source_path_directory
from ...file_handling import generate_unique_filename
from ...parsers import get_parser_class_for_mime_type
from documents.tasks import update_document_archive_file
logger = logging.getLogger("paperless.management.archiver")
def handle_document(document_id):
document = Document.objects.get(id=document_id)
mime_type = document.mime_type
parser_class = get_parser_class_for_mime_type(mime_type)
if not parser_class:
logger.error(
f"No parser found for mime type {mime_type}, cannot "
f"archive document {document} (ID: {document_id})",
)
return
parser = parser_class(logging_group=uuid.uuid4())
try:
parser.parse(document.source_path, mime_type, document.get_public_filename())
thumbnail = parser.get_thumbnail(
document.source_path,
mime_type,
document.get_public_filename(),
)
if parser.get_archive_path():
with transaction.atomic():
with open(parser.get_archive_path(), "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
# I'm going to save first so that in case the file move
# fails, the database is rolled back.
# We also don't use save() since that triggers the filehandling
# logic, and we don't want that yet (file not yet in place)
document.archive_filename = generate_unique_filename(
document,
archive_filename=True,
)
Document.objects.filter(pk=document.pk).update(
archive_checksum=checksum,
content=parser.get_text(),
archive_filename=document.archive_filename,
)
with FileLock(settings.MEDIA_LOCK):
create_source_path_directory(document.archive_path)
shutil.move(parser.get_archive_path(), document.archive_path)
shutil.move(thumbnail, document.thumbnail_path)
with index.open_index_writer() as writer:
index.update_document(writer, document)
except Exception:
logger.exception(
f"Error while parsing document {document} " f"(ID: {document_id})",
)
finally:
parser.cleanup()
class Command(BaseCommand):
help = """
@ -146,7 +79,7 @@ class Command(BaseCommand):
with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
list(
tqdm.tqdm(
pool.imap_unordered(handle_document, document_ids),
pool.imap_unordered(update_document_archive_file, document_ids),
total=len(document_ids),
disable=options["no_progress_bar"],
),

View File

@ -1,35 +0,0 @@
import tqdm
from django.core.management.base import BaseCommand
from documents.tasks import redo_ocr
class Command(BaseCommand):
help = """
This will rename all documents to match the latest filename format.
""".replace(
" ",
"",
)
def add_arguments(self, parser):
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown",
)
parser.add_argument(
"documents",
nargs="+",
help="Document primary keys for re-processing OCR on",
)
def handle(self, *args, **options):
doc_pks = tqdm.tqdm(
options["documents"],
disable=options["no_progress_bar"],
)
redo_ocr(doc_pks)

View File

@ -1,6 +1,8 @@
import hashlib
import logging
import os
import shutil
import uuid
from pathlib import Path
from typing import Type
@ -8,7 +10,7 @@ import tqdm
from asgiref.sync import async_to_sync
from channels.layers import get_channel_layer
from django.conf import settings
from django.core.exceptions import ObjectDoesNotExist
from django.db import transaction
from django.db.models.signals import post_save
from documents import barcodes
from documents import index
@ -17,6 +19,8 @@ from documents.classifier import DocumentClassifier
from documents.classifier import load_classifier
from documents.consumer import Consumer
from documents.consumer import ConsumerError
from documents.file_handling import create_source_path_directory
from documents.file_handling import generate_unique_filename
from documents.models import Correspondent
from documents.models import Document
from documents.models import DocumentType
@ -24,8 +28,8 @@ from documents.models import StoragePath
from documents.models import Tag
from documents.parsers import DocumentParser
from documents.parsers import get_parser_class_for_mime_type
from documents.parsers import ParseError
from documents.sanity_checker import SanityCheckFailedException
from filelock import FileLock
from whoosh.writing import AsyncWriter
@ -213,44 +217,62 @@ def bulk_update_documents(document_ids):
index.update_document(writer, doc)
def redo_ocr(document_ids):
all_docs = Document.objects.all()
def update_document_archive_file(document_id):
"""
Re-creates the archive file of a document, including new OCR content and thumbnail
"""
document = Document.objects.get(id=document_id)
for doc_pk in document_ids:
try:
logger.info(f"Parsing document {doc_pk}")
doc: Document = all_docs.get(pk=doc_pk)
except ObjectDoesNotExist:
logger.error(f"Document {doc_pk} does not exist")
continue
mime_type = document.mime_type
# Get the correct parser for this mime type
parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(
doc.mime_type,
parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(mime_type)
if not parser_class:
logger.error(
f"No parser found for mime type {mime_type}, cannot "
f"archive document {document} (ID: {document_id})",
)
document_parser: DocumentParser = parser_class(
"redo-ocr",
return
parser: DocumentParser = parser_class(logging_group=uuid.uuid4())
try:
parser.parse(document.source_path, mime_type, document.get_public_filename())
thumbnail = parser.get_thumbnail(
document.source_path,
mime_type,
document.get_public_filename(),
)
# Create a file path to copy the original file to for working on
temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve()
if parser.get_archive_path():
with transaction.atomic():
with open(parser.get_archive_path(), "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
# I'm going to save first so that in case the file move
# fails, the database is rolled back.
# We also don't use save() since that triggers the filehandling
# logic, and we don't want that yet (file not yet in place)
document.archive_filename = generate_unique_filename(
document,
archive_filename=True,
)
Document.objects.filter(pk=document.pk).update(
archive_checksum=checksum,
content=parser.get_text(),
archive_filename=document.archive_filename,
)
with FileLock(settings.MEDIA_LOCK):
create_source_path_directory(document.archive_path)
shutil.move(parser.get_archive_path(), document.archive_path)
shutil.move(thumbnail, document.thumbnail_path)
shutil.copy(doc.source_path, temp_file)
with index.open_index_writer() as writer:
index.update_document(writer, document)
try:
logger.info(
f"Using {type(document_parser).__name__} for document",
)
# Try to re-parse the document into text
document_parser.parse(str(temp_file), doc.mime_type)
doc.content = document_parser.get_text()
doc.save()
logger.info("Document OCR updated")
except ParseError as e:
logger.error(f"Error parsing document: {e}")
finally:
# Remove the file path if it was created
if temp_file.exists() and temp_file.is_file():
temp_file.unlink()
except Exception:
logger.exception(
f"Error while parsing document {document} " f"(ID: {document_id})",
)
finally:
parser.cleanup()

View File

@ -10,8 +10,8 @@ from django.core.management import call_command
from django.test import override_settings
from django.test import TestCase
from documents.file_handling import generate_filename
from documents.management.commands.document_archiver import handle_document
from documents.models import Document
from documents.tasks import update_document_archive_file
from documents.tests.utils import DirectoriesMixin
@ -46,7 +46,7 @@ class TestArchiver(DirectoriesMixin, TestCase):
os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"),
)
handle_document(doc.pk)
update_document_archive_file(doc.pk)
doc = Document.objects.get(id=doc.id)
@ -63,7 +63,7 @@ class TestArchiver(DirectoriesMixin, TestCase):
doc.save()
shutil.copy(sample_file, doc.source_path)
handle_document(doc.pk)
update_document_archive_file(doc.pk)
doc = Document.objects.get(id=doc.id)
@ -94,8 +94,8 @@ class TestArchiver(DirectoriesMixin, TestCase):
os.path.join(self.dirs.originals_dir, f"document_01.pdf"),
)
handle_document(doc2.pk)
handle_document(doc1.pk)
update_document_archive_file(doc2.pk)
update_document_archive_file(doc1.pk)
doc1 = Document.objects.get(id=doc1.id)
doc2 = Document.objects.get(id=doc2.id)