Merge pull request #1451 from paperless-ngx/feature-better-redo-ocr

Feature: Even better re-do of OCR
This commit is contained in:
Quinn Casey 2022-08-25 17:01:54 -07:00 committed by GitHub
commit 44e596b0c4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 70 additions and 146 deletions

View File

@ -122,6 +122,10 @@ def delete(doc_ids):
def redo_ocr(doc_ids): def redo_ocr(doc_ids):
async_task("documents.tasks.redo_ocr", document_ids=doc_ids) for document_id in doc_ids:
async_task(
"documents.tasks.update_document_archive_file",
document_id=document_id,
)
return "OK" return "OK"

View File

@ -1,85 +1,18 @@
import hashlib
import logging import logging
import multiprocessing import multiprocessing
import os import os
import shutil
import uuid
import tqdm import tqdm
from django import db from django import db
from django.conf import settings from django.conf import settings
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
from django.db import transaction
from documents.models import Document from documents.models import Document
from filelock import FileLock from documents.tasks import update_document_archive_file
from ... import index
from ...file_handling import create_source_path_directory
from ...file_handling import generate_unique_filename
from ...parsers import get_parser_class_for_mime_type
logger = logging.getLogger("paperless.management.archiver") logger = logging.getLogger("paperless.management.archiver")
def handle_document(document_id):
document = Document.objects.get(id=document_id)
mime_type = document.mime_type
parser_class = get_parser_class_for_mime_type(mime_type)
if not parser_class:
logger.error(
f"No parser found for mime type {mime_type}, cannot "
f"archive document {document} (ID: {document_id})",
)
return
parser = parser_class(logging_group=uuid.uuid4())
try:
parser.parse(document.source_path, mime_type, document.get_public_filename())
thumbnail = parser.get_thumbnail(
document.source_path,
mime_type,
document.get_public_filename(),
)
if parser.get_archive_path():
with transaction.atomic():
with open(parser.get_archive_path(), "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
# I'm going to save first so that in case the file move
# fails, the database is rolled back.
# We also don't use save() since that triggers the filehandling
# logic, and we don't want that yet (file not yet in place)
document.archive_filename = generate_unique_filename(
document,
archive_filename=True,
)
Document.objects.filter(pk=document.pk).update(
archive_checksum=checksum,
content=parser.get_text(),
archive_filename=document.archive_filename,
)
with FileLock(settings.MEDIA_LOCK):
create_source_path_directory(document.archive_path)
shutil.move(parser.get_archive_path(), document.archive_path)
shutil.move(thumbnail, document.thumbnail_path)
with index.open_index_writer() as writer:
index.update_document(writer, document)
except Exception:
logger.exception(
f"Error while parsing document {document} " f"(ID: {document_id})",
)
finally:
parser.cleanup()
class Command(BaseCommand): class Command(BaseCommand):
help = """ help = """
@ -146,7 +79,7 @@ class Command(BaseCommand):
with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool: with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
list( list(
tqdm.tqdm( tqdm.tqdm(
pool.imap_unordered(handle_document, document_ids), pool.imap_unordered(update_document_archive_file, document_ids),
total=len(document_ids), total=len(document_ids),
disable=options["no_progress_bar"], disable=options["no_progress_bar"],
), ),

View File

@ -1,35 +0,0 @@
import tqdm
from django.core.management.base import BaseCommand
from documents.tasks import redo_ocr
class Command(BaseCommand):
help = """
This will rename all documents to match the latest filename format.
""".replace(
" ",
"",
)
def add_arguments(self, parser):
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown",
)
parser.add_argument(
"documents",
nargs="+",
help="Document primary keys for re-processing OCR on",
)
def handle(self, *args, **options):
doc_pks = tqdm.tqdm(
options["documents"],
disable=options["no_progress_bar"],
)
redo_ocr(doc_pks)

View File

@ -1,6 +1,8 @@
import hashlib
import logging import logging
import os import os
import shutil import shutil
import uuid
from pathlib import Path from pathlib import Path
from typing import Type from typing import Type
@ -8,7 +10,7 @@ import tqdm
from asgiref.sync import async_to_sync from asgiref.sync import async_to_sync
from channels.layers import get_channel_layer from channels.layers import get_channel_layer
from django.conf import settings from django.conf import settings
from django.core.exceptions import ObjectDoesNotExist from django.db import transaction
from django.db.models.signals import post_save from django.db.models.signals import post_save
from documents import barcodes from documents import barcodes
from documents import index from documents import index
@ -17,6 +19,8 @@ from documents.classifier import DocumentClassifier
from documents.classifier import load_classifier from documents.classifier import load_classifier
from documents.consumer import Consumer from documents.consumer import Consumer
from documents.consumer import ConsumerError from documents.consumer import ConsumerError
from documents.file_handling import create_source_path_directory
from documents.file_handling import generate_unique_filename
from documents.models import Correspondent from documents.models import Correspondent
from documents.models import Document from documents.models import Document
from documents.models import DocumentType from documents.models import DocumentType
@ -24,8 +28,8 @@ from documents.models import StoragePath
from documents.models import Tag from documents.models import Tag
from documents.parsers import DocumentParser from documents.parsers import DocumentParser
from documents.parsers import get_parser_class_for_mime_type from documents.parsers import get_parser_class_for_mime_type
from documents.parsers import ParseError
from documents.sanity_checker import SanityCheckFailedException from documents.sanity_checker import SanityCheckFailedException
from filelock import FileLock
from whoosh.writing import AsyncWriter from whoosh.writing import AsyncWriter
@ -213,44 +217,62 @@ def bulk_update_documents(document_ids):
index.update_document(writer, doc) index.update_document(writer, doc)
def redo_ocr(document_ids): def update_document_archive_file(document_id):
all_docs = Document.objects.all() """
Re-creates the archive file of a document, including new OCR content and thumbnail
"""
document = Document.objects.get(id=document_id)
for doc_pk in document_ids: mime_type = document.mime_type
try:
logger.info(f"Parsing document {doc_pk}")
doc: Document = all_docs.get(pk=doc_pk)
except ObjectDoesNotExist:
logger.error(f"Document {doc_pk} does not exist")
continue
# Get the correct parser for this mime type parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(mime_type)
parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(
doc.mime_type, if not parser_class:
logger.error(
f"No parser found for mime type {mime_type}, cannot "
f"archive document {document} (ID: {document_id})",
) )
document_parser: DocumentParser = parser_class( return
"redo-ocr",
parser: DocumentParser = parser_class(logging_group=uuid.uuid4())
try:
parser.parse(document.source_path, mime_type, document.get_public_filename())
thumbnail = parser.get_thumbnail(
document.source_path,
mime_type,
document.get_public_filename(),
) )
# Create a file path to copy the original file to for working on if parser.get_archive_path():
temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve() with transaction.atomic():
with open(parser.get_archive_path(), "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
# I'm going to save first so that in case the file move
# fails, the database is rolled back.
# We also don't use save() since that triggers the filehandling
# logic, and we don't want that yet (file not yet in place)
document.archive_filename = generate_unique_filename(
document,
archive_filename=True,
)
Document.objects.filter(pk=document.pk).update(
archive_checksum=checksum,
content=parser.get_text(),
archive_filename=document.archive_filename,
)
with FileLock(settings.MEDIA_LOCK):
create_source_path_directory(document.archive_path)
shutil.move(parser.get_archive_path(), document.archive_path)
shutil.move(thumbnail, document.thumbnail_path)
shutil.copy(doc.source_path, temp_file) with index.open_index_writer() as writer:
index.update_document(writer, document)
try: except Exception:
logger.info( logger.exception(
f"Using {type(document_parser).__name__} for document", f"Error while parsing document {document} " f"(ID: {document_id})",
) )
# Try to re-parse the document into text finally:
document_parser.parse(str(temp_file), doc.mime_type) parser.cleanup()
doc.content = document_parser.get_text()
doc.save()
logger.info("Document OCR updated")
except ParseError as e:
logger.error(f"Error parsing document: {e}")
finally:
# Remove the file path if it was created
if temp_file.exists() and temp_file.is_file():
temp_file.unlink()

View File

@ -10,8 +10,8 @@ from django.core.management import call_command
from django.test import override_settings from django.test import override_settings
from django.test import TestCase from django.test import TestCase
from documents.file_handling import generate_filename from documents.file_handling import generate_filename
from documents.management.commands.document_archiver import handle_document
from documents.models import Document from documents.models import Document
from documents.tasks import update_document_archive_file
from documents.tests.utils import DirectoriesMixin from documents.tests.utils import DirectoriesMixin
@ -46,7 +46,7 @@ class TestArchiver(DirectoriesMixin, TestCase):
os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"), os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"),
) )
handle_document(doc.pk) update_document_archive_file(doc.pk)
doc = Document.objects.get(id=doc.id) doc = Document.objects.get(id=doc.id)
@ -63,7 +63,7 @@ class TestArchiver(DirectoriesMixin, TestCase):
doc.save() doc.save()
shutil.copy(sample_file, doc.source_path) shutil.copy(sample_file, doc.source_path)
handle_document(doc.pk) update_document_archive_file(doc.pk)
doc = Document.objects.get(id=doc.id) doc = Document.objects.get(id=doc.id)
@ -94,8 +94,8 @@ class TestArchiver(DirectoriesMixin, TestCase):
os.path.join(self.dirs.originals_dir, f"document_01.pdf"), os.path.join(self.dirs.originals_dir, f"document_01.pdf"),
) )
handle_document(doc2.pk) update_document_archive_file(doc2.pk)
handle_document(doc1.pk) update_document_archive_file(doc1.pk)
doc1 = Document.objects.get(id=doc1.id) doc1 = Document.objects.get(id=doc1.id)
doc2 = Document.objects.get(id=doc2.id) doc2 = Document.objects.get(id=doc2.id)