mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Merge pull request #1451 from paperless-ngx/feature-better-redo-ocr
Feature: Even better re-do of OCR
This commit is contained in:
commit
44e596b0c4
@ -122,6 +122,10 @@ def delete(doc_ids):
|
|||||||
|
|
||||||
def redo_ocr(doc_ids):
|
def redo_ocr(doc_ids):
|
||||||
|
|
||||||
async_task("documents.tasks.redo_ocr", document_ids=doc_ids)
|
for document_id in doc_ids:
|
||||||
|
async_task(
|
||||||
|
"documents.tasks.update_document_archive_file",
|
||||||
|
document_id=document_id,
|
||||||
|
)
|
||||||
|
|
||||||
return "OK"
|
return "OK"
|
||||||
|
@ -1,85 +1,18 @@
|
|||||||
import hashlib
|
|
||||||
import logging
|
import logging
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
import os
|
import os
|
||||||
import shutil
|
|
||||||
import uuid
|
|
||||||
|
|
||||||
import tqdm
|
import tqdm
|
||||||
from django import db
|
from django import db
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.core.management.base import BaseCommand
|
from django.core.management.base import BaseCommand
|
||||||
from django.db import transaction
|
|
||||||
from documents.models import Document
|
from documents.models import Document
|
||||||
from filelock import FileLock
|
from documents.tasks import update_document_archive_file
|
||||||
|
|
||||||
from ... import index
|
|
||||||
from ...file_handling import create_source_path_directory
|
|
||||||
from ...file_handling import generate_unique_filename
|
|
||||||
from ...parsers import get_parser_class_for_mime_type
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger("paperless.management.archiver")
|
logger = logging.getLogger("paperless.management.archiver")
|
||||||
|
|
||||||
|
|
||||||
def handle_document(document_id):
|
|
||||||
document = Document.objects.get(id=document_id)
|
|
||||||
|
|
||||||
mime_type = document.mime_type
|
|
||||||
|
|
||||||
parser_class = get_parser_class_for_mime_type(mime_type)
|
|
||||||
|
|
||||||
if not parser_class:
|
|
||||||
logger.error(
|
|
||||||
f"No parser found for mime type {mime_type}, cannot "
|
|
||||||
f"archive document {document} (ID: {document_id})",
|
|
||||||
)
|
|
||||||
return
|
|
||||||
|
|
||||||
parser = parser_class(logging_group=uuid.uuid4())
|
|
||||||
|
|
||||||
try:
|
|
||||||
parser.parse(document.source_path, mime_type, document.get_public_filename())
|
|
||||||
|
|
||||||
thumbnail = parser.get_thumbnail(
|
|
||||||
document.source_path,
|
|
||||||
mime_type,
|
|
||||||
document.get_public_filename(),
|
|
||||||
)
|
|
||||||
|
|
||||||
if parser.get_archive_path():
|
|
||||||
with transaction.atomic():
|
|
||||||
with open(parser.get_archive_path(), "rb") as f:
|
|
||||||
checksum = hashlib.md5(f.read()).hexdigest()
|
|
||||||
# I'm going to save first so that in case the file move
|
|
||||||
# fails, the database is rolled back.
|
|
||||||
# We also don't use save() since that triggers the filehandling
|
|
||||||
# logic, and we don't want that yet (file not yet in place)
|
|
||||||
document.archive_filename = generate_unique_filename(
|
|
||||||
document,
|
|
||||||
archive_filename=True,
|
|
||||||
)
|
|
||||||
Document.objects.filter(pk=document.pk).update(
|
|
||||||
archive_checksum=checksum,
|
|
||||||
content=parser.get_text(),
|
|
||||||
archive_filename=document.archive_filename,
|
|
||||||
)
|
|
||||||
with FileLock(settings.MEDIA_LOCK):
|
|
||||||
create_source_path_directory(document.archive_path)
|
|
||||||
shutil.move(parser.get_archive_path(), document.archive_path)
|
|
||||||
shutil.move(thumbnail, document.thumbnail_path)
|
|
||||||
|
|
||||||
with index.open_index_writer() as writer:
|
|
||||||
index.update_document(writer, document)
|
|
||||||
|
|
||||||
except Exception:
|
|
||||||
logger.exception(
|
|
||||||
f"Error while parsing document {document} " f"(ID: {document_id})",
|
|
||||||
)
|
|
||||||
finally:
|
|
||||||
parser.cleanup()
|
|
||||||
|
|
||||||
|
|
||||||
class Command(BaseCommand):
|
class Command(BaseCommand):
|
||||||
|
|
||||||
help = """
|
help = """
|
||||||
@ -146,7 +79,7 @@ class Command(BaseCommand):
|
|||||||
with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
|
with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
|
||||||
list(
|
list(
|
||||||
tqdm.tqdm(
|
tqdm.tqdm(
|
||||||
pool.imap_unordered(handle_document, document_ids),
|
pool.imap_unordered(update_document_archive_file, document_ids),
|
||||||
total=len(document_ids),
|
total=len(document_ids),
|
||||||
disable=options["no_progress_bar"],
|
disable=options["no_progress_bar"],
|
||||||
),
|
),
|
||||||
|
@ -1,35 +0,0 @@
|
|||||||
import tqdm
|
|
||||||
from django.core.management.base import BaseCommand
|
|
||||||
from documents.tasks import redo_ocr
|
|
||||||
|
|
||||||
|
|
||||||
class Command(BaseCommand):
|
|
||||||
|
|
||||||
help = """
|
|
||||||
This will rename all documents to match the latest filename format.
|
|
||||||
""".replace(
|
|
||||||
" ",
|
|
||||||
"",
|
|
||||||
)
|
|
||||||
|
|
||||||
def add_arguments(self, parser):
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--no-progress-bar",
|
|
||||||
default=False,
|
|
||||||
action="store_true",
|
|
||||||
help="If set, the progress bar will not be shown",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"documents",
|
|
||||||
nargs="+",
|
|
||||||
help="Document primary keys for re-processing OCR on",
|
|
||||||
)
|
|
||||||
|
|
||||||
def handle(self, *args, **options):
|
|
||||||
doc_pks = tqdm.tqdm(
|
|
||||||
options["documents"],
|
|
||||||
disable=options["no_progress_bar"],
|
|
||||||
)
|
|
||||||
redo_ocr(doc_pks)
|
|
@ -1,6 +1,8 @@
|
|||||||
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
|
import uuid
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Type
|
from typing import Type
|
||||||
|
|
||||||
@ -8,7 +10,7 @@ import tqdm
|
|||||||
from asgiref.sync import async_to_sync
|
from asgiref.sync import async_to_sync
|
||||||
from channels.layers import get_channel_layer
|
from channels.layers import get_channel_layer
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.core.exceptions import ObjectDoesNotExist
|
from django.db import transaction
|
||||||
from django.db.models.signals import post_save
|
from django.db.models.signals import post_save
|
||||||
from documents import barcodes
|
from documents import barcodes
|
||||||
from documents import index
|
from documents import index
|
||||||
@ -17,6 +19,8 @@ from documents.classifier import DocumentClassifier
|
|||||||
from documents.classifier import load_classifier
|
from documents.classifier import load_classifier
|
||||||
from documents.consumer import Consumer
|
from documents.consumer import Consumer
|
||||||
from documents.consumer import ConsumerError
|
from documents.consumer import ConsumerError
|
||||||
|
from documents.file_handling import create_source_path_directory
|
||||||
|
from documents.file_handling import generate_unique_filename
|
||||||
from documents.models import Correspondent
|
from documents.models import Correspondent
|
||||||
from documents.models import Document
|
from documents.models import Document
|
||||||
from documents.models import DocumentType
|
from documents.models import DocumentType
|
||||||
@ -24,8 +28,8 @@ from documents.models import StoragePath
|
|||||||
from documents.models import Tag
|
from documents.models import Tag
|
||||||
from documents.parsers import DocumentParser
|
from documents.parsers import DocumentParser
|
||||||
from documents.parsers import get_parser_class_for_mime_type
|
from documents.parsers import get_parser_class_for_mime_type
|
||||||
from documents.parsers import ParseError
|
|
||||||
from documents.sanity_checker import SanityCheckFailedException
|
from documents.sanity_checker import SanityCheckFailedException
|
||||||
|
from filelock import FileLock
|
||||||
from whoosh.writing import AsyncWriter
|
from whoosh.writing import AsyncWriter
|
||||||
|
|
||||||
|
|
||||||
@ -213,44 +217,62 @@ def bulk_update_documents(document_ids):
|
|||||||
index.update_document(writer, doc)
|
index.update_document(writer, doc)
|
||||||
|
|
||||||
|
|
||||||
def redo_ocr(document_ids):
|
def update_document_archive_file(document_id):
|
||||||
all_docs = Document.objects.all()
|
"""
|
||||||
|
Re-creates the archive file of a document, including new OCR content and thumbnail
|
||||||
|
"""
|
||||||
|
document = Document.objects.get(id=document_id)
|
||||||
|
|
||||||
for doc_pk in document_ids:
|
mime_type = document.mime_type
|
||||||
try:
|
|
||||||
logger.info(f"Parsing document {doc_pk}")
|
|
||||||
doc: Document = all_docs.get(pk=doc_pk)
|
|
||||||
except ObjectDoesNotExist:
|
|
||||||
logger.error(f"Document {doc_pk} does not exist")
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Get the correct parser for this mime type
|
parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(mime_type)
|
||||||
parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(
|
|
||||||
doc.mime_type,
|
if not parser_class:
|
||||||
|
logger.error(
|
||||||
|
f"No parser found for mime type {mime_type}, cannot "
|
||||||
|
f"archive document {document} (ID: {document_id})",
|
||||||
)
|
)
|
||||||
document_parser: DocumentParser = parser_class(
|
return
|
||||||
"redo-ocr",
|
|
||||||
|
parser: DocumentParser = parser_class(logging_group=uuid.uuid4())
|
||||||
|
|
||||||
|
try:
|
||||||
|
parser.parse(document.source_path, mime_type, document.get_public_filename())
|
||||||
|
|
||||||
|
thumbnail = parser.get_thumbnail(
|
||||||
|
document.source_path,
|
||||||
|
mime_type,
|
||||||
|
document.get_public_filename(),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create a file path to copy the original file to for working on
|
if parser.get_archive_path():
|
||||||
temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve()
|
with transaction.atomic():
|
||||||
|
with open(parser.get_archive_path(), "rb") as f:
|
||||||
|
checksum = hashlib.md5(f.read()).hexdigest()
|
||||||
|
# I'm going to save first so that in case the file move
|
||||||
|
# fails, the database is rolled back.
|
||||||
|
# We also don't use save() since that triggers the filehandling
|
||||||
|
# logic, and we don't want that yet (file not yet in place)
|
||||||
|
document.archive_filename = generate_unique_filename(
|
||||||
|
document,
|
||||||
|
archive_filename=True,
|
||||||
|
)
|
||||||
|
Document.objects.filter(pk=document.pk).update(
|
||||||
|
archive_checksum=checksum,
|
||||||
|
content=parser.get_text(),
|
||||||
|
archive_filename=document.archive_filename,
|
||||||
|
)
|
||||||
|
with FileLock(settings.MEDIA_LOCK):
|
||||||
|
create_source_path_directory(document.archive_path)
|
||||||
|
shutil.move(parser.get_archive_path(), document.archive_path)
|
||||||
|
shutil.move(thumbnail, document.thumbnail_path)
|
||||||
|
|
||||||
shutil.copy(doc.source_path, temp_file)
|
with index.open_index_writer() as writer:
|
||||||
|
index.update_document(writer, document)
|
||||||
|
|
||||||
try:
|
except Exception:
|
||||||
logger.info(
|
logger.exception(
|
||||||
f"Using {type(document_parser).__name__} for document",
|
f"Error while parsing document {document} " f"(ID: {document_id})",
|
||||||
)
|
)
|
||||||
# Try to re-parse the document into text
|
finally:
|
||||||
document_parser.parse(str(temp_file), doc.mime_type)
|
parser.cleanup()
|
||||||
|
|
||||||
doc.content = document_parser.get_text()
|
|
||||||
doc.save()
|
|
||||||
logger.info("Document OCR updated")
|
|
||||||
|
|
||||||
except ParseError as e:
|
|
||||||
logger.error(f"Error parsing document: {e}")
|
|
||||||
finally:
|
|
||||||
# Remove the file path if it was created
|
|
||||||
if temp_file.exists() and temp_file.is_file():
|
|
||||||
temp_file.unlink()
|
|
||||||
|
@ -10,8 +10,8 @@ from django.core.management import call_command
|
|||||||
from django.test import override_settings
|
from django.test import override_settings
|
||||||
from django.test import TestCase
|
from django.test import TestCase
|
||||||
from documents.file_handling import generate_filename
|
from documents.file_handling import generate_filename
|
||||||
from documents.management.commands.document_archiver import handle_document
|
|
||||||
from documents.models import Document
|
from documents.models import Document
|
||||||
|
from documents.tasks import update_document_archive_file
|
||||||
from documents.tests.utils import DirectoriesMixin
|
from documents.tests.utils import DirectoriesMixin
|
||||||
|
|
||||||
|
|
||||||
@ -46,7 +46,7 @@ class TestArchiver(DirectoriesMixin, TestCase):
|
|||||||
os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"),
|
os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"),
|
||||||
)
|
)
|
||||||
|
|
||||||
handle_document(doc.pk)
|
update_document_archive_file(doc.pk)
|
||||||
|
|
||||||
doc = Document.objects.get(id=doc.id)
|
doc = Document.objects.get(id=doc.id)
|
||||||
|
|
||||||
@ -63,7 +63,7 @@ class TestArchiver(DirectoriesMixin, TestCase):
|
|||||||
doc.save()
|
doc.save()
|
||||||
shutil.copy(sample_file, doc.source_path)
|
shutil.copy(sample_file, doc.source_path)
|
||||||
|
|
||||||
handle_document(doc.pk)
|
update_document_archive_file(doc.pk)
|
||||||
|
|
||||||
doc = Document.objects.get(id=doc.id)
|
doc = Document.objects.get(id=doc.id)
|
||||||
|
|
||||||
@ -94,8 +94,8 @@ class TestArchiver(DirectoriesMixin, TestCase):
|
|||||||
os.path.join(self.dirs.originals_dir, f"document_01.pdf"),
|
os.path.join(self.dirs.originals_dir, f"document_01.pdf"),
|
||||||
)
|
)
|
||||||
|
|
||||||
handle_document(doc2.pk)
|
update_document_archive_file(doc2.pk)
|
||||||
handle_document(doc1.pk)
|
update_document_archive_file(doc1.pk)
|
||||||
|
|
||||||
doc1 = Document.objects.get(id=doc1.id)
|
doc1 = Document.objects.get(id=doc1.id)
|
||||||
doc2 = Document.objects.get(id=doc2.id)
|
doc2 = Document.objects.get(id=doc2.id)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user