From dfd16c5187ea493c3ce5c745a84cd742b3c58ac8 Mon Sep 17 00:00:00 2001 From: Trenton Holmes Date: Tue, 31 May 2022 09:33:09 -0700 Subject: [PATCH 1/6] Saves work on a new management comment to re-ocr a file --- .../management/commands/document_redo_ocr.py | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 src/documents/management/commands/document_redo_ocr.py diff --git a/src/documents/management/commands/document_redo_ocr.py b/src/documents/management/commands/document_redo_ocr.py new file mode 100644 index 000000000..c35639487 --- /dev/null +++ b/src/documents/management/commands/document_redo_ocr.py @@ -0,0 +1,69 @@ +import logging +import shutil +from pathlib import Path +from typing import Type + +from django.core.exceptions import ObjectDoesNotExist +from django.core.management.base import BaseCommand +from documents.models import Document +from documents.parsers import DocumentParser +from documents.parsers import get_parser_class_for_mime_type +from documents.parsers import ParseError + + +class Command(BaseCommand): + + help = """ + This will rename all documents to match the latest filename format. + """.replace( + " ", + "", + ) + + def add_arguments(self, parser): + parser.add_argument( + "documents", + nargs="+", + help="Document primary keys for re-processing OCR on", + ) + + def handle(self, *args, **options): + + logging.getLogger().handlers[0].level = logging.ERROR + + all_docs = Document.objects.all() + + for doc_pk in args.documents: + try: + self.stdout.write(f"Parsing document {doc_pk}") + doc: Document = all_docs.get(pk=doc_pk) + except ObjectDoesNotExist: + self.stdout.write(self.style.ERROR(f"Document {doc_pk} does not exist")) + continue + + # Get the correct parser for this mime type + parser_class: Type[DocumentParser] = get_parser_class_for_mime_type( + doc.mime_type, + ) + document_parser: DocumentParser = parser_class( + "redo-ocr", + ) + + # Create a file path to copy the original file to for working on + temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve() + + shutil.copy(doc.source_path, temp_file) + + try: + # Try to re-parse the document into text + document_parser.parse(str(temp_file), doc.mime_type) + + doc.content = document_parser.get_text() + doc.save() + + except ParseError as e: + self.stdout.write(self.style.ERROR(f"Error parsing document: {e}")) + finally: + # Remove the file path if it was created + if temp_file.exists() and temp_file.is_file(): + temp_file.unlink() From b7250477b5184b589669596a67c1914e4c332e14 Mon Sep 17 00:00:00 2001 From: Trenton Holmes Date: Tue, 31 May 2022 11:20:10 -0700 Subject: [PATCH 2/6] Includes the progress bar --- .../management/commands/document_redo_ocr.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/documents/management/commands/document_redo_ocr.py b/src/documents/management/commands/document_redo_ocr.py index c35639487..c76218d99 100644 --- a/src/documents/management/commands/document_redo_ocr.py +++ b/src/documents/management/commands/document_redo_ocr.py @@ -3,6 +3,7 @@ import shutil from pathlib import Path from typing import Type +import tqdm from django.core.exceptions import ObjectDoesNotExist from django.core.management.base import BaseCommand from documents.models import Document @@ -21,6 +22,14 @@ class Command(BaseCommand): ) def add_arguments(self, parser): + + parser.add_argument( + "--no-progress-bar", + default=False, + action="store_true", + help="If set, the progress bar will not be shown", + ) + parser.add_argument( "documents", nargs="+", @@ -33,9 +42,9 @@ class Command(BaseCommand): all_docs = Document.objects.all() - for doc_pk in args.documents: + for doc_pk in tqdm.tqdm(args.documents, disable=options["no_progress_bar"]): try: - self.stdout.write(f"Parsing document {doc_pk}") + self.stdout.write(self.style.INFO(f"Parsing document {doc_pk}")) doc: Document = all_docs.get(pk=doc_pk) except ObjectDoesNotExist: self.stdout.write(self.style.ERROR(f"Document {doc_pk} does not exist")) From a090cf7a101d27516e7b36f8ab27b5440550f330 Mon Sep 17 00:00:00 2001 From: Trenton Holmes Date: Fri, 10 Jun 2022 11:23:24 -0700 Subject: [PATCH 3/6] Updates following testing of command --- .../management/commands/document_redo_ocr.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/documents/management/commands/document_redo_ocr.py b/src/documents/management/commands/document_redo_ocr.py index c76218d99..3ead5a485 100644 --- a/src/documents/management/commands/document_redo_ocr.py +++ b/src/documents/management/commands/document_redo_ocr.py @@ -1,4 +1,3 @@ -import logging import shutil from pathlib import Path from typing import Type @@ -38,13 +37,14 @@ class Command(BaseCommand): def handle(self, *args, **options): - logging.getLogger().handlers[0].level = logging.ERROR - all_docs = Document.objects.all() - for doc_pk in tqdm.tqdm(args.documents, disable=options["no_progress_bar"]): + for doc_pk in tqdm.tqdm( + options["documents"], + disable=options["no_progress_bar"], + ): try: - self.stdout.write(self.style.INFO(f"Parsing document {doc_pk}")) + self.stdout.write(f"Parsing document {doc_pk}") doc: Document = all_docs.get(pk=doc_pk) except ObjectDoesNotExist: self.stdout.write(self.style.ERROR(f"Document {doc_pk} does not exist")) @@ -64,11 +64,15 @@ class Command(BaseCommand): shutil.copy(doc.source_path, temp_file) try: + self.stdout.write( + f"Using {type(document_parser).__name__} for document", + ) # Try to re-parse the document into text document_parser.parse(str(temp_file), doc.mime_type) doc.content = document_parser.get_text() doc.save() + self.stdout.write("Document OCR updated") except ParseError as e: self.stdout.write(self.style.ERROR(f"Error parsing document: {e}")) From 13ffe468dfa1af4bb55a257c641be54d52adf223 Mon Sep 17 00:00:00 2001 From: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Date: Wed, 22 Jun 2022 05:53:13 -0700 Subject: [PATCH 4/6] Extract redo ocr to task --- .../management/commands/document_redo_ocr.py | 55 ++----------------- src/documents/tasks.py | 49 +++++++++++++++++ 2 files changed, 53 insertions(+), 51 deletions(-) diff --git a/src/documents/management/commands/document_redo_ocr.py b/src/documents/management/commands/document_redo_ocr.py index 3ead5a485..1e44e6134 100644 --- a/src/documents/management/commands/document_redo_ocr.py +++ b/src/documents/management/commands/document_redo_ocr.py @@ -1,14 +1,6 @@ -import shutil -from pathlib import Path -from typing import Type - import tqdm -from django.core.exceptions import ObjectDoesNotExist from django.core.management.base import BaseCommand -from documents.models import Document -from documents.parsers import DocumentParser -from documents.parsers import get_parser_class_for_mime_type -from documents.parsers import ParseError +from documents.tasks import redo_ocr class Command(BaseCommand): @@ -36,47 +28,8 @@ class Command(BaseCommand): ) def handle(self, *args, **options): - - all_docs = Document.objects.all() - - for doc_pk in tqdm.tqdm( + doc_pks = tqdm.tqdm( options["documents"], disable=options["no_progress_bar"], - ): - try: - self.stdout.write(f"Parsing document {doc_pk}") - doc: Document = all_docs.get(pk=doc_pk) - except ObjectDoesNotExist: - self.stdout.write(self.style.ERROR(f"Document {doc_pk} does not exist")) - continue - - # Get the correct parser for this mime type - parser_class: Type[DocumentParser] = get_parser_class_for_mime_type( - doc.mime_type, - ) - document_parser: DocumentParser = parser_class( - "redo-ocr", - ) - - # Create a file path to copy the original file to for working on - temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve() - - shutil.copy(doc.source_path, temp_file) - - try: - self.stdout.write( - f"Using {type(document_parser).__name__} for document", - ) - # Try to re-parse the document into text - document_parser.parse(str(temp_file), doc.mime_type) - - doc.content = document_parser.get_text() - doc.save() - self.stdout.write("Document OCR updated") - - except ParseError as e: - self.stdout.write(self.style.ERROR(f"Error parsing document: {e}")) - finally: - # Remove the file path if it was created - if temp_file.exists() and temp_file.is_file(): - temp_file.unlink() + ) + redo_ocr(doc_pks) diff --git a/src/documents/tasks.py b/src/documents/tasks.py index 4c57b2eee..1070471ba 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -2,13 +2,16 @@ import logging import os import shutil import tempfile +from pathlib import Path from typing import List # for type hinting. Can be removed, if only Python >3.8 is used +from typing import Type import magic import tqdm from asgiref.sync import async_to_sync from channels.layers import get_channel_layer from django.conf import settings +from django.core.exceptions import ObjectDoesNotExist from django.db.models.signals import post_save from documents import index from documents import sanity_checker @@ -21,6 +24,9 @@ from documents.models import Document from documents.models import DocumentType from documents.models import StoragePath from documents.models import Tag +from documents.parsers import DocumentParser +from documents.parsers import get_parser_class_for_mime_type +from documents.parsers import ParseError from documents.sanity_checker import SanityCheckFailedException from pdf2image import convert_from_path from pikepdf import Pdf @@ -359,3 +365,46 @@ def bulk_update_documents(document_ids): with AsyncWriter(ix) as writer: for doc in documents: index.update_document(writer, doc) + + +def redo_ocr(document_ids): + all_docs = Document.objects.all() + + for doc_pk in document_ids: + try: + logger.info(f"Parsing document {doc_pk}") + doc: Document = all_docs.get(pk=doc_pk) + except ObjectDoesNotExist: + logger.error(f"Document {doc_pk} does not exist") + continue + + # Get the correct parser for this mime type + parser_class: Type[DocumentParser] = get_parser_class_for_mime_type( + doc.mime_type, + ) + document_parser: DocumentParser = parser_class( + "redo-ocr", + ) + + # Create a file path to copy the original file to for working on + temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve() + + shutil.copy(doc.source_path, temp_file) + + try: + logger.info( + f"Using {type(document_parser).__name__} for document", + ) + # Try to re-parse the document into text + document_parser.parse(str(temp_file), doc.mime_type) + + doc.content = document_parser.get_text() + doc.save() + logger.info("Document OCR updated") + + except ParseError as e: + logger.error(f"Error parsing document: {e}") + finally: + # Remove the file path if it was created + if temp_file.exists() and temp_file.is_file(): + temp_file.unlink() From c9bdf1c1841905be17d54cd25cfe6af4020b7867 Mon Sep 17 00:00:00 2001 From: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Date: Wed, 22 Jun 2022 05:53:41 -0700 Subject: [PATCH 5/6] Frontend UI for redo OCR --- .../bulk-editor/bulk-editor.component.html | 37 +++++++++++-------- .../bulk-editor/bulk-editor.component.ts | 15 ++++++++ src/documents/bulk_edit.py | 7 ++++ src/documents/serialisers.py | 3 ++ 4 files changed, 47 insertions(+), 15 deletions(-) diff --git a/src-ui/src/app/components/document-list/bulk-editor/bulk-editor.component.html b/src-ui/src/app/components/document-list/bulk-editor/bulk-editor.component.html index 8d9389df3..04069d997 100644 --- a/src-ui/src/app/components/document-list/bulk-editor/bulk-editor.component.html +++ b/src-ui/src/app/components/document-list/bulk-editor/bulk-editor.component.html @@ -66,23 +66,30 @@
- -
- -