Extract redo ocr to task

This commit is contained in:
Michael Shamoon 2022-06-22 05:53:13 -07:00
parent a090cf7a10
commit 13ffe468df
2 changed files with 53 additions and 51 deletions

View File

@ -1,14 +1,6 @@
import shutil
from pathlib import Path
from typing import Type
import tqdm import tqdm
from django.core.exceptions import ObjectDoesNotExist
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
from documents.models import Document from documents.tasks import redo_ocr
from documents.parsers import DocumentParser
from documents.parsers import get_parser_class_for_mime_type
from documents.parsers import ParseError
class Command(BaseCommand): class Command(BaseCommand):
@ -36,47 +28,8 @@ class Command(BaseCommand):
) )
def handle(self, *args, **options): def handle(self, *args, **options):
doc_pks = tqdm.tqdm(
all_docs = Document.objects.all()
for doc_pk in tqdm.tqdm(
options["documents"], options["documents"],
disable=options["no_progress_bar"], disable=options["no_progress_bar"],
):
try:
self.stdout.write(f"Parsing document {doc_pk}")
doc: Document = all_docs.get(pk=doc_pk)
except ObjectDoesNotExist:
self.stdout.write(self.style.ERROR(f"Document {doc_pk} does not exist"))
continue
# Get the correct parser for this mime type
parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(
doc.mime_type,
) )
document_parser: DocumentParser = parser_class( redo_ocr(doc_pks)
"redo-ocr",
)
# Create a file path to copy the original file to for working on
temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve()
shutil.copy(doc.source_path, temp_file)
try:
self.stdout.write(
f"Using {type(document_parser).__name__} for document",
)
# Try to re-parse the document into text
document_parser.parse(str(temp_file), doc.mime_type)
doc.content = document_parser.get_text()
doc.save()
self.stdout.write("Document OCR updated")
except ParseError as e:
self.stdout.write(self.style.ERROR(f"Error parsing document: {e}"))
finally:
# Remove the file path if it was created
if temp_file.exists() and temp_file.is_file():
temp_file.unlink()

View File

@ -2,13 +2,16 @@ import logging
import os import os
import shutil import shutil
import tempfile import tempfile
from pathlib import Path
from typing import List # for type hinting. Can be removed, if only Python >3.8 is used from typing import List # for type hinting. Can be removed, if only Python >3.8 is used
from typing import Type
import magic import magic
import tqdm import tqdm
from asgiref.sync import async_to_sync from asgiref.sync import async_to_sync
from channels.layers import get_channel_layer from channels.layers import get_channel_layer
from django.conf import settings from django.conf import settings
from django.core.exceptions import ObjectDoesNotExist
from django.db.models.signals import post_save from django.db.models.signals import post_save
from documents import index from documents import index
from documents import sanity_checker from documents import sanity_checker
@ -21,6 +24,9 @@ from documents.models import Document
from documents.models import DocumentType from documents.models import DocumentType
from documents.models import StoragePath from documents.models import StoragePath
from documents.models import Tag from documents.models import Tag
from documents.parsers import DocumentParser
from documents.parsers import get_parser_class_for_mime_type
from documents.parsers import ParseError
from documents.sanity_checker import SanityCheckFailedException from documents.sanity_checker import SanityCheckFailedException
from pdf2image import convert_from_path from pdf2image import convert_from_path
from pikepdf import Pdf from pikepdf import Pdf
@ -359,3 +365,46 @@ def bulk_update_documents(document_ids):
with AsyncWriter(ix) as writer: with AsyncWriter(ix) as writer:
for doc in documents: for doc in documents:
index.update_document(writer, doc) index.update_document(writer, doc)
def redo_ocr(document_ids):
all_docs = Document.objects.all()
for doc_pk in document_ids:
try:
logger.info(f"Parsing document {doc_pk}")
doc: Document = all_docs.get(pk=doc_pk)
except ObjectDoesNotExist:
logger.error(f"Document {doc_pk} does not exist")
continue
# Get the correct parser for this mime type
parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(
doc.mime_type,
)
document_parser: DocumentParser = parser_class(
"redo-ocr",
)
# Create a file path to copy the original file to for working on
temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve()
shutil.copy(doc.source_path, temp_file)
try:
logger.info(
f"Using {type(document_parser).__name__} for document",
)
# Try to re-parse the document into text
document_parser.parse(str(temp_file), doc.mime_type)
doc.content = document_parser.get_text()
doc.save()
logger.info("Document OCR updated")
except ParseError as e:
logger.error(f"Error parsing document: {e}")
finally:
# Remove the file path if it was created
if temp_file.exists() and temp_file.is_file():
temp_file.unlink()