Implements a better re-do of OCR by making the document archiver function common. Actually creates updated file now

This commit is contained in:
Trenton Holmes
2022-08-21 18:20:59 -07:00
parent 87cf215f3e
commit 059242b7dc
5 changed files with 70 additions and 146 deletions

View File

@@ -1,6 +1,8 @@
import hashlib
import logging
import os
import shutil
import uuid
from pathlib import Path
from typing import Type
@@ -8,7 +10,7 @@ import tqdm
from asgiref.sync import async_to_sync
from channels.layers import get_channel_layer
from django.conf import settings
from django.core.exceptions import ObjectDoesNotExist
from django.db import transaction
from django.db.models.signals import post_save
from documents import barcodes
from documents import index
@@ -17,6 +19,8 @@ from documents.classifier import DocumentClassifier
from documents.classifier import load_classifier
from documents.consumer import Consumer
from documents.consumer import ConsumerError
from documents.file_handling import create_source_path_directory
from documents.file_handling import generate_unique_filename
from documents.models import Correspondent
from documents.models import Document
from documents.models import DocumentType
@@ -24,8 +28,8 @@ from documents.models import StoragePath
from documents.models import Tag
from documents.parsers import DocumentParser
from documents.parsers import get_parser_class_for_mime_type
from documents.parsers import ParseError
from documents.sanity_checker import SanityCheckFailedException
from filelock import FileLock
from whoosh.writing import AsyncWriter
@@ -213,44 +217,62 @@ def bulk_update_documents(document_ids):
index.update_document(writer, doc)
def redo_ocr(document_ids):
all_docs = Document.objects.all()
def update_document_archive_file(document_id):
"""
Re-creates the archive file of a document, including new OCR content and thumbnail
"""
document = Document.objects.get(id=document_id)
for doc_pk in document_ids:
try:
logger.info(f"Parsing document {doc_pk}")
doc: Document = all_docs.get(pk=doc_pk)
except ObjectDoesNotExist:
logger.error(f"Document {doc_pk} does not exist")
continue
mime_type = document.mime_type
# Get the correct parser for this mime type
parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(
doc.mime_type,
parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(mime_type)
if not parser_class:
logger.error(
f"No parser found for mime type {mime_type}, cannot "
f"archive document {document} (ID: {document_id})",
)
document_parser: DocumentParser = parser_class(
"redo-ocr",
return
parser: DocumentParser = parser_class(logging_group=uuid.uuid4())
try:
parser.parse(document.source_path, mime_type, document.get_public_filename())
thumbnail = parser.get_thumbnail(
document.source_path,
mime_type,
document.get_public_filename(),
)
# Create a file path to copy the original file to for working on
temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve()
if parser.get_archive_path():
with transaction.atomic():
with open(parser.get_archive_path(), "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
# I'm going to save first so that in case the file move
# fails, the database is rolled back.
# We also don't use save() since that triggers the filehandling
# logic, and we don't want that yet (file not yet in place)
document.archive_filename = generate_unique_filename(
document,
archive_filename=True,
)
Document.objects.filter(pk=document.pk).update(
archive_checksum=checksum,
content=parser.get_text(),
archive_filename=document.archive_filename,
)
with FileLock(settings.MEDIA_LOCK):
create_source_path_directory(document.archive_path)
shutil.move(parser.get_archive_path(), document.archive_path)
shutil.move(thumbnail, document.thumbnail_path)
shutil.copy(doc.source_path, temp_file)
with index.open_index_writer() as writer:
index.update_document(writer, document)
try:
logger.info(
f"Using {type(document_parser).__name__} for document",
)
# Try to re-parse the document into text
document_parser.parse(str(temp_file), doc.mime_type)
doc.content = document_parser.get_text()
doc.save()
logger.info("Document OCR updated")
except ParseError as e:
logger.error(f"Error parsing document: {e}")
finally:
# Remove the file path if it was created
if temp_file.exists() and temp_file.is_file():
temp_file.unlink()
except Exception:
logger.exception(
f"Error while parsing document {document} " f"(ID: {document_id})",
)
finally:
parser.cleanup()