If extracting text from a fallback file (ie forced), allow the text to be used

This commit is contained in:
Trenton Holmes 2022-12-30 13:07:29 -08:00 committed by Trenton H
parent 28b26eb4c7
commit 26c7fad005

View File

@ -1,6 +1,8 @@
import json import json
import os import os
import re import re
from pathlib import Path
from typing import Optional
from django.conf import settings from django.conf import settings
from documents.parsers import DocumentParser from documents.parsers import DocumentParser
@ -99,7 +101,7 @@ class RasterisedDocumentParser(DocumentParser):
self.log("warning", f"Error while calculating DPI for image {image}: {e}") self.log("warning", f"Error while calculating DPI for image {image}: {e}")
return None return None
def extract_text(self, sidecar_file, pdf_file): def extract_text(self, sidecar_file: Optional[Path], pdf_file: Path):
# When re-doing OCR, the sidecar contains ONLY the new text, not # When re-doing OCR, the sidecar contains ONLY the new text, not
# the whole text, so do not utilize it in that case # the whole text, so do not utilize it in that case
if ( if (
@ -139,11 +141,15 @@ class RasterisedDocumentParser(DocumentParser):
self.log("debug", f"Detected language {lang}") self.log("debug", f"Detected language {lang}")
if lang in { if (
lang
in {
"ar", # Arabic "ar", # Arabic
"he", # Hebrew, "he", # Hebrew,
"fa", # Persian "fa", # Persian
}: }
and pdf_file.name != "archive-fallback.pdf"
):
raise RtlLanguageException() raise RtlLanguageException()
return stripped return stripped
except RtlLanguageException: except RtlLanguageException:
@ -275,7 +281,7 @@ class RasterisedDocumentParser(DocumentParser):
return ocrmypdf_args return ocrmypdf_args
def parse(self, document_path, mime_type, file_name=None): def parse(self, document_path: Path, mime_type, file_name=None):
# This forces tesseract to use one core per page. # This forces tesseract to use one core per page.
os.environ["OMP_THREAD_LIMIT"] = "1" os.environ["OMP_THREAD_LIMIT"] = "1"
@ -300,8 +306,8 @@ class RasterisedDocumentParser(DocumentParser):
import ocrmypdf import ocrmypdf
from ocrmypdf import InputFileError, EncryptedPdfError from ocrmypdf import InputFileError, EncryptedPdfError
archive_path = os.path.join(self.tempdir, "archive.pdf") archive_path = Path(os.path.join(self.tempdir, "archive.pdf"))
sidecar_file = os.path.join(self.tempdir, "sidecar.txt") sidecar_file = Path(os.path.join(self.tempdir, "sidecar.txt"))
args = self.construct_ocrmypdf_parameters( args = self.construct_ocrmypdf_parameters(
document_path, document_path,
@ -335,8 +341,12 @@ class RasterisedDocumentParser(DocumentParser):
f"Attempting force OCR to get the text.", f"Attempting force OCR to get the text.",
) )
archive_path_fallback = os.path.join(self.tempdir, "archive-fallback.pdf") archive_path_fallback = Path(
sidecar_file_fallback = os.path.join(self.tempdir, "sidecar-fallback.txt") os.path.join(self.tempdir, "archive-fallback.pdf"),
)
sidecar_file_fallback = Path(
os.path.join(self.tempdir, "sidecar-fallback.txt"),
)
# Attempt to run OCR with safe settings. # Attempt to run OCR with safe settings.