From 26c7fad00523c5a6ff06b7893e348da2f23bd739 Mon Sep 17 00:00:00 2001 From: Trenton Holmes <797416+stumpylog@users.noreply.github.com> Date: Fri, 30 Dec 2022 13:07:29 -0800 Subject: [PATCH] If extracting text from a fallback file (ie forced), allow the text to be used --- src/paperless_tesseract/parsers.py | 32 ++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 4cc9b8e5f..4107cace8 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -1,6 +1,8 @@ import json import os import re +from pathlib import Path +from typing import Optional from django.conf import settings from documents.parsers import DocumentParser @@ -99,7 +101,7 @@ class RasterisedDocumentParser(DocumentParser): self.log("warning", f"Error while calculating DPI for image {image}: {e}") return None - def extract_text(self, sidecar_file, pdf_file): + def extract_text(self, sidecar_file: Optional[Path], pdf_file: Path): # When re-doing OCR, the sidecar contains ONLY the new text, not # the whole text, so do not utilize it in that case if ( @@ -139,11 +141,15 @@ class RasterisedDocumentParser(DocumentParser): self.log("debug", f"Detected language {lang}") - if lang in { - "ar", # Arabic - "he", # Hebrew, - "fa", # Persian - }: + if ( + lang + in { + "ar", # Arabic + "he", # Hebrew, + "fa", # Persian + } + and pdf_file.name != "archive-fallback.pdf" + ): raise RtlLanguageException() return stripped except RtlLanguageException: @@ -275,7 +281,7 @@ class RasterisedDocumentParser(DocumentParser): return ocrmypdf_args - def parse(self, document_path, mime_type, file_name=None): + def parse(self, document_path: Path, mime_type, file_name=None): # This forces tesseract to use one core per page. os.environ["OMP_THREAD_LIMIT"] = "1" @@ -300,8 +306,8 @@ class RasterisedDocumentParser(DocumentParser): import ocrmypdf from ocrmypdf import InputFileError, EncryptedPdfError - archive_path = os.path.join(self.tempdir, "archive.pdf") - sidecar_file = os.path.join(self.tempdir, "sidecar.txt") + archive_path = Path(os.path.join(self.tempdir, "archive.pdf")) + sidecar_file = Path(os.path.join(self.tempdir, "sidecar.txt")) args = self.construct_ocrmypdf_parameters( document_path, @@ -335,8 +341,12 @@ class RasterisedDocumentParser(DocumentParser): f"Attempting force OCR to get the text.", ) - archive_path_fallback = os.path.join(self.tempdir, "archive-fallback.pdf") - sidecar_file_fallback = os.path.join(self.tempdir, "sidecar-fallback.txt") + archive_path_fallback = Path( + os.path.join(self.tempdir, "archive-fallback.pdf"), + ) + sidecar_file_fallback = Path( + os.path.join(self.tempdir, "sidecar-fallback.txt"), + ) # Attempt to run OCR with safe settings.