diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 44671fa11..8e0bac5a7 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -2,6 +2,7 @@ import json import os import re import subprocess +import tempfile from pathlib import Path from typing import Optional @@ -137,36 +138,27 @@ class RasterisedDocumentParser(DocumentParser): if not os.path.isfile(pdf_file): return None - from pdfminer.high_level import extract_text as pdfminer_extract_text - try: - stripped = post_process_text(pdfminer_extract_text(pdf_file)) + text = None + with tempfile.NamedTemporaryFile( + mode="w+", + dir=settings.SCRATCH_DIR, + ) as tmp: + subprocess.run( + [ + "pdftotext", + "-q", + "-layout", + "-enc", + "UTF-8", + pdf_file, + tmp.name, + ], + ) + text = tmp.read() - self.log("debug", f"Extracted text from PDF file {pdf_file}") + return post_process_text(text) - # pdfminer.six does not handle RTL text - # as a hack, for some languages, return no text, to force - # OCRMyPdf/Tesseract do handle this correctly - from langdetect import detect - - lang = detect(stripped) - - self.log("debug", f"Detected language {lang}") - - if ( - lang - in { - "ar", # Arabic - "he", # Hebrew, - "fa", # Persian - } - and pdf_file.name != "archive-fallback.pdf" - ): - raise RtlLanguageException() - return stripped - except RtlLanguageException: - self.log("warning", f"Detected RTL language {lang}") - return None except Exception: # TODO catch all for various issues with PDFminer.six. # If PDFminer fails, fall back to OCR. @@ -342,7 +334,7 @@ class RasterisedDocumentParser(DocumentParser): ) if original_has_text: self.text = text_original - except (NoTextFoundException, RtlLanguageException, InputFileError) as e: + except (NoTextFoundException, InputFileError) as e: self.log( "warning", f"Encountered an error while running OCR: {str(e)}. " diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index 28af8dec1..53af68f8d 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -661,28 +661,14 @@ class TestParser(DirectoriesMixin, TestCase): - Text from the document is extracted """ parser = RasterisedDocumentParser(None) - with mock.patch.object( - parser, - "construct_ocrmypdf_parameters", - wraps=parser.construct_ocrmypdf_parameters, - ) as wrapped: - parser.parse( - os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"), - "application/pdf", - ) + parser.parse( + os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"), + "application/pdf", + ) - # There isn't a good way to actually check this working, with RTL correctly return - # as it would require tesseract-ocr-ara installed for everyone running the - # test suite. This test does provide the coverage though and attempts to ensure - # the force OCR happens - self.assertIsNotNone(parser.get_text()) - - self.assertEqual(parser.construct_ocrmypdf_parameters.call_count, 2) - # Check the last call kwargs - self.assertTrue( - parser.construct_ocrmypdf_parameters.call_args.kwargs["safe_fallback"], - ) + # Copied from the PDF to here. Don't even look at it + self.assertIn("ةﯾﻠﺧﺎدﻻ ةرازو", parser.get_text()) class TestParserFileTypes(DirectoriesMixin, TestCase):