Try a new way of extracting text from a given PDF file

This commit is contained in:
Trenton Holmes 2023-01-01 15:57:22 -08:00 committed by Trenton H
parent da38efebdf
commit 7be9ae9c02
2 changed files with 26 additions and 48 deletions

View File

@ -2,6 +2,7 @@ import json
import os import os
import re import re
import subprocess import subprocess
import tempfile
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
@ -137,36 +138,27 @@ class RasterisedDocumentParser(DocumentParser):
if not os.path.isfile(pdf_file): if not os.path.isfile(pdf_file):
return None return None
from pdfminer.high_level import extract_text as pdfminer_extract_text
try: try:
stripped = post_process_text(pdfminer_extract_text(pdf_file)) text = None
with tempfile.NamedTemporaryFile(
mode="w+",
dir=settings.SCRATCH_DIR,
) as tmp:
subprocess.run(
[
"pdftotext",
"-q",
"-layout",
"-enc",
"UTF-8",
pdf_file,
tmp.name,
],
)
text = tmp.read()
self.log("debug", f"Extracted text from PDF file {pdf_file}") return post_process_text(text)
# pdfminer.six does not handle RTL text
# as a hack, for some languages, return no text, to force
# OCRMyPdf/Tesseract do handle this correctly
from langdetect import detect
lang = detect(stripped)
self.log("debug", f"Detected language {lang}")
if (
lang
in {
"ar", # Arabic
"he", # Hebrew,
"fa", # Persian
}
and pdf_file.name != "archive-fallback.pdf"
):
raise RtlLanguageException()
return stripped
except RtlLanguageException:
self.log("warning", f"Detected RTL language {lang}")
return None
except Exception: except Exception:
# TODO catch all for various issues with PDFminer.six. # TODO catch all for various issues with PDFminer.six.
# If PDFminer fails, fall back to OCR. # If PDFminer fails, fall back to OCR.
@ -342,7 +334,7 @@ class RasterisedDocumentParser(DocumentParser):
) )
if original_has_text: if original_has_text:
self.text = text_original self.text = text_original
except (NoTextFoundException, RtlLanguageException, InputFileError) as e: except (NoTextFoundException, InputFileError) as e:
self.log( self.log(
"warning", "warning",
f"Encountered an error while running OCR: {str(e)}. " f"Encountered an error while running OCR: {str(e)}. "

View File

@ -661,28 +661,14 @@ class TestParser(DirectoriesMixin, TestCase):
- Text from the document is extracted - Text from the document is extracted
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
with mock.patch.object(
parser,
"construct_ocrmypdf_parameters",
wraps=parser.construct_ocrmypdf_parameters,
) as wrapped:
parser.parse( parser.parse(
os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"), os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
"application/pdf", "application/pdf",
) )
# There isn't a good way to actually check this working, with RTL correctly return # Copied from the PDF to here. Don't even look at it
# as it would require tesseract-ocr-ara installed for everyone running the self.assertIn("ةﯾﻠﺧﺎدﻻ ةرازو", parser.get_text())
# test suite. This test does provide the coverage though and attempts to ensure
# the force OCR happens
self.assertIsNotNone(parser.get_text())
self.assertEqual(parser.construct_ocrmypdf_parameters.call_count, 2)
# Check the last call kwargs
self.assertTrue(
parser.construct_ocrmypdf_parameters.call_args.kwargs["safe_fallback"],
)
class TestParserFileTypes(DirectoriesMixin, TestCase): class TestParserFileTypes(DirectoriesMixin, TestCase):