Try a new way of extracting text from a given PDF file

This commit is contained in:
Trenton Holmes 2023-01-01 15:57:22 -08:00 committed by Trenton H
parent da38efebdf
commit 7be9ae9c02
2 changed files with 26 additions and 48 deletions

View File

@ -2,6 +2,7 @@ import json
import os
import re
import subprocess
import tempfile
from pathlib import Path
from typing import Optional
@ -137,36 +138,27 @@ class RasterisedDocumentParser(DocumentParser):
if not os.path.isfile(pdf_file):
return None
from pdfminer.high_level import extract_text as pdfminer_extract_text
try:
stripped = post_process_text(pdfminer_extract_text(pdf_file))
text = None
with tempfile.NamedTemporaryFile(
mode="w+",
dir=settings.SCRATCH_DIR,
) as tmp:
subprocess.run(
[
"pdftotext",
"-q",
"-layout",
"-enc",
"UTF-8",
pdf_file,
tmp.name,
],
)
text = tmp.read()
self.log("debug", f"Extracted text from PDF file {pdf_file}")
return post_process_text(text)
# pdfminer.six does not handle RTL text
# as a hack, for some languages, return no text, to force
# OCRMyPdf/Tesseract do handle this correctly
from langdetect import detect
lang = detect(stripped)
self.log("debug", f"Detected language {lang}")
if (
lang
in {
"ar", # Arabic
"he", # Hebrew,
"fa", # Persian
}
and pdf_file.name != "archive-fallback.pdf"
):
raise RtlLanguageException()
return stripped
except RtlLanguageException:
self.log("warning", f"Detected RTL language {lang}")
return None
except Exception:
# TODO catch all for various issues with PDFminer.six.
# If PDFminer fails, fall back to OCR.
@ -342,7 +334,7 @@ class RasterisedDocumentParser(DocumentParser):
)
if original_has_text:
self.text = text_original
except (NoTextFoundException, RtlLanguageException, InputFileError) as e:
except (NoTextFoundException, InputFileError) as e:
self.log(
"warning",
f"Encountered an error while running OCR: {str(e)}. "

View File

@ -661,28 +661,14 @@ class TestParser(DirectoriesMixin, TestCase):
- Text from the document is extracted
"""
parser = RasterisedDocumentParser(None)
with mock.patch.object(
parser,
"construct_ocrmypdf_parameters",
wraps=parser.construct_ocrmypdf_parameters,
) as wrapped:
parser.parse(
os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
"application/pdf",
)
parser.parse(
os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
"application/pdf",
)
# There isn't a good way to actually check this working, with RTL correctly return
# as it would require tesseract-ocr-ara installed for everyone running the
# test suite. This test does provide the coverage though and attempts to ensure
# the force OCR happens
self.assertIsNotNone(parser.get_text())
self.assertEqual(parser.construct_ocrmypdf_parameters.call_count, 2)
# Check the last call kwargs
self.assertTrue(
parser.construct_ocrmypdf_parameters.call_args.kwargs["safe_fallback"],
)
# Copied from the PDF to here. Don't even look at it
self.assertIn("ةﯾﻠﺧﺎدﻻ ةرازو", parser.get_text())
class TestParserFileTypes(DirectoriesMixin, TestCase):