diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index d381ed94f..be8f45e7b 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -104,7 +104,7 @@ class RasterisedDocumentParser(DocumentParser): # This happens when there's already text in the input file. # The sidecar file will only contain text for OCR'ed pages. self.log("debug", "Using text from sidecar file") - return text + return post_process_text(text) else: self.log("debug", "Incomplete sidecar file: discarding.") @@ -113,13 +113,11 @@ class RasterisedDocumentParser(DocumentParser): if not os.path.isfile(pdf_file): return None - from pdfminer.high_level import extract_text + from pdfminer.high_level import extract_text as pdfminer_extract_text from pdfminer.pdftypes import PDFException try: - text = extract_text(pdf_file) - stripped = strip_excess_whitespace(text) - stripped = stripped.replace("\0", " ") + stripped = post_process_text(pdfminer_extract_text(pdf_file)) self.log("debug", f"Extracted text from PDF file {pdf_file}") return stripped @@ -296,7 +294,7 @@ class RasterisedDocumentParser(DocumentParser): self.text = "" -def strip_excess_whitespace(text): +def post_process_text(text): if not text: return None @@ -307,4 +305,6 @@ def strip_excess_whitespace(text): r"([^\S\n\r]+)$", '', no_leading_whitespace) # TODO: this needs a rework - return no_trailing_whitespace.strip() + # replace \0 prevents issues with saving to postgres. + # text may contain \0 when this character is present in PDF files. + return no_trailing_whitespace.strip().replace("\0", " ") diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index fe4e4733b..e39f87017 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -7,7 +7,7 @@ from django.test import TestCase, override_settings from documents.parsers import ParseError, run_convert from documents.tests.utils import DirectoriesMixin -from paperless_tesseract.parsers import RasterisedDocumentParser, strip_excess_whitespace +from paperless_tesseract.parsers import RasterisedDocumentParser, post_process_text image_to_string_calls = [] @@ -32,8 +32,6 @@ class FakeImageFile(ContextManager): return os.path.basename(self.fname) - - class TestParser(DirectoriesMixin, TestCase): def assertContainsStrings(self, content, strings): @@ -58,9 +56,9 @@ class TestParser(DirectoriesMixin, TestCase): ) ] - def test_strip_excess_whitespace(self): + def test_post_process_text(self): for source, result in self.text_cases: - actual_result = strip_excess_whitespace(source) + actual_result = post_process_text(source) self.assertEqual( result, actual_result,