also apply \0 removal to sidecar contents

2026-02-03 23:22:42 -06:00 · 2021-03-22 23:08:34 +01:00
parent fda2bfbea7
commit 0e596bd1fc
2 changed files with 10 additions and 12 deletions
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -104,7 +104,7 @@ class RasterisedDocumentParser(DocumentParser):
                # This happens when there's already text in the input file.
                # The sidecar file will only contain text for OCR'ed pages.
                self.log("debug", "Using text from sidecar file")
-                return text
+                return post_process_text(text)
            else:
                self.log("debug", "Incomplete sidecar file: discarding.")
@@ -113,13 +113,11 @@ class RasterisedDocumentParser(DocumentParser):
        if not os.path.isfile(pdf_file):
            return None
-        from pdfminer.high_level import extract_text
+        from pdfminer.high_level import extract_text as pdfminer_extract_text
        from pdfminer.pdftypes import PDFException
        try:
-            text = extract_text(pdf_file)
+            stripped = post_process_text(pdfminer_extract_text(pdf_file))
            stripped = strip_excess_whitespace(text)
            stripped = stripped.replace("\0", " ")
            self.log("debug", f"Extracted text from PDF file {pdf_file}")
            return stripped
@@ -296,7 +294,7 @@ class RasterisedDocumentParser(DocumentParser):
                self.text = ""
-def strip_excess_whitespace(text):
+def post_process_text(text):
    if not text:
        return None
@@ -307,4 +305,6 @@ def strip_excess_whitespace(text):
        r"([^\S\n\r]+)$", '', no_leading_whitespace)
    # TODO: this needs a rework
-    return no_trailing_whitespace.strip()
+    # replace \0 prevents issues with saving to postgres.
    # text may contain \0 when this character is present in PDF files.
    return no_trailing_whitespace.strip().replace("\0", " ")
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@@ -7,7 +7,7 @@ from django.test import TestCase, override_settings
 from documents.parsers import ParseError, run_convert
 from documents.tests.utils import DirectoriesMixin
-from paperless_tesseract.parsers import RasterisedDocumentParser, strip_excess_whitespace
+from paperless_tesseract.parsers import RasterisedDocumentParser, post_process_text
 image_to_string_calls = []
@@ -32,8 +32,6 @@ class FakeImageFile(ContextManager):
        return os.path.basename(self.fname)
 class TestParser(DirectoriesMixin, TestCase):
    def assertContainsStrings(self, content, strings):
@@ -58,9 +56,9 @@ class TestParser(DirectoriesMixin, TestCase):
        )
    ]
-    def test_strip_excess_whitespace(self):
+    def test_post_process_text(self):
        for source, result in self.text_cases:
-            actual_result = strip_excess_whitespace(source)
+            actual_result = post_process_text(source)
            self.assertEqual(
                result,
                actual_result,