Adds better handling for files with invalid utf8 content

2025-12-24 02:05:48 -06:00 · 2023-05-12 14:21:32 -07:00
parent 350c20d6ab
commit 111960c530
6 changed files with 47 additions and 16 deletions
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -122,8 +122,7 @@ class RasterisedDocumentParser(DocumentParser):
            and os.path.isfile(sidecar_file)
            and settings.OCR_MODE != "redo"
        ):
-            with open(sidecar_file) as f:
-                text = f.read()
+            text = self.read_file_handle_unicode_errors(sidecar_file)

            if "[OCR skipped on page" not in text:
                # This happens when there's already text in the input file.
@@ -155,7 +154,7 @@ class RasterisedDocumentParser(DocumentParser):
                        tmp.name,
                    ],
                )
-                text = tmp.read()
+                text = self.read_file_handle_unicode_errors(Path(tmp.name))

            return post_process_text(text)

--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@@ -2,6 +2,7 @@ import os
 import shutil
 import tempfile
 import uuid
+from pathlib import Path
 from typing import ContextManager
 from unittest import mock

@@ -39,7 +40,7 @@ class FakeImageFile(ContextManager):


 class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
-    SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
+    SAMPLE_FILES = Path(__file__).resolve().parent / "samples"

    def assertContainsStrings(self, content, strings):
        # Asserts that all strings appear in content, in the given order.
@@ -77,7 +78,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
        parser = RasterisedDocumentParser(uuid.uuid4())
        text = parser.extract_text(
            None,
-            os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"),
+            self.SAMPLE_FILES / "simple-digital.pdf",
        )

        self.assertContainsStrings(text.strip(), ["This is a test document."])