Adds better handling for files with invalid utf8 content

2026-02-07 23:42:46 -06:00 · 2023-05-12 14:21:32 -07:00
parent 350c20d6ab
commit 111960c530
6 changed files with 47 additions and 16 deletions
--- a/src/paperless_text/parsers.py
+++ b/src/paperless_text/parsers.py
@@ -16,11 +16,7 @@ class TextDocumentParser(DocumentParser):
    logging_name = "paperless.parsing.text"

    def get_thumbnail(self, document_path, mime_type, file_name=None):
-        def read_text():
-            with open(document_path) as src:
-                lines = [line.strip() for line in src.readlines()]
-                text = "\n".join(lines[:50])
-                return text
+        text = self.read_file_handle_unicode_errors(document_path)

        img = Image.new("RGB", (500, 700), color="white")
        draw = ImageDraw.Draw(img)
@@ -29,7 +25,7 @@ class TextDocumentParser(DocumentParser):
            size=20,
            layout_engine=ImageFont.Layout.BASIC,
        )
-        draw.text((5, 5), read_text(), font=font, fill="black")
+        draw.text((5, 5), text, font=font, fill="black")

        out_path = os.path.join(self.tempdir, "thumb.webp")
        img.save(out_path, format="WEBP")
@@ -37,5 +33,4 @@ class TextDocumentParser(DocumentParser):
        return out_path

    def parse(self, document_path, mime_type, file_name=None):
-        with open(document_path) as f:
-            self.text = f.read()
+        self.text = self.read_file_handle_unicode_errors(document_path)
--- a/src/paperless_text/tests/samples/decode_error.txt
+++ b/src/paperless_text/tests/samples/decode_error.txt
@@ -0,0 +1 @@
+Pantothensäure
--- a/src/paperless_text/tests/test_parser.py
+++ b/src/paperless_text/tests/test_parser.py
@@ -1,4 +1,4 @@
-import os
+from pathlib import Path

 from django.test import TestCase

@@ -8,12 +8,14 @@ from paperless_text.parsers import TextDocumentParser


 class TestTextParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
+    SAMPLE_DIR = Path(__file__).resolve().parent / "samples"
+
    def test_thumbnail(self):
        parser = TextDocumentParser(None)

        # just make sure that it does not crash
        f = parser.get_thumbnail(
-            os.path.join(os.path.dirname(__file__), "samples", "test.txt"),
+            self.SAMPLE_DIR / "test.txt",
            "text/plain",
        )
        self.assertIsFile(f)
@@ -22,9 +24,29 @@ class TestTextParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
        parser = TextDocumentParser(None)

        parser.parse(
-            os.path.join(os.path.dirname(__file__), "samples", "test.txt"),
+            self.SAMPLE_DIR / "test.txt",
            "text/plain",
        )

        self.assertEqual(parser.get_text(), "This is a test file.\n")
        self.assertIsNone(parser.get_archive_path())
+
+    def test_parse_invalid_bytes(self):
+        """
+        GIVEN:
+            - Text file which contains invalid UTF bytes
+        WHEN:
+            - The file is parsed
+        THEN:
+            - Parsing continues
+            - Invalid bytes are removed
+        """
+        parser = TextDocumentParser(None)
+
+        parser.parse(
+            self.SAMPLE_DIR / "decode_error.txt",
+            "text/plain",
+        )
+
+        self.assertEqual(parser.get_text(), "Pantothensure\n")
+        self.assertIsNone(parser.get_archive_path())