From 3205d52331c14951583580b9c85270437241da35 Mon Sep 17 00:00:00 2001 From: Trenton Holmes <797416+stumpylog@users.noreply.github.com> Date: Sat, 13 May 2023 07:47:21 -0700 Subject: [PATCH] Changes the error mode to replace instead of ignore, to better highlight where a problem happened --- src/documents/parsers.py | 2 +- src/paperless_text/tests/test_parser.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/documents/parsers.py b/src/documents/parsers.py index cb9b31cfd..93d78832e 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -329,7 +329,7 @@ class DocumentParser(LoggingMixin): text = filepath.read_text(encoding="utf-8") except UnicodeDecodeError as e: self.log("warning", f"Unicode error during text reading, continuing: {e}") - text = filepath.read_bytes().decode("utf-8", errors="ignore") + text = filepath.read_bytes().decode("utf-8", errors="replace") return text def extract_metadata(self, document_path, mime_type): diff --git a/src/paperless_text/tests/test_parser.py b/src/paperless_text/tests/test_parser.py index b6b331fce..cc5ce76fe 100644 --- a/src/paperless_text/tests/test_parser.py +++ b/src/paperless_text/tests/test_parser.py @@ -48,5 +48,5 @@ class TestTextParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): "text/plain", ) - self.assertEqual(parser.get_text(), "Pantothensure\n") + self.assertEqual(parser.get_text(), "Pantothens�ure\n") self.assertIsNone(parser.get_archive_path())