diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 80968912c..cb9b31cfd 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -7,6 +7,7 @@ import shutil import subprocess import tempfile from functools import lru_cache +from pathlib import Path from typing import Iterator from typing import Match from typing import Optional @@ -319,6 +320,18 @@ class DocumentParser(LoggingMixin): if self.progress_callback: self.progress_callback(current_progress, max_progress) + def read_file_handle_unicode_errors(self, filepath: Path) -> str: + """ + Helper utility for reading from a file, and handling a problem with its + unicode, falling back to ignoring the error to remove the invalid bytes + """ + try: + text = filepath.read_text(encoding="utf-8") + except UnicodeDecodeError as e: + self.log("warning", f"Unicode error during text reading, continuing: {e}") + text = filepath.read_bytes().decode("utf-8", errors="ignore") + return text + def extract_metadata(self, document_path, mime_type): return [] diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 7657cb7e2..151af97dc 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -122,8 +122,7 @@ class RasterisedDocumentParser(DocumentParser): and os.path.isfile(sidecar_file) and settings.OCR_MODE != "redo" ): - with open(sidecar_file) as f: - text = f.read() + text = self.read_file_handle_unicode_errors(sidecar_file) if "[OCR skipped on page" not in text: # This happens when there's already text in the input file. @@ -155,7 +154,7 @@ class RasterisedDocumentParser(DocumentParser): tmp.name, ], ) - text = tmp.read() + text = self.read_file_handle_unicode_errors(Path(tmp.name)) return post_process_text(text) diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index 23cff29b7..7850ad4ef 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -2,6 +2,7 @@ import os import shutil import tempfile import uuid +from pathlib import Path from typing import ContextManager from unittest import mock @@ -39,7 +40,7 @@ class FakeImageFile(ContextManager): class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): - SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") + SAMPLE_FILES = Path(__file__).resolve().parent / "samples" def assertContainsStrings(self, content, strings): # Asserts that all strings appear in content, in the given order. @@ -77,7 +78,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): parser = RasterisedDocumentParser(uuid.uuid4()) text = parser.extract_text( None, - os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"), + self.SAMPLE_FILES / "simple-digital.pdf", ) self.assertContainsStrings(text.strip(), ["This is a test document."]) diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py index 37e4ca1a6..c017a3c0f 100644 --- a/src/paperless_text/parsers.py +++ b/src/paperless_text/parsers.py @@ -16,11 +16,7 @@ class TextDocumentParser(DocumentParser): logging_name = "paperless.parsing.text" def get_thumbnail(self, document_path, mime_type, file_name=None): - def read_text(): - with open(document_path) as src: - lines = [line.strip() for line in src.readlines()] - text = "\n".join(lines[:50]) - return text + text = self.read_file_handle_unicode_errors(document_path) img = Image.new("RGB", (500, 700), color="white") draw = ImageDraw.Draw(img) @@ -29,7 +25,7 @@ class TextDocumentParser(DocumentParser): size=20, layout_engine=ImageFont.Layout.BASIC, ) - draw.text((5, 5), read_text(), font=font, fill="black") + draw.text((5, 5), text, font=font, fill="black") out_path = os.path.join(self.tempdir, "thumb.webp") img.save(out_path, format="WEBP") @@ -37,5 +33,4 @@ class TextDocumentParser(DocumentParser): return out_path def parse(self, document_path, mime_type, file_name=None): - with open(document_path) as f: - self.text = f.read() + self.text = self.read_file_handle_unicode_errors(document_path) diff --git a/src/paperless_text/tests/samples/decode_error.txt b/src/paperless_text/tests/samples/decode_error.txt new file mode 100644 index 000000000..2137cd2b7 --- /dev/null +++ b/src/paperless_text/tests/samples/decode_error.txt @@ -0,0 +1 @@ +Pantothensäure diff --git a/src/paperless_text/tests/test_parser.py b/src/paperless_text/tests/test_parser.py index 869a3a8ef..b6b331fce 100644 --- a/src/paperless_text/tests/test_parser.py +++ b/src/paperless_text/tests/test_parser.py @@ -1,4 +1,4 @@ -import os +from pathlib import Path from django.test import TestCase @@ -8,12 +8,14 @@ from paperless_text.parsers import TextDocumentParser class TestTextParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): + SAMPLE_DIR = Path(__file__).resolve().parent / "samples" + def test_thumbnail(self): parser = TextDocumentParser(None) # just make sure that it does not crash f = parser.get_thumbnail( - os.path.join(os.path.dirname(__file__), "samples", "test.txt"), + self.SAMPLE_DIR / "test.txt", "text/plain", ) self.assertIsFile(f) @@ -22,9 +24,29 @@ class TestTextParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): parser = TextDocumentParser(None) parser.parse( - os.path.join(os.path.dirname(__file__), "samples", "test.txt"), + self.SAMPLE_DIR / "test.txt", "text/plain", ) self.assertEqual(parser.get_text(), "This is a test file.\n") self.assertIsNone(parser.get_archive_path()) + + def test_parse_invalid_bytes(self): + """ + GIVEN: + - Text file which contains invalid UTF bytes + WHEN: + - The file is parsed + THEN: + - Parsing continues + - Invalid bytes are removed + """ + parser = TextDocumentParser(None) + + parser.parse( + self.SAMPLE_DIR / "decode_error.txt", + "text/plain", + ) + + self.assertEqual(parser.get_text(), "Pantothensure\n") + self.assertIsNone(parser.get_archive_path())