Adds better handling for files with invalid utf8 content

This commit is contained in:
Trenton H
2023-05-12 14:21:32 -07:00
parent 350c20d6ab
commit 111960c530
6 changed files with 47 additions and 16 deletions

View File

@@ -0,0 +1 @@
Pantothens<EFBFBD>ure

View File

@@ -1,4 +1,4 @@
import os
from pathlib import Path
from django.test import TestCase
@@ -8,12 +8,14 @@ from paperless_text.parsers import TextDocumentParser
class TestTextParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
SAMPLE_DIR = Path(__file__).resolve().parent / "samples"
def test_thumbnail(self):
parser = TextDocumentParser(None)
# just make sure that it does not crash
f = parser.get_thumbnail(
os.path.join(os.path.dirname(__file__), "samples", "test.txt"),
self.SAMPLE_DIR / "test.txt",
"text/plain",
)
self.assertIsFile(f)
@@ -22,9 +24,29 @@ class TestTextParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
parser = TextDocumentParser(None)
parser.parse(
os.path.join(os.path.dirname(__file__), "samples", "test.txt"),
self.SAMPLE_DIR / "test.txt",
"text/plain",
)
self.assertEqual(parser.get_text(), "This is a test file.\n")
self.assertIsNone(parser.get_archive_path())
def test_parse_invalid_bytes(self):
"""
GIVEN:
- Text file which contains invalid UTF bytes
WHEN:
- The file is parsed
THEN:
- Parsing continues
- Invalid bytes are removed
"""
parser = TextDocumentParser(None)
parser.parse(
self.SAMPLE_DIR / "decode_error.txt",
"text/plain",
)
self.assertEqual(parser.get_text(), "Pantothensure\n")
self.assertIsNone(parser.get_archive_path())