mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-08-10 00:18:57 +00:00
Adds better handling for files with invalid utf8 content
This commit is contained in:
1
src/paperless_text/tests/samples/decode_error.txt
Normal file
1
src/paperless_text/tests/samples/decode_error.txt
Normal file
@@ -0,0 +1 @@
|
||||
Pantothens<EFBFBD>ure
|
@@ -1,4 +1,4 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from django.test import TestCase
|
||||
|
||||
@@ -8,12 +8,14 @@ from paperless_text.parsers import TextDocumentParser
|
||||
|
||||
|
||||
class TestTextParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
SAMPLE_DIR = Path(__file__).resolve().parent / "samples"
|
||||
|
||||
def test_thumbnail(self):
|
||||
parser = TextDocumentParser(None)
|
||||
|
||||
# just make sure that it does not crash
|
||||
f = parser.get_thumbnail(
|
||||
os.path.join(os.path.dirname(__file__), "samples", "test.txt"),
|
||||
self.SAMPLE_DIR / "test.txt",
|
||||
"text/plain",
|
||||
)
|
||||
self.assertIsFile(f)
|
||||
@@ -22,9 +24,29 @@ class TestTextParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
parser = TextDocumentParser(None)
|
||||
|
||||
parser.parse(
|
||||
os.path.join(os.path.dirname(__file__), "samples", "test.txt"),
|
||||
self.SAMPLE_DIR / "test.txt",
|
||||
"text/plain",
|
||||
)
|
||||
|
||||
self.assertEqual(parser.get_text(), "This is a test file.\n")
|
||||
self.assertIsNone(parser.get_archive_path())
|
||||
|
||||
def test_parse_invalid_bytes(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Text file which contains invalid UTF bytes
|
||||
WHEN:
|
||||
- The file is parsed
|
||||
THEN:
|
||||
- Parsing continues
|
||||
- Invalid bytes are removed
|
||||
"""
|
||||
parser = TextDocumentParser(None)
|
||||
|
||||
parser.parse(
|
||||
self.SAMPLE_DIR / "decode_error.txt",
|
||||
"text/plain",
|
||||
)
|
||||
|
||||
self.assertEqual(parser.get_text(), "Pantothensure\n")
|
||||
self.assertIsNone(parser.get_archive_path())
|
||||
|
Reference in New Issue
Block a user