mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-28 18:24:38 -05:00
Adds better handling for files with invalid utf8 content
This commit is contained in:
@@ -122,8 +122,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
and os.path.isfile(sidecar_file)
|
||||
and settings.OCR_MODE != "redo"
|
||||
):
|
||||
with open(sidecar_file) as f:
|
||||
text = f.read()
|
||||
text = self.read_file_handle_unicode_errors(sidecar_file)
|
||||
|
||||
if "[OCR skipped on page" not in text:
|
||||
# This happens when there's already text in the input file.
|
||||
@@ -155,7 +154,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
tmp.name,
|
||||
],
|
||||
)
|
||||
text = tmp.read()
|
||||
text = self.read_file_handle_unicode_errors(Path(tmp.name))
|
||||
|
||||
return post_process_text(text)
|
||||
|
||||
|
@@ -2,6 +2,7 @@ import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import ContextManager
|
||||
from unittest import mock
|
||||
|
||||
@@ -39,7 +40,7 @@ class FakeImageFile(ContextManager):
|
||||
|
||||
|
||||
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
|
||||
SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
|
||||
|
||||
def assertContainsStrings(self, content, strings):
|
||||
# Asserts that all strings appear in content, in the given order.
|
||||
@@ -77,7 +78,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
parser = RasterisedDocumentParser(uuid.uuid4())
|
||||
text = parser.extract_text(
|
||||
None,
|
||||
os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"),
|
||||
self.SAMPLE_FILES / "simple-digital.pdf",
|
||||
)
|
||||
|
||||
self.assertContainsStrings(text.strip(), ["This is a test document."])
|
||||
|
Reference in New Issue
Block a user