mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-09 09:58:20 -05:00
Adds a test to cover this edge case
This commit is contained in:
parent
b897d6de2e
commit
f015556562
BIN
src/paperless_tesseract/tests/samples/single-page-mixed.pdf
Normal file
BIN
src/paperless_tesseract/tests/samples/single-page-mixed.pdf
Normal file
Binary file not shown.
@ -37,6 +37,9 @@ class FakeImageFile(ContextManager):
|
|||||||
|
|
||||||
|
|
||||||
class TestParser(DirectoriesMixin, TestCase):
|
class TestParser(DirectoriesMixin, TestCase):
|
||||||
|
|
||||||
|
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
|
||||||
|
|
||||||
def assertContainsStrings(self, content, strings):
|
def assertContainsStrings(self, content, strings):
|
||||||
# Asserts that all strings appear in content, in the given order.
|
# Asserts that all strings appear in content, in the given order.
|
||||||
indices = []
|
indices = []
|
||||||
@ -47,14 +50,18 @@ class TestParser(DirectoriesMixin, TestCase):
|
|||||||
self.fail(f"'{s}' is not in '{content}'")
|
self.fail(f"'{s}' is not in '{content}'")
|
||||||
self.assertListEqual(indices, sorted(indices))
|
self.assertListEqual(indices, sorted(indices))
|
||||||
|
|
||||||
text_cases = [
|
|
||||||
("simple string", "simple string"),
|
|
||||||
("simple newline\n testing string", "simple newline\ntesting string"),
|
|
||||||
("utf-8 строка с пробелами в конце ", "utf-8 строка с пробелами в конце"),
|
|
||||||
]
|
|
||||||
|
|
||||||
def test_post_process_text(self):
|
def test_post_process_text(self):
|
||||||
for source, result in self.text_cases:
|
|
||||||
|
text_cases = [
|
||||||
|
("simple string", "simple string"),
|
||||||
|
("simple newline\n testing string", "simple newline\ntesting string"),
|
||||||
|
(
|
||||||
|
"utf-8 строка с пробелами в конце ",
|
||||||
|
"utf-8 строка с пробелами в конце",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
for source, result in text_cases:
|
||||||
actual_result = post_process_text(source)
|
actual_result = post_process_text(source)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
result,
|
result,
|
||||||
@ -66,8 +73,6 @@ class TestParser(DirectoriesMixin, TestCase):
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
|
|
||||||
|
|
||||||
def test_get_text_from_pdf(self):
|
def test_get_text_from_pdf(self):
|
||||||
parser = RasterisedDocumentParser(uuid.uuid4())
|
parser = RasterisedDocumentParser(uuid.uuid4())
|
||||||
text = parser.extract_text(
|
text = parser.extract_text(
|
||||||
@ -461,6 +466,45 @@ class TestParser(DirectoriesMixin, TestCase):
|
|||||||
|
|
||||||
self.assertIn("[OCR skipped on page(s) 4-6]", sidecar)
|
self.assertIn("[OCR skipped on page(s) 4-6]", sidecar)
|
||||||
|
|
||||||
|
@override_settings(OCR_MODE="redo")
|
||||||
|
def test_single_page_mixed(self):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- File with some text contained in images and some in text layer
|
||||||
|
- Text and images are mixed on the same page
|
||||||
|
- OCR mode set to redo
|
||||||
|
WHEN:
|
||||||
|
- Document is parsed
|
||||||
|
THEN:
|
||||||
|
- Text from images is extracted
|
||||||
|
- Full content of the file is parsed (not just the image text)
|
||||||
|
- An archive file is created with the OCRd text and the original text
|
||||||
|
"""
|
||||||
|
parser = RasterisedDocumentParser(None)
|
||||||
|
parser.parse(
|
||||||
|
os.path.join(self.SAMPLE_FILES, "single-page-mixed.pdf"),
|
||||||
|
"application/pdf",
|
||||||
|
)
|
||||||
|
self.assertIsNotNone(parser.archive_path)
|
||||||
|
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||||
|
self.assertContainsStrings(
|
||||||
|
parser.get_text().lower(),
|
||||||
|
[
|
||||||
|
"this is some normal text, present on page 1 of the document.",
|
||||||
|
"this is some text, but in an image, also on page 1.",
|
||||||
|
"this is further text on page 1.",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
with open(os.path.join(parser.tempdir, "sidecar.txt")) as f:
|
||||||
|
sidecar = f.read().lower()
|
||||||
|
|
||||||
|
self.assertIn("this is some text, but in an image, also on page 1.", sidecar)
|
||||||
|
self.assertNotIn(
|
||||||
|
"this is some normal text, present on page 1 of the document.",
|
||||||
|
sidecar,
|
||||||
|
)
|
||||||
|
|
||||||
@override_settings(OCR_MODE="skip_noarchive")
|
@override_settings(OCR_MODE="skip_noarchive")
|
||||||
def test_multi_page_mixed_no_archive(self):
|
def test_multi_page_mixed_no_archive(self):
|
||||||
"""
|
"""
|
||||||
|
Loading…
x
Reference in New Issue
Block a user