diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index b181364aa..2289619f6 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -160,7 +160,9 @@ def strip_excess_whitespace(text): r"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces) no_trailing_whitespace = re.sub( r"([^\S\n\r]+)$", '', no_leading_whitespace) - return no_trailing_whitespace + + # TODO: this needs a rework + return no_trailing_whitespace.strip() def get_text_from_pdf(pdf_file): diff --git a/src/paperless_tesseract/tests/samples/multi-page-digital.pdf b/src/paperless_tesseract/tests/samples/multi-page-digital.pdf new file mode 100644 index 000000000..5e75266ca Binary files /dev/null and b/src/paperless_tesseract/tests/samples/multi-page-digital.pdf differ diff --git a/src/paperless_tesseract/tests/samples/multi-page-images.pdf b/src/paperless_tesseract/tests/samples/multi-page-images.pdf new file mode 100644 index 000000000..ea08363bf Binary files /dev/null and b/src/paperless_tesseract/tests/samples/multi-page-images.pdf differ diff --git a/src/paperless_tesseract/tests/samples/no-text-alpha.png b/src/paperless_tesseract/tests/samples/no-text-alpha.png new file mode 100644 index 000000000..e78b22bfb Binary files /dev/null and b/src/paperless_tesseract/tests/samples/no-text-alpha.png differ diff --git a/src/paperless_tesseract/tests/samples/simple-alpha.png b/src/paperless_tesseract/tests/samples/simple-alpha.png new file mode 100644 index 000000000..0a267db1f Binary files /dev/null and b/src/paperless_tesseract/tests/samples/simple-alpha.png differ diff --git a/src/paperless_tesseract/tests/samples/simple.pdf b/src/paperless_tesseract/tests/samples/simple-digital.pdf similarity index 100% rename from src/paperless_tesseract/tests/samples/simple.pdf rename to src/paperless_tesseract/tests/samples/simple-digital.pdf diff --git a/src/paperless_tesseract/tests/samples/simple-no-dpi.png b/src/paperless_tesseract/tests/samples/simple-no-dpi.png new file mode 100644 index 000000000..84b2dc29b Binary files /dev/null and b/src/paperless_tesseract/tests/samples/simple-no-dpi.png differ diff --git a/src/paperless_tesseract/tests/samples/simple.png b/src/paperless_tesseract/tests/samples/simple.png index a3a768401..6fa0490e4 100644 Binary files a/src/paperless_tesseract/tests/samples/simple.png and b/src/paperless_tesseract/tests/samples/simple.png differ diff --git a/src/paperless_tesseract/tests/samples/with-form.pdf b/src/paperless_tesseract/tests/samples/with-form.pdf new file mode 100644 index 000000000..afbeef5c8 Binary files /dev/null and b/src/paperless_tesseract/tests/samples/with-form.pdf differ diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index 84363a18d..70fb494ef 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -33,7 +33,7 @@ class FakeImageFile(ContextManager): return os.path.basename(self.fname) -class TestAuxilliaryFunctions(TestCase): +class TestParser(TestCase): def setUp(self): self.scratch = tempfile.mkdtemp() @@ -46,18 +46,13 @@ class TestAuxilliaryFunctions(TestCase): SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") def test_get_text_from_pdf(self): - text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.pdf')) + text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf')) self.assertEqual(text.strip(), "This is a test document.") - def test_get_text_from_pdf_error(self): - text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.png')) - - self.assertIsNone(text) - def test_thumbnail(self): parser = RasterisedDocumentParser(uuid.uuid4()) - parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), "application/pdf") + parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf") # dont really know how to test it, just call it and assert that it does not raise anything. @mock.patch("paperless_tesseract.parsers.run_convert") @@ -72,5 +67,144 @@ class TestAuxilliaryFunctions(TestCase): m.side_effect = call_convert parser = RasterisedDocumentParser(uuid.uuid4()) - parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), "application/pdf") + parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf") # dont really know how to test it, just call it and assert that it does not raise anything. + + def test_get_dpi(self): + parser = RasterisedDocumentParser(None) + + dpi = parser.get_dpi(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png")) + self.assertEqual(dpi, None) + + dpi = parser.get_dpi(os.path.join(self.SAMPLE_FILES, "simple.png")) + self.assertEqual(dpi, 72) + + def test_simple_digital(self): + parser = RasterisedDocumentParser(None) + + parser.parse(os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"), "application/pdf") + + self.assertTrue(os.path.isfile(parser.archive_path)) + + self.assertEqual(parser.get_text(), "This is a test document.") + + def test_with_form(self): + parser = RasterisedDocumentParser(None) + + parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf") + + self.assertTrue(os.path.isfile(parser.archive_path)) + + self.assertEqual(parser.get_text(), "Please enter your name in here:\n\nThis is a PDF document with a form.") + + @override_settings(OCR_MODE="redo") + def test_with_form_error(self): + parser = RasterisedDocumentParser(None) + + parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf") + + self.assertIsNone(parser.archive_path) + + self.assertEqual(parser.get_text(), "Please enter your name in here:\n\nThis is a PDF document with a form.") + + @override_settings(OCR_MODE="redo") + @mock.patch("paperless_tesseract.parsers.get_text_from_pdf", lambda _: None) + def test_with_form_error_notext(self): + parser = RasterisedDocumentParser(None) + + def f(): + parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf") + + self.assertRaises(ParseError, f) + + @override_settings(OCR_MODE="force") + def test_with_form_force(self): + parser = RasterisedDocumentParser(None) + +# parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf") + +# self.assertEqual(parser.get_text(), "Please enter your name in here:\n\nThis is a PDF document with a form.") + + def test_image_simple(self): + parser = RasterisedDocumentParser(None) + + parser.parse(os.path.join(self.SAMPLE_FILES, "simple.png"), "image/png") + + self.assertTrue(os.path.isfile(parser.archive_path)) + + self.assertEqual(parser.get_text(), "This is a test document.") + + def test_image_simple_alpha_fail(self): + parser = RasterisedDocumentParser(None) + + def f(): + parser.parse(os.path.join(self.SAMPLE_FILES, "simple-alpha.png"), "image/png") + + self.assertRaises(ParseError, f) + + + def test_image_no_dpi_fail(self): + parser = RasterisedDocumentParser(None) + + def f(): + parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png") + + self.assertRaises(ParseError, f) + + @override_settings(OCR_IMAGE_DPI=72) + def test_image_no_dpi_default(self): + parser = RasterisedDocumentParser(None) + + parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png") + + self.assertTrue(os.path.isfile(parser.archive_path)) + + self.assertEqual(parser.get_text(), "This is a test document.") + + def test_multi_page(self): + parser = RasterisedDocumentParser(None) + parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf") + self.assertTrue(os.path.isfile(parser.archive_path)) + self.assertEqual(parser.get_text(), "This is a multi page document. Page 1.\n\nThis is a multi page document. Page 2.\n\nThis is a multi page document. Page 3.") + + @override_settings(OCR_PAGES=2, OCR_MODE="skip") + def test_multi_page_pages_skip(self): + parser = RasterisedDocumentParser(None) + parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf") + self.assertTrue(os.path.isfile(parser.archive_path)) + self.assertEqual(parser.get_text(), "This is a multi page document. Page 1.\n\nThis is a multi page document. Page 2.\n\nThis is a multi page document. Page 3.") + + @override_settings(OCR_PAGES=2, OCR_MODE="redo") + def test_multi_page_pages_redo(self): + parser = RasterisedDocumentParser(None) + parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf") + self.assertTrue(os.path.isfile(parser.archive_path)) + self.assertEqual(parser.get_text(), "This is a multi page document. Page 1.\n\nThis is a multi page document. Page 2.\n\nThis is a multi page document. Page 3.") + + @override_settings(OCR_PAGES=2, OCR_MODE="force") + def test_multi_page_pages_force(self): + parser = RasterisedDocumentParser(None) + parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf") + self.assertTrue(os.path.isfile(parser.archive_path)) + self.assertEqual(parser.get_text(), "This is a multi page document. Page 1.\n\nThis is a multi page document. Page 2.\n\nThis is a multi page document. Page 3.") + + @override_settings(OOCR_MODE="skip") + def test_multi_page_analog_pages_skip(self): + parser = RasterisedDocumentParser(None) + parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf") + self.assertTrue(os.path.isfile(parser.archive_path)) + self.assertEqual(parser.get_text(), "This is a multi page document. Page 1.\n\nThis is a multi page document. Page 2.\n\nThis is a multi page document. Page 3.") + + @override_settings(OCR_PAGES=2, OCR_MODE="redo") + def test_multi_page_analog_pages_redo(self): + parser = RasterisedDocumentParser(None) + parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf") + self.assertTrue(os.path.isfile(parser.archive_path)) + self.assertEqual(parser.get_text(), "This is a multi page document. Page 1.\n\nThis is a multi page document. Page 2.") + + @override_settings(OCR_PAGES=1, OCR_MODE="force") + def test_multi_page_analog_pages_force(self): + parser = RasterisedDocumentParser(None) + parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf") + self.assertTrue(os.path.isfile(parser.archive_path)) + self.assertEqual(parser.get_text(), "This is a multi page document. Page 1.")