From bdcba570cb3c70391b4e05aac656864d2e8a809b Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Thu, 2 Feb 2023 12:46:49 -0800 Subject: [PATCH] Adding more test coverage, in particular around Tika and its parser --- src/documents/tests/test_api.py | 14 ++-- src/documents/tests/test_importer.py | 7 +- src/documents/tests/test_parsers.py | 88 +++++++++++++++++--- src/paperless/tests/test_websockets.py | 4 +- src/paperless/views.py | 2 +- src/paperless_tesseract/parsers.py | 2 +- src/paperless_tesseract/tests/test_parser.py | 15 +++- src/paperless_tika/tests/test_tika_parser.py | 62 ++++++++++++++ 8 files changed, 164 insertions(+), 30 deletions(-) diff --git a/src/documents/tests/test_api.py b/src/documents/tests/test_api.py index b6d817de1..1b8a71ded 100644 --- a/src/documents/tests/test_api.py +++ b/src/documents/tests/test_api.py @@ -121,28 +121,28 @@ class TestDocumentApi(DirectoriesMixin, APITestCase): response = self.client.get("/api/documents/", format="json") self.assertEqual(response.status_code, 200) results_full = response.data["results"] - self.assertTrue("content" in results_full[0]) - self.assertTrue("id" in results_full[0]) + self.assertIn("content", results_full[0]) + self.assertIn("id", results_full[0]) response = self.client.get("/api/documents/?fields=id", format="json") self.assertEqual(response.status_code, 200) results = response.data["results"] self.assertFalse("content" in results[0]) - self.assertTrue("id" in results[0]) + self.assertIn("id", results[0]) self.assertEqual(len(results[0]), 1) response = self.client.get("/api/documents/?fields=content", format="json") self.assertEqual(response.status_code, 200) results = response.data["results"] - self.assertTrue("content" in results[0]) + self.assertIn("content", results[0]) self.assertFalse("id" in results[0]) self.assertEqual(len(results[0]), 1) response = self.client.get("/api/documents/?fields=id,content", format="json") self.assertEqual(response.status_code, 200) results = response.data["results"] - self.assertTrue("content" in results[0]) - self.assertTrue("id" in results[0]) + self.assertIn("content", results[0]) + self.assertIn("id", results[0]) self.assertEqual(len(results[0]), 2) response = self.client.get( @@ -152,7 +152,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase): self.assertEqual(response.status_code, 200) results = response.data["results"] self.assertFalse("content" in results[0]) - self.assertTrue("id" in results[0]) + self.assertIn("id", results[0]) self.assertEqual(len(results[0]), 1) response = self.client.get("/api/documents/?fields=", format="json") diff --git a/src/documents/tests/test_importer.py b/src/documents/tests/test_importer.py index 5101a269f..10146ff30 100644 --- a/src/documents/tests/test_importer.py +++ b/src/documents/tests/test_importer.py @@ -25,7 +25,7 @@ class TestImporter(TestCase): cmd.manifest = [{"model": "documents.document"}] with self.assertRaises(CommandError) as cm: cmd._check_manifest() - self.assertTrue("The manifest file contains a record" in str(cm.exception)) + self.assertIn("The manifest file contains a record", str(cm.exception)) cmd.manifest = [ {"model": "documents.document", EXPORTER_FILE_NAME: "noexist.pdf"}, @@ -33,6 +33,7 @@ class TestImporter(TestCase): # self.assertRaises(CommandError, cmd._check_manifest) with self.assertRaises(CommandError) as cm: cmd._check_manifest() - self.assertTrue( - 'The manifest file refers to "noexist.pdf"' in str(cm.exception), + self.assertIn( + 'The manifest file refers to "noexist.pdf"', + str(cm.exception), ) diff --git a/src/documents/tests/test_parsers.py b/src/documents/tests/test_parsers.py index 8ba2c70ee..eda4bacf8 100644 --- a/src/documents/tests/test_parsers.py +++ b/src/documents/tests/test_parsers.py @@ -1,6 +1,8 @@ from tempfile import TemporaryDirectory from unittest import mock +from django.apps import apps +from django.test import override_settings from django.test import TestCase from documents.parsers import get_default_file_extension from documents.parsers import get_parser_class_for_mime_type @@ -8,6 +10,7 @@ from documents.parsers import get_supported_file_extensions from documents.parsers import is_file_ext_supported from paperless_tesseract.parsers import RasterisedDocumentParser from paperless_text.parsers import TextDocumentParser +from paperless_tika.parsers import TikaDocumentParser class TestParserDiscovery(TestCase): @@ -124,14 +127,43 @@ class TestParserDiscovery(TestCase): class TestParserAvailability(TestCase): - def test_file_extensions(self): - + def test_tesseract_parser(self): + """ + GIVEN: + - Various mime types + WHEN: + - The parser class is instantiated + THEN: + - The Tesseract based parser is return + """ supported_mimes_and_exts = [ ("application/pdf", ".pdf"), ("image/png", ".png"), ("image/jpeg", ".jpg"), ("image/tiff", ".tif"), ("image/webp", ".webp"), + ] + + supported_exts = get_supported_file_extensions() + + for mime_type, ext in supported_mimes_and_exts: + self.assertIn(ext, supported_exts) + self.assertEqual(get_default_file_extension(mime_type), ext) + self.assertIsInstance( + get_parser_class_for_mime_type(mime_type)(logging_group=None), + RasterisedDocumentParser, + ) + + def test_text_parser(self): + """ + GIVEN: + - Various mime types of a text form + WHEN: + - The parser class is instantiated + THEN: + - The text based parser is return + """ + supported_mimes_and_exts = [ ("text/plain", ".txt"), ("text/csv", ".csv"), ] @@ -141,23 +173,55 @@ class TestParserAvailability(TestCase): for mime_type, ext in supported_mimes_and_exts: self.assertIn(ext, supported_exts) self.assertEqual(get_default_file_extension(mime_type), ext) + self.assertIsInstance( + get_parser_class_for_mime_type(mime_type)(logging_group=None), + TextDocumentParser, + ) + def test_tika_parser(self): + """ + GIVEN: + - Various mime types of a office document form + WHEN: + - The parser class is instantiated + THEN: + - The Tika/Gotenberg based parser is return + """ + supported_mimes_and_exts = [ + ("application/vnd.oasis.opendocument.text", ".odt"), + ("text/rtf", ".rtf"), + ("application/msword", ".doc"), + ( + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ".docx", + ), + ] + + # Force the app ready to notice the settings override + with override_settings(TIKA_ENABLED=True, INSTALLED_APPS=["paperless_tika"]): + app = apps.get_app_config("paperless_tika") + app.ready() + supported_exts = get_supported_file_extensions() + + for mime_type, ext in supported_mimes_and_exts: + self.assertIn(ext, supported_exts) + self.assertEqual(get_default_file_extension(mime_type), ext) + self.assertIsInstance( + get_parser_class_for_mime_type(mime_type)(logging_group=None), + TikaDocumentParser, + ) + + def test_no_parser_for_mime(self): + self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf")) + + def test_default_extension(self): # Test no parser declared still returns a an extension self.assertEqual(get_default_file_extension("application/zip"), ".zip") # Test invalid mimetype returns no extension self.assertEqual(get_default_file_extension("aasdasd/dgfgf"), "") - self.assertIsInstance( - get_parser_class_for_mime_type("application/pdf")(logging_group=None), - RasterisedDocumentParser, - ) - self.assertIsInstance( - get_parser_class_for_mime_type("text/plain")(logging_group=None), - TextDocumentParser, - ) - self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf")) - + def test_file_extension_support(self): self.assertTrue(is_file_ext_supported(".pdf")) self.assertFalse(is_file_ext_supported(".hsdfh")) self.assertFalse(is_file_ext_supported("")) diff --git a/src/paperless/tests/test_websockets.py b/src/paperless/tests/test_websockets.py index 069bb644a..cebbddf39 100644 --- a/src/paperless/tests/test_websockets.py +++ b/src/paperless/tests/test_websockets.py @@ -14,15 +14,14 @@ TEST_CHANNEL_LAYERS = { } +@override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS) class TestWebSockets(TestCase): - @override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS) async def test_no_auth(self): communicator = WebsocketCommunicator(application, "/ws/status/") connected, subprotocol = await communicator.connect() self.assertFalse(connected) await communicator.disconnect() - @override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS) @mock.patch("paperless.consumers.StatusConsumer._authenticated") async def test_auth(self, _authenticated): _authenticated.return_value = True @@ -33,7 +32,6 @@ class TestWebSockets(TestCase): await communicator.disconnect() - @override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS) @mock.patch("paperless.consumers.StatusConsumer._authenticated") async def test_receive(self, _authenticated): _authenticated.return_value = True diff --git a/src/paperless/views.py b/src/paperless/views.py index 9f3d017a6..975df6601 100644 --- a/src/paperless/views.py +++ b/src/paperless/views.py @@ -12,7 +12,7 @@ class StandardPagination(PageNumberPagination): class FaviconView(View): - def get(self, request, *args, **kwargs): + def get(self, request, *args, **kwargs): # pragma: nocover favicon = os.path.join( os.path.dirname(__file__), "static", diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 14068cb26..4227583f8 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -161,7 +161,7 @@ class RasterisedDocumentParser(DocumentParser): except Exception: # TODO catch all for various issues with PDFminer.six. - # If PDFminer fails, fall back to OCR. + # If pdftotext fails, fall back to OCR. self.log( "warning", "Error while getting text from PDF document with " "pdfminer.six", diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index 7fa399c97..d22ce26a7 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -364,7 +364,7 @@ class TestParser(DirectoriesMixin, TestCase): ) self.assertTrue(os.path.isfile(parser.archive_path)) self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"]) - self.assertFalse("page 3" in parser.get_text().lower()) + self.assertNotIn("page 3", parser.get_text().lower()) @override_settings(OCR_PAGES=1, OCR_MODE="force") def test_multi_page_analog_pages_force(self): @@ -386,8 +386,8 @@ class TestParser(DirectoriesMixin, TestCase): ) self.assertTrue(os.path.isfile(parser.archive_path)) self.assertContainsStrings(parser.get_text().lower(), ["page 1"]) - self.assertFalse("page 2" in parser.get_text().lower()) - self.assertFalse("page 3" in parser.get_text().lower()) + self.assertNotIn("page 2", parser.get_text().lower()) + self.assertNotIn("page 3", parser.get_text().lower()) @override_settings(OCR_MODE="skip_noarchive") def test_skip_noarchive_withtext(self): @@ -660,6 +660,15 @@ class TestParser(DirectoriesMixin, TestCase): params = parser.construct_ocrmypdf_parameters("", "", "", "") self.assertNotIn("deskew", params) + with override_settings(OCR_MAX_IMAGE_PIXELS=1_000_001.0): + params = parser.construct_ocrmypdf_parameters("", "", "", "") + self.assertIn("max_image_mpixels", params) + self.assertAlmostEqual(params["max_image_mpixels"], 1, places=4) + + with override_settings(OCR_MAX_IMAGE_PIXELS=-1_000_001.0): + params = parser.construct_ocrmypdf_parameters("", "", "", "") + self.assertNotIn("max_image_mpixels", params) + def test_rtl_language_detection(self): """ GIVEN: diff --git a/src/paperless_tika/tests/test_tika_parser.py b/src/paperless_tika/tests/test_tika_parser.py index bf6b4e7c8..058196581 100644 --- a/src/paperless_tika/tests/test_tika_parser.py +++ b/src/paperless_tika/tests/test_tika_parser.py @@ -3,7 +3,9 @@ import os from pathlib import Path from unittest import mock +from django.test import override_settings from django.test import TestCase +from documents.parsers import ParseError from paperless_tika.parsers import TikaDocumentParser from requests import Response @@ -54,3 +56,63 @@ class TestTikaParser(TestCase): self.assertTrue("Creation-Date" in [m["key"] for m in metadata]) self.assertTrue("Some-key" in [m["key"] for m in metadata]) + + @mock.patch("paperless_tika.parsers.parser.from_file") + @mock.patch("paperless_tika.parsers.requests.post") + def test_convert_failure(self, post, from_file): + """ + GIVEN: + - Document needs to be converted to PDF + WHEN: + - Gotenberg server returns an error + THEN: + - Parse error is raised + """ + from_file.return_value = { + "content": "the content", + "metadata": {"Creation-Date": "2020-11-21"}, + } + response = Response() + response._content = b"PDF document" + response.status_code = 500 + post.return_value = response + + file = os.path.join(self.parser.tempdir, "input.odt") + Path(file).touch() + + with self.assertRaises(ParseError): + self.parser.convert_to_pdf(file, None) + + @mock.patch("paperless_tika.parsers.requests.post") + def test_request_pdf_a_format(self, post: mock.Mock): + """ + GIVEN: + - Document needs to be converted to PDF + WHEN: + - Specific PDF/A format requested + THEN: + - Request to Gotenberg contains the expected PDF/A format string + """ + file = os.path.join(self.parser.tempdir, "input.odt") + Path(file).touch() + + response = Response() + response._content = b"PDF document" + response.status_code = 200 + post.return_value = response + + for setting, expected_key in [ + ("pdfa", "PDF/A-2b"), + ("pdfa-2", "PDF/A-2b"), + ("pdfa-1", "PDF/A-1a"), + ("pdfa-3", "PDF/A-3b"), + ]: + with override_settings(OCR_OUTPUT_TYPE=setting): + self.parser.convert_to_pdf(file, None) + + post.assert_called_once() + _, kwargs = post.call_args + + self.assertEqual(kwargs["data"]["pdfFormat"], expected_key) + + post.reset_mock()