diff --git a/src/documents/parsers.py b/src/documents/parsers.py index b2714f6a3..8cb8f5399 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -143,6 +143,46 @@ def run_convert(input_file, raise ParseError("Convert failed at {}".format(args)) +def get_default_thumbnail(): + return os.path.join(os.path.dirname(__file__), "resources", "document.png") + + +def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None): + out_path = os.path.join(temp_dir, "convert_gs.png") + + # if convert fails, fall back to extracting + # the first PDF page as a PNG using Ghostscript + logger.warning( + "Thumbnail generation with ImageMagick failed, falling back " + "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!", + extra={'group': logging_group} + ) + gs_out_path = os.path.join(temp_dir, "gs_out.png") + cmd = [settings.GS_BINARY, + "-q", + "-sDEVICE=pngalpha", + "-o", gs_out_path, + in_path] + try: + if not subprocess.Popen(cmd).wait() == 0: + raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) + # then run convert on the output from gs + run_convert(density=300, + scale="500x5000>", + alpha="remove", + strip=True, + trim=False, + auto_orient=True, + input_file=gs_out_path, + output_file=out_path, + logging_group=logging_group) + + return out_path + + except ParseError: + return get_default_thumbnail() + + def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None): """ The thumbnail of a PDF is just a 500px wide image of the first page. @@ -161,31 +201,8 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None): output_file=out_path, logging_group=logging_group) except ParseError: - # if convert fails, fall back to extracting - # the first PDF page as a PNG using Ghostscript - logger.warning( - "Thumbnail generation with ImageMagick failed, falling back " - "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!", - extra={'group': logging_group} - ) - gs_out_path = os.path.join(temp_dir, "gs_out.png") - cmd = [settings.GS_BINARY, - "-q", - "-sDEVICE=pngalpha", - "-o", gs_out_path, - in_path] - if not subprocess.Popen(cmd).wait() == 0: - raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) - # then run convert on the output from gs - run_convert(density=300, - scale="500x5000>", - alpha="remove", - strip=True, - trim=False, - auto_orient=True, - input_file=gs_out_path, - output_file=out_path, - logging_group=logging_group) + out_path = make_thumbnail_from_pdf_gs_fallback( + in_path, temp_dir, logging_group) return out_path diff --git a/src/documents/resources/document.png b/src/documents/resources/document.png new file mode 100644 index 000000000..8c24f9d6e Binary files /dev/null and b/src/documents/resources/document.png differ diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index deadf2234..589b25e37 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -291,6 +291,7 @@ class RasterisedDocumentParser(DocumentParser): f"No text was found in {document_path}, the content will " f"be empty." ) + self.text = "" def strip_excess_whitespace(text): diff --git a/src/paperless_tesseract/tests/samples/encrypted.pdf b/src/paperless_tesseract/tests/samples/encrypted.pdf index 5996edeea..12c5a3070 100644 Binary files a/src/paperless_tesseract/tests/samples/encrypted.pdf and b/src/paperless_tesseract/tests/samples/encrypted.pdf differ diff --git a/src/paperless_tesseract/tests/samples/signed.pdf b/src/paperless_tesseract/tests/samples/signed.pdf new file mode 100644 index 000000000..5996edeea Binary files /dev/null and b/src/paperless_tesseract/tests/samples/signed.pdf differ diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index 21f1e140c..fe4e4733b 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -81,8 +81,8 @@ class TestParser(DirectoriesMixin, TestCase): def test_thumbnail(self): parser = RasterisedDocumentParser(uuid.uuid4()) - parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf") - # dont really know how to test it, just call it and assert that it does not raise anything. + thumb = parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf") + self.assertTrue(os.path.isfile(thumb)) @mock.patch("documents.parsers.run_convert") def test_thumbnail_fallback(self, m): @@ -96,8 +96,13 @@ class TestParser(DirectoriesMixin, TestCase): m.side_effect = call_convert parser = RasterisedDocumentParser(uuid.uuid4()) - parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf") - # dont really know how to test it, just call it and assert that it does not raise anything. + thumb = parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf") + self.assertTrue(os.path.isfile(thumb)) + + def test_thumbnail_encrypted(self): + parser = RasterisedDocumentParser(uuid.uuid4()) + thumb = parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'encrypted.pdf'), "application/pdf") + self.assertTrue(os.path.isfile(thumb)) def test_get_dpi(self): parser = RasterisedDocumentParser(None) @@ -135,6 +140,15 @@ class TestParser(DirectoriesMixin, TestCase): self.assertIsNone(parser.archive_path) self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."]) + @override_settings(OCR_MODE="skip") + def test_signed(self): + parser = RasterisedDocumentParser(None) + + parser.parse(os.path.join(self.SAMPLE_FILES, "signed.pdf"), "application/pdf") + + self.assertIsNone(parser.archive_path) + self.assertContainsStrings(parser.get_text(), ["This is a digitally signed PDF, created with Acrobat Pro for the Paperless project to enable", "automated testing of signed/encrypted PDFs"]) + @override_settings(OCR_MODE="skip") def test_encrypted(self): parser = RasterisedDocumentParser(None) @@ -142,7 +156,8 @@ class TestParser(DirectoriesMixin, TestCase): parser.parse(os.path.join(self.SAMPLE_FILES, "encrypted.pdf"), "application/pdf") self.assertIsNone(parser.archive_path) - self.assertContainsStrings(parser.get_text(), ["This is a digitally signed PDF, created with Acrobat Pro for the Paperless project to enable", "automated testing of signed/encrypted PDFs"]) + self.assertEqual(parser.get_text(), "") + @override_settings(OCR_MODE="redo") def test_with_form_error_notext(self):