This commit is contained in:
jonaswinkler 2021-03-14 14:42:48 +01:00
parent 0ad2b05455
commit 40ce38254b
6 changed files with 63 additions and 30 deletions

View File

@ -143,6 +143,46 @@ def run_convert(input_file,
raise ParseError("Convert failed at {}".format(args)) raise ParseError("Convert failed at {}".format(args))
def get_default_thumbnail():
return os.path.join(os.path.dirname(__file__), "resources", "document.png")
def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None):
out_path = os.path.join(temp_dir, "convert_gs.png")
# if convert fails, fall back to extracting
# the first PDF page as a PNG using Ghostscript
logger.warning(
"Thumbnail generation with ImageMagick failed, falling back "
"to ghostscript. Check your /etc/ImageMagick-x/policy.xml!",
extra={'group': logging_group}
)
gs_out_path = os.path.join(temp_dir, "gs_out.png")
cmd = [settings.GS_BINARY,
"-q",
"-sDEVICE=pngalpha",
"-o", gs_out_path,
in_path]
try:
if not subprocess.Popen(cmd).wait() == 0:
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
# then run convert on the output from gs
run_convert(density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file=gs_out_path,
output_file=out_path,
logging_group=logging_group)
return out_path
except ParseError:
return get_default_thumbnail()
def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None): def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None):
""" """
The thumbnail of a PDF is just a 500px wide image of the first page. The thumbnail of a PDF is just a 500px wide image of the first page.
@ -161,31 +201,8 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None):
output_file=out_path, output_file=out_path,
logging_group=logging_group) logging_group=logging_group)
except ParseError: except ParseError:
# if convert fails, fall back to extracting out_path = make_thumbnail_from_pdf_gs_fallback(
# the first PDF page as a PNG using Ghostscript in_path, temp_dir, logging_group)
logger.warning(
"Thumbnail generation with ImageMagick failed, falling back "
"to ghostscript. Check your /etc/ImageMagick-x/policy.xml!",
extra={'group': logging_group}
)
gs_out_path = os.path.join(temp_dir, "gs_out.png")
cmd = [settings.GS_BINARY,
"-q",
"-sDEVICE=pngalpha",
"-o", gs_out_path,
in_path]
if not subprocess.Popen(cmd).wait() == 0:
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
# then run convert on the output from gs
run_convert(density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file=gs_out_path,
output_file=out_path,
logging_group=logging_group)
return out_path return out_path

Binary file not shown.

After

Width:  |  Height:  |  Size: 10 KiB

View File

@ -291,6 +291,7 @@ class RasterisedDocumentParser(DocumentParser):
f"No text was found in {document_path}, the content will " f"No text was found in {document_path}, the content will "
f"be empty." f"be empty."
) )
self.text = ""
def strip_excess_whitespace(text): def strip_excess_whitespace(text):

Binary file not shown.

View File

@ -81,8 +81,8 @@ class TestParser(DirectoriesMixin, TestCase):
def test_thumbnail(self): def test_thumbnail(self):
parser = RasterisedDocumentParser(uuid.uuid4()) parser = RasterisedDocumentParser(uuid.uuid4())
parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf") thumb = parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf")
# dont really know how to test it, just call it and assert that it does not raise anything. self.assertTrue(os.path.isfile(thumb))
@mock.patch("documents.parsers.run_convert") @mock.patch("documents.parsers.run_convert")
def test_thumbnail_fallback(self, m): def test_thumbnail_fallback(self, m):
@ -96,8 +96,13 @@ class TestParser(DirectoriesMixin, TestCase):
m.side_effect = call_convert m.side_effect = call_convert
parser = RasterisedDocumentParser(uuid.uuid4()) parser = RasterisedDocumentParser(uuid.uuid4())
parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf") thumb = parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf")
# dont really know how to test it, just call it and assert that it does not raise anything. self.assertTrue(os.path.isfile(thumb))
def test_thumbnail_encrypted(self):
parser = RasterisedDocumentParser(uuid.uuid4())
thumb = parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'encrypted.pdf'), "application/pdf")
self.assertTrue(os.path.isfile(thumb))
def test_get_dpi(self): def test_get_dpi(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
@ -135,6 +140,15 @@ class TestParser(DirectoriesMixin, TestCase):
self.assertIsNone(parser.archive_path) self.assertIsNone(parser.archive_path)
self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."]) self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."])
@override_settings(OCR_MODE="skip")
def test_signed(self):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "signed.pdf"), "application/pdf")
self.assertIsNone(parser.archive_path)
self.assertContainsStrings(parser.get_text(), ["This is a digitally signed PDF, created with Acrobat Pro for the Paperless project to enable", "automated testing of signed/encrypted PDFs"])
@override_settings(OCR_MODE="skip") @override_settings(OCR_MODE="skip")
def test_encrypted(self): def test_encrypted(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
@ -142,7 +156,8 @@ class TestParser(DirectoriesMixin, TestCase):
parser.parse(os.path.join(self.SAMPLE_FILES, "encrypted.pdf"), "application/pdf") parser.parse(os.path.join(self.SAMPLE_FILES, "encrypted.pdf"), "application/pdf")
self.assertIsNone(parser.archive_path) self.assertIsNone(parser.archive_path)
self.assertContainsStrings(parser.get_text(), ["This is a digitally signed PDF, created with Acrobat Pro for the Paperless project to enable", "automated testing of signed/encrypted PDFs"]) self.assertEqual(parser.get_text(), "")
@override_settings(OCR_MODE="redo") @override_settings(OCR_MODE="redo")
def test_with_form_error_notext(self): def test_with_form_error_notext(self):