From 41494ee689aaf8cd9fbdd6cb6ad1b8c4d403b85d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20M=C3=BCller?= Date: Mon, 21 Feb 2022 22:06:43 +0100 Subject: [PATCH 1/4] Remove alpha layer from PNG files for img2pdf Fixes issue #1254 --- src/paperless_tesseract/parsers.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index d53879542..edb3190ac 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -66,6 +66,16 @@ class RasterisedDocumentParser(DocumentParser): "image/gif", ] + def has_alpha(self, image): + try: + with Image.open(image) as im: + return im.mode in ('RGBA', 'LA') + except Exception as e: + self.log( + 'warning', + f"Error while check for alpha channel in image {image}: {e}") + return None + def get_dpi(self, image): try: with Image.open(image) as im: @@ -182,6 +192,18 @@ class RasterisedDocumentParser(DocumentParser): if self.is_image(mime_type): dpi = self.get_dpi(input_file) a4_dpi = self.calculate_a4_dpi(input_file) + + if self.has_alpha(input_file): + self.log( + "info", + f"Removing alpha layer from {input_file} for compatibility with img2pdf" + ) + with Image.open(input_file) as im: + background = Image.new('RGBA', im.size, (255, 255, 255)) + background.alpha_composite(im) + background = background.convert('RGB') + background.save(input_file, format=im.format) + if dpi: self.log( "debug", From 2a47b3f1a1e2fa7cc7499fe39abcc167a2c75a57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20M=C3=BCller?= Date: Mon, 21 Feb 2022 22:34:34 +0100 Subject: [PATCH 2/4] Fix code style (line too long) --- src/paperless_tesseract/parsers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index edb3190ac..02de914ce 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -196,7 +196,8 @@ class RasterisedDocumentParser(DocumentParser): if self.has_alpha(input_file): self.log( "info", - f"Removing alpha layer from {input_file} for compatibility with img2pdf" + f"Removing alpha layer from {input_file} " + "for compatibility with img2pdf" ) with Image.open(input_file) as im: background = Image.new('RGBA', im.size, (255, 255, 255)) From 73a8569d21113e6e16d2b133eed7c0b48d173156 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20M=C3=BCller?= Date: Mon, 21 Feb 2022 22:38:25 +0100 Subject: [PATCH 3/4] Modify test for PNG image with alpha --- src/paperless_tesseract/tests/test_parser.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index e39f87017..1ee295edf 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -181,13 +181,14 @@ class TestParser(DirectoriesMixin, TestCase): self.assertContainsStrings(parser.get_text(), ["This is a test document."]) - def test_image_simple_alpha_fail(self): + def test_image_simple_alpha(self): parser = RasterisedDocumentParser(None) - def f(): - parser.parse(os.path.join(self.SAMPLE_FILES, "simple-alpha.png"), "image/png") + parser.parse(os.path.join(self.SAMPLE_FILES, "simple-alpha.png"), "image/png") - self.assertRaises(ParseError, f) + self.assertTrue(os.path.isfile(parser.archive_path)) + + self.assertContainsStrings(parser.get_text(), ["This is a test document."]) def test_image_calc_a4_dpi(self): parser = RasterisedDocumentParser(None) From 1e288100a96315ff1fa4de2be7872fa42e68502c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20M=C3=BCller?= Date: Mon, 21 Feb 2022 22:58:19 +0100 Subject: [PATCH 4/4] Remove unneded exception handler from has_alpha() --- src/paperless_tesseract/parsers.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 02de914ce..c1eddcefe 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -67,14 +67,8 @@ class RasterisedDocumentParser(DocumentParser): ] def has_alpha(self, image): - try: - with Image.open(image) as im: - return im.mode in ('RGBA', 'LA') - except Exception as e: - self.log( - 'warning', - f"Error while check for alpha channel in image {image}: {e}") - return None + with Image.open(image) as im: + return im.mode in ('RGBA', 'LA') def get_dpi(self, image): try: