diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index d53879542..c1eddcefe 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -66,6 +66,10 @@ class RasterisedDocumentParser(DocumentParser): "image/gif", ] + def has_alpha(self, image): + with Image.open(image) as im: + return im.mode in ('RGBA', 'LA') + def get_dpi(self, image): try: with Image.open(image) as im: @@ -182,6 +186,19 @@ class RasterisedDocumentParser(DocumentParser): if self.is_image(mime_type): dpi = self.get_dpi(input_file) a4_dpi = self.calculate_a4_dpi(input_file) + + if self.has_alpha(input_file): + self.log( + "info", + f"Removing alpha layer from {input_file} " + "for compatibility with img2pdf" + ) + with Image.open(input_file) as im: + background = Image.new('RGBA', im.size, (255, 255, 255)) + background.alpha_composite(im) + background = background.convert('RGB') + background.save(input_file, format=im.format) + if dpi: self.log( "debug", diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index e39f87017..1ee295edf 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -181,13 +181,14 @@ class TestParser(DirectoriesMixin, TestCase): self.assertContainsStrings(parser.get_text(), ["This is a test document."]) - def test_image_simple_alpha_fail(self): + def test_image_simple_alpha(self): parser = RasterisedDocumentParser(None) - def f(): - parser.parse(os.path.join(self.SAMPLE_FILES, "simple-alpha.png"), "image/png") + parser.parse(os.path.join(self.SAMPLE_FILES, "simple-alpha.png"), "image/png") - self.assertRaises(ParseError, f) + self.assertTrue(os.path.isfile(parser.archive_path)) + + self.assertContainsStrings(parser.get_text(), ["This is a test document."]) def test_image_calc_a4_dpi(self): parser = RasterisedDocumentParser(None)