mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
fixes #631
This commit is contained in:
parent
0ad2b05455
commit
40ce38254b
@ -143,6 +143,46 @@ def run_convert(input_file,
|
|||||||
raise ParseError("Convert failed at {}".format(args))
|
raise ParseError("Convert failed at {}".format(args))
|
||||||
|
|
||||||
|
|
||||||
|
def get_default_thumbnail():
|
||||||
|
return os.path.join(os.path.dirname(__file__), "resources", "document.png")
|
||||||
|
|
||||||
|
|
||||||
|
def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None):
|
||||||
|
out_path = os.path.join(temp_dir, "convert_gs.png")
|
||||||
|
|
||||||
|
# if convert fails, fall back to extracting
|
||||||
|
# the first PDF page as a PNG using Ghostscript
|
||||||
|
logger.warning(
|
||||||
|
"Thumbnail generation with ImageMagick failed, falling back "
|
||||||
|
"to ghostscript. Check your /etc/ImageMagick-x/policy.xml!",
|
||||||
|
extra={'group': logging_group}
|
||||||
|
)
|
||||||
|
gs_out_path = os.path.join(temp_dir, "gs_out.png")
|
||||||
|
cmd = [settings.GS_BINARY,
|
||||||
|
"-q",
|
||||||
|
"-sDEVICE=pngalpha",
|
||||||
|
"-o", gs_out_path,
|
||||||
|
in_path]
|
||||||
|
try:
|
||||||
|
if not subprocess.Popen(cmd).wait() == 0:
|
||||||
|
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
|
||||||
|
# then run convert on the output from gs
|
||||||
|
run_convert(density=300,
|
||||||
|
scale="500x5000>",
|
||||||
|
alpha="remove",
|
||||||
|
strip=True,
|
||||||
|
trim=False,
|
||||||
|
auto_orient=True,
|
||||||
|
input_file=gs_out_path,
|
||||||
|
output_file=out_path,
|
||||||
|
logging_group=logging_group)
|
||||||
|
|
||||||
|
return out_path
|
||||||
|
|
||||||
|
except ParseError:
|
||||||
|
return get_default_thumbnail()
|
||||||
|
|
||||||
|
|
||||||
def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None):
|
def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None):
|
||||||
"""
|
"""
|
||||||
The thumbnail of a PDF is just a 500px wide image of the first page.
|
The thumbnail of a PDF is just a 500px wide image of the first page.
|
||||||
@ -161,31 +201,8 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None):
|
|||||||
output_file=out_path,
|
output_file=out_path,
|
||||||
logging_group=logging_group)
|
logging_group=logging_group)
|
||||||
except ParseError:
|
except ParseError:
|
||||||
# if convert fails, fall back to extracting
|
out_path = make_thumbnail_from_pdf_gs_fallback(
|
||||||
# the first PDF page as a PNG using Ghostscript
|
in_path, temp_dir, logging_group)
|
||||||
logger.warning(
|
|
||||||
"Thumbnail generation with ImageMagick failed, falling back "
|
|
||||||
"to ghostscript. Check your /etc/ImageMagick-x/policy.xml!",
|
|
||||||
extra={'group': logging_group}
|
|
||||||
)
|
|
||||||
gs_out_path = os.path.join(temp_dir, "gs_out.png")
|
|
||||||
cmd = [settings.GS_BINARY,
|
|
||||||
"-q",
|
|
||||||
"-sDEVICE=pngalpha",
|
|
||||||
"-o", gs_out_path,
|
|
||||||
in_path]
|
|
||||||
if not subprocess.Popen(cmd).wait() == 0:
|
|
||||||
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
|
|
||||||
# then run convert on the output from gs
|
|
||||||
run_convert(density=300,
|
|
||||||
scale="500x5000>",
|
|
||||||
alpha="remove",
|
|
||||||
strip=True,
|
|
||||||
trim=False,
|
|
||||||
auto_orient=True,
|
|
||||||
input_file=gs_out_path,
|
|
||||||
output_file=out_path,
|
|
||||||
logging_group=logging_group)
|
|
||||||
|
|
||||||
return out_path
|
return out_path
|
||||||
|
|
||||||
|
BIN
src/documents/resources/document.png
Normal file
BIN
src/documents/resources/document.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 10 KiB |
@ -291,6 +291,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
f"No text was found in {document_path}, the content will "
|
f"No text was found in {document_path}, the content will "
|
||||||
f"be empty."
|
f"be empty."
|
||||||
)
|
)
|
||||||
|
self.text = ""
|
||||||
|
|
||||||
|
|
||||||
def strip_excess_whitespace(text):
|
def strip_excess_whitespace(text):
|
||||||
|
Binary file not shown.
BIN
src/paperless_tesseract/tests/samples/signed.pdf
Normal file
BIN
src/paperless_tesseract/tests/samples/signed.pdf
Normal file
Binary file not shown.
@ -81,8 +81,8 @@ class TestParser(DirectoriesMixin, TestCase):
|
|||||||
|
|
||||||
def test_thumbnail(self):
|
def test_thumbnail(self):
|
||||||
parser = RasterisedDocumentParser(uuid.uuid4())
|
parser = RasterisedDocumentParser(uuid.uuid4())
|
||||||
parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf")
|
thumb = parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf")
|
||||||
# dont really know how to test it, just call it and assert that it does not raise anything.
|
self.assertTrue(os.path.isfile(thumb))
|
||||||
|
|
||||||
@mock.patch("documents.parsers.run_convert")
|
@mock.patch("documents.parsers.run_convert")
|
||||||
def test_thumbnail_fallback(self, m):
|
def test_thumbnail_fallback(self, m):
|
||||||
@ -96,8 +96,13 @@ class TestParser(DirectoriesMixin, TestCase):
|
|||||||
m.side_effect = call_convert
|
m.side_effect = call_convert
|
||||||
|
|
||||||
parser = RasterisedDocumentParser(uuid.uuid4())
|
parser = RasterisedDocumentParser(uuid.uuid4())
|
||||||
parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf")
|
thumb = parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf")
|
||||||
# dont really know how to test it, just call it and assert that it does not raise anything.
|
self.assertTrue(os.path.isfile(thumb))
|
||||||
|
|
||||||
|
def test_thumbnail_encrypted(self):
|
||||||
|
parser = RasterisedDocumentParser(uuid.uuid4())
|
||||||
|
thumb = parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'encrypted.pdf'), "application/pdf")
|
||||||
|
self.assertTrue(os.path.isfile(thumb))
|
||||||
|
|
||||||
def test_get_dpi(self):
|
def test_get_dpi(self):
|
||||||
parser = RasterisedDocumentParser(None)
|
parser = RasterisedDocumentParser(None)
|
||||||
@ -135,6 +140,15 @@ class TestParser(DirectoriesMixin, TestCase):
|
|||||||
self.assertIsNone(parser.archive_path)
|
self.assertIsNone(parser.archive_path)
|
||||||
self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."])
|
self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."])
|
||||||
|
|
||||||
|
@override_settings(OCR_MODE="skip")
|
||||||
|
def test_signed(self):
|
||||||
|
parser = RasterisedDocumentParser(None)
|
||||||
|
|
||||||
|
parser.parse(os.path.join(self.SAMPLE_FILES, "signed.pdf"), "application/pdf")
|
||||||
|
|
||||||
|
self.assertIsNone(parser.archive_path)
|
||||||
|
self.assertContainsStrings(parser.get_text(), ["This is a digitally signed PDF, created with Acrobat Pro for the Paperless project to enable", "automated testing of signed/encrypted PDFs"])
|
||||||
|
|
||||||
@override_settings(OCR_MODE="skip")
|
@override_settings(OCR_MODE="skip")
|
||||||
def test_encrypted(self):
|
def test_encrypted(self):
|
||||||
parser = RasterisedDocumentParser(None)
|
parser = RasterisedDocumentParser(None)
|
||||||
@ -142,7 +156,8 @@ class TestParser(DirectoriesMixin, TestCase):
|
|||||||
parser.parse(os.path.join(self.SAMPLE_FILES, "encrypted.pdf"), "application/pdf")
|
parser.parse(os.path.join(self.SAMPLE_FILES, "encrypted.pdf"), "application/pdf")
|
||||||
|
|
||||||
self.assertIsNone(parser.archive_path)
|
self.assertIsNone(parser.archive_path)
|
||||||
self.assertContainsStrings(parser.get_text(), ["This is a digitally signed PDF, created with Acrobat Pro for the Paperless project to enable", "automated testing of signed/encrypted PDFs"])
|
self.assertEqual(parser.get_text(), "")
|
||||||
|
|
||||||
|
|
||||||
@override_settings(OCR_MODE="redo")
|
@override_settings(OCR_MODE="redo")
|
||||||
def test_with_form_error_notext(self):
|
def test_with_form_error_notext(self):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user