remove duplicate code

This commit is contained in:
jonaswinkler 2021-01-01 21:50:45 +01:00
parent 279e269a66
commit c05bfb894a
3 changed files with 60 additions and 103 deletions

View File

@ -144,6 +144,52 @@ def run_convert(input_file,
raise ParseError("Convert failed at {}".format(args)) raise ParseError("Convert failed at {}".format(args))
def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None):
"""
The thumbnail of a PDF is just a 500px wide image of the first page.
"""
out_path = os.path.join(temp_dir, "convert.png")
# Run convert to get a decent thumbnail
try:
run_convert(density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file="{}[0]".format(in_path),
output_file=out_path,
logging_group=logging_group)
except ParseError:
# if convert fails, fall back to extracting
# the first PDF page as a PNG using Ghostscript
logger.warning(
"Thumbnail generation with ImageMagick failed, falling back "
"to ghostscript. Check your /etc/ImageMagick-x/policy.xml!",
extra={'group': logging_group}
)
gs_out_path = os.path.join(temp_dir, "gs_out.png")
cmd = [settings.GS_BINARY,
"-q",
"-sDEVICE=pngalpha",
"-o", gs_out_path,
in_path]
if not subprocess.Popen(cmd).wait() == 0:
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
# then run convert on the output from gs
run_convert(density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file=gs_out_path,
output_file=out_path,
logging_group=logging_group)
return out_path
def parse_date(filename, text): def parse_date(filename, text):
""" """
Returns the date of the document. Returns the date of the document.

View File

@ -1,7 +1,6 @@
import json import json
import os import os
import re import re
import subprocess
import ocrmypdf import ocrmypdf
import pdftotext import pdftotext
@ -10,7 +9,8 @@ from PIL import Image
from django.conf import settings from django.conf import settings
from ocrmypdf import InputFileError, EncryptedPdfError from ocrmypdf import InputFileError, EncryptedPdfError
from documents.parsers import DocumentParser, ParseError, run_convert from documents.parsers import DocumentParser, ParseError, \
make_thumbnail_from_pdf
class RasterisedDocumentParser(DocumentParser): class RasterisedDocumentParser(DocumentParser):
@ -47,50 +47,8 @@ class RasterisedDocumentParser(DocumentParser):
return result return result
def get_thumbnail(self, document_path, mime_type): def get_thumbnail(self, document_path, mime_type):
""" return make_thumbnail_from_pdf(
The thumbnail of a PDF is just a 500px wide image of the first page. document_path, self.tempdir, self.logging_group)
"""
out_path = os.path.join(self.tempdir, "convert.png")
# Run convert to get a decent thumbnail
try:
run_convert(density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file="{}[0]".format(document_path),
output_file=out_path,
logging_group=self.logging_group)
except ParseError:
# if convert fails, fall back to extracting
# the first PDF page as a PNG using Ghostscript
self.log(
'warning',
"Thumbnail generation with ImageMagick failed, falling back "
"to ghostscript. Check your /etc/ImageMagick-x/policy.xml!")
gs_out_path = os.path.join(self.tempdir, "gs_out.png")
cmd = [settings.GS_BINARY,
"-q",
"-sDEVICE=pngalpha",
"-o", gs_out_path,
document_path]
if not subprocess.Popen(cmd).wait() == 0:
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
# then run convert on the output from gs
run_convert(density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file=gs_out_path,
output_file=out_path,
logging_group=self.logging_group)
return out_path
def is_image(self, mime_type): def is_image(self, mime_type):
return mime_type in [ return mime_type in [

View File

@ -1,14 +1,11 @@
import os import os
import subprocess
import tika
import requests import requests
import dateutil.parser import dateutil.parser
from PIL import ImageDraw, ImageFont, Image
from django.conf import settings from django.conf import settings
from documents.parsers import DocumentParser, ParseError, run_convert from documents.parsers import DocumentParser, ParseError, \
from paperless_tesseract.parsers import RasterisedDocumentParser make_thumbnail_from_pdf
from tika import parser from tika import parser
@ -18,55 +15,11 @@ class TikaDocumentParser(DocumentParser):
""" """
def get_thumbnail(self, document_path, mime_type): def get_thumbnail(self, document_path, mime_type):
self.log("info", f"[TIKA_THUMB] Generating thumbnail for{document_path}") if not self.archive_path:
archive_path = self.archive_path self.archive_path = self.convert_to_pdf(document_path)
out_path = os.path.join(self.tempdir, "convert.png") return make_thumbnail_from_pdf(
self.archive_path, self.tempdir, self.logging_group)
# Run convert to get a decent thumbnail
try:
run_convert(
density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
input_file="{}[0]".format(archive_path),
output_file=out_path,
logging_group=self.logging_group,
)
except ParseError:
# if convert fails, fall back to extracting
# the first PDF page as a PNG using Ghostscript
self.log(
"warning",
"Thumbnail generation with ImageMagick failed, falling back "
"to ghostscript. Check your /etc/ImageMagick-x/policy.xml!",
)
gs_out_path = os.path.join(self.tempdir, "gs_out.png")
cmd = [
settings.GS_BINARY,
"-q",
"-sDEVICE=pngalpha",
"-o",
gs_out_path,
archive_path,
]
if not subprocess.Popen(cmd).wait() == 0:
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
# then run convert on the output from gs
run_convert(
density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
input_file=gs_out_path,
output_file=out_path,
logging_group=self.logging_group,
)
return out_path
def parse(self, document_path, mime_type): def parse(self, document_path, mime_type):
self.log("info", f"[TIKA_PARSE] Sending {document_path} to Tika server") self.log("info", f"[TIKA_PARSE] Sending {document_path} to Tika server")
@ -89,11 +42,9 @@ class TikaDocumentParser(DocumentParser):
except: except:
pass pass
archive_path = os.path.join(self.tempdir, "convert.pdf") self.archive_path = self.convert_to_pdf(document_path)
convert_to_pdf(document_path, archive_path)
self.archive_path = archive_path
def convert_to_pdf(document_path, pdf_path): def convert_to_pdf(self, document_path):
pdf_path = os.path.join(self.tempdir, "convert.pdf") pdf_path = os.path.join(self.tempdir, "convert.pdf")
gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT
url = gotenberg_server + "/convert/office" url = gotenberg_server + "/convert/office"
@ -113,3 +64,5 @@ class TikaDocumentParser(DocumentParser):
file = open(pdf_path, "wb") file = open(pdf_path, "wb")
file.write(response.content) file.write(response.content)
file.close() file.close()
return pdf_path