remove duplicate code

This commit is contained in:
jonaswinkler 2021-01-01 21:50:45 +01:00
parent 279e269a66
commit c05bfb894a
3 changed files with 60 additions and 103 deletions

View File

@ -144,6 +144,52 @@ def run_convert(input_file,
raise ParseError("Convert failed at {}".format(args))
def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None):
"""
The thumbnail of a PDF is just a 500px wide image of the first page.
"""
out_path = os.path.join(temp_dir, "convert.png")
# Run convert to get a decent thumbnail
try:
run_convert(density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file="{}[0]".format(in_path),
output_file=out_path,
logging_group=logging_group)
except ParseError:
# if convert fails, fall back to extracting
# the first PDF page as a PNG using Ghostscript
logger.warning(
"Thumbnail generation with ImageMagick failed, falling back "
"to ghostscript. Check your /etc/ImageMagick-x/policy.xml!",
extra={'group': logging_group}
)
gs_out_path = os.path.join(temp_dir, "gs_out.png")
cmd = [settings.GS_BINARY,
"-q",
"-sDEVICE=pngalpha",
"-o", gs_out_path,
in_path]
if not subprocess.Popen(cmd).wait() == 0:
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
# then run convert on the output from gs
run_convert(density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file=gs_out_path,
output_file=out_path,
logging_group=logging_group)
return out_path
def parse_date(filename, text):
"""
Returns the date of the document.

View File

@ -1,7 +1,6 @@
import json
import os
import re
import subprocess
import ocrmypdf
import pdftotext
@ -10,7 +9,8 @@ from PIL import Image
from django.conf import settings
from ocrmypdf import InputFileError, EncryptedPdfError
from documents.parsers import DocumentParser, ParseError, run_convert
from documents.parsers import DocumentParser, ParseError, \
make_thumbnail_from_pdf
class RasterisedDocumentParser(DocumentParser):
@ -47,50 +47,8 @@ class RasterisedDocumentParser(DocumentParser):
return result
def get_thumbnail(self, document_path, mime_type):
"""
The thumbnail of a PDF is just a 500px wide image of the first page.
"""
out_path = os.path.join(self.tempdir, "convert.png")
# Run convert to get a decent thumbnail
try:
run_convert(density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file="{}[0]".format(document_path),
output_file=out_path,
logging_group=self.logging_group)
except ParseError:
# if convert fails, fall back to extracting
# the first PDF page as a PNG using Ghostscript
self.log(
'warning',
"Thumbnail generation with ImageMagick failed, falling back "
"to ghostscript. Check your /etc/ImageMagick-x/policy.xml!")
gs_out_path = os.path.join(self.tempdir, "gs_out.png")
cmd = [settings.GS_BINARY,
"-q",
"-sDEVICE=pngalpha",
"-o", gs_out_path,
document_path]
if not subprocess.Popen(cmd).wait() == 0:
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
# then run convert on the output from gs
run_convert(density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file=gs_out_path,
output_file=out_path,
logging_group=self.logging_group)
return out_path
return make_thumbnail_from_pdf(
document_path, self.tempdir, self.logging_group)
def is_image(self, mime_type):
return mime_type in [

View File

@ -1,14 +1,11 @@
import os
import subprocess
import tika
import requests
import dateutil.parser
from PIL import ImageDraw, ImageFont, Image
from django.conf import settings
from documents.parsers import DocumentParser, ParseError, run_convert
from paperless_tesseract.parsers import RasterisedDocumentParser
from documents.parsers import DocumentParser, ParseError, \
make_thumbnail_from_pdf
from tika import parser
@ -18,55 +15,11 @@ class TikaDocumentParser(DocumentParser):
"""
def get_thumbnail(self, document_path, mime_type):
self.log("info", f"[TIKA_THUMB] Generating thumbnail for{document_path}")
archive_path = self.archive_path
if not self.archive_path:
self.archive_path = self.convert_to_pdf(document_path)
out_path = os.path.join(self.tempdir, "convert.png")
# Run convert to get a decent thumbnail
try:
run_convert(
density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
input_file="{}[0]".format(archive_path),
output_file=out_path,
logging_group=self.logging_group,
)
except ParseError:
# if convert fails, fall back to extracting
# the first PDF page as a PNG using Ghostscript
self.log(
"warning",
"Thumbnail generation with ImageMagick failed, falling back "
"to ghostscript. Check your /etc/ImageMagick-x/policy.xml!",
)
gs_out_path = os.path.join(self.tempdir, "gs_out.png")
cmd = [
settings.GS_BINARY,
"-q",
"-sDEVICE=pngalpha",
"-o",
gs_out_path,
archive_path,
]
if not subprocess.Popen(cmd).wait() == 0:
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
# then run convert on the output from gs
run_convert(
density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
input_file=gs_out_path,
output_file=out_path,
logging_group=self.logging_group,
)
return out_path
return make_thumbnail_from_pdf(
self.archive_path, self.tempdir, self.logging_group)
def parse(self, document_path, mime_type):
self.log("info", f"[TIKA_PARSE] Sending {document_path} to Tika server")
@ -89,11 +42,9 @@ class TikaDocumentParser(DocumentParser):
except:
pass
archive_path = os.path.join(self.tempdir, "convert.pdf")
convert_to_pdf(document_path, archive_path)
self.archive_path = archive_path
self.archive_path = self.convert_to_pdf(document_path)
def convert_to_pdf(document_path, pdf_path):
def convert_to_pdf(self, document_path):
pdf_path = os.path.join(self.tempdir, "convert.pdf")
gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT
url = gotenberg_server + "/convert/office"
@ -113,3 +64,5 @@ class TikaDocumentParser(DocumentParser):
file = open(pdf_path, "wb")
file.write(response.content)
file.close()
return pdf_path