remove duplicate code

2025-07-30 18:27:45 -05:00 · 2021-01-01 21:50:45 +01:00
parent 279e269a66
commit c05bfb894a
3 changed files with 60 additions and 103 deletions
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -144,6 +144,52 @@ def run_convert(input_file,
        raise ParseError("Convert failed at {}".format(args))
 def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None):
    """
    The thumbnail of a PDF is just a 500px wide image of the first page.
    """
    out_path = os.path.join(temp_dir, "convert.png")
    # Run convert to get a decent thumbnail
    try:
        run_convert(density=300,
                    scale="500x5000>",
                    alpha="remove",
                    strip=True,
                    trim=False,
                    auto_orient=True,
                    input_file="{}[0]".format(in_path),
                    output_file=out_path,
                    logging_group=logging_group)
    except ParseError:
        # if convert fails, fall back to extracting
        # the first PDF page as a PNG using Ghostscript
        logger.warning(
            "Thumbnail generation with ImageMagick failed, falling back "
            "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!",
            extra={'group': logging_group}
        )
        gs_out_path = os.path.join(temp_dir, "gs_out.png")
        cmd = [settings.GS_BINARY,
               "-q",
               "-sDEVICE=pngalpha",
               "-o", gs_out_path,
               in_path]
        if not subprocess.Popen(cmd).wait() == 0:
            raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
        # then run convert on the output from gs
        run_convert(density=300,
                    scale="500x5000>",
                    alpha="remove",
                    strip=True,
                    trim=False,
                    auto_orient=True,
                    input_file=gs_out_path,
                    output_file=out_path,
                    logging_group=logging_group)
    return out_path
 def parse_date(filename, text):
    """
    Returns the date of the document.
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -1,7 +1,6 @@
 import json
 import os
 import re
 import subprocess
 import ocrmypdf
 import pdftotext
@@ -10,7 +9,8 @@ from PIL import Image
 from django.conf import settings
 from ocrmypdf import InputFileError, EncryptedPdfError
-from documents.parsers import DocumentParser, ParseError, run_convert
+from documents.parsers import DocumentParser, ParseError, \
    make_thumbnail_from_pdf
 class RasterisedDocumentParser(DocumentParser):
@@ -47,50 +47,8 @@ class RasterisedDocumentParser(DocumentParser):
        return result
    def get_thumbnail(self, document_path, mime_type):
-        """
+        return make_thumbnail_from_pdf(
-        The thumbnail of a PDF is just a 500px wide image of the first page.
+            document_path, self.tempdir, self.logging_group)
        """
        out_path = os.path.join(self.tempdir, "convert.png")
        # Run convert to get a decent thumbnail
        try:
            run_convert(density=300,
                        scale="500x5000>",
                        alpha="remove",
                        strip=True,
                        trim=False,
                        auto_orient=True,
                        input_file="{}[0]".format(document_path),
                        output_file=out_path,
                        logging_group=self.logging_group)
        except ParseError:
            # if convert fails, fall back to extracting
            # the first PDF page as a PNG using Ghostscript
            self.log(
                'warning',
                "Thumbnail generation with ImageMagick failed, falling back "
                "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!")
            gs_out_path = os.path.join(self.tempdir, "gs_out.png")
            cmd = [settings.GS_BINARY,
                   "-q",
                   "-sDEVICE=pngalpha",
                   "-o", gs_out_path,
                   document_path]
            if not subprocess.Popen(cmd).wait() == 0:
                raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
            # then run convert on the output from gs
            run_convert(density=300,
                        scale="500x5000>",
                        alpha="remove",
                        strip=True,
                        trim=False,
                        auto_orient=True,
                        input_file=gs_out_path,
                        output_file=out_path,
                        logging_group=self.logging_group)
        return out_path
    def is_image(self, mime_type):
        return mime_type in [
--- a/src/paperless_tika/parsers.py
+++ b/src/paperless_tika/parsers.py
@@ -1,14 +1,11 @@
 import os
 import subprocess
 import tika
 import requests
 import dateutil.parser
 from PIL import ImageDraw, ImageFont, Image
 from django.conf import settings
-from documents.parsers import DocumentParser, ParseError, run_convert
+from documents.parsers import DocumentParser, ParseError, \
-from paperless_tesseract.parsers import RasterisedDocumentParser
+    make_thumbnail_from_pdf
 from tika import parser
@@ -18,55 +15,11 @@ class TikaDocumentParser(DocumentParser):
    """
    def get_thumbnail(self, document_path, mime_type):
-        self.log("info", f"[TIKA_THUMB] Generating thumbnail for{document_path}")
+        if not self.archive_path:
-        archive_path = self.archive_path
+            self.archive_path = self.convert_to_pdf(document_path)
-        out_path = os.path.join(self.tempdir, "convert.png")
+        return make_thumbnail_from_pdf(
-
+            self.archive_path, self.tempdir, self.logging_group)
        # Run convert to get a decent thumbnail
        try:
            run_convert(
                density=300,
                scale="500x5000>",
                alpha="remove",
                strip=True,
                trim=False,
                input_file="{}[0]".format(archive_path),
                output_file=out_path,
                logging_group=self.logging_group,
            )
        except ParseError:
            # if convert fails, fall back to extracting
            # the first PDF page as a PNG using Ghostscript
            self.log(
                "warning",
                "Thumbnail generation with ImageMagick failed, falling back "
                "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!",
            )
            gs_out_path = os.path.join(self.tempdir, "gs_out.png")
            cmd = [
                settings.GS_BINARY,
                "-q",
                "-sDEVICE=pngalpha",
                "-o",
                gs_out_path,
                archive_path,
            ]
            if not subprocess.Popen(cmd).wait() == 0:
                raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
            # then run convert on the output from gs
            run_convert(
                density=300,
                scale="500x5000>",
                alpha="remove",
                strip=True,
                trim=False,
                input_file=gs_out_path,
                output_file=out_path,
                logging_group=self.logging_group,
            )
        return out_path
    def parse(self, document_path, mime_type):
        self.log("info", f"[TIKA_PARSE] Sending {document_path} to Tika server")
@@ -89,11 +42,9 @@ class TikaDocumentParser(DocumentParser):
        except:
            pass
-        archive_path = os.path.join(self.tempdir, "convert.pdf")
+        self.archive_path = self.convert_to_pdf(document_path)
        convert_to_pdf(document_path, archive_path)
        self.archive_path = archive_path
-    def convert_to_pdf(document_path, pdf_path):
+    def convert_to_pdf(self, document_path):
        pdf_path = os.path.join(self.tempdir, "convert.pdf")
        gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT
        url = gotenberg_server + "/convert/office"
@@ -113,3 +64,5 @@ class TikaDocumentParser(DocumentParser):
        file = open(pdf_path, "wb")
        file.write(response.content)
        file.close()
        return pdf_path