remove duplicate code

2026-02-05 23:32:46 -06:00 · 2021-01-01 21:50:45 +01:00
parent 279e269a66
commit c05bfb894a
3 changed files with 60 additions and 103 deletions
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -144,6 +144,52 @@ def run_convert(input_file,
        raise ParseError("Convert failed at {}".format(args))


+def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None):
+    """
+    The thumbnail of a PDF is just a 500px wide image of the first page.
+    """
+    out_path = os.path.join(temp_dir, "convert.png")
+
+    # Run convert to get a decent thumbnail
+    try:
+        run_convert(density=300,
+                    scale="500x5000>",
+                    alpha="remove",
+                    strip=True,
+                    trim=False,
+                    auto_orient=True,
+                    input_file="{}[0]".format(in_path),
+                    output_file=out_path,
+                    logging_group=logging_group)
+    except ParseError:
+        # if convert fails, fall back to extracting
+        # the first PDF page as a PNG using Ghostscript
+        logger.warning(
+            "Thumbnail generation with ImageMagick failed, falling back "
+            "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!",
+            extra={'group': logging_group}
+        )
+        gs_out_path = os.path.join(temp_dir, "gs_out.png")
+        cmd = [settings.GS_BINARY,
+               "-q",
+               "-sDEVICE=pngalpha",
+               "-o", gs_out_path,
+               in_path]
+        if not subprocess.Popen(cmd).wait() == 0:
+            raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
+        # then run convert on the output from gs
+        run_convert(density=300,
+                    scale="500x5000>",
+                    alpha="remove",
+                    strip=True,
+                    trim=False,
+                    auto_orient=True,
+                    input_file=gs_out_path,
+                    output_file=out_path,
+                    logging_group=logging_group)
+
+    return out_path
+
 def parse_date(filename, text):
    """
    Returns the date of the document.
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -1,7 +1,6 @@
 import json
 import os
 import re
-import subprocess

 import ocrmypdf
 import pdftotext
@@ -10,7 +9,8 @@ from PIL import Image
 from django.conf import settings
 from ocrmypdf import InputFileError, EncryptedPdfError

-from documents.parsers import DocumentParser, ParseError, run_convert
+from documents.parsers import DocumentParser, ParseError, \
+    make_thumbnail_from_pdf


 class RasterisedDocumentParser(DocumentParser):
@@ -47,50 +47,8 @@ class RasterisedDocumentParser(DocumentParser):
        return result

    def get_thumbnail(self, document_path, mime_type):
-        """
-        The thumbnail of a PDF is just a 500px wide image of the first page.
-        """
-
-        out_path = os.path.join(self.tempdir, "convert.png")
-
-        # Run convert to get a decent thumbnail
-        try:
-            run_convert(density=300,
-                        scale="500x5000>",
-                        alpha="remove",
-                        strip=True,
-                        trim=False,
-                        auto_orient=True,
-                        input_file="{}[0]".format(document_path),
-                        output_file=out_path,
-                        logging_group=self.logging_group)
-        except ParseError:
-            # if convert fails, fall back to extracting
-            # the first PDF page as a PNG using Ghostscript
-            self.log(
-                'warning',
-                "Thumbnail generation with ImageMagick failed, falling back "
-                "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!")
-            gs_out_path = os.path.join(self.tempdir, "gs_out.png")
-            cmd = [settings.GS_BINARY,
-                   "-q",
-                   "-sDEVICE=pngalpha",
-                   "-o", gs_out_path,
-                   document_path]
-            if not subprocess.Popen(cmd).wait() == 0:
-                raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
-            # then run convert on the output from gs
-            run_convert(density=300,
-                        scale="500x5000>",
-                        alpha="remove",
-                        strip=True,
-                        trim=False,
-                        auto_orient=True,
-                        input_file=gs_out_path,
-                        output_file=out_path,
-                        logging_group=self.logging_group)
-
-        return out_path
+        return make_thumbnail_from_pdf(
+            document_path, self.tempdir, self.logging_group)

    def is_image(self, mime_type):
        return mime_type in [
--- a/src/paperless_tika/parsers.py
+++ b/src/paperless_tika/parsers.py
@@ -1,14 +1,11 @@
 import os
-import subprocess
-import tika
 import requests
 import dateutil.parser

-from PIL import ImageDraw, ImageFont, Image
 from django.conf import settings

-from documents.parsers import DocumentParser, ParseError, run_convert
-from paperless_tesseract.parsers import RasterisedDocumentParser
+from documents.parsers import DocumentParser, ParseError, \
+    make_thumbnail_from_pdf
 from tika import parser


@@ -18,55 +15,11 @@ class TikaDocumentParser(DocumentParser):
    """

    def get_thumbnail(self, document_path, mime_type):
-        self.log("info", f"[TIKA_THUMB] Generating thumbnail for{document_path}")
-        archive_path = self.archive_path
+        if not self.archive_path:
+            self.archive_path = self.convert_to_pdf(document_path)

-        out_path = os.path.join(self.tempdir, "convert.png")
-
-        # Run convert to get a decent thumbnail
-        try:
-            run_convert(
-                density=300,
-                scale="500x5000>",
-                alpha="remove",
-                strip=True,
-                trim=False,
-                input_file="{}[0]".format(archive_path),
-                output_file=out_path,
-                logging_group=self.logging_group,
-            )
-        except ParseError:
-            # if convert fails, fall back to extracting
-            # the first PDF page as a PNG using Ghostscript
-            self.log(
-                "warning",
-                "Thumbnail generation with ImageMagick failed, falling back "
-                "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!",
-            )
-            gs_out_path = os.path.join(self.tempdir, "gs_out.png")
-            cmd = [
-                settings.GS_BINARY,
-                "-q",
-                "-sDEVICE=pngalpha",
-                "-o",
-                gs_out_path,
-                archive_path,
-            ]
-            if not subprocess.Popen(cmd).wait() == 0:
-                raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
-            # then run convert on the output from gs
-            run_convert(
-                density=300,
-                scale="500x5000>",
-                alpha="remove",
-                strip=True,
-                trim=False,
-                input_file=gs_out_path,
-                output_file=out_path,
-                logging_group=self.logging_group,
-            )
-
-        return out_path
+        return make_thumbnail_from_pdf(
+            self.archive_path, self.tempdir, self.logging_group)

    def parse(self, document_path, mime_type):
        self.log("info", f"[TIKA_PARSE] Sending {document_path} to Tika server")
@@ -89,11 +42,9 @@ class TikaDocumentParser(DocumentParser):
        except:
            pass

-        archive_path = os.path.join(self.tempdir, "convert.pdf")
-        convert_to_pdf(document_path, archive_path)
-        self.archive_path = archive_path
+        self.archive_path = self.convert_to_pdf(document_path)

-    def convert_to_pdf(document_path, pdf_path):
+    def convert_to_pdf(self, document_path):
        pdf_path = os.path.join(self.tempdir, "convert.pdf")
        gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT
        url = gotenberg_server + "/convert/office"
@@ -113,3 +64,5 @@ class TikaDocumentParser(DocumentParser):
        file = open(pdf_path, "wb")
        file.write(response.content)
        file.close()
+
+        return pdf_path