Merge branch 'dev' into feature-localization

2026-02-07 23:42:46 -06:00 · 2021-01-02 00:15:03 +01:00
parent 564f3b9170 d9f5bc7681
commit fdf330276e
49 changed files with 3218 additions and 466 deletions
--- a/src/documents/apps.py
+++ b/src/documents/apps.py
@@ -6,29 +6,21 @@ class DocumentsConfig(AppConfig):
    name = "documents"

    def ready(self):
-
-        from .signals import document_consumption_started
        from .signals import document_consumption_finished
        from .signals.handlers import (
            add_inbox_tags,
-            run_pre_consume_script,
-            run_post_consume_script,
            set_log_entry,
            set_correspondent,
            set_document_type,
            set_tags,
            add_to_index
-
        )

-        document_consumption_started.connect(run_pre_consume_script)
-
        document_consumption_finished.connect(add_inbox_tags)
        document_consumption_finished.connect(set_correspondent)
        document_consumption_finished.connect(set_document_type)
        document_consumption_finished.connect(set_tags)
        document_consumption_finished.connect(set_log_entry)
        document_consumption_finished.connect(add_to_index)
-        document_consumption_finished.connect(run_post_consume_script)

        AppConfig.ready(self)
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -1,7 +1,7 @@
 import datetime
 import hashlib
-import logging
 import os
+from subprocess import Popen

 import magic
 from django.conf import settings
@@ -9,6 +9,7 @@ from django.db import transaction
 from django.db.models import Q
 from django.utils import timezone
 from filelock import FileLock
+from rest_framework.reverse import reverse

 from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
 from .file_handling import create_source_path_directory, \
@@ -66,6 +67,39 @@ class Consumer(LoggingMixin):
        os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
        os.makedirs(settings.ARCHIVE_DIR, exist_ok=True)

+    def run_pre_consume_script(self):
+        if not settings.PRE_CONSUME_SCRIPT:
+            return
+
+        try:
+            Popen((settings.PRE_CONSUME_SCRIPT, self.path)).wait()
+        except Exception as e:
+            raise ConsumerError(
+                f"Error while executing pre-consume script: {e}"
+            )
+
+    def run_post_consume_script(self, document):
+        if not settings.POST_CONSUME_SCRIPT:
+            return
+
+        try:
+            Popen((
+                settings.POST_CONSUME_SCRIPT,
+                str(document.pk),
+                document.get_public_filename(),
+                os.path.normpath(document.source_path),
+                os.path.normpath(document.thumbnail_path),
+                reverse("document-download", kwargs={"pk": document.pk}),
+                reverse("document-thumb", kwargs={"pk": document.pk}),
+                str(document.correspondent),
+                str(",".join(document.tags.all().values_list(
+                    "name", flat=True)))
+            )).wait()
+        except Exception as e:
+            raise ConsumerError(
+                f"Error while executing pre-consume script: {e}"
+            )
+
    def try_consume_file(self,
                         path,
                         override_filename=None,
@@ -119,6 +153,8 @@ class Consumer(LoggingMixin):
            logging_group=self.logging_group
        )

+        self.run_pre_consume_script()
+
        # This doesn't parse the document yet, but gives us a parser.

        document_parser = parser_class(self.logging_group)
@@ -130,7 +166,7 @@ class Consumer(LoggingMixin):

        try:
            self.log("debug", "Parsing {}...".format(self.filename))
-            document_parser.parse(self.path, mime_type)
+            document_parser.parse(self.path, mime_type, self.filename)

            self.log("debug", f"Generating thumbnail for {self.filename}...")
            thumbnail = document_parser.get_optimised_thumbnail(
@@ -215,6 +251,9 @@ class Consumer(LoggingMixin):
                # Delete the file only if it was successfully consumed
                self.log("debug", "Deleting file {}".format(self.path))
                os.unlink(self.path)
+
+                self.run_post_consume_script(document)
+
        except Exception as e:
            self.log(
                "error",
--- a/src/documents/migrations/1010_auto_20210101_2159.py
+++ b/src/documents/migrations/1010_auto_20210101_2159.py
@@ -0,0 +1,18 @@
+# Generated by Django 3.1.4 on 2021-01-01 21:59
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('documents', '1009_auto_20201216_2005'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='savedviewfilterrule',
+            name='value',
+            field=models.CharField(blank=True, max_length=128, null=True),
+        ),
+    ]
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -404,7 +404,9 @@ class SavedViewFilterRule(models.Model):

    value = models.CharField(
        _("value"),
-        max_length=128)
+        max_length=128,
+        blank=True,
+        null=True)

    class Meta:
        verbose_name = _("filter rule")
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -144,6 +144,52 @@ def run_convert(input_file,
        raise ParseError("Convert failed at {}".format(args))


+def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None):
+    """
+    The thumbnail of a PDF is just a 500px wide image of the first page.
+    """
+    out_path = os.path.join(temp_dir, "convert.png")
+
+    # Run convert to get a decent thumbnail
+    try:
+        run_convert(density=300,
+                    scale="500x5000>",
+                    alpha="remove",
+                    strip=True,
+                    trim=False,
+                    auto_orient=True,
+                    input_file="{}[0]".format(in_path),
+                    output_file=out_path,
+                    logging_group=logging_group)
+    except ParseError:
+        # if convert fails, fall back to extracting
+        # the first PDF page as a PNG using Ghostscript
+        logger.warning(
+            "Thumbnail generation with ImageMagick failed, falling back "
+            "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!",
+            extra={'group': logging_group}
+        )
+        gs_out_path = os.path.join(temp_dir, "gs_out.png")
+        cmd = [settings.GS_BINARY,
+               "-q",
+               "-sDEVICE=pngalpha",
+               "-o", gs_out_path,
+               in_path]
+        if not subprocess.Popen(cmd).wait() == 0:
+            raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
+        # then run convert on the output from gs
+        run_convert(density=300,
+                    scale="500x5000>",
+                    alpha="remove",
+                    strip=True,
+                    trim=False,
+                    auto_orient=True,
+                    input_file=gs_out_path,
+                    output_file=out_path,
+                    logging_group=logging_group)
+
+    return out_path
+
 def parse_date(filename, text):
    """
    Returns the date of the document.
@@ -221,7 +267,7 @@ class DocumentParser(LoggingMixin):
    def extract_metadata(self, document_path, mime_type):
        return []

-    def parse(self, document_path, mime_type):
+    def parse(self, document_path, mime_type, file_name=None):
        raise NotImplementedError()

    def get_archive_path(self):
--- a/src/documents/signals/handlers.py
+++ b/src/documents/signals/handlers.py
@@ -11,7 +11,6 @@ from django.db.models import Q
 from django.dispatch import receiver
 from django.utils import timezone
 from filelock import FileLock
-from rest_framework.reverse import reverse

 from .. import index, matching
 from ..file_handling import delete_empty_directories, \
@@ -147,32 +146,6 @@ def set_tags(sender,
    document.tags.add(*relevant_tags)


-def run_pre_consume_script(sender, filename, **kwargs):
-
-    if not settings.PRE_CONSUME_SCRIPT:
-        return
-
-    Popen((settings.PRE_CONSUME_SCRIPT, filename)).wait()
-
-
-def run_post_consume_script(sender, document, **kwargs):
-
-    if not settings.POST_CONSUME_SCRIPT:
-        return
-
-    Popen((
-        settings.POST_CONSUME_SCRIPT,
-        str(document.pk),
-        document.get_public_filename(),
-        os.path.normpath(document.source_path),
-        os.path.normpath(document.thumbnail_path),
-        reverse("document-download", kwargs={"pk": document.pk}),
-        reverse("document-thumb", kwargs={"pk": document.pk}),
-        str(document.correspondent),
-        str(",".join(document.tags.all().values_list("name", flat=True)))
-    )).wait()
-
-
@receiver(models.signals.post_delete, sender=Document)
 def cleanup_document_deletion(sender, instance, using, **kwargs):
    with FileLock(settings.MEDIA_LOCK):
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -177,7 +177,7 @@ class DummyParser(DocumentParser):
    def get_optimised_thumbnail(self, document_path, mime_type):
        return self.fake_thumb

-    def parse(self, document_path, mime_type):
+    def parse(self, document_path, mime_type, file_name=None):
        self.text = "The Text"


@@ -194,7 +194,7 @@ class FaultyParser(DocumentParser):
    def get_optimised_thumbnail(self, document_path, mime_type):
        return self.fake_thumb

-    def parse(self, document_path, mime_type):
+    def parse(self, document_path, mime_type, file_name=None):
        raise ParseError("Does not compute.")


@@ -466,3 +466,53 @@ class TestConsumer(DirectoriesMixin, TestCase):
        self.assertTrue(os.path.isfile(dst))
        self.assertRaises(ConsumerError, self.consumer.try_consume_file, dst)
        self.assertTrue(os.path.isfile(dst))
+
+
+class PostConsumeTestCase(TestCase):
+
+    @mock.patch("documents.signals.handlers.Popen")
+    @override_settings(POST_CONSUME_SCRIPT=None)
+    def test_no_post_consume_script(self, m):
+        doc = Document.objects.create(title="Test", mime_type="application/pdf")
+        tag1 = Tag.objects.create(name="a")
+        tag2 = Tag.objects.create(name="b")
+        doc.tags.add(tag1)
+        doc.tags.add(tag2)
+
+        Consumer().run_post_consume_script(doc)
+
+        m.assert_not_called()
+
+    @mock.patch("documents.signals.handlers.Popen")
+    @override_settings(POST_CONSUME_SCRIPT="script")
+    def test_post_consume_script_simple(self, m):
+        doc = Document.objects.create(title="Test", mime_type="application/pdf")
+
+        Consumer().run_post_consume_script(doc)
+
+        m.assert_called_once()
+
+    @mock.patch("documents.signals.handlers.Popen")
+    @override_settings(POST_CONSUME_SCRIPT="script")
+    def test_post_consume_script_with_correspondent(self, m):
+        c = Correspondent.objects.create(name="my_bank")
+        doc = Document.objects.create(title="Test", mime_type="application/pdf", correspondent=c)
+        tag1 = Tag.objects.create(name="a")
+        tag2 = Tag.objects.create(name="b")
+        doc.tags.add(tag1)
+        doc.tags.add(tag2)
+
+        Consumer().run_post_consume_script(doc)
+
+        m.assert_called_once()
+
+        args, kwargs = m.call_args
+
+        command = args[0]
+
+        self.assertEqual(command[0], "script")
+        self.assertEqual(command[1], str(doc.pk))
+        self.assertEqual(command[5], f"/api/documents/{doc.pk}/download/")
+        self.assertEqual(command[6], f"/api/documents/{doc.pk}/thumb/")
+        self.assertEqual(command[7], "my_bank")
+        self.assertCountEqual(command[8].split(","), ["a", "b"])
--- a/src/documents/tests/test_post_consume_handlers.py
+++ b/src/documents/tests/test_post_consume_handlers.py
@@ -1,56 +0,0 @@
-from unittest import mock
-
-from django.test import TestCase, override_settings
-
-from documents.models import Document, Tag, Correspondent
-from documents.signals.handlers import run_post_consume_script
-
-
-class PostConsumeTestCase(TestCase):
-
-    @mock.patch("documents.signals.handlers.Popen")
-    @override_settings(POST_CONSUME_SCRIPT=None)
-    def test_no_post_consume_script(self, m):
-        doc = Document.objects.create(title="Test", mime_type="application/pdf")
-        tag1 = Tag.objects.create(name="a")
-        tag2 = Tag.objects.create(name="b")
-        doc.tags.add(tag1)
-        doc.tags.add(tag2)
-
-        run_post_consume_script(None, doc)
-
-        m.assert_not_called()
-
-    @mock.patch("documents.signals.handlers.Popen")
-    @override_settings(POST_CONSUME_SCRIPT="script")
-    def test_post_consume_script_simple(self, m):
-        doc = Document.objects.create(title="Test", mime_type="application/pdf")
-
-        run_post_consume_script(None, doc)
-
-        m.assert_called_once()
-
-    @mock.patch("documents.signals.handlers.Popen")
-    @override_settings(POST_CONSUME_SCRIPT="script")
-    def test_post_consume_script_with_correspondent(self, m):
-        c = Correspondent.objects.create(name="my_bank")
-        doc = Document.objects.create(title="Test", mime_type="application/pdf", correspondent=c)
-        tag1 = Tag.objects.create(name="a")
-        tag2 = Tag.objects.create(name="b")
-        doc.tags.add(tag1)
-        doc.tags.add(tag2)
-
-        run_post_consume_script(None, doc)
-
-        m.assert_called_once()
-
-        args, kwargs = m.call_args
-
-        command = args[0]
-
-        self.assertEqual(command[0], "script")
-        self.assertEqual(command[1], str(doc.pk))
-        self.assertEqual(command[5], f"/api/documents/{doc.pk}/download/")
-        self.assertEqual(command[6], f"/api/documents/{doc.pk}/thumb/")
-        self.assertEqual(command[7], "my_bank")
-        self.assertCountEqual(command[8].split(","), ["a", "b"])
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -89,6 +89,7 @@ INSTALLED_APPS = [
    "documents.apps.DocumentsConfig",
    "paperless_tesseract.apps.PaperlessTesseractConfig",
    "paperless_text.apps.PaperlessTextConfig",
+    "paperless_tika.apps.PaperlessTikaConfig",
    "paperless_mail.apps.PaperlessMailConfig",

    "django.contrib.admin",
@@ -436,3 +437,10 @@ for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
 PAPERLESS_FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT")

 THUMBNAIL_FONT_NAME = os.getenv("PAPERLESS_THUMBNAIL_FONT_NAME", "/usr/share/fonts/liberation/LiberationSerif-Regular.ttf")
+
+# Tika settings
+PAPERLESS_TIKA_ENABLED = __get_boolean("PAPERLESS_TIKA_ENABLED", "NO")
+PAPERLESS_TIKA_ENDPOINT = os.getenv("PAPERLESS_TIKA_ENDPOINT", "http://localhost:9998")
+PAPERLESS_TIKA_GOTENBERG_ENDPOINT = os.getenv(
+    "PAPERLESS_TIKA_GOTENBERG_ENDPOINT", "http://localhost:3000"
+)
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -1,7 +1,6 @@
 import json
 import os
 import re
-import subprocess

 import ocrmypdf
 import pdftotext
@@ -10,7 +9,8 @@ from PIL import Image
 from django.conf import settings
 from ocrmypdf import InputFileError, EncryptedPdfError

-from documents.parsers import DocumentParser, ParseError, run_convert
+from documents.parsers import DocumentParser, ParseError, \
+    make_thumbnail_from_pdf


 class RasterisedDocumentParser(DocumentParser):
@@ -47,50 +47,8 @@ class RasterisedDocumentParser(DocumentParser):
        return result

    def get_thumbnail(self, document_path, mime_type):
-        """
-        The thumbnail of a PDF is just a 500px wide image of the first page.
-        """
-
-        out_path = os.path.join(self.tempdir, "convert.png")
-
-        # Run convert to get a decent thumbnail
-        try:
-            run_convert(density=300,
-                        scale="500x5000>",
-                        alpha="remove",
-                        strip=True,
-                        trim=False,
-                        auto_orient=True,
-                        input_file="{}[0]".format(document_path),
-                        output_file=out_path,
-                        logging_group=self.logging_group)
-        except ParseError:
-            # if convert fails, fall back to extracting
-            # the first PDF page as a PNG using Ghostscript
-            self.log(
-                'warning',
-                "Thumbnail generation with ImageMagick failed, falling back "
-                "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!")
-            gs_out_path = os.path.join(self.tempdir, "gs_out.png")
-            cmd = [settings.GS_BINARY,
-                   "-q",
-                   "-sDEVICE=pngalpha",
-                   "-o", gs_out_path,
-                   document_path]
-            if not subprocess.Popen(cmd).wait() == 0:
-                raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
-            # then run convert on the output from gs
-            run_convert(density=300,
-                        scale="500x5000>",
-                        alpha="remove",
-                        strip=True,
-                        trim=False,
-                        auto_orient=True,
-                        input_file=gs_out_path,
-                        output_file=out_path,
-                        logging_group=self.logging_group)
-
-        return out_path
+        return make_thumbnail_from_pdf(
+            document_path, self.tempdir, self.logging_group)

    def is_image(self, mime_type):
        return mime_type in [
@@ -130,7 +88,7 @@ class RasterisedDocumentParser(DocumentParser):
                f"Error while calculating DPI for image {image}: {e}")
            return None

-    def parse(self, document_path, mime_type):
+    def parse(self, document_path, mime_type, file_name=None):
        mode = settings.OCR_MODE

        text_original = get_text_from_pdf(document_path)
--- a/src/paperless_text/parsers.py
+++ b/src/paperless_text/parsers.py
@@ -32,6 +32,6 @@ class TextDocumentParser(DocumentParser):

        return out_path

-    def parse(self, document_path, mime_type):
+    def parse(self, document_path, mime_type, file_name=None):
        with open(document_path, 'r') as f:
            self.text = f.read()
--- a/src/paperless_tika/apps.py
+++ b/src/paperless_tika/apps.py
@@ -0,0 +1,14 @@
+from django.apps import AppConfig
+from django.conf import settings
+from paperless_tika.signals import tika_consumer_declaration
+
+
+class PaperlessTikaConfig(AppConfig):
+    name = "paperless_tika"
+
+    def ready(self):
+        from documents.signals import document_consumer_declaration
+
+        if settings.PAPERLESS_TIKA_ENABLED:
+            document_consumer_declaration.connect(tika_consumer_declaration)
+        AppConfig.ready(self)
--- a/src/paperless_tika/parsers.py
+++ b/src/paperless_tika/parsers.py
@@ -0,0 +1,86 @@
+import os
+import requests
+import dateutil.parser
+
+from django.conf import settings
+
+from documents.parsers import DocumentParser, ParseError, \
+    make_thumbnail_from_pdf
+from tika import parser
+
+
+class TikaDocumentParser(DocumentParser):
+    """
+    This parser sends documents to a local tika server
+    """
+
+    def get_thumbnail(self, document_path, mime_type):
+        if not self.archive_path:
+            self.archive_path = self.convert_to_pdf(document_path)
+
+        return make_thumbnail_from_pdf(
+            self.archive_path, self.tempdir, self.logging_group)
+
+    def extract_metadata(self, document_path, mime_type):
+        tika_server = settings.PAPERLESS_TIKA_ENDPOINT
+        try:
+            parsed = parser.from_file(document_path, tika_server)
+        except Exception as e:
+            self.log("warning", f"Error while fetching document metadata for "
+                                f"{document_path}: {e}")
+            return []
+
+        return [
+            {
+                "namespace": "",
+                "prefix": "",
+                "key": key,
+                "value": parsed['metadata'][key]
+            } for key in parsed['metadata']
+        ]
+
+    def parse(self, document_path, mime_type, file_name=None):
+        self.log("info", f"Sending {document_path} to Tika server")
+        tika_server = settings.PAPERLESS_TIKA_ENDPOINT
+
+        try:
+            parsed = parser.from_file(document_path, tika_server)
+        except Exception as err:
+            raise ParseError(
+                f"Could not parse {document_path} with tika server at "
+                f"{tika_server}: {err}"
+            )
+
+        self.text = parsed["content"].strip()
+
+        try:
+            self.date = dateutil.parser.isoparse(
+                parsed["metadata"]["Creation-Date"])
+        except Exception as e:
+            self.log("warning", f"Unable to extract date for document "
+                                f"{document_path}: {e}")
+
+        self.archive_path = self.convert_to_pdf(document_path, file_name)
+
+    def convert_to_pdf(self, document_path, file_name):
+        pdf_path = os.path.join(self.tempdir, "convert.pdf")
+        gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT
+        url = gotenberg_server + "/convert/office"
+
+        self.log("info", f"Converting {document_path} to PDF as {pdf_path}")
+        files = {"files": (file_name, open(document_path, "rb"))}
+        headers = {}
+
+        try:
+            response = requests.post(url, files=files, headers=headers)
+            response.raise_for_status()  # ensure we notice bad responses
+        except Exception as err:
+            raise ParseError(
+                f"Error while converting document to PDF: {err}"
+            )
+
+        file = open(pdf_path, "wb")
+        file.write(response.content)
+        file.close()
+
+        return pdf_path
--- a/src/paperless_tika/signals.py
+++ b/src/paperless_tika/signals.py
@@ -0,0 +1,20 @@
+from .parsers import TikaDocumentParser
+
+
+def tika_consumer_declaration(sender, **kwargs):
+    return {
+        "parser": TikaDocumentParser,
+        "weight": 10,
+        "mime_types": {
+            "application/msword": ".doc",
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
+            "application/vnd.ms-excel": ".xls",
+            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
+            "application/vnd.ms-powerpoint": ".ppt",
+            "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
+            "application/vnd.openxmlformats-officedocument.presentationml.slideshow": ".ppsx",
+            "application/vnd.oasis.opendocument.presentation": ".odp",
+            "application/vnd.oasis.opendocument.spreadsheet": ".ods",
+            "application/vnd.oasis.opendocument.text": ".odt",
+        },
+    }