Merge branch 'dev' into feature-permissions

2025-12-14 01:21:14 -06:00 · 2023-02-14 11:32:37 -08:00
parent 0e83c94832 8b3d01c49b
commit 32754defef
31 changed files with 327 additions and 90 deletions
--- a/src/documents/barcodes.py
+++ b/src/documents/barcodes.py
@@ -325,11 +325,10 @@ def save_to_dir(
    Optionally rename the file.
    """
    if os.path.isfile(filepath) and os.path.isdir(target_dir):
-        dst = shutil.copy(filepath, target_dir)
-        logging.debug(f"saved {str(filepath)} to {str(dst)}")
-        if newname:
-            dst_new = os.path.join(target_dir, newname)
-            logger.debug(f"moving {str(dst)} to {str(dst_new)}")
-            os.rename(dst, dst_new)
+        dest = target_dir
+        if newname is not None:
+            dest = os.path.join(dest, newname)
+        shutil.copy(filepath, dest)
+        logging.debug(f"saved {str(filepath)} to {str(dest)}")
    else:
        logger.warning(f"{str(filepath)} or {str(target_dir)} don't exist.")
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -346,6 +346,7 @@ class Consumer(LoggingMixin):
            mime_type,
        )
        if not parser_class:
+            tempdir.cleanup()
            self._fail(MESSAGE_UNSUPPORTED_TYPE, f"Unsupported mime type {mime_type}")

        # Notify all listeners that we're going to do some work.
@@ -404,6 +405,7 @@ class Consumer(LoggingMixin):

        except ParseError as e:
            document_parser.cleanup()
+            tempdir.cleanup()
            self._fail(
                str(e),
                f"Error while consuming document {self.filename}: {e}",
--- a/src/documents/serialisers.py
+++ b/src/documents/serialisers.py
@@ -779,11 +779,17 @@ class StoragePathSerializer(MatchingModelSerializer, OwnedObjectSerializer):
                document_type="document_type",
                created="created",
                created_year="created_year",
+                created_year_short="created_year_short",
                created_month="created_month",
+                created_month_name="created_month_name",
+                created_month_name_short="created_month_name_short",
                created_day="created_day",
                added="added",
                added_year="added_year",
+                added_year_short="added_year_short",
                added_month="added_month",
+                added_month_name="added_month_name",
+                added_month_name_short="added_month_name_short",
                added_day="added_day",
                asn="asn",
                tags="tags",
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@@ -130,6 +130,18 @@ def consume_file(
                )

                if document_list:
+
+                    # If the file is an upload, it's in the scratch directory
+                    # Move it to consume directory to be picked up
+                    # Otherwise, use the current parent to keep possible tags
+                    # from subdirectories
+                    try:
+                        # is_relative_to would be nicer, but new in 3.9
+                        _ = path.relative_to(settings.SCRATCH_DIR)
+                        save_to_dir = settings.CONSUMPTION_DIR
+                    except ValueError:
+                        save_to_dir = path.parent
+
                    for n, document in enumerate(document_list):
                        # save to consumption dir
                        # rename it to the original filename  with number prefix
@@ -138,23 +150,18 @@ def consume_file(
                        else:
                            newname = None

-                        # If the file is an upload, it's in the scratch directory
-                        # Move it to consume directory to be picked up
-                        # Otherwise, use the current parent to keep possible tags
-                        # from subdirectories
-                        try:
-                            # is_relative_to would be nicer, but new in 3.9
-                            _ = path.relative_to(settings.SCRATCH_DIR)
-                            save_to_dir = settings.CONSUMPTION_DIR
-                        except ValueError:
-                            save_to_dir = path.parent
-
                        barcodes.save_to_dir(
                            document,
                            newname=newname,
                            target_dir=save_to_dir,
                        )

+                        # Split file has been copied safely, remove it
+                        os.remove(document)
+
+                    # And clean up the directory as well, now it's empty
+                    shutil.rmtree(os.path.dirname(document_list[0]))
+
                    # Delete the PDF file which was split
                    os.remove(doc_barcode_info.pdf_path)

--- a/src/documents/tests/test_api.py
+++ b/src/documents/tests/test_api.py
@@ -125,28 +125,28 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
        response = self.client.get("/api/documents/", format="json")
        self.assertEqual(response.status_code, 200)
        results_full = response.data["results"]
-        self.assertTrue("content" in results_full[0])
-        self.assertTrue("id" in results_full[0])
+        self.assertIn("content", results_full[0])
+        self.assertIn("id", results_full[0])

        response = self.client.get("/api/documents/?fields=id", format="json")
        self.assertEqual(response.status_code, 200)
        results = response.data["results"]
        self.assertFalse("content" in results[0])
-        self.assertTrue("id" in results[0])
+        self.assertIn("id", results[0])
        self.assertEqual(len(results[0]), 1)

        response = self.client.get("/api/documents/?fields=content", format="json")
        self.assertEqual(response.status_code, 200)
        results = response.data["results"]
-        self.assertTrue("content" in results[0])
+        self.assertIn("content", results[0])
        self.assertFalse("id" in results[0])
        self.assertEqual(len(results[0]), 1)

        response = self.client.get("/api/documents/?fields=id,content", format="json")
        self.assertEqual(response.status_code, 200)
        results = response.data["results"]
-        self.assertTrue("content" in results[0])
-        self.assertTrue("id" in results[0])
+        self.assertIn("content", results[0])
+        self.assertIn("id", results[0])
        self.assertEqual(len(results[0]), 2)

        response = self.client.get(
@@ -156,7 +156,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
        self.assertEqual(response.status_code, 200)
        results = response.data["results"]
        self.assertFalse("content" in results[0])
-        self.assertTrue("id" in results[0])
+        self.assertIn("id", results[0])
        self.assertEqual(len(results[0]), 1)

        response = self.client.get("/api/documents/?fields=", format="json")
@@ -3291,8 +3291,32 @@ class TestApiStoragePaths(DirectoriesMixin, APITestCase):
        self.assertEqual(response.status_code, 400)
        self.assertEqual(StoragePath.objects.count(), 1)

+    def test_api_storage_path_placeholders(self):
+        """
+        GIVEN:
+            - API request to create a storage path with placeholders
+            - Storage path is valid
+        WHEN:
+            - API is called
+        THEN:
+            - Correct HTTP response
+            - New storage path is created
+        """
+        response = self.client.post(
+            self.ENDPOINT,
+            json.dumps(
+                {
+                    "name": "Storage path with placeholders",
+                    "path": "{title}/{correspondent}/{document_type}/{created}/{created_year}/{created_year_short}/{created_month}/{created_month_name}/{created_month_name_short}/{created_day}/{added}/{added_year}/{added_year_short}/{added_month}/{added_month_name}/{added_month_name_short}/{added_day}/{asn}/{tags}/{tag_list}/",
+                },
+            ),
+            content_type="application/json",
+        )
+        self.assertEqual(response.status_code, 201)
+        self.assertEqual(StoragePath.objects.count(), 2)

-class TestTasks(APITestCase):
+
+class TestTasks(DirectoriesMixin, APITestCase):
    ENDPOINT = "/api/tasks/"
    ENDPOINT_ACKNOWLEDGE = "/api/acknowledge_tasks/"

--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -847,13 +847,11 @@ class PreConsumeTestCase(TestCase):
                self.assertEqual(command[0], script.name)
                self.assertEqual(command[1], "path-to-file")

-                self.assertDictContainsSubset(
-                    {
-                        "DOCUMENT_SOURCE_PATH": c.original_path,
-                        "DOCUMENT_WORKING_PATH": c.path,
-                    },
-                    environment,
-                )
+                subset = {
+                    "DOCUMENT_SOURCE_PATH": c.original_path,
+                    "DOCUMENT_WORKING_PATH": c.path,
+                }
+                self.assertDictEqual(environment, {**environment, **subset})

    @mock.patch("documents.consumer.Consumer.log")
    def test_script_with_output(self, mocked_log):
@@ -983,16 +981,15 @@ class PostConsumeTestCase(TestCase):
                self.assertEqual(command[7], "my_bank")
                self.assertCountEqual(command[8].split(","), ["a", "b"])

-                self.assertDictContainsSubset(
-                    {
-                        "DOCUMENT_ID": str(doc.pk),
-                        "DOCUMENT_DOWNLOAD_URL": f"/api/documents/{doc.pk}/download/",
-                        "DOCUMENT_THUMBNAIL_URL": f"/api/documents/{doc.pk}/thumb/",
-                        "DOCUMENT_CORRESPONDENT": "my_bank",
-                        "DOCUMENT_TAGS": "a,b",
-                    },
-                    environment,
-                )
+                subset = {
+                    "DOCUMENT_ID": str(doc.pk),
+                    "DOCUMENT_DOWNLOAD_URL": f"/api/documents/{doc.pk}/download/",
+                    "DOCUMENT_THUMBNAIL_URL": f"/api/documents/{doc.pk}/thumb/",
+                    "DOCUMENT_CORRESPONDENT": "my_bank",
+                    "DOCUMENT_TAGS": "a,b",
+                }
+
+                self.assertDictEqual(environment, {**environment, **subset})

    def test_script_exit_non_zero(self):
        """
--- a/src/documents/tests/test_importer.py
+++ b/src/documents/tests/test_importer.py
@@ -25,7 +25,7 @@ class TestImporter(TestCase):
        cmd.manifest = [{"model": "documents.document"}]
        with self.assertRaises(CommandError) as cm:
            cmd._check_manifest()
-        self.assertTrue("The manifest file contains a record" in str(cm.exception))
+        self.assertIn("The manifest file contains a record", str(cm.exception))

        cmd.manifest = [
            {"model": "documents.document", EXPORTER_FILE_NAME: "noexist.pdf"},
@@ -33,6 +33,7 @@ class TestImporter(TestCase):
        # self.assertRaises(CommandError, cmd._check_manifest)
        with self.assertRaises(CommandError) as cm:
            cmd._check_manifest()
-        self.assertTrue(
-            'The manifest file refers to "noexist.pdf"' in str(cm.exception),
+        self.assertIn(
+            'The manifest file refers to "noexist.pdf"',
+            str(cm.exception),
        )
--- a/src/documents/tests/test_parsers.py
+++ b/src/documents/tests/test_parsers.py
@@ -1,6 +1,8 @@
 from tempfile import TemporaryDirectory
 from unittest import mock

+from django.apps import apps
+from django.test import override_settings
 from django.test import TestCase
 from documents.parsers import get_default_file_extension
 from documents.parsers import get_parser_class_for_mime_type
@@ -8,6 +10,7 @@ from documents.parsers import get_supported_file_extensions
 from documents.parsers import is_file_ext_supported
 from paperless_tesseract.parsers import RasterisedDocumentParser
 from paperless_text.parsers import TextDocumentParser
+from paperless_tika.parsers import TikaDocumentParser


 class TestParserDiscovery(TestCase):
@@ -124,14 +127,43 @@ class TestParserDiscovery(TestCase):


 class TestParserAvailability(TestCase):
-    def test_file_extensions(self):
-
+    def test_tesseract_parser(self):
+        """
+        GIVEN:
+            - Various mime types
+        WHEN:
+            - The parser class is instantiated
+        THEN:
+            - The Tesseract based parser is return
+        """
        supported_mimes_and_exts = [
            ("application/pdf", ".pdf"),
            ("image/png", ".png"),
            ("image/jpeg", ".jpg"),
            ("image/tiff", ".tif"),
            ("image/webp", ".webp"),
+        ]
+
+        supported_exts = get_supported_file_extensions()
+
+        for mime_type, ext in supported_mimes_and_exts:
+            self.assertIn(ext, supported_exts)
+            self.assertEqual(get_default_file_extension(mime_type), ext)
+            self.assertIsInstance(
+                get_parser_class_for_mime_type(mime_type)(logging_group=None),
+                RasterisedDocumentParser,
+            )
+
+    def test_text_parser(self):
+        """
+        GIVEN:
+            - Various mime types of a text form
+        WHEN:
+            - The parser class is instantiated
+        THEN:
+            - The text based parser is return
+        """
+        supported_mimes_and_exts = [
            ("text/plain", ".txt"),
            ("text/csv", ".csv"),
        ]
@@ -141,23 +173,55 @@ class TestParserAvailability(TestCase):
        for mime_type, ext in supported_mimes_and_exts:
            self.assertIn(ext, supported_exts)
            self.assertEqual(get_default_file_extension(mime_type), ext)
+            self.assertIsInstance(
+                get_parser_class_for_mime_type(mime_type)(logging_group=None),
+                TextDocumentParser,
+            )

+    def test_tika_parser(self):
+        """
+        GIVEN:
+            - Various mime types of a office document form
+        WHEN:
+            - The parser class is instantiated
+        THEN:
+            - The Tika/Gotenberg based parser is return
+        """
+        supported_mimes_and_exts = [
+            ("application/vnd.oasis.opendocument.text", ".odt"),
+            ("text/rtf", ".rtf"),
+            ("application/msword", ".doc"),
+            (
+                "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                ".docx",
+            ),
+        ]
+
+        # Force the app ready to notice the settings override
+        with override_settings(TIKA_ENABLED=True, INSTALLED_APPS=["paperless_tika"]):
+            app = apps.get_app_config("paperless_tika")
+            app.ready()
+            supported_exts = get_supported_file_extensions()
+
+        for mime_type, ext in supported_mimes_and_exts:
+            self.assertIn(ext, supported_exts)
+            self.assertEqual(get_default_file_extension(mime_type), ext)
+            self.assertIsInstance(
+                get_parser_class_for_mime_type(mime_type)(logging_group=None),
+                TikaDocumentParser,
+            )
+
+    def test_no_parser_for_mime(self):
+        self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf"))
+
+    def test_default_extension(self):
        # Test no parser declared still returns a an extension
        self.assertEqual(get_default_file_extension("application/zip"), ".zip")

        # Test invalid mimetype returns no extension
        self.assertEqual(get_default_file_extension("aasdasd/dgfgf"), "")

-        self.assertIsInstance(
-            get_parser_class_for_mime_type("application/pdf")(logging_group=None),
-            RasterisedDocumentParser,
-        )
-        self.assertIsInstance(
-            get_parser_class_for_mime_type("text/plain")(logging_group=None),
-            TextDocumentParser,
-        )
-        self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf"))
-
+    def test_file_extension_support(self):
        self.assertTrue(is_file_ext_supported(".pdf"))
        self.assertFalse(is_file_ext_supported(".hsdfh"))
        self.assertFalse(is_file_ext_supported(""))
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -109,6 +109,16 @@ def _parse_redis_url(env_redis: Optional[str]) -> Tuple[str]:


 def _parse_beat_schedule() -> Dict:
+    """
+    Configures the scheduled tasks, according to default or
+    environment variables.  Task expiration is configured so the task will
+    expire (and not run), shortly before the default frequency will put another
+    of the same task into the queue
+
+
+    https://docs.celeryq.dev/en/stable/userguide/periodic-tasks.html#beat-entries
+    https://docs.celeryq.dev/en/latest/userguide/calling.html#expiration
+    """
    schedule = {}
    tasks = [
        {
@@ -117,6 +127,11 @@ def _parse_beat_schedule() -> Dict:
            # Default every ten minutes
            "env_default": "*/10 * * * *",
            "task": "paperless_mail.tasks.process_mail_accounts",
+            "options": {
+                # 1 minute before default schedule sends again
+                "expires": 9.0
+                * 60.0,
+            },
        },
        {
            "name": "Train the classifier",
@@ -124,6 +139,11 @@ def _parse_beat_schedule() -> Dict:
            # Default hourly at 5 minutes past the hour
            "env_default": "5 */1 * * *",
            "task": "documents.tasks.train_classifier",
+            "options": {
+                # 1 minute before default schedule sends again
+                "expires": 59.0
+                * 60.0,
+            },
        },
        {
            "name": "Optimize the index",
@@ -131,6 +151,12 @@ def _parse_beat_schedule() -> Dict:
            # Default daily at midnight
            "env_default": "0 0 * * *",
            "task": "documents.tasks.index_optimize",
+            "options": {
+                # 1 hour before default schedule sends again
+                "expires": 23.0
+                * 60.0
+                * 60.0,
+            },
        },
        {
            "name": "Perform sanity check",
@@ -138,6 +164,12 @@ def _parse_beat_schedule() -> Dict:
            # Default Sunday at 00:30
            "env_default": "30 0 * * sun",
            "task": "documents.tasks.sanity_check",
+            "options": {
+                # 1 hour before default schedule sends again
+                "expires": ((7.0 * 24.0) - 1.0)
+                * 60.0
+                * 60.0,
+            },
        },
    ]
    for task in tasks:
@@ -151,9 +183,11 @@ def _parse_beat_schedule() -> Dict:
        #   - five time-and-date fields
        #   - separated by at least one blank
        minute, hour, day_month, month, day_week = value.split(" ")
+
        schedule[task["name"]] = {
            "task": task["task"],
            "schedule": crontab(minute, hour, day_week, day_month, month),
+            "options": task["options"],
        }

    return schedule
@@ -564,22 +598,21 @@ LOGGING = {
 # Task queue                                                                  #
 ###############################################################################

-TASK_WORKERS = __get_int("PAPERLESS_TASK_WORKERS", 1)
-
-WORKER_TIMEOUT: Final[int] = __get_int("PAPERLESS_WORKER_TIMEOUT", 1800)
+# https://docs.celeryq.dev/en/stable/userguide/configuration.html

 CELERY_BROKER_URL = _CELERY_REDIS_URL
 CELERY_TIMEZONE = TIME_ZONE

 CELERY_WORKER_HIJACK_ROOT_LOGGER = False
-CELERY_WORKER_CONCURRENCY = TASK_WORKERS
+CELERY_WORKER_CONCURRENCY: Final[int] = __get_int("PAPERLESS_TASK_WORKERS", 1)
+TASK_WORKERS = CELERY_WORKER_CONCURRENCY
 CELERY_WORKER_MAX_TASKS_PER_CHILD = 1
 CELERY_WORKER_SEND_TASK_EVENTS = True
-
+CELERY_TASK_SEND_SENT_EVENT = True
 CELERY_SEND_TASK_SENT_EVENT = True

 CELERY_TASK_TRACK_STARTED = True
-CELERY_TASK_TIME_LIMIT = WORKER_TIMEOUT
+CELERY_TASK_TIME_LIMIT: Final[int] = __get_int("PAPERLESS_WORKER_TIMEOUT", 1800)

 CELERY_RESULT_EXTENDED = True
 CELERY_RESULT_BACKEND = "django-db"
@@ -611,7 +644,7 @@ def default_threads_per_worker(task_workers) -> int:

 THREADS_PER_WORKER = os.getenv(
    "PAPERLESS_THREADS_PER_WORKER",
-    default_threads_per_worker(TASK_WORKERS),
+    default_threads_per_worker(CELERY_WORKER_CONCURRENCY),
 )

 ###############################################################################
--- a/src/paperless/tests/test_settings.py
+++ b/src/paperless/tests/test_settings.py
@@ -149,6 +149,11 @@ class TestRedisSocketConversion(TestCase):


 class TestCeleryScheduleParsing(TestCase):
+    MAIL_EXPIRE_TIME = 9.0 * 60.0
+    CLASSIFIER_EXPIRE_TIME = 59.0 * 60.0
+    INDEX_EXPIRE_TIME = 23.0 * 60.0 * 60.0
+    SANITY_EXPIRE_TIME = ((7.0 * 24.0) - 1.0) * 60.0 * 60.0
+
    def test_schedule_configuration_default(self):
        """
        GIVEN:
@@ -165,18 +170,22 @@ class TestCeleryScheduleParsing(TestCase):
                "Check all e-mail accounts": {
                    "task": "paperless_mail.tasks.process_mail_accounts",
                    "schedule": crontab(minute="*/10"),
+                    "options": {"expires": self.MAIL_EXPIRE_TIME},
                },
                "Train the classifier": {
                    "task": "documents.tasks.train_classifier",
                    "schedule": crontab(minute="5", hour="*/1"),
+                    "options": {"expires": self.CLASSIFIER_EXPIRE_TIME},
                },
                "Optimize the index": {
                    "task": "documents.tasks.index_optimize",
                    "schedule": crontab(minute=0, hour=0),
+                    "options": {"expires": self.INDEX_EXPIRE_TIME},
                },
                "Perform sanity check": {
                    "task": "documents.tasks.sanity_check",
                    "schedule": crontab(minute=30, hour=0, day_of_week="sun"),
+                    "options": {"expires": self.SANITY_EXPIRE_TIME},
                },
            },
            schedule,
@@ -203,18 +212,22 @@ class TestCeleryScheduleParsing(TestCase):
                "Check all e-mail accounts": {
                    "task": "paperless_mail.tasks.process_mail_accounts",
                    "schedule": crontab(minute="*/50", day_of_week="mon"),
+                    "options": {"expires": self.MAIL_EXPIRE_TIME},
                },
                "Train the classifier": {
                    "task": "documents.tasks.train_classifier",
                    "schedule": crontab(minute="5", hour="*/1"),
+                    "options": {"expires": self.CLASSIFIER_EXPIRE_TIME},
                },
                "Optimize the index": {
                    "task": "documents.tasks.index_optimize",
                    "schedule": crontab(minute=0, hour=0),
+                    "options": {"expires": self.INDEX_EXPIRE_TIME},
                },
                "Perform sanity check": {
                    "task": "documents.tasks.sanity_check",
                    "schedule": crontab(minute=30, hour=0, day_of_week="sun"),
+                    "options": {"expires": self.SANITY_EXPIRE_TIME},
                },
            },
            schedule,
@@ -238,14 +251,17 @@ class TestCeleryScheduleParsing(TestCase):
                "Check all e-mail accounts": {
                    "task": "paperless_mail.tasks.process_mail_accounts",
                    "schedule": crontab(minute="*/10"),
+                    "options": {"expires": self.MAIL_EXPIRE_TIME},
                },
                "Train the classifier": {
                    "task": "documents.tasks.train_classifier",
                    "schedule": crontab(minute="5", hour="*/1"),
+                    "options": {"expires": self.CLASSIFIER_EXPIRE_TIME},
                },
                "Perform sanity check": {
                    "task": "documents.tasks.sanity_check",
                    "schedule": crontab(minute=30, hour=0, day_of_week="sun"),
+                    "options": {"expires": self.SANITY_EXPIRE_TIME},
                },
            },
            schedule,
--- a/src/paperless/tests/test_websockets.py
+++ b/src/paperless/tests/test_websockets.py
@@ -14,15 +14,14 @@ TEST_CHANNEL_LAYERS = {
 }


+@override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS)
 class TestWebSockets(TestCase):
-    @override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS)
    async def test_no_auth(self):
        communicator = WebsocketCommunicator(application, "/ws/status/")
        connected, subprotocol = await communicator.connect()
        self.assertFalse(connected)
        await communicator.disconnect()

-    @override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS)
    @mock.patch("paperless.consumers.StatusConsumer._authenticated")
    async def test_auth(self, _authenticated):
        _authenticated.return_value = True
@@ -33,7 +32,6 @@ class TestWebSockets(TestCase):

        await communicator.disconnect()

-    @override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS)
    @mock.patch("paperless.consumers.StatusConsumer._authenticated")
    async def test_receive(self, _authenticated):
        _authenticated.return_value = True
--- a/src/paperless/views.py
+++ b/src/paperless/views.py
@@ -24,7 +24,7 @@ class StandardPagination(PageNumberPagination):


 class FaviconView(View):
-    def get(self, request, *args, **kwargs):
+    def get(self, request, *args, **kwargs):  # pragma: nocover
        favicon = os.path.join(
            os.path.dirname(__file__),
            "static",
--- a/src/paperless_mail/tests/test_api.py
+++ b/src/paperless_mail/tests/test_api.py
@@ -2,12 +2,13 @@ from django.contrib.auth.models import User
 from documents.models import Correspondent
 from documents.models import DocumentType
 from documents.models import Tag
+from documents.tests.utils import DirectoriesMixin
 from paperless_mail.models import MailAccount
 from paperless_mail.models import MailRule
 from rest_framework.test import APITestCase


-class TestAPIMailAccounts(APITestCase):
+class TestAPIMailAccounts(DirectoriesMixin, APITestCase):
    ENDPOINT = "/api/mail_accounts/"

    def setUp(self):
@@ -165,7 +166,7 @@ class TestAPIMailAccounts(APITestCase):
        self.assertEqual(returned_account2.password, "123xyz")


-class TestAPIMailRules(APITestCase):
+class TestAPIMailRules(DirectoriesMixin, APITestCase):
    ENDPOINT = "/api/mail_rules/"

    def setUp(self):
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -161,7 +161,7 @@ class RasterisedDocumentParser(DocumentParser):

        except Exception:
            # TODO catch all for various issues with PDFminer.six.
-            #  If PDFminer fails, fall back to OCR.
+            #  If pdftotext fails, fall back to OCR.
            self.log(
                "warning",
                "Error while getting text from PDF document with " "pdfminer.six",
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@@ -364,7 +364,7 @@ class TestParser(DirectoriesMixin, TestCase):
        )
        self.assertTrue(os.path.isfile(parser.archive_path))
        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"])
-        self.assertFalse("page 3" in parser.get_text().lower())
+        self.assertNotIn("page 3", parser.get_text().lower())

    @override_settings(OCR_PAGES=1, OCR_MODE="force")
    def test_multi_page_analog_pages_force(self):
@@ -386,8 +386,8 @@ class TestParser(DirectoriesMixin, TestCase):
        )
        self.assertTrue(os.path.isfile(parser.archive_path))
        self.assertContainsStrings(parser.get_text().lower(), ["page 1"])
-        self.assertFalse("page 2" in parser.get_text().lower())
-        self.assertFalse("page 3" in parser.get_text().lower())
+        self.assertNotIn("page 2", parser.get_text().lower())
+        self.assertNotIn("page 3", parser.get_text().lower())

    @override_settings(OCR_MODE="skip_noarchive")
    def test_skip_noarchive_withtext(self):
@@ -660,6 +660,15 @@ class TestParser(DirectoriesMixin, TestCase):
            params = parser.construct_ocrmypdf_parameters("", "", "", "")
            self.assertNotIn("deskew", params)

+        with override_settings(OCR_MAX_IMAGE_PIXELS=1_000_001.0):
+            params = parser.construct_ocrmypdf_parameters("", "", "", "")
+            self.assertIn("max_image_mpixels", params)
+            self.assertAlmostEqual(params["max_image_mpixels"], 1, places=4)
+
+        with override_settings(OCR_MAX_IMAGE_PIXELS=-1_000_001.0):
+            params = parser.construct_ocrmypdf_parameters("", "", "", "")
+            self.assertNotIn("max_image_mpixels", params)
+
    def test_rtl_language_detection(self):
        """
        GIVEN:
--- a/src/paperless_tika/tests/test_tika_parser.py
+++ b/src/paperless_tika/tests/test_tika_parser.py
@@ -3,7 +3,9 @@ import os
 from pathlib import Path
 from unittest import mock

+from django.test import override_settings
 from django.test import TestCase
+from documents.parsers import ParseError
 from paperless_tika.parsers import TikaDocumentParser
 from requests import Response

@@ -54,3 +56,63 @@ class TestTikaParser(TestCase):

        self.assertTrue("Creation-Date" in [m["key"] for m in metadata])
        self.assertTrue("Some-key" in [m["key"] for m in metadata])
+
+    @mock.patch("paperless_tika.parsers.parser.from_file")
+    @mock.patch("paperless_tika.parsers.requests.post")
+    def test_convert_failure(self, post, from_file):
+        """
+        GIVEN:
+            - Document needs to be converted to PDF
+        WHEN:
+            - Gotenberg server returns an error
+        THEN:
+            - Parse error is raised
+        """
+        from_file.return_value = {
+            "content": "the content",
+            "metadata": {"Creation-Date": "2020-11-21"},
+        }
+        response = Response()
+        response._content = b"PDF document"
+        response.status_code = 500
+        post.return_value = response
+
+        file = os.path.join(self.parser.tempdir, "input.odt")
+        Path(file).touch()
+
+        with self.assertRaises(ParseError):
+            self.parser.convert_to_pdf(file, None)
+
+    @mock.patch("paperless_tika.parsers.requests.post")
+    def test_request_pdf_a_format(self, post: mock.Mock):
+        """
+        GIVEN:
+            - Document needs to be converted to PDF
+        WHEN:
+            - Specific PDF/A format requested
+        THEN:
+            - Request to Gotenberg contains the expected PDF/A format string
+        """
+        file = os.path.join(self.parser.tempdir, "input.odt")
+        Path(file).touch()
+
+        response = Response()
+        response._content = b"PDF document"
+        response.status_code = 200
+        post.return_value = response
+
+        for setting, expected_key in [
+            ("pdfa", "PDF/A-2b"),
+            ("pdfa-2", "PDF/A-2b"),
+            ("pdfa-1", "PDF/A-1a"),
+            ("pdfa-3", "PDF/A-3b"),
+        ]:
+            with override_settings(OCR_OUTPUT_TYPE=setting):
+                self.parser.convert_to_pdf(file, None)
+
+                post.assert_called_once()
+                _, kwargs = post.call_args
+
+                self.assertEqual(kwargs["data"]["pdfFormat"], expected_key)
+
+                post.reset_mock()