Merge branch 'dev'

2025-07-28 18:24:38 -05:00 · 2023-02-16 20:07:50 -08:00
parent 7b46bc652c 74b7c49e1b
commit c35ac07b32
88 changed files with 9585 additions and 21104 deletions
--- a/src/documents/barcodes.py
+++ b/src/documents/barcodes.py
@@ -5,6 +5,7 @@ import tempfile
 from dataclasses import dataclass
 from functools import lru_cache
 from pathlib import Path
+from typing import Dict
 from typing import List
 from typing import Optional

@@ -201,16 +202,25 @@ def scan_file_for_barcodes(
    return DocumentBarcodeInfo(pdf_filepath, barcodes)


-def get_separating_barcodes(barcodes: List[Barcode]) -> List[int]:
+def get_separating_barcodes(barcodes: List[Barcode]) -> Dict[int, bool]:
    """
    Search the parsed barcodes for separators
-    and returns a list of page numbers, which
-    separate the file into new files.
+    and returns a dict of page numbers, which
+    separate the file into new files, together
+    with the information whether to keep the page.
    """
    # filter all barcodes for the separator string
    # get the page numbers of the separating barcodes
+    separator_pages = {bc.page: False for bc in barcodes if bc.is_separator}
+    if not settings.CONSUMER_ENABLE_ASN_BARCODE:
+        return separator_pages

-    return list({bc.page for bc in barcodes if bc.is_separator})
+    # add the page numbers of the ASN barcodes
+    # (except for first page, that might lead to infinite loops).
+    return {
+        **separator_pages,
+        **{bc.page: True for bc in barcodes if bc.is_asn and bc.page != 0},
+    }


 def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]:
@@ -242,10 +252,11 @@ def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]:
    return asn


-def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
+def separate_pages(filepath: str, pages_to_split_on: Dict[int, bool]) -> List[str]:
    """
    Separate the provided pdf file on the pages_to_split_on.
-    The pages which are defined by page_numbers will be removed.
+    The pages which are defined by the keys in page_numbers
+    will be removed if the corresponding value is false.
    Returns a list of (temporary) filepaths to consume.
    These will need to be deleted later.
    """
@@ -261,26 +272,28 @@ def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
    fname = os.path.splitext(os.path.basename(filepath))[0]
    pdf = Pdf.open(filepath)

+    # Start with an empty document
+    current_document: List[Page] = []
    # A list of documents, ie a list of lists of pages
-    documents: List[List[Page]] = []
-    # A single document, ie a list of pages
-    document: List[Page] = []
+    documents: List[List[Page]] = [current_document]

    for idx, page in enumerate(pdf.pages):
        # Keep building the new PDF as long as it is not a
        # separator index
        if idx not in pages_to_split_on:
-            document.append(page)
-            # Make sure to append the very last document to the documents
-            if idx == (len(pdf.pages) - 1):
-                documents.append(document)
-                document = []
-        else:
-            # This is a split index, save the current PDF pages, and restart
-            # a new destination page listing
-            logger.debug(f"Starting new document at idx {idx}")
-            documents.append(document)
-            document = []
+            current_document.append(page)
+            continue
+
+        # This is a split index
+        # Start a new destination page listing
+        logger.debug(f"Starting new document at idx {idx}")
+        current_document = []
+        documents.append(current_document)
+        keep_page = pages_to_split_on[idx]
+        if keep_page:
+            # Keep the page
+            # (new document is started by asn barcode)
+            current_document.append(page)

    documents = [x for x in documents if len(x)]

@@ -312,11 +325,10 @@ def save_to_dir(
    Optionally rename the file.
    """
    if os.path.isfile(filepath) and os.path.isdir(target_dir):
-        dst = shutil.copy(filepath, target_dir)
-        logging.debug(f"saved {str(filepath)} to {str(dst)}")
-        if newname:
-            dst_new = os.path.join(target_dir, newname)
-            logger.debug(f"moving {str(dst)} to {str(dst_new)}")
-            os.rename(dst, dst_new)
+        dest = target_dir
+        if newname is not None:
+            dest = os.path.join(dest, newname)
+        shutil.copy(filepath, dest)
+        logging.debug(f"saved {str(filepath)} to {str(dest)}")
    else:
        logger.warning(f"{str(filepath)} or {str(target_dir)} don't exist.")
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -146,11 +146,16 @@ class Consumer(LoggingMixin):
            return
        # Validate the range is above zero and less than uint32_t max
        # otherwise, Whoosh can't handle it in the index
-        if self.override_asn < 0 or self.override_asn > 0xFF_FF_FF_FF:
+        if (
+            self.override_asn < Document.ARCHIVE_SERIAL_NUMBER_MIN
+            or self.override_asn > Document.ARCHIVE_SERIAL_NUMBER_MAX
+        ):
            self._fail(
                MESSAGE_ASN_RANGE,
                f"Not consuming {self.filename}: "
-                f"Given ASN {self.override_asn} is out of range [0, 4,294,967,295]",
+                f"Given ASN {self.override_asn} is out of range "
+                f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, "
+                f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}]",
            )
        if Document.objects.filter(archive_serial_number=self.override_asn).exists():
            self._fail(
@@ -337,6 +342,7 @@ class Consumer(LoggingMixin):
            mime_type,
        )
        if not parser_class:
+            tempdir.cleanup()
            self._fail(MESSAGE_UNSUPPORTED_TYPE, f"Unsupported mime type {mime_type}")

        # Notify all listeners that we're going to do some work.
@@ -395,6 +401,7 @@ class Consumer(LoggingMixin):

        except ParseError as e:
            document_parser.cleanup()
+            tempdir.cleanup()
            self._fail(
                str(e),
                f"Error while consuming document {self.filename}: {e}",
--- a/src/documents/index.py
+++ b/src/documents/index.py
@@ -5,6 +5,7 @@ from contextlib import contextmanager

 from dateutil.parser import isoparse
 from django.conf import settings
+from django.utils import timezone
 from documents.models import Comment
 from documents.models import Document
 from whoosh import classify
@@ -89,10 +90,22 @@ def open_index_searcher():
        searcher.close()


-def update_document(writer, doc):
+def update_document(writer: AsyncWriter, doc: Document):
    tags = ",".join([t.name for t in doc.tags.all()])
    tags_ids = ",".join([str(t.id) for t in doc.tags.all()])
    comments = ",".join([str(c.comment) for c in Comment.objects.filter(document=doc)])
+    asn = doc.archive_serial_number
+    if asn is not None and (
+        asn < Document.ARCHIVE_SERIAL_NUMBER_MIN
+        or asn > Document.ARCHIVE_SERIAL_NUMBER_MAX
+    ):
+        logger.error(
+            f"Not indexing Archive Serial Number {asn} of document {doc.pk}. "
+            f"ASN is out of range "
+            f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, "
+            f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}.",
+        )
+        asn = 0
    writer.update_document(
        id=doc.pk,
        title=doc.title,
@@ -108,7 +121,7 @@ def update_document(writer, doc):
        has_type=doc.document_type is not None,
        created=doc.created,
        added=doc.added,
-        asn=doc.archive_serial_number,
+        asn=asn,
        modified=doc.modified,
        path=doc.storage_path.name if doc.storage_path else None,
        path_id=doc.storage_path.id if doc.storage_path else None,
@@ -262,7 +275,7 @@ class DelayedFullTextQuery(DelayedQuery):
            ["content", "title", "correspondent", "tag", "type", "comments"],
            self.searcher.ixreader.schema,
        )
-        qp.add_plugin(DateParserPlugin())
+        qp.add_plugin(DateParserPlugin(basedate=timezone.now()))
        q = qp.parse(q_str)

        corrected = self.searcher.correct_query(q, q_str)
--- a/src/documents/management/commands/document_consumer.py
+++ b/src/documents/management/commands/document_consumer.py
@@ -1,5 +1,6 @@
 import logging
 import os
+from fnmatch import filter
 from pathlib import Path
 from pathlib import PurePath
 from threading import Event
@@ -7,6 +8,7 @@ from threading import Thread
 from time import monotonic
 from time import sleep
 from typing import Final
+from typing import Set

 from django.conf import settings
 from django.core.management.base import BaseCommand
@@ -25,15 +27,15 @@ except ImportError:  # pragma: nocover
 logger = logging.getLogger("paperless.management.consumer")


-def _tags_from_path(filepath):
-    """Walk up the directory tree from filepath to CONSUMPTION_DIR
-    and get or create Tag IDs for every directory.
+def _tags_from_path(filepath) -> Set[Tag]:
+    """
+    Walk up the directory tree from filepath to CONSUMPTION_DIR
+    and get or create Tag IDs for every directory.
+
+    Returns set of Tag models
    """
-    normalized_consumption_dir = os.path.abspath(
-        os.path.normpath(settings.CONSUMPTION_DIR),
-    )
    tag_ids = set()
-    path_parts = Path(filepath).relative_to(normalized_consumption_dir).parent.parts
+    path_parts = Path(filepath).relative_to(settings.CONSUMPTION_DIR).parent.parts
    for part in path_parts:
        tag_ids.add(
            Tag.objects.get_or_create(name__iexact=part, defaults={"name": part})[0].pk,
@@ -43,14 +45,41 @@ def _tags_from_path(filepath):


 def _is_ignored(filepath: str) -> bool:
-    normalized_consumption_dir = os.path.abspath(
-        os.path.normpath(settings.CONSUMPTION_DIR),
+    """
+    Checks if the given file should be ignored, based on configured
+    patterns.
+
+    Returns True if the file is ignored, False otherwise
+    """
+    filepath = os.path.abspath(
+        os.path.normpath(filepath),
    )
-    filepath_relative = PurePath(filepath).relative_to(normalized_consumption_dir)
-    return any(filepath_relative.match(p) for p in settings.CONSUMER_IGNORE_PATTERNS)
+
+    # Trim out the consume directory, leaving only filename and it's
+    # path relative to the consume directory
+    filepath_relative = PurePath(filepath).relative_to(settings.CONSUMPTION_DIR)
+
+    # March through the components of the path, including directories and the filename
+    # looking for anything matching
+    # foo/bar/baz/file.pdf -> (foo, bar, baz, file.pdf)
+    parts = []
+    for part in filepath_relative.parts:
+        # If the part is not the name (ie, it's a dir)
+        # Need to append the trailing slash or fnmatch doesn't match
+        # fnmatch("dir", "dir/*") == False
+        # fnmatch("dir/", "dir/*") == True
+        if part != filepath_relative.name:
+            part = part + "/"
+        parts.append(part)
+
+    for pattern in settings.CONSUMER_IGNORE_PATTERNS:
+        if len(filter(parts, pattern)):
+            return True
+
+    return False


-def _consume(filepath):
+def _consume(filepath: str) -> None:
    if os.path.isdir(filepath) or _is_ignored(filepath):
        return

@@ -103,7 +132,13 @@ def _consume(filepath):
        logger.exception("Error while consuming document")


-def _consume_wait_unmodified(file):
+def _consume_wait_unmodified(file: str) -> None:
+    """
+    Waits for the given file to appear unmodified based on file size
+    and modification time.  Will wait a configured number of seconds
+    and retry a configured number of times before either consuming or
+    giving up
+    """
    if _is_ignored(file):
        return

--- a/src/documents/management/commands/document_exporter.py
+++ b/src/documents/management/commands/document_exporter.py
@@ -311,8 +311,8 @@ class Command(BaseCommand):
                archive_target = None

            # 3.4. write files to target folder
-            t = int(time.mktime(document.created.timetuple()))
            if document.storage_type == Document.STORAGE_TYPE_GPG:
+                t = int(time.mktime(document.created.timetuple()))

                original_target.parent.mkdir(parents=True, exist_ok=True)
                with document.source_file as out_file:
--- a/src/documents/migrations/1030_alter_paperlesstask_task_file_name.py
+++ b/src/documents/migrations/1030_alter_paperlesstask_task_file_name.py
@@ -0,0 +1,23 @@
+# Generated by Django 4.1.5 on 2023-02-03 21:53
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("documents", "1029_alter_document_archive_serial_number"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="paperlesstask",
+            name="task_file_name",
+            field=models.CharField(
+                help_text="Name of the file which the Task was run for",
+                max_length=255,
+                null=True,
+                verbose_name="Task Filename",
+            ),
+        ),
+    ]
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -3,6 +3,7 @@ import logging
 import os
 import re
 from collections import OrderedDict
+from typing import Final
 from typing import Optional

 import dateutil.parser
@@ -229,6 +230,9 @@ class Document(models.Model):
        help_text=_("The original name of the file when it was uploaded"),
    )

+    ARCHIVE_SERIAL_NUMBER_MIN: Final[int] = 0
+    ARCHIVE_SERIAL_NUMBER_MAX: Final[int] = 0xFF_FF_FF_FF
+
    archive_serial_number = models.PositiveIntegerField(
        _("archive serial number"),
        blank=True,
@@ -236,8 +240,8 @@ class Document(models.Model):
        unique=True,
        db_index=True,
        validators=[
-            MaxValueValidator(0xFF_FF_FF_FF),
-            MinValueValidator(0),
+            MaxValueValidator(ARCHIVE_SERIAL_NUMBER_MAX),
+            MinValueValidator(ARCHIVE_SERIAL_NUMBER_MIN),
        ],
        help_text=_(
            "The position of this document in your physical document " "archive.",
@@ -555,7 +559,7 @@ class PaperlessTask(models.Model):
    task_file_name = models.CharField(
        null=True,
        max_length=255,
-        verbose_name=_("Task Name"),
+        verbose_name=_("Task Filename"),
        help_text=_("Name of the file which the Task was run for"),
    )

--- a/src/documents/serialisers.py
+++ b/src/documents/serialisers.py
@@ -599,11 +599,17 @@ class StoragePathSerializer(MatchingModelSerializer):
                document_type="document_type",
                created="created",
                created_year="created_year",
+                created_year_short="created_year_short",
                created_month="created_month",
+                created_month_name="created_month_name",
+                created_month_name_short="created_month_name_short",
                created_day="created_day",
                added="added",
                added_year="added_year",
+                added_year_short="added_year_short",
                added_month="added_month",
+                added_month_name="added_month_name",
+                added_month_name_short="added_month_name_short",
                added_day="added_day",
                asn="asn",
                tags="tags",
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@@ -128,6 +128,18 @@ def consume_file(
                )

                if document_list:
+
+                    # If the file is an upload, it's in the scratch directory
+                    # Move it to consume directory to be picked up
+                    # Otherwise, use the current parent to keep possible tags
+                    # from subdirectories
+                    try:
+                        # is_relative_to would be nicer, but new in 3.9
+                        _ = path.relative_to(settings.SCRATCH_DIR)
+                        save_to_dir = settings.CONSUMPTION_DIR
+                    except ValueError:
+                        save_to_dir = path.parent
+
                    for n, document in enumerate(document_list):
                        # save to consumption dir
                        # rename it to the original filename  with number prefix
@@ -136,23 +148,18 @@ def consume_file(
                        else:
                            newname = None

-                        # If the file is an upload, it's in the scratch directory
-                        # Move it to consume directory to be picked up
-                        # Otherwise, use the current parent to keep possible tags
-                        # from subdirectories
-                        try:
-                            # is_relative_to would be nicer, but new in 3.9
-                            _ = path.relative_to(settings.SCRATCH_DIR)
-                            save_to_dir = settings.CONSUMPTION_DIR
-                        except ValueError:
-                            save_to_dir = path.parent
-
                        barcodes.save_to_dir(
                            document,
                            newname=newname,
                            target_dir=save_to_dir,
                        )

+                        # Split file has been copied safely, remove it
+                        os.remove(document)
+
+                    # And clean up the directory as well, now it's empty
+                    shutil.rmtree(os.path.dirname(document_list[0]))
+
                    # Delete the PDF file which was split
                    os.remove(doc_barcode_info.pdf_path)

@@ -164,7 +171,7 @@ def consume_file(
                    # notify the sender, otherwise the progress bar
                    # in the UI stays stuck
                    payload = {
-                        "filename": override_filename,
+                        "filename": override_filename or path.name,
                        "task_id": task_id,
                        "current_progress": 100,
                        "max_progress": 100,
--- a/src/documents/tests/samples/barcodes/split-by-asn-1.pdf
+++ b/src/documents/tests/samples/barcodes/split-by-asn-1.pdf
--- a/src/documents/tests/samples/barcodes/split-by-asn-2.pdf
+++ b/src/documents/tests/samples/barcodes/split-by-asn-2.pdf
--- a/src/documents/tests/test_api.py
+++ b/src/documents/tests/test_api.py
@@ -7,6 +7,7 @@ import tempfile
 import urllib.request
 import uuid
 import zipfile
+from datetime import timedelta
 from pathlib import Path
 from unittest import mock
 from unittest.mock import MagicMock
@@ -23,6 +24,7 @@ from django.conf import settings
 from django.contrib.auth.models import User
 from django.test import override_settings
 from django.utils import timezone
+from dateutil.relativedelta import relativedelta
 from documents import bulk_edit
 from documents import index
 from documents.models import Correspondent
@@ -119,28 +121,28 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
        response = self.client.get("/api/documents/", format="json")
        self.assertEqual(response.status_code, 200)
        results_full = response.data["results"]
-        self.assertTrue("content" in results_full[0])
-        self.assertTrue("id" in results_full[0])
+        self.assertIn("content", results_full[0])
+        self.assertIn("id", results_full[0])

        response = self.client.get("/api/documents/?fields=id", format="json")
        self.assertEqual(response.status_code, 200)
        results = response.data["results"]
        self.assertFalse("content" in results[0])
-        self.assertTrue("id" in results[0])
+        self.assertIn("id", results[0])
        self.assertEqual(len(results[0]), 1)

        response = self.client.get("/api/documents/?fields=content", format="json")
        self.assertEqual(response.status_code, 200)
        results = response.data["results"]
-        self.assertTrue("content" in results[0])
+        self.assertIn("content", results[0])
        self.assertFalse("id" in results[0])
        self.assertEqual(len(results[0]), 1)

        response = self.client.get("/api/documents/?fields=id,content", format="json")
        self.assertEqual(response.status_code, 200)
        results = response.data["results"]
-        self.assertTrue("content" in results[0])
-        self.assertTrue("id" in results[0])
+        self.assertIn("content", results[0])
+        self.assertIn("id", results[0])
        self.assertEqual(len(results[0]), 2)

        response = self.client.get(
@@ -150,7 +152,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
        self.assertEqual(response.status_code, 200)
        results = response.data["results"]
        self.assertFalse("content" in results[0])
-        self.assertTrue("id" in results[0])
+        self.assertIn("id", results[0])
        self.assertEqual(len(results[0]), 1)

        response = self.client.get("/api/documents/?fields=", format="json")
@@ -505,6 +507,270 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
        response = self.client.get("/api/documents/?query=content&page=3&page_size=10")
        self.assertEqual(response.status_code, 404)

+    @override_settings(
+        TIME_ZONE="UTC",
+    )
+    def test_search_added_in_last_week(self):
+        """
+        GIVEN:
+            - Three documents added right now
+            - The timezone is UTC time
+        WHEN:
+            - Query for documents added in the last 7 days
+        THEN:
+            - All three recent documents are returned
+        """
+        d1 = Document.objects.create(
+            title="invoice",
+            content="the thing i bought at a shop and paid with bank account",
+            checksum="A",
+            pk=1,
+        )
+        d2 = Document.objects.create(
+            title="bank statement 1",
+            content="things i paid for in august",
+            pk=2,
+            checksum="B",
+        )
+        d3 = Document.objects.create(
+            title="bank statement 3",
+            content="things i paid for in september",
+            pk=3,
+            checksum="C",
+        )
+        with index.open_index_writer() as writer:
+            index.update_document(writer, d1)
+            index.update_document(writer, d2)
+            index.update_document(writer, d3)
+
+        response = self.client.get("/api/documents/?query=added:[-1 week to now]")
+        results = response.data["results"]
+        # Expect 3 documents returned
+        self.assertEqual(len(results), 3)
+
+        for idx, subset in enumerate(
+            [
+                {"id": 1, "title": "invoice"},
+                {"id": 2, "title": "bank statement 1"},
+                {"id": 3, "title": "bank statement 3"},
+            ],
+        ):
+            result = results[idx]
+            # Assert subset in results
+            self.assertDictEqual(result, {**result, **subset})
+
+    @override_settings(
+        TIME_ZONE="America/Chicago",
+    )
+    def test_search_added_in_last_week_with_timezone_behind(self):
+        """
+        GIVEN:
+            - Two documents added right now
+            - One document added over a week ago
+            - The timezone is behind UTC time (-6)
+        WHEN:
+            - Query for documents added in the last 7 days
+        THEN:
+            - The two recent documents are returned
+        """
+        d1 = Document.objects.create(
+            title="invoice",
+            content="the thing i bought at a shop and paid with bank account",
+            checksum="A",
+            pk=1,
+        )
+        d2 = Document.objects.create(
+            title="bank statement 1",
+            content="things i paid for in august",
+            pk=2,
+            checksum="B",
+        )
+        d3 = Document.objects.create(
+            title="bank statement 3",
+            content="things i paid for in september",
+            pk=3,
+            checksum="C",
+            # 7 days, 1 hour and 1 minute ago
+            added=timezone.now() - timedelta(days=7, hours=1, minutes=1),
+        )
+        with index.open_index_writer() as writer:
+            index.update_document(writer, d1)
+            index.update_document(writer, d2)
+            index.update_document(writer, d3)
+
+        response = self.client.get("/api/documents/?query=added:[-1 week to now]")
+        results = response.data["results"]
+
+        # Expect 2 documents returned
+        self.assertEqual(len(results), 2)
+
+        for idx, subset in enumerate(
+            [{"id": 1, "title": "invoice"}, {"id": 2, "title": "bank statement 1"}],
+        ):
+            result = results[idx]
+            # Assert subset in results
+            self.assertDictEqual(result, {**result, **subset})
+
+    @override_settings(
+        TIME_ZONE="Europe/Sofia",
+    )
+    def test_search_added_in_last_week_with_timezone_ahead(self):
+        """
+        GIVEN:
+            - Two documents added right now
+            - One document added over a week ago
+            - The timezone is behind UTC time (+2)
+        WHEN:
+            - Query for documents added in the last 7 days
+        THEN:
+            - The two recent documents are returned
+        """
+        d1 = Document.objects.create(
+            title="invoice",
+            content="the thing i bought at a shop and paid with bank account",
+            checksum="A",
+            pk=1,
+        )
+        d2 = Document.objects.create(
+            title="bank statement 1",
+            content="things i paid for in august",
+            pk=2,
+            checksum="B",
+        )
+        d3 = Document.objects.create(
+            title="bank statement 3",
+            content="things i paid for in september",
+            pk=3,
+            checksum="C",
+            # 7 days, 1 hour and 1 minute ago
+            added=timezone.now() - timedelta(days=7, hours=1, minutes=1),
+        )
+        with index.open_index_writer() as writer:
+            index.update_document(writer, d1)
+            index.update_document(writer, d2)
+            index.update_document(writer, d3)
+
+        response = self.client.get("/api/documents/?query=added:[-1 week to now]")
+        results = response.data["results"]
+
+        # Expect 2 documents returned
+        self.assertEqual(len(results), 2)
+
+        for idx, subset in enumerate(
+            [{"id": 1, "title": "invoice"}, {"id": 2, "title": "bank statement 1"}],
+        ):
+            result = results[idx]
+            # Assert subset in results
+            self.assertDictEqual(result, {**result, **subset})
+
+    def test_search_added_in_last_month(self):
+        """
+        GIVEN:
+            - One document added right now
+            - One documents added about a week ago
+            - One document added over 1 month
+        WHEN:
+            - Query for documents added in the last month
+        THEN:
+            - The two recent documents are returned
+        """
+        d1 = Document.objects.create(
+            title="invoice",
+            content="the thing i bought at a shop and paid with bank account",
+            checksum="A",
+            pk=1,
+        )
+        d2 = Document.objects.create(
+            title="bank statement 1",
+            content="things i paid for in august",
+            pk=2,
+            checksum="B",
+            # 1 month, 1 day ago
+            added=timezone.now() - relativedelta(months=1, days=1),
+        )
+        d3 = Document.objects.create(
+            title="bank statement 3",
+            content="things i paid for in september",
+            pk=3,
+            checksum="C",
+            # 7 days, 1 hour and 1 minute ago
+            added=timezone.now() - timedelta(days=7, hours=1, minutes=1),
+        )
+
+        with index.open_index_writer() as writer:
+            index.update_document(writer, d1)
+            index.update_document(writer, d2)
+            index.update_document(writer, d3)
+
+        response = self.client.get("/api/documents/?query=added:[-1 month to now]")
+        results = response.data["results"]
+
+        # Expect 2 documents returned
+        self.assertEqual(len(results), 2)
+
+        for idx, subset in enumerate(
+            [{"id": 1, "title": "invoice"}, {"id": 3, "title": "bank statement 3"}],
+        ):
+            result = results[idx]
+            # Assert subset in results
+            self.assertDictEqual(result, {**result, **subset})
+
+    @override_settings(
+        TIME_ZONE="America/Denver",
+    )
+    def test_search_added_in_last_month_timezone_behind(self):
+        """
+        GIVEN:
+            - One document added right now
+            - One documents added about a week ago
+            - One document added over 1 month
+            - The timezone is behind UTC time (-6 or -7)
+        WHEN:
+            - Query for documents added in the last month
+        THEN:
+            - The two recent documents are returned
+        """
+        d1 = Document.objects.create(
+            title="invoice",
+            content="the thing i bought at a shop and paid with bank account",
+            checksum="A",
+            pk=1,
+        )
+        d2 = Document.objects.create(
+            title="bank statement 1",
+            content="things i paid for in august",
+            pk=2,
+            checksum="B",
+            # 1 month, 1 day ago
+            added=timezone.now() - relativedelta(months=1, days=1),
+        )
+        d3 = Document.objects.create(
+            title="bank statement 3",
+            content="things i paid for in september",
+            pk=3,
+            checksum="C",
+            # 7 days, 1 hour and 1 minute ago
+            added=timezone.now() - timedelta(days=7, hours=1, minutes=1),
+        )
+
+        with index.open_index_writer() as writer:
+            index.update_document(writer, d1)
+            index.update_document(writer, d2)
+            index.update_document(writer, d3)
+
+        response = self.client.get("/api/documents/?query=added:[-1 month to now]")
+        results = response.data["results"]
+
+        # Expect 2 documents returned
+        self.assertEqual(len(results), 2)
+
+        for idx, subset in enumerate(
+            [{"id": 1, "title": "invoice"}, {"id": 3, "title": "bank statement 3"}],
+        ):
+            result = results[idx]
+            # Assert subset in results
+            self.assertDictEqual(result, {**result, **subset})
+
    @mock.patch("documents.index.autocomplete")
    def test_search_autocomplete(self, m):
        m.side_effect = lambda ix, term, limit: [term for _ in range(limit)]
@@ -2933,8 +3199,32 @@ class TestApiStoragePaths(DirectoriesMixin, APITestCase):
        self.assertEqual(response.status_code, 400)
        self.assertEqual(StoragePath.objects.count(), 1)

+    def test_api_storage_path_placeholders(self):
+        """
+        GIVEN:
+            - API request to create a storage path with placeholders
+            - Storage path is valid
+        WHEN:
+            - API is called
+        THEN:
+            - Correct HTTP response
+            - New storage path is created
+        """
+        response = self.client.post(
+            self.ENDPOINT,
+            json.dumps(
+                {
+                    "name": "Storage path with placeholders",
+                    "path": "{title}/{correspondent}/{document_type}/{created}/{created_year}/{created_year_short}/{created_month}/{created_month_name}/{created_month_name_short}/{created_day}/{added}/{added_year}/{added_year_short}/{added_month}/{added_month_name}/{added_month_name_short}/{added_day}/{asn}/{tags}/{tag_list}/",
+                },
+            ),
+            content_type="application/json",
+        )
+        self.assertEqual(response.status_code, 201)
+        self.assertEqual(StoragePath.objects.count(), 2)

-class TestTasks(APITestCase):
+
+class TestTasks(DirectoriesMixin, APITestCase):
    ENDPOINT = "/api/tasks/"
    ENDPOINT_ACKNOWLEDGE = "/api/acknowledge_tasks/"

--- a/src/documents/tests/test_barcodes.py
+++ b/src/documents/tests/test_barcodes.py
@@ -294,7 +294,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
        )

        self.assertEqual(doc_barcode_info.pdf_path, test_file)
-        self.assertListEqual(separator_page_numbers, [0])
+        self.assertDictEqual(separator_page_numbers, {0: False})

    def test_scan_file_for_separating_barcodes_none_present(self):
        """
@@ -314,7 +314,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
        )

        self.assertEqual(doc_barcode_info.pdf_path, test_file)
-        self.assertListEqual(separator_page_numbers, [])
+        self.assertDictEqual(separator_page_numbers, {})

    def test_scan_file_for_separating_barcodes_middle_page(self):
        """
@@ -337,7 +337,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
        )

        self.assertEqual(doc_barcode_info.pdf_path, test_file)
-        self.assertListEqual(separator_page_numbers, [1])
+        self.assertDictEqual(separator_page_numbers, {1: False})

    def test_scan_file_for_separating_barcodes_multiple_pages(self):
        """
@@ -360,7 +360,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
        )

        self.assertEqual(doc_barcode_info.pdf_path, test_file)
-        self.assertListEqual(separator_page_numbers, [2, 5])
+        self.assertDictEqual(separator_page_numbers, {2: False, 5: False})

    def test_scan_file_for_separating_barcodes_upside_down(self):
        """
@@ -384,7 +384,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
        )

        self.assertEqual(doc_barcode_info.pdf_path, test_file)
-        self.assertListEqual(separator_page_numbers, [1])
+        self.assertDictEqual(separator_page_numbers, {1: False})

    def test_scan_file_for_separating_barcodes_fax_decode(self):
        """
@@ -407,7 +407,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
        )

        self.assertEqual(doc_barcode_info.pdf_path, test_file)
-        self.assertListEqual(separator_page_numbers, [1])
+        self.assertDictEqual(separator_page_numbers, {1: False})

    def test_scan_file_for_separating_qr_barcodes(self):
        """
@@ -431,7 +431,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
        )

        self.assertEqual(doc_barcode_info.pdf_path, test_file)
-        self.assertListEqual(separator_page_numbers, [0])
+        self.assertDictEqual(separator_page_numbers, {0: False})

    @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
    def test_scan_file_for_separating_custom_barcodes(self):
@@ -456,7 +456,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
        )

        self.assertEqual(doc_barcode_info.pdf_path, test_file)
-        self.assertListEqual(separator_page_numbers, [0])
+        self.assertDictEqual(separator_page_numbers, {0: False})

    @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
    def test_scan_file_for_separating_custom_qr_barcodes(self):
@@ -482,7 +482,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
        )

        self.assertEqual(doc_barcode_info.pdf_path, test_file)
-        self.assertListEqual(separator_page_numbers, [0])
+        self.assertDictEqual(separator_page_numbers, {0: False})

    @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
    def test_scan_file_for_separating_custom_128_barcodes(self):
@@ -508,7 +508,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
        )

        self.assertEqual(doc_barcode_info.pdf_path, test_file)
-        self.assertListEqual(separator_page_numbers, [0])
+        self.assertDictEqual(separator_page_numbers, {0: False})

    def test_scan_file_for_separating_wrong_qr_barcodes(self):
        """
@@ -533,7 +533,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
        )

        self.assertEqual(doc_barcode_info.pdf_path, test_file)
-        self.assertListEqual(separator_page_numbers, [])
+        self.assertDictEqual(separator_page_numbers, {})

    @override_settings(CONSUMER_BARCODE_STRING="ADAR-NEXTDOC")
    def test_scan_file_for_separating_qr_barcodes(self):
@@ -558,7 +558,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
        )

        self.assertGreater(len(doc_barcode_info.barcodes), 0)
-        self.assertListEqual(separator_page_numbers, [1])
+        self.assertDictEqual(separator_page_numbers, {1: False})

    def test_separate_pages(self):
        """
@@ -573,7 +573,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
            self.BARCODE_SAMPLE_DIR,
            "patch-code-t-middle.pdf",
        )
-        documents = barcodes.separate_pages(test_file, [1])
+        documents = barcodes.separate_pages(test_file, {1: False})

        self.assertEqual(len(documents), 2)

@@ -591,7 +591,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
            self.BARCODE_SAMPLE_DIR,
            "patch-code-t-double.pdf",
        )
-        pages = barcodes.separate_pages(test_file, [1, 2])
+        pages = barcodes.separate_pages(test_file, {1: False, 2: False})

        self.assertEqual(len(pages), 2)

@@ -610,7 +610,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
            "patch-code-t-middle.pdf",
        )
        with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
-            pages = barcodes.separate_pages(test_file, [])
+            pages = barcodes.separate_pages(test_file, {})
            self.assertEqual(pages, [])
            self.assertEqual(
                cm.output,
@@ -858,7 +858,88 @@ class TestBarcode(DirectoriesMixin, TestCase):
        )

        self.assertEqual(doc_barcode_info.pdf_path, test_file)
-        self.assertListEqual(separator_page_numbers, [])
+        self.assertDictEqual(separator_page_numbers, {})
+
+    @override_settings(
+        CONSUMER_ENABLE_BARCODES=True,
+        CONSUMER_ENABLE_ASN_BARCODE=True,
+    )
+    def test_separate_pages_by_asn_barcodes_and_patcht(self):
+        """
+        GIVEN:
+            - Input PDF with a patch code on page 3 and ASN barcodes on pages 1,5,6,9,11
+        WHEN:
+            - Input file is split on barcodes
+        THEN:
+            - Correct number of files produced, split correctly by correct pages
+        """
+        test_file = os.path.join(
+            os.path.dirname(__file__),
+            self.BARCODE_SAMPLE_DIR,
+            "split-by-asn-2.pdf",
+        )
+
+        doc_barcode_info = barcodes.scan_file_for_barcodes(
+            test_file,
+        )
+        separator_page_numbers = barcodes.get_separating_barcodes(
+            doc_barcode_info.barcodes,
+        )
+
+        self.assertEqual(test_file, doc_barcode_info.pdf_path)
+        self.assertDictEqual(
+            separator_page_numbers,
+            {
+                2: False,
+                4: True,
+                5: True,
+                8: True,
+                10: True,
+            },
+        )
+
+        document_list = barcodes.separate_pages(test_file, separator_page_numbers)
+        self.assertEqual(len(document_list), 6)
+
+    @override_settings(
+        CONSUMER_ENABLE_BARCODES=True,
+        CONSUMER_ENABLE_ASN_BARCODE=True,
+    )
+    def test_separate_pages_by_asn_barcodes(self):
+        """
+        GIVEN:
+            - Input PDF with ASN barcodes on pages 1,3,4,7,9
+        WHEN:
+            - Input file is split on barcodes
+        THEN:
+            - Correct number of files produced, split correctly by correct pages
+        """
+        test_file = os.path.join(
+            os.path.dirname(__file__),
+            self.BARCODE_SAMPLE_DIR,
+            "split-by-asn-1.pdf",
+        )
+
+        doc_barcode_info = barcodes.scan_file_for_barcodes(
+            test_file,
+        )
+        separator_page_numbers = barcodes.get_separating_barcodes(
+            doc_barcode_info.barcodes,
+        )
+
+        self.assertEqual(test_file, doc_barcode_info.pdf_path)
+        self.assertDictEqual(
+            separator_page_numbers,
+            {
+                2: True,
+                3: True,
+                6: True,
+                8: True,
+            },
+        )
+
+        document_list = barcodes.separate_pages(test_file, separator_page_numbers)
+        self.assertEqual(len(document_list), 5)


 class TestAsnBarcodes(DirectoriesMixin, TestCase):
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -847,13 +847,11 @@ class PreConsumeTestCase(TestCase):
                self.assertEqual(command[0], script.name)
                self.assertEqual(command[1], "path-to-file")

-                self.assertDictContainsSubset(
-                    {
-                        "DOCUMENT_SOURCE_PATH": c.original_path,
-                        "DOCUMENT_WORKING_PATH": c.path,
-                    },
-                    environment,
-                )
+                subset = {
+                    "DOCUMENT_SOURCE_PATH": c.original_path,
+                    "DOCUMENT_WORKING_PATH": c.path,
+                }
+                self.assertDictEqual(environment, {**environment, **subset})

    @mock.patch("documents.consumer.Consumer.log")
    def test_script_with_output(self, mocked_log):
@@ -983,16 +981,15 @@ class PostConsumeTestCase(TestCase):
                self.assertEqual(command[7], "my_bank")
                self.assertCountEqual(command[8].split(","), ["a", "b"])

-                self.assertDictContainsSubset(
-                    {
-                        "DOCUMENT_ID": str(doc.pk),
-                        "DOCUMENT_DOWNLOAD_URL": f"/api/documents/{doc.pk}/download/",
-                        "DOCUMENT_THUMBNAIL_URL": f"/api/documents/{doc.pk}/thumb/",
-                        "DOCUMENT_CORRESPONDENT": "my_bank",
-                        "DOCUMENT_TAGS": "a,b",
-                    },
-                    environment,
-                )
+                subset = {
+                    "DOCUMENT_ID": str(doc.pk),
+                    "DOCUMENT_DOWNLOAD_URL": f"/api/documents/{doc.pk}/download/",
+                    "DOCUMENT_THUMBNAIL_URL": f"/api/documents/{doc.pk}/thumb/",
+                    "DOCUMENT_CORRESPONDENT": "my_bank",
+                    "DOCUMENT_TAGS": "a,b",
+                }
+
+                self.assertDictEqual(environment, {**environment, **subset})

    def test_script_exit_non_zero(self):
        """
--- a/src/documents/tests/test_importer.py
+++ b/src/documents/tests/test_importer.py
@@ -25,7 +25,7 @@ class TestImporter(TestCase):
        cmd.manifest = [{"model": "documents.document"}]
        with self.assertRaises(CommandError) as cm:
            cmd._check_manifest()
-        self.assertTrue("The manifest file contains a record" in str(cm.exception))
+        self.assertIn("The manifest file contains a record", str(cm.exception))

        cmd.manifest = [
            {"model": "documents.document", EXPORTER_FILE_NAME: "noexist.pdf"},
@@ -33,6 +33,7 @@ class TestImporter(TestCase):
        # self.assertRaises(CommandError, cmd._check_manifest)
        with self.assertRaises(CommandError) as cm:
            cmd._check_manifest()
-        self.assertTrue(
-            'The manifest file refers to "noexist.pdf"' in str(cm.exception),
+        self.assertIn(
+            'The manifest file refers to "noexist.pdf"',
+            str(cm.exception),
        )
--- a/src/documents/tests/test_index.py
+++ b/src/documents/tests/test_index.py
@@ -1,3 +1,5 @@
+from unittest import mock
+
 from django.test import TestCase
 from documents import index
 from documents.models import Document
@@ -31,3 +33,60 @@ class TestAutoComplete(DirectoriesMixin, TestCase):
        )
        self.assertListEqual(index.autocomplete(ix, "tes", limit=1), [b"test3"])
        self.assertListEqual(index.autocomplete(ix, "tes", limit=0), [])
+
+    def test_archive_serial_number_ranging(self):
+        """
+        GIVEN:
+            - Document with an archive serial number above schema allowed size
+        WHEN:
+            - Document is provided to the index
+        THEN:
+            - Error is logged
+            - Document ASN is reset to 0 for the index
+        """
+        doc1 = Document.objects.create(
+            title="doc1",
+            checksum="A",
+            content="test test2 test3",
+            # yes, this is allowed, unless full_clean is run
+            # DRF does call the validators, this test won't
+            archive_serial_number=Document.ARCHIVE_SERIAL_NUMBER_MAX + 1,
+        )
+        with self.assertLogs("paperless.index", level="ERROR") as cm:
+            with mock.patch(
+                "documents.index.AsyncWriter.update_document",
+            ) as mocked_update_doc:
+                index.add_or_update_document(doc1)
+
+                mocked_update_doc.assert_called_once()
+                _, kwargs = mocked_update_doc.call_args
+
+                self.assertEqual(kwargs["asn"], 0)
+
+                error_str = cm.output[0]
+                expected_str = "ERROR:paperless.index:Not indexing Archive Serial Number 4294967296 of document 1"
+                self.assertIn(expected_str, error_str)
+
+    def test_archive_serial_number_is_none(self):
+        """
+        GIVEN:
+            - Document with no archive serial number
+        WHEN:
+            - Document is provided to the index
+        THEN:
+            - ASN isn't touched
+        """
+        doc1 = Document.objects.create(
+            title="doc1",
+            checksum="A",
+            content="test test2 test3",
+        )
+        with mock.patch(
+            "documents.index.AsyncWriter.update_document",
+        ) as mocked_update_doc:
+            index.add_or_update_document(doc1)
+
+            mocked_update_doc.assert_called_once()
+            _, kwargs = mocked_update_doc.call_args
+
+            self.assertIsNone(kwargs["asn"])
--- a/src/documents/tests/test_management_consumer.py
+++ b/src/documents/tests/test_management_consumer.py
@@ -247,22 +247,85 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):

    def test_is_ignored(self):
        test_paths = [
-            (os.path.join(self.dirs.consumption_dir, "foo.pdf"), False),
-            (os.path.join(self.dirs.consumption_dir, "foo", "bar.pdf"), False),
-            (os.path.join(self.dirs.consumption_dir, ".DS_STORE", "foo.pdf"), True),
-            (
-                os.path.join(self.dirs.consumption_dir, "foo", ".DS_STORE", "bar.pdf"),
-                True,
-            ),
-            (os.path.join(self.dirs.consumption_dir, ".stfolder", "foo.pdf"), True),
-            (os.path.join(self.dirs.consumption_dir, "._foo.pdf"), True),
-            (os.path.join(self.dirs.consumption_dir, "._foo", "bar.pdf"), False),
+            {
+                "path": os.path.join(self.dirs.consumption_dir, "foo.pdf"),
+                "ignore": False,
+            },
+            {
+                "path": os.path.join(self.dirs.consumption_dir, "foo", "bar.pdf"),
+                "ignore": False,
+            },
+            {
+                "path": os.path.join(self.dirs.consumption_dir, ".DS_STORE", "foo.pdf"),
+                "ignore": True,
+            },
+            {
+                "path": os.path.join(
+                    self.dirs.consumption_dir,
+                    "foo",
+                    ".DS_STORE",
+                    "bar.pdf",
+                ),
+                "ignore": True,
+            },
+            {
+                "path": os.path.join(
+                    self.dirs.consumption_dir,
+                    ".DS_STORE",
+                    "foo",
+                    "bar.pdf",
+                ),
+                "ignore": True,
+            },
+            {
+                "path": os.path.join(self.dirs.consumption_dir, ".stfolder", "foo.pdf"),
+                "ignore": True,
+            },
+            {
+                "path": os.path.join(self.dirs.consumption_dir, ".stfolder.pdf"),
+                "ignore": False,
+            },
+            {
+                "path": os.path.join(
+                    self.dirs.consumption_dir,
+                    ".stversions",
+                    "foo.pdf",
+                ),
+                "ignore": True,
+            },
+            {
+                "path": os.path.join(self.dirs.consumption_dir, ".stversions.pdf"),
+                "ignore": False,
+            },
+            {
+                "path": os.path.join(self.dirs.consumption_dir, "._foo.pdf"),
+                "ignore": True,
+            },
+            {
+                "path": os.path.join(self.dirs.consumption_dir, "my_foo.pdf"),
+                "ignore": False,
+            },
+            {
+                "path": os.path.join(self.dirs.consumption_dir, "._foo", "bar.pdf"),
+                "ignore": True,
+            },
+            {
+                "path": os.path.join(
+                    self.dirs.consumption_dir,
+                    "@eaDir",
+                    "SYNO@.fileindexdb",
+                    "_1jk.fnm",
+                ),
+                "ignore": True,
+            },
        ]
-        for file_path, expected_ignored in test_paths:
+        for test_setup in test_paths:
+            filepath = test_setup["path"]
+            expected_ignored_result = test_setup["ignore"]
            self.assertEqual(
-                expected_ignored,
-                document_consumer._is_ignored(file_path),
-                f'_is_ignored("{file_path}") != {expected_ignored}',
+                expected_ignored_result,
+                document_consumer._is_ignored(filepath),
+                f'_is_ignored("{filepath}") != {expected_ignored_result}',
            )

    @mock.patch("documents.management.commands.document_consumer.open")
--- a/src/documents/tests/test_parsers.py
+++ b/src/documents/tests/test_parsers.py
@@ -1,6 +1,8 @@
 from tempfile import TemporaryDirectory
 from unittest import mock

+from django.apps import apps
+from django.test import override_settings
 from django.test import TestCase
 from documents.parsers import get_default_file_extension
 from documents.parsers import get_parser_class_for_mime_type
@@ -8,6 +10,7 @@ from documents.parsers import get_supported_file_extensions
 from documents.parsers import is_file_ext_supported
 from paperless_tesseract.parsers import RasterisedDocumentParser
 from paperless_text.parsers import TextDocumentParser
+from paperless_tika.parsers import TikaDocumentParser


 class TestParserDiscovery(TestCase):
@@ -124,14 +127,43 @@ class TestParserDiscovery(TestCase):


 class TestParserAvailability(TestCase):
-    def test_file_extensions(self):
-
+    def test_tesseract_parser(self):
+        """
+        GIVEN:
+            - Various mime types
+        WHEN:
+            - The parser class is instantiated
+        THEN:
+            - The Tesseract based parser is return
+        """
        supported_mimes_and_exts = [
            ("application/pdf", ".pdf"),
            ("image/png", ".png"),
            ("image/jpeg", ".jpg"),
            ("image/tiff", ".tif"),
            ("image/webp", ".webp"),
+        ]
+
+        supported_exts = get_supported_file_extensions()
+
+        for mime_type, ext in supported_mimes_and_exts:
+            self.assertIn(ext, supported_exts)
+            self.assertEqual(get_default_file_extension(mime_type), ext)
+            self.assertIsInstance(
+                get_parser_class_for_mime_type(mime_type)(logging_group=None),
+                RasterisedDocumentParser,
+            )
+
+    def test_text_parser(self):
+        """
+        GIVEN:
+            - Various mime types of a text form
+        WHEN:
+            - The parser class is instantiated
+        THEN:
+            - The text based parser is return
+        """
+        supported_mimes_and_exts = [
            ("text/plain", ".txt"),
            ("text/csv", ".csv"),
        ]
@@ -141,23 +173,55 @@ class TestParserAvailability(TestCase):
        for mime_type, ext in supported_mimes_and_exts:
            self.assertIn(ext, supported_exts)
            self.assertEqual(get_default_file_extension(mime_type), ext)
+            self.assertIsInstance(
+                get_parser_class_for_mime_type(mime_type)(logging_group=None),
+                TextDocumentParser,
+            )

+    def test_tika_parser(self):
+        """
+        GIVEN:
+            - Various mime types of a office document form
+        WHEN:
+            - The parser class is instantiated
+        THEN:
+            - The Tika/Gotenberg based parser is return
+        """
+        supported_mimes_and_exts = [
+            ("application/vnd.oasis.opendocument.text", ".odt"),
+            ("text/rtf", ".rtf"),
+            ("application/msword", ".doc"),
+            (
+                "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                ".docx",
+            ),
+        ]
+
+        # Force the app ready to notice the settings override
+        with override_settings(TIKA_ENABLED=True, INSTALLED_APPS=["paperless_tika"]):
+            app = apps.get_app_config("paperless_tika")
+            app.ready()
+            supported_exts = get_supported_file_extensions()
+
+        for mime_type, ext in supported_mimes_and_exts:
+            self.assertIn(ext, supported_exts)
+            self.assertEqual(get_default_file_extension(mime_type), ext)
+            self.assertIsInstance(
+                get_parser_class_for_mime_type(mime_type)(logging_group=None),
+                TikaDocumentParser,
+            )
+
+    def test_no_parser_for_mime(self):
+        self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf"))
+
+    def test_default_extension(self):
        # Test no parser declared still returns a an extension
        self.assertEqual(get_default_file_extension("application/zip"), ".zip")

        # Test invalid mimetype returns no extension
        self.assertEqual(get_default_file_extension("aasdasd/dgfgf"), "")

-        self.assertIsInstance(
-            get_parser_class_for_mime_type("application/pdf")(logging_group=None),
-            RasterisedDocumentParser,
-        )
-        self.assertIsInstance(
-            get_parser_class_for_mime_type("text/plain")(logging_group=None),
-            TextDocumentParser,
-        )
-        self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf"))
-
+    def test_file_extension_support(self):
        self.assertTrue(is_file_ext_supported(".pdf"))
        self.assertFalse(is_file_ext_supported(".hsdfh"))
        self.assertFalse(is_file_ext_supported(""))
--- a/src/locale/nl_NL/LC_MESSAGES/django.po
+++ b/src/locale/nl_NL/LC_MESSAGES/django.po
@@ -3,7 +3,7 @@ msgstr ""
 "Project-Id-Version: paperless-ngx\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2022-11-09 21:50+0000\n"
-"PO-Revision-Date: 2023-01-23 12:37\n"
+"PO-Revision-Date: 2023-01-27 19:22\n"
 "Last-Translator: \n"
 "Language-Team: Dutch\n"
 "Language: nl_NL\n"
@@ -368,15 +368,15 @@ msgstr "heeft tags in"

 #: documents/models.py:410
 msgid "ASN greater than"
-msgstr ""
+msgstr "ASN groter dan"

 #: documents/models.py:411
 msgid "ASN less than"
-msgstr ""
+msgstr "ASN kleiner dan"

 #: documents/models.py:412
 msgid "storage path is"
-msgstr ""
+msgstr "opslagpad is"

 #: documents/models.py:422
 msgid "rule type"
@@ -396,99 +396,99 @@ msgstr "filterregels"

 #: documents/models.py:536
 msgid "Task ID"
-msgstr ""
+msgstr "Taak ID"

 #: documents/models.py:537
 msgid "Celery ID for the Task that was run"
-msgstr ""
+msgstr "Celery ID voor de taak die werd uitgevoerd"

 #: documents/models.py:542
 msgid "Acknowledged"
-msgstr ""
+msgstr "Bevestigd"

 #: documents/models.py:543
 msgid "If the task is acknowledged via the frontend or API"
-msgstr ""
+msgstr "Of de taak is bevestigd via de frontend of de API"

 #: documents/models.py:549 documents/models.py:556
 msgid "Task Name"
-msgstr ""
+msgstr "Taaknaam"

 #: documents/models.py:550
 msgid "Name of the file which the Task was run for"
-msgstr ""
+msgstr "Naam van het bestand waarvoor de taak werd uitgevoerd"

 #: documents/models.py:557
 msgid "Name of the Task which was run"
-msgstr ""
+msgstr "Naam van de uitgevoerde taak"

 #: documents/models.py:562
 msgid "Task Positional Arguments"
-msgstr ""
+msgstr "Positionele argumenten voor taak"

 #: documents/models.py:564
 msgid "JSON representation of the positional arguments used with the task"
-msgstr ""
+msgstr "JSON weergave van de positionele argumenten die gebruikt worden voor de taak"

 #: documents/models.py:569
 msgid "Task Named Arguments"
-msgstr ""
+msgstr "Argumenten met naam voor taak"

 #: documents/models.py:571
 msgid "JSON representation of the named arguments used with the task"
-msgstr ""
+msgstr "JSON weergave van de argumenten met naam die gebruikt worden voor de taak"

 #: documents/models.py:578
 msgid "Task State"
-msgstr ""
+msgstr "Taakstatus"

 #: documents/models.py:579
 msgid "Current state of the task being run"
-msgstr ""
+msgstr "Huidige status van de taak die wordt uitgevoerd"

 #: documents/models.py:584
 msgid "Created DateTime"
-msgstr ""
+msgstr "Aangemaakt DateTime"

 #: documents/models.py:585
 msgid "Datetime field when the task result was created in UTC"
-msgstr ""
+msgstr "Datetime veld wanneer het resultaat van de taak werd aangemaakt in UTC"

 #: documents/models.py:590
 msgid "Started DateTime"
-msgstr ""
+msgstr "Gestart DateTime"

 #: documents/models.py:591
 msgid "Datetime field when the task was started in UTC"
-msgstr ""
+msgstr "Datetime veld wanneer de taak werd gestart in UTC"

 #: documents/models.py:596
 msgid "Completed DateTime"
-msgstr ""
+msgstr "Voltooid DateTime"

 #: documents/models.py:597
 msgid "Datetime field when the task was completed in UTC"
-msgstr ""
+msgstr "Datetime veld wanneer de taak werd voltooid in UTC"

 #: documents/models.py:602
 msgid "Result Data"
-msgstr ""
+msgstr "Resultaatgegevens"

 #: documents/models.py:604
 msgid "The data returned by the task"
-msgstr ""
+msgstr "Gegevens geretourneerd door de taak"

 #: documents/models.py:613
 msgid "Comment for the document"
-msgstr ""
+msgstr "Commentaar op het document"

 #: documents/models.py:642
 msgid "comment"
-msgstr ""
+msgstr "opmerking"

 #: documents/models.py:643
 msgid "comments"
-msgstr ""
+msgstr "opmerkingen"

 #: documents/serialisers.py:72
 #, python-format
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -109,6 +109,16 @@ def _parse_redis_url(env_redis: Optional[str]) -> Tuple[str]:


 def _parse_beat_schedule() -> Dict:
+    """
+    Configures the scheduled tasks, according to default or
+    environment variables.  Task expiration is configured so the task will
+    expire (and not run), shortly before the default frequency will put another
+    of the same task into the queue
+
+
+    https://docs.celeryq.dev/en/stable/userguide/periodic-tasks.html#beat-entries
+    https://docs.celeryq.dev/en/latest/userguide/calling.html#expiration
+    """
    schedule = {}
    tasks = [
        {
@@ -117,6 +127,11 @@ def _parse_beat_schedule() -> Dict:
            # Default every ten minutes
            "env_default": "*/10 * * * *",
            "task": "paperless_mail.tasks.process_mail_accounts",
+            "options": {
+                # 1 minute before default schedule sends again
+                "expires": 9.0
+                * 60.0,
+            },
        },
        {
            "name": "Train the classifier",
@@ -124,6 +139,11 @@ def _parse_beat_schedule() -> Dict:
            # Default hourly at 5 minutes past the hour
            "env_default": "5 */1 * * *",
            "task": "documents.tasks.train_classifier",
+            "options": {
+                # 1 minute before default schedule sends again
+                "expires": 59.0
+                * 60.0,
+            },
        },
        {
            "name": "Optimize the index",
@@ -131,6 +151,12 @@ def _parse_beat_schedule() -> Dict:
            # Default daily at midnight
            "env_default": "0 0 * * *",
            "task": "documents.tasks.index_optimize",
+            "options": {
+                # 1 hour before default schedule sends again
+                "expires": 23.0
+                * 60.0
+                * 60.0,
+            },
        },
        {
            "name": "Perform sanity check",
@@ -138,6 +164,12 @@ def _parse_beat_schedule() -> Dict:
            # Default Sunday at 00:30
            "env_default": "30 0 * * sun",
            "task": "documents.tasks.sanity_check",
+            "options": {
+                # 1 hour before default schedule sends again
+                "expires": ((7.0 * 24.0) - 1.0)
+                * 60.0
+                * 60.0,
+            },
        },
    ]
    for task in tasks:
@@ -151,9 +183,11 @@ def _parse_beat_schedule() -> Dict:
        #   - five time-and-date fields
        #   - separated by at least one blank
        minute, hour, day_month, month, day_week = value.split(" ")
+
        schedule[task["name"]] = {
            "task": task["task"],
            "schedule": crontab(minute, hour, day_week, day_month, month),
+            "options": task["options"],
        }

    return schedule
@@ -263,6 +297,10 @@ MIDDLEWARE = [
    "django.middleware.clickjacking.XFrameOptionsMiddleware",
 ]

+# Optional to enable compression
+if __get_boolean("PAPERLESS_ENABLE_COMPRESSION", "yes"):  # pragma: nocover
+    MIDDLEWARE.insert(0, "compression_middleware.middleware.CompressionMiddleware")
+
 ROOT_URLCONF = "paperless.urls"

 FORCE_SCRIPT_NAME = os.getenv("PAPERLESS_FORCE_SCRIPT_NAME")
@@ -280,7 +318,6 @@ _CELERY_REDIS_URL, _CHANNELS_REDIS_URL = _parse_redis_url(
    os.getenv("PAPERLESS_REDIS", None),
 )

-# TODO: what is this used for?
 TEMPLATES = [
    {
        "BACKEND": "django.template.backends.django.DjangoTemplates",
@@ -561,22 +598,21 @@ LOGGING = {
 # Task queue                                                                  #
 ###############################################################################

-TASK_WORKERS = __get_int("PAPERLESS_TASK_WORKERS", 1)
-
-WORKER_TIMEOUT: Final[int] = __get_int("PAPERLESS_WORKER_TIMEOUT", 1800)
+# https://docs.celeryq.dev/en/stable/userguide/configuration.html

 CELERY_BROKER_URL = _CELERY_REDIS_URL
 CELERY_TIMEZONE = TIME_ZONE

 CELERY_WORKER_HIJACK_ROOT_LOGGER = False
-CELERY_WORKER_CONCURRENCY = TASK_WORKERS
+CELERY_WORKER_CONCURRENCY: Final[int] = __get_int("PAPERLESS_TASK_WORKERS", 1)
+TASK_WORKERS = CELERY_WORKER_CONCURRENCY
 CELERY_WORKER_MAX_TASKS_PER_CHILD = 1
 CELERY_WORKER_SEND_TASK_EVENTS = True
-
+CELERY_TASK_SEND_SENT_EVENT = True
 CELERY_SEND_TASK_SENT_EVENT = True

 CELERY_TASK_TRACK_STARTED = True
-CELERY_TASK_TIME_LIMIT = WORKER_TIMEOUT
+CELERY_TASK_TIME_LIMIT: Final[int] = __get_int("PAPERLESS_WORKER_TIMEOUT", 1800)

 CELERY_RESULT_EXTENDED = True
 CELERY_RESULT_BACKEND = "django-db"
@@ -608,7 +644,7 @@ def default_threads_per_worker(task_workers) -> int:

 THREADS_PER_WORKER = os.getenv(
    "PAPERLESS_THREADS_PER_WORKER",
-    default_threads_per_worker(TASK_WORKERS),
+    default_threads_per_worker(CELERY_WORKER_CONCURRENCY),
 )

 ###############################################################################
@@ -637,7 +673,7 @@ CONSUMER_IGNORE_PATTERNS = list(
    json.loads(
        os.getenv(
            "PAPERLESS_CONSUMER_IGNORE_PATTERNS",
-            '[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini"]',  # noqa: E501
+            '[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini", "@eaDir/*"]',  # noqa: E501
        ),
    ),
 )
--- a/src/paperless/tests/test_settings.py
+++ b/src/paperless/tests/test_settings.py
@@ -149,6 +149,11 @@ class TestRedisSocketConversion(TestCase):


 class TestCeleryScheduleParsing(TestCase):
+    MAIL_EXPIRE_TIME = 9.0 * 60.0
+    CLASSIFIER_EXPIRE_TIME = 59.0 * 60.0
+    INDEX_EXPIRE_TIME = 23.0 * 60.0 * 60.0
+    SANITY_EXPIRE_TIME = ((7.0 * 24.0) - 1.0) * 60.0 * 60.0
+
    def test_schedule_configuration_default(self):
        """
        GIVEN:
@@ -165,18 +170,22 @@ class TestCeleryScheduleParsing(TestCase):
                "Check all e-mail accounts": {
                    "task": "paperless_mail.tasks.process_mail_accounts",
                    "schedule": crontab(minute="*/10"),
+                    "options": {"expires": self.MAIL_EXPIRE_TIME},
                },
                "Train the classifier": {
                    "task": "documents.tasks.train_classifier",
                    "schedule": crontab(minute="5", hour="*/1"),
+                    "options": {"expires": self.CLASSIFIER_EXPIRE_TIME},
                },
                "Optimize the index": {
                    "task": "documents.tasks.index_optimize",
                    "schedule": crontab(minute=0, hour=0),
+                    "options": {"expires": self.INDEX_EXPIRE_TIME},
                },
                "Perform sanity check": {
                    "task": "documents.tasks.sanity_check",
                    "schedule": crontab(minute=30, hour=0, day_of_week="sun"),
+                    "options": {"expires": self.SANITY_EXPIRE_TIME},
                },
            },
            schedule,
@@ -203,18 +212,22 @@ class TestCeleryScheduleParsing(TestCase):
                "Check all e-mail accounts": {
                    "task": "paperless_mail.tasks.process_mail_accounts",
                    "schedule": crontab(minute="*/50", day_of_week="mon"),
+                    "options": {"expires": self.MAIL_EXPIRE_TIME},
                },
                "Train the classifier": {
                    "task": "documents.tasks.train_classifier",
                    "schedule": crontab(minute="5", hour="*/1"),
+                    "options": {"expires": self.CLASSIFIER_EXPIRE_TIME},
                },
                "Optimize the index": {
                    "task": "documents.tasks.index_optimize",
                    "schedule": crontab(minute=0, hour=0),
+                    "options": {"expires": self.INDEX_EXPIRE_TIME},
                },
                "Perform sanity check": {
                    "task": "documents.tasks.sanity_check",
                    "schedule": crontab(minute=30, hour=0, day_of_week="sun"),
+                    "options": {"expires": self.SANITY_EXPIRE_TIME},
                },
            },
            schedule,
@@ -238,14 +251,17 @@ class TestCeleryScheduleParsing(TestCase):
                "Check all e-mail accounts": {
                    "task": "paperless_mail.tasks.process_mail_accounts",
                    "schedule": crontab(minute="*/10"),
+                    "options": {"expires": self.MAIL_EXPIRE_TIME},
                },
                "Train the classifier": {
                    "task": "documents.tasks.train_classifier",
                    "schedule": crontab(minute="5", hour="*/1"),
+                    "options": {"expires": self.CLASSIFIER_EXPIRE_TIME},
                },
                "Perform sanity check": {
                    "task": "documents.tasks.sanity_check",
                    "schedule": crontab(minute=30, hour=0, day_of_week="sun"),
+                    "options": {"expires": self.SANITY_EXPIRE_TIME},
                },
            },
            schedule,
--- a/src/paperless/tests/test_websockets.py
+++ b/src/paperless/tests/test_websockets.py
@@ -14,15 +14,14 @@ TEST_CHANNEL_LAYERS = {
 }


+@override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS)
 class TestWebSockets(TestCase):
-    @override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS)
    async def test_no_auth(self):
        communicator = WebsocketCommunicator(application, "/ws/status/")
        connected, subprotocol = await communicator.connect()
        self.assertFalse(connected)
        await communicator.disconnect()

-    @override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS)
    @mock.patch("paperless.consumers.StatusConsumer._authenticated")
    async def test_auth(self, _authenticated):
        _authenticated.return_value = True
@@ -33,7 +32,6 @@ class TestWebSockets(TestCase):

        await communicator.disconnect()

-    @override_settings(CHANNEL_LAYERS=TEST_CHANNEL_LAYERS)
    @mock.patch("paperless.consumers.StatusConsumer._authenticated")
    async def test_receive(self, _authenticated):
        _authenticated.return_value = True
--- a/src/paperless/views.py
+++ b/src/paperless/views.py
@@ -12,7 +12,7 @@ class StandardPagination(PageNumberPagination):


 class FaviconView(View):
-    def get(self, request, *args, **kwargs):
+    def get(self, request, *args, **kwargs):  # pragma: nocover
        favicon = os.path.join(
            os.path.dirname(__file__),
            "static",
--- a/src/paperless_mail/tests/test_api.py
+++ b/src/paperless_mail/tests/test_api.py
@@ -2,12 +2,13 @@ from django.contrib.auth.models import User
 from documents.models import Correspondent
 from documents.models import DocumentType
 from documents.models import Tag
+from documents.tests.utils import DirectoriesMixin
 from paperless_mail.models import MailAccount
 from paperless_mail.models import MailRule
 from rest_framework.test import APITestCase


-class TestAPIMailAccounts(APITestCase):
+class TestAPIMailAccounts(DirectoriesMixin, APITestCase):
    ENDPOINT = "/api/mail_accounts/"

    def setUp(self):
@@ -165,7 +166,7 @@ class TestAPIMailAccounts(APITestCase):
        self.assertEqual(returned_account2.password, "123xyz")


-class TestAPIMailRules(APITestCase):
+class TestAPIMailRules(DirectoriesMixin, APITestCase):
    ENDPOINT = "/api/mail_rules/"

    def setUp(self):
--- a/src/paperless_mail/tests/test_parsers_live.py
+++ b/src/paperless_mail/tests/test_parsers_live.py
@@ -67,11 +67,6 @@ class TestParserLive(TestCase):

        return result

-    # Only run if convert is available
-    @pytest.mark.skipif(
-        "PAPERLESS_TEST_SKIP_CONVERT" in os.environ,
-        reason="PAPERLESS_TEST_SKIP_CONVERT set, skipping Test",
-    )
    @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
    def test_get_thumbnail(self, mock_generate_pdf: mock.MagicMock):
        """
@@ -204,11 +199,6 @@ class TestParserLive(TestCase):
        "GOTENBERG_LIVE" not in os.environ,
        reason="No gotenberg server",
    )
-    # Only run if convert is available
-    @pytest.mark.skipif(
-        "PAPERLESS_TEST_SKIP_CONVERT" in os.environ,
-        reason="PAPERLESS_TEST_SKIP_CONVERT set, skipping Test",
-    )
    def test_generate_pdf_from_mail(self):
        """
        GIVEN:
@@ -301,11 +291,6 @@ class TestParserLive(TestCase):
        "GOTENBERG_LIVE" not in os.environ,
        reason="No gotenberg server",
    )
-    # Only run if convert is available
-    @pytest.mark.skipif(
-        "PAPERLESS_TEST_SKIP_CONVERT" in os.environ,
-        reason="PAPERLESS_TEST_SKIP_CONVERT set, skipping Test",
-    )
    def test_generate_pdf_from_html(self):
        """
        GIVEN:
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -161,7 +161,7 @@ class RasterisedDocumentParser(DocumentParser):

        except Exception:
            # TODO catch all for various issues with PDFminer.six.
-            #  If PDFminer fails, fall back to OCR.
+            #  If pdftotext fails, fall back to OCR.
            self.log(
                "warning",
                "Error while getting text from PDF document with " "pdfminer.six",
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@@ -364,7 +364,7 @@ class TestParser(DirectoriesMixin, TestCase):
        )
        self.assertTrue(os.path.isfile(parser.archive_path))
        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"])
-        self.assertFalse("page 3" in parser.get_text().lower())
+        self.assertNotIn("page 3", parser.get_text().lower())

    @override_settings(OCR_PAGES=1, OCR_MODE="force")
    def test_multi_page_analog_pages_force(self):
@@ -386,8 +386,8 @@ class TestParser(DirectoriesMixin, TestCase):
        )
        self.assertTrue(os.path.isfile(parser.archive_path))
        self.assertContainsStrings(parser.get_text().lower(), ["page 1"])
-        self.assertFalse("page 2" in parser.get_text().lower())
-        self.assertFalse("page 3" in parser.get_text().lower())
+        self.assertNotIn("page 2", parser.get_text().lower())
+        self.assertNotIn("page 3", parser.get_text().lower())

    @override_settings(OCR_MODE="skip_noarchive")
    def test_skip_noarchive_withtext(self):
@@ -660,6 +660,15 @@ class TestParser(DirectoriesMixin, TestCase):
            params = parser.construct_ocrmypdf_parameters("", "", "", "")
            self.assertNotIn("deskew", params)

+        with override_settings(OCR_MAX_IMAGE_PIXELS=1_000_001.0):
+            params = parser.construct_ocrmypdf_parameters("", "", "", "")
+            self.assertIn("max_image_mpixels", params)
+            self.assertAlmostEqual(params["max_image_mpixels"], 1, places=4)
+
+        with override_settings(OCR_MAX_IMAGE_PIXELS=-1_000_001.0):
+            params = parser.construct_ocrmypdf_parameters("", "", "", "")
+            self.assertNotIn("max_image_mpixels", params)
+
    def test_rtl_language_detection(self):
        """
        GIVEN:
--- a/src/paperless_tika/parsers.py
+++ b/src/paperless_tika/parsers.py
@@ -90,7 +90,7 @@ class TikaDocumentParser(DocumentParser):
        with open(document_path, "rb") as document_handle:
            files = {
                "files": (
-                    file_name or os.path.basename(document_path),
+                    "convert" + os.path.splitext(document_path)[-1],
                    document_handle,
                ),
            }
--- a/src/paperless_tika/tests/test_tika_parser.py
+++ b/src/paperless_tika/tests/test_tika_parser.py
@@ -3,7 +3,9 @@ import os
 from pathlib import Path
 from unittest import mock

+from django.test import override_settings
 from django.test import TestCase
+from documents.parsers import ParseError
 from paperless_tika.parsers import TikaDocumentParser
 from requests import Response

@@ -54,3 +56,63 @@ class TestTikaParser(TestCase):

        self.assertTrue("Creation-Date" in [m["key"] for m in metadata])
        self.assertTrue("Some-key" in [m["key"] for m in metadata])
+
+    @mock.patch("paperless_tika.parsers.parser.from_file")
+    @mock.patch("paperless_tika.parsers.requests.post")
+    def test_convert_failure(self, post, from_file):
+        """
+        GIVEN:
+            - Document needs to be converted to PDF
+        WHEN:
+            - Gotenberg server returns an error
+        THEN:
+            - Parse error is raised
+        """
+        from_file.return_value = {
+            "content": "the content",
+            "metadata": {"Creation-Date": "2020-11-21"},
+        }
+        response = Response()
+        response._content = b"PDF document"
+        response.status_code = 500
+        post.return_value = response
+
+        file = os.path.join(self.parser.tempdir, "input.odt")
+        Path(file).touch()
+
+        with self.assertRaises(ParseError):
+            self.parser.convert_to_pdf(file, None)
+
+    @mock.patch("paperless_tika.parsers.requests.post")
+    def test_request_pdf_a_format(self, post: mock.Mock):
+        """
+        GIVEN:
+            - Document needs to be converted to PDF
+        WHEN:
+            - Specific PDF/A format requested
+        THEN:
+            - Request to Gotenberg contains the expected PDF/A format string
+        """
+        file = os.path.join(self.parser.tempdir, "input.odt")
+        Path(file).touch()
+
+        response = Response()
+        response._content = b"PDF document"
+        response.status_code = 200
+        post.return_value = response
+
+        for setting, expected_key in [
+            ("pdfa", "PDF/A-2b"),
+            ("pdfa-2", "PDF/A-2b"),
+            ("pdfa-1", "PDF/A-1a"),
+            ("pdfa-3", "PDF/A-3b"),
+        ]:
+            with override_settings(OCR_OUTPUT_TYPE=setting):
+                self.parser.convert_to_pdf(file, None)
+
+                post.assert_called_once()
+                _, kwargs = post.call_args
+
+                self.assertEqual(kwargs["data"]["pdfFormat"], expected_key)
+
+                post.reset_mock()
--- a/src/setup.cfg
+++ b/src/setup.cfg
@@ -7,7 +7,7 @@ max-line-length = 88

 [tool:pytest]
 DJANGO_SETTINGS_MODULE=paperless.settings
-addopts = --pythonwarnings=all --cov --cov-report=html --numprocesses auto --quiet
+addopts = --pythonwarnings=all --cov --cov-report=html --cov-report=xml --numprocesses auto --quiet
 env =
  PAPERLESS_DISABLE_DBHANDLER=true