Merge remote-tracking branch 'paperless-ngx/dev' into dev

2025-07-30 18:27:45 -05:00 · 2023-08-03 10:00:14 -07:00
parent 6bcc26b487 557e1790dd
commit cbcd9ed67d
93 changed files with 4444 additions and 5187 deletions
--- a/src/documents/barcodes.py
+++ b/src/documents/barcodes.py
@@ -1,15 +1,12 @@
 import logging
-import shutil
 import tempfile
 from dataclasses import dataclass
 from pathlib import Path
-from subprocess import run
 from typing import Dict
 from typing import Final
 from typing import List
 from typing import Optional

-import img2pdf
 from django.conf import settings
 from pdf2image import convert_from_path
 from pdf2image.exceptions import PDFPageCountError
@@ -17,7 +14,10 @@ from pikepdf import Page
 from pikepdf import Pdf
 from PIL import Image

+from documents.converters import convert_from_tiff_to_pdf
 from documents.data_models import DocumentSource
+from documents.utils import copy_basic_file_stats
+from documents.utils import copy_file_with_basic_stats

 logger = logging.getLogger("paperless.barcodes")

@@ -54,7 +54,7 @@ class BarcodeReader:
        self.mime: Final[str] = mime_type
        self.pdf_file: Path = self.file
        self.barcodes: List[Barcode] = []
-        self.temp_dir: Optional[Path] = None
+        self.temp_dir: Optional[tempfile.TemporaryDirectory] = None

        if settings.CONSUMER_BARCODE_TIFF_SUPPORT:
            self.SUPPORTED_FILE_MIMES = {"application/pdf", "image/tiff"}
@@ -154,34 +154,7 @@ class BarcodeReader:
        if self.mime != "image/tiff":
            return

-        with Image.open(self.file) as im:
-            has_alpha_layer = im.mode in ("RGBA", "LA")
-        if has_alpha_layer:
-            # Note the save into the temp folder, so as not to trigger a new
-            # consume
-            scratch_image = Path(self.temp_dir.name) / Path(self.file.name)
-            run(
-                [
-                    settings.CONVERT_BINARY,
-                    "-alpha",
-                    "off",
-                    self.file,
-                    scratch_image,
-                ],
-            )
-        else:
-            # Not modifying the original, safe to use in place
-            scratch_image = self.file
-
-        self.pdf_file = Path(self.temp_dir.name) / Path(self.file.name).with_suffix(
-            ".pdf",
-        )
-
-        with scratch_image.open("rb") as img_file, self.pdf_file.open("wb") as pdf_file:
-            pdf_file.write(img2pdf.convert(img_file))
-
-        # Copy what file stat is possible
-        shutil.copystat(self.file, self.pdf_file)
+        self.pdf_file = convert_from_tiff_to_pdf(self.file, Path(self.temp_dir.name))

    def detect(self) -> None:
        """
@@ -306,7 +279,7 @@ class BarcodeReader:
                with open(savepath, "wb") as out:
                    dst.save(out)

-                shutil.copystat(self.file, savepath)
+                copy_basic_file_stats(self.file, savepath)

                document_paths.append(savepath)

@@ -363,5 +336,5 @@ class BarcodeReader:
            else:
                dest = save_to_dir
            logger.info(f"Saving {document_path} to {dest}")
-            shutil.copy2(document_path, dest)
+            copy_file_with_basic_stats(document_path, dest)
        return True
--- a/src/documents/classifier.py
+++ b/src/documents/classifier.py
@@ -5,6 +5,7 @@ import re
 import warnings
 from datetime import datetime
 from hashlib import sha256
+from pathlib import Path
 from typing import Iterator
 from typing import List
 from typing import Optional
@@ -81,7 +82,7 @@ class DocumentClassifier:
        self._stemmer = None
        self._stop_words = None

-    def load(self):
+    def load(self) -> None:
        # Catch warnings for processing
        with warnings.catch_warnings(record=True) as w:
            with open(settings.MODEL_FILE, "rb") as f:
@@ -120,19 +121,20 @@ class DocumentClassifier:
                        raise IncompatibleClassifierVersionError

    def save(self):
-        target_file = settings.MODEL_FILE
-        target_file_temp = settings.MODEL_FILE.with_suffix(".pickle.part")
+        target_file: Path = settings.MODEL_FILE
+        target_file_temp = target_file.with_suffix(".pickle.part")

        with open(target_file_temp, "wb") as f:
            pickle.dump(self.FORMAT_VERSION, f)
+
            pickle.dump(self.last_doc_change_time, f)
            pickle.dump(self.last_auto_type_hash, f)

            pickle.dump(self.data_vectorizer, f)

            pickle.dump(self.tags_binarizer, f)
-
            pickle.dump(self.tags_classifier, f)
+
            pickle.dump(self.correspondent_classifier, f)
            pickle.dump(self.document_type_classifier, f)
            pickle.dump(self.storage_path_classifier, f)
@@ -247,7 +249,7 @@ class DocumentClassifier:
        data_vectorized = self.data_vectorizer.fit_transform(content_generator())

        # See the notes here:
-        # https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html  # noqa: 501
+        # https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html  # noqa: E501
        # This attribute isn't needed to function and can be large
        self.data_vectorizer.stop_words_ = None

@@ -380,7 +382,7 @@ class DocumentClassifier:

        return content

-    def predict_correspondent(self, content: str):
+    def predict_correspondent(self, content: str) -> Optional[int]:
        if self.correspondent_classifier:
            X = self.data_vectorizer.transform([self.preprocess_content(content)])
            correspondent_id = self.correspondent_classifier.predict(X)
@@ -391,7 +393,7 @@ class DocumentClassifier:
        else:
            return None

-    def predict_document_type(self, content: str):
+    def predict_document_type(self, content: str) -> Optional[int]:
        if self.document_type_classifier:
            X = self.data_vectorizer.transform([self.preprocess_content(content)])
            document_type_id = self.document_type_classifier.predict(X)
@@ -402,7 +404,7 @@ class DocumentClassifier:
        else:
            return None

-    def predict_tags(self, content: str):
+    def predict_tags(self, content: str) -> List[int]:
        from sklearn.utils.multiclass import type_of_target

        if self.tags_classifier:
@@ -423,7 +425,7 @@ class DocumentClassifier:
        else:
            return []

-    def predict_storage_path(self, content: str):
+    def predict_storage_path(self, content: str) -> Optional[int]:
        if self.storage_path_classifier:
            X = self.data_vectorizer.transform([self.preprocess_content(content)])
            storage_path_id = self.storage_path_classifier.predict(X)
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -1,9 +1,9 @@
 import datetime
 import hashlib
 import os
-import shutil
 import tempfile
 import uuid
+from enum import Enum
 from pathlib import Path
 from subprocess import CompletedProcess
 from subprocess import run
@@ -21,6 +21,9 @@ from django.utils import timezone
 from filelock import FileLock
 from rest_framework.reverse import reverse

+from documents.utils import copy_basic_file_stats
+from documents.utils import copy_file_with_basic_stats
+
 from .classifier import load_classifier
 from .file_handling import create_source_path_directory
 from .file_handling import generate_unique_filename
@@ -42,21 +45,30 @@ class ConsumerError(Exception):
    pass


-MESSAGE_DOCUMENT_ALREADY_EXISTS = "document_already_exists"
-MESSAGE_ASN_ALREADY_EXISTS = "asn_already_exists"
-MESSAGE_ASN_RANGE = "asn_value_out_of_range"
-MESSAGE_FILE_NOT_FOUND = "file_not_found"
-MESSAGE_PRE_CONSUME_SCRIPT_NOT_FOUND = "pre_consume_script_not_found"
-MESSAGE_PRE_CONSUME_SCRIPT_ERROR = "pre_consume_script_error"
-MESSAGE_POST_CONSUME_SCRIPT_NOT_FOUND = "post_consume_script_not_found"
-MESSAGE_POST_CONSUME_SCRIPT_ERROR = "post_consume_script_error"
-MESSAGE_NEW_FILE = "new_file"
-MESSAGE_UNSUPPORTED_TYPE = "unsupported_type"
-MESSAGE_PARSING_DOCUMENT = "parsing_document"
-MESSAGE_GENERATING_THUMBNAIL = "generating_thumbnail"
-MESSAGE_PARSE_DATE = "parse_date"
-MESSAGE_SAVE_DOCUMENT = "save_document"
-MESSAGE_FINISHED = "finished"
+class ConsumerStatusShortMessage(str, Enum):
+    DOCUMENT_ALREADY_EXISTS = "document_already_exists"
+    ASN_ALREADY_EXISTS = "asn_already_exists"
+    ASN_RANGE = "asn_value_out_of_range"
+    FILE_NOT_FOUND = "file_not_found"
+    PRE_CONSUME_SCRIPT_NOT_FOUND = "pre_consume_script_not_found"
+    PRE_CONSUME_SCRIPT_ERROR = "pre_consume_script_error"
+    POST_CONSUME_SCRIPT_NOT_FOUND = "post_consume_script_not_found"
+    POST_CONSUME_SCRIPT_ERROR = "post_consume_script_error"
+    NEW_FILE = "new_file"
+    UNSUPPORTED_TYPE = "unsupported_type"
+    PARSING_DOCUMENT = "parsing_document"
+    GENERATING_THUMBNAIL = "generating_thumbnail"
+    PARSE_DATE = "parse_date"
+    SAVE_DOCUMENT = "save_document"
+    FINISHED = "finished"
+    FAILED = "failed"
+
+
+class ConsumerFilePhase(str, Enum):
+    STARTED = "STARTED"
+    WORKING = "WORKING"
+    SUCCESS = "SUCCESS"
+    FAILED = "FAILED"


 class Consumer(LoggingMixin):
@@ -64,10 +76,10 @@ class Consumer(LoggingMixin):

    def _send_progress(
        self,
-        current_progress,
-        max_progress,
-        status,
-        message=None,
+        current_progress: int,
+        max_progress: int,
+        status: ConsumerFilePhase,
+        message: Optional[ConsumerStatusShortMessage] = None,
        document_id=None,
    ):  # pragma: no cover
        payload = {
@@ -86,12 +98,12 @@ class Consumer(LoggingMixin):

    def _fail(
        self,
-        message,
-        log_message=None,
+        message: ConsumerStatusShortMessage,
+        log_message: Optional[str] = None,
        exc_info=None,
        exception: Optional[Exception] = None,
    ):
-        self._send_progress(100, 100, "FAILED", message)
+        self._send_progress(100, 100, ConsumerFilePhase.FAILED, message)
        self.log.error(log_message or message, exc_info=exc_info)
        raise ConsumerError(f"{self.filename}: {log_message or message}") from exception

@@ -111,13 +123,19 @@ class Consumer(LoggingMixin):
        self.channel_layer = get_channel_layer()

    def pre_check_file_exists(self):
+        """
+        Confirm the input file still exists where it should
+        """
        if not os.path.isfile(self.path):
            self._fail(
-                MESSAGE_FILE_NOT_FOUND,
+                ConsumerStatusShortMessage.FILE_NOT_FOUND,
                f"Cannot consume {self.path}: File not found.",
            )

    def pre_check_duplicate(self):
+        """
+        Using the MD5 of the file, check this exact file doesn't already exist
+        """
        with open(self.path, "rb") as f:
            checksum = hashlib.md5(f.read()).hexdigest()
        existing_doc = Document.objects.filter(
@@ -127,12 +145,15 @@ class Consumer(LoggingMixin):
            if settings.CONSUMER_DELETE_DUPLICATES:
                os.unlink(self.path)
            self._fail(
-                MESSAGE_DOCUMENT_ALREADY_EXISTS,
+                ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS,
                f"Not consuming {self.filename}: It is a duplicate of"
                f" {existing_doc.get().title} (#{existing_doc.get().pk})",
            )

    def pre_check_directories(self):
+        """
+        Ensure all required directories exist before attempting to use them
+        """
        os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
        os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
        os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
@@ -152,7 +173,7 @@ class Consumer(LoggingMixin):
            or self.override_asn > Document.ARCHIVE_SERIAL_NUMBER_MAX
        ):
            self._fail(
-                MESSAGE_ASN_RANGE,
+                ConsumerStatusShortMessage.ASN_RANGE,
                f"Not consuming {self.filename}: "
                f"Given ASN {self.override_asn} is out of range "
                f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, "
@@ -160,17 +181,21 @@ class Consumer(LoggingMixin):
            )
        if Document.objects.filter(archive_serial_number=self.override_asn).exists():
            self._fail(
-                MESSAGE_ASN_ALREADY_EXISTS,
+                ConsumerStatusShortMessage.ASN_ALREADY_EXISTS,
                f"Not consuming {self.filename}: Given ASN already exists!",
            )

    def run_pre_consume_script(self):
+        """
+        If one is configured and exists, run the pre-consume script and
+        handle its output and/or errors
+        """
        if not settings.PRE_CONSUME_SCRIPT:
            return

        if not os.path.isfile(settings.PRE_CONSUME_SCRIPT):
            self._fail(
-                MESSAGE_PRE_CONSUME_SCRIPT_NOT_FOUND,
+                ConsumerStatusShortMessage.PRE_CONSUME_SCRIPT_NOT_FOUND,
                f"Configured pre-consume script "
                f"{settings.PRE_CONSUME_SCRIPT} does not exist.",
            )
@@ -201,19 +226,23 @@ class Consumer(LoggingMixin):

        except Exception as e:
            self._fail(
-                MESSAGE_PRE_CONSUME_SCRIPT_ERROR,
+                ConsumerStatusShortMessage.PRE_CONSUME_SCRIPT_ERROR,
                f"Error while executing pre-consume script: {e}",
                exc_info=True,
                exception=e,
            )

    def run_post_consume_script(self, document: Document):
+        """
+        If one is configured and exists, run the pre-consume script and
+        handle its output and/or errors
+        """
        if not settings.POST_CONSUME_SCRIPT:
            return

        if not os.path.isfile(settings.POST_CONSUME_SCRIPT):
            self._fail(
-                MESSAGE_POST_CONSUME_SCRIPT_NOT_FOUND,
+                ConsumerStatusShortMessage.POST_CONSUME_SCRIPT_NOT_FOUND,
                f"Configured post-consume script "
                f"{settings.POST_CONSUME_SCRIPT} does not exist.",
            )
@@ -274,7 +303,7 @@ class Consumer(LoggingMixin):

        except Exception as e:
            self._fail(
-                MESSAGE_POST_CONSUME_SCRIPT_ERROR,
+                ConsumerStatusShortMessage.POST_CONSUME_SCRIPT_ERROR,
                f"Error while executing post-consume script: {e}",
                exc_info=True,
                exception=e,
@@ -308,7 +337,12 @@ class Consumer(LoggingMixin):
        self.override_asn = override_asn
        self.override_owner_id = override_owner_id

-        self._send_progress(0, 100, "STARTING", MESSAGE_NEW_FILE)
+        self._send_progress(
+            0,
+            100,
+            ConsumerFilePhase.STARTED,
+            ConsumerStatusShortMessage.NEW_FILE,
+        )

        # Make sure that preconditions for consuming the file are met.

@@ -326,7 +360,7 @@ class Consumer(LoggingMixin):
            dir=settings.SCRATCH_DIR,
        )
        self.path = Path(tempdir.name) / Path(self.filename)
-        shutil.copy2(self.original_path, self.path)
+        copy_file_with_basic_stats(self.original_path, self.path)

        # Determine the parser class.

@@ -340,7 +374,10 @@ class Consumer(LoggingMixin):
        )
        if not parser_class:
            tempdir.cleanup()
-            self._fail(MESSAGE_UNSUPPORTED_TYPE, f"Unsupported mime type {mime_type}")
+            self._fail(
+                ConsumerStatusShortMessage.UNSUPPORTED_TYPE,
+                f"Unsupported mime type {mime_type}",
+            )

        # Notify all listeners that we're going to do some work.

@@ -355,7 +392,7 @@ class Consumer(LoggingMixin):
        def progress_callback(current_progress, max_progress):  # pragma: no cover
            # recalculate progress to be within 20 and 80
            p = int((current_progress / max_progress) * 50 + 20)
-            self._send_progress(p, 100, "WORKING")
+            self._send_progress(p, 100, ConsumerFilePhase.WORKING)

        # This doesn't parse the document yet, but gives us a parser.

@@ -377,12 +414,22 @@ class Consumer(LoggingMixin):
        archive_path = None

        try:
-            self._send_progress(20, 100, "WORKING", MESSAGE_PARSING_DOCUMENT)
+            self._send_progress(
+                20,
+                100,
+                ConsumerFilePhase.WORKING,
+                ConsumerStatusShortMessage.PARSING_DOCUMENT,
+            )
            self.log.debug(f"Parsing {self.filename}...")
            document_parser.parse(self.path, mime_type, self.filename)

            self.log.debug(f"Generating thumbnail for {self.filename}...")
-            self._send_progress(70, 100, "WORKING", MESSAGE_GENERATING_THUMBNAIL)
+            self._send_progress(
+                70,
+                100,
+                ConsumerFilePhase.WORKING,
+                ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
+            )
            thumbnail = document_parser.get_thumbnail(
                self.path,
                mime_type,
@@ -392,7 +439,12 @@ class Consumer(LoggingMixin):
            text = document_parser.get_text()
            date = document_parser.get_date()
            if date is None:
-                self._send_progress(90, 100, "WORKING", MESSAGE_PARSE_DATE)
+                self._send_progress(
+                    90,
+                    100,
+                    ConsumerFilePhase.WORKING,
+                    ConsumerStatusShortMessage.PARSE_DATE,
+                )
                date = parse_date(self.filename, text)
            archive_path = document_parser.get_archive_path()

@@ -414,7 +466,12 @@ class Consumer(LoggingMixin):

        classifier = load_classifier()

-        self._send_progress(95, 100, "WORKING", MESSAGE_SAVE_DOCUMENT)
+        self._send_progress(
+            95,
+            100,
+            ConsumerFilePhase.WORKING,
+            ConsumerStatusShortMessage.SAVE_DOCUMENT,
+        )
        # now that everything is done, we can start to store the document
        # in the system. This will be a transaction and reasonably fast.
        try:
@@ -499,7 +556,13 @@ class Consumer(LoggingMixin):

        self.log.info(f"Document {document} consumption finished")

-        self._send_progress(100, 100, "SUCCESS", MESSAGE_FINISHED, document.id)
+        self._send_progress(
+            100,
+            100,
+            ConsumerFilePhase.SUCCESS,
+            ConsumerStatusShortMessage.FINISHED,
+            document.id,
+        )

        # Return the most up to date fields
        document.refresh_from_db()
@@ -585,7 +648,7 @@ class Consumer(LoggingMixin):

        # Attempt to copy file's original stats, but it's ok if we can't
        try:
-            shutil.copystat(source, target)
+            copy_basic_file_stats(source, target)
        except Exception:  # pragma: no cover
            pass

--- a/src/documents/converters.py
+++ b/src/documents/converters.py
@@ -0,0 +1,46 @@
+from pathlib import Path
+from subprocess import run
+
+import img2pdf
+from django.conf import settings
+from PIL import Image
+
+from documents.utils import copy_basic_file_stats
+
+
+def convert_from_tiff_to_pdf(tiff_path: Path, target_directory: Path) -> Path:
+    """
+    Converts a TIFF file into a PDF file.
+
+    The PDF will be created in the given target_directory and share the name of
+    the original TIFF file, as well as its stats (mtime etc.).
+
+    Returns the path of the PDF created.
+    """
+    with Image.open(tiff_path) as im:
+        has_alpha_layer = im.mode in ("RGBA", "LA")
+    if has_alpha_layer:
+        # Note the save into the temp folder, so as not to trigger a new
+        # consume
+        scratch_image = target_directory / tiff_path.name
+        run(
+            [
+                settings.CONVERT_BINARY,
+                "-alpha",
+                "off",
+                tiff_path,
+                scratch_image,
+            ],
+        )
+    else:
+        # Not modifying the original, safe to use in place
+        scratch_image = tiff_path
+
+    pdf_path = (target_directory / tiff_path.name).with_suffix(".pdf")
+
+    with scratch_image.open("rb") as img_file, pdf_path.open("wb") as pdf_file:
+        pdf_file.write(img2pdf.convert(img_file))
+
+    # Copy what file stat is possible
+    copy_basic_file_stats(tiff_path, pdf_path)
+    return pdf_path
--- a/src/documents/double_sided.py
+++ b/src/documents/double_sided.py
@@ -0,0 +1,131 @@
+import datetime as dt
+import logging
+import os
+import shutil
+from pathlib import Path
+
+from django.conf import settings
+from pikepdf import Pdf
+
+from documents.consumer import ConsumerError
+from documents.converters import convert_from_tiff_to_pdf
+from documents.data_models import ConsumableDocument
+
+logger = logging.getLogger("paperless.double_sided")
+
+# Hardcoded for now, could be made a configurable setting if needed
+TIMEOUT_MINUTES = 30
+
+# Used by test cases
+STAGING_FILE_NAME = "double-sided-staging.pdf"
+
+
+def collate(input_doc: ConsumableDocument) -> str:
+    """
+    Tries to collate pages from 2 single sided scans of a double sided
+    document.
+
+    When called with a file, it checks whether or not a staging file
+    exists, if not, the current file is turned into that staging file
+    containing the odd numbered pages.
+
+    If a staging file exists, and it is not too old, the current file is
+    considered to be the second part (the even numbered pages) and it will
+    collate the pages of both, the pages of the second file will be added
+    in reverse order, since the ADF will have scanned the pages from bottom
+    to top.
+
+    Returns a status message on succcess, or raises a ConsumerError
+    in case of failure.
+    """
+
+    # Make sure scratch dir exists, Consumer might not have run yet
+    settings.SCRATCH_DIR.mkdir(exist_ok=True)
+
+    if input_doc.mime_type == "application/pdf":
+        pdf_file = input_doc.original_file
+    elif (
+        input_doc.mime_type == "image/tiff"
+        and settings.CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT
+    ):
+        pdf_file = convert_from_tiff_to_pdf(
+            input_doc.original_file,
+            settings.SCRATCH_DIR,
+        )
+        input_doc.original_file.unlink()
+    else:
+        raise ConsumerError("Unsupported file type for collation of double-sided scans")
+
+    staging = settings.SCRATCH_DIR / STAGING_FILE_NAME
+
+    valid_staging_exists = False
+    if staging.exists():
+        stats = os.stat(str(staging))
+        # if the file is older than the timeout, we don't consider
+        # it valid
+        if dt.datetime.now().timestamp() - stats.st_mtime > TIMEOUT_MINUTES * 60:
+            logger.warning("Outdated double sided staging file exists, deleting it")
+            os.unlink(str(staging))
+        else:
+            valid_staging_exists = True
+
+    if valid_staging_exists:
+        try:
+            # Collate pages from second PDF in reverse order
+            with Pdf.open(staging) as pdf1, Pdf.open(pdf_file) as pdf2:
+                pdf2.pages.reverse()
+                try:
+                    for i, page in enumerate(pdf2.pages):
+                        pdf1.pages.insert(2 * i + 1, page)
+                except IndexError:
+                    raise ConsumerError(
+                        "This second file (even numbered pages) contains more "
+                        "pages than the first/odd numbered one. This means the "
+                        "two uploaded files don't belong to the same double-"
+                        "sided scan. Please retry, starting with the odd "
+                        "numbered pages again.",
+                    )
+                # Merged file has the same path, but without the
+                # double-sided subdir. Therefore, it is also in the
+                # consumption dir and will be picked up for processing
+                old_file = input_doc.original_file
+                new_file = Path(
+                    *(
+                        part
+                        for part in old_file.with_name(
+                            f"{old_file.stem}-collated.pdf",
+                        ).parts
+                        if part != settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME
+                    ),
+                )
+                # If the user didn't create the subdirs yet, do it for them
+                new_file.parent.mkdir(parents=True, exist_ok=True)
+                pdf1.save(new_file)
+            logger.info("Collated documents into new file %s", new_file)
+            return (
+                "Success. Even numbered pages of double sided scan collated "
+                "with odd pages"
+            )
+        finally:
+            # Delete staging and recently uploaded file no matter what.
+            # If any error occurs, the user needs to be able to restart
+            # the process from scratch; after all, the staging file
+            # with the odd numbered pages might be the culprit
+            pdf_file.unlink()
+            staging.unlink()
+
+    else:
+        # In Python 3.9 move supports Path objects directly,
+        # but for now we have to be compatible with 3.8
+        shutil.move(str(pdf_file), str(staging))
+        # update access to modification time so we know if the file
+        # is outdated when another file gets uploaded
+        os.utime(str(staging), (dt.datetime.now().timestamp(),) * 2)
+        logger.info(
+            "Got scan with odd numbered pages of double-sided scan, moved it to %s",
+            staging,
+        )
+        return (
+            "Received odd numbered pages of double sided scan, waiting up to "
+            f"{TIMEOUT_MINUTES} minutes for even numbered pages"
+        )
--- a/src/documents/file_handling.py
+++ b/src/documents/file_handling.py
@@ -218,6 +218,7 @@ def generate_filename(
                tag_list=tag_list,
                owner_username=owner_username_str,
                original_name=original_name,
+                doc_pk=f"{doc.pk:07}",
            ).strip()

            if settings.FILENAME_FORMAT_REMOVE_NONE:
--- a/src/documents/management/commands/document_exporter.py
+++ b/src/documents/management/commands/document_exporter.py
@@ -11,13 +11,17 @@ from typing import Set
 import tqdm
 from django.conf import settings
 from django.contrib.auth.models import Group
+from django.contrib.auth.models import Permission
 from django.contrib.auth.models import User
+from django.contrib.contenttypes.models import ContentType
 from django.core import serializers
 from django.core.management.base import BaseCommand
 from django.core.management.base import CommandError
 from django.db import transaction
 from django.utils import timezone
 from filelock import FileLock
+from guardian.models import GroupObjectPermission
+from guardian.models import UserObjectPermission

 from documents.file_handling import delete_empty_directories
 from documents.file_handling import generate_filename
@@ -33,6 +37,7 @@ from documents.models import UiSettings
 from documents.settings import EXPORTER_ARCHIVE_NAME
 from documents.settings import EXPORTER_FILE_NAME
 from documents.settings import EXPORTER_THUMBNAIL_NAME
+from documents.utils import copy_file_with_basic_stats
 from paperless import version
 from paperless.db import GnuPG
 from paperless_mail.models import MailAccount
@@ -261,6 +266,22 @@ class Command(BaseCommand):
                serializers.serialize("json", UiSettings.objects.all()),
            )

+            manifest += json.loads(
+                serializers.serialize("json", ContentType.objects.all()),
+            )
+
+            manifest += json.loads(
+                serializers.serialize("json", Permission.objects.all()),
+            )
+
+            manifest += json.loads(
+                serializers.serialize("json", UserObjectPermission.objects.all()),
+            )
+
+            manifest += json.loads(
+                serializers.serialize("json", GroupObjectPermission.objects.all()),
+            )
+
        # 3. Export files from each document
        for index, document_dict in tqdm.tqdm(
            enumerate(document_manifest),
@@ -417,4 +438,4 @@ class Command(BaseCommand):

        if perform_copy:
            target.parent.mkdir(parents=True, exist_ok=True)
-            shutil.copy2(source, target)
+            copy_file_with_basic_stats(source, target)
--- a/src/documents/management/commands/document_importer.py
+++ b/src/documents/management/commands/document_importer.py
@@ -1,17 +1,20 @@
 import json
 import logging
 import os
-import shutil
 from contextlib import contextmanager
 from pathlib import Path

 import tqdm
 from django.conf import settings
+from django.contrib.auth.models import Permission
+from django.contrib.contenttypes.models import ContentType
 from django.core.exceptions import FieldDoesNotExist
 from django.core.management import call_command
 from django.core.management.base import BaseCommand
 from django.core.management.base import CommandError
 from django.core.serializers.base import DeserializationError
+from django.db import IntegrityError
+from django.db import transaction
 from django.db.models.signals import m2m_changed
 from django.db.models.signals import post_save
 from filelock import FileLock
@@ -23,6 +26,7 @@ from documents.settings import EXPORTER_ARCHIVE_NAME
 from documents.settings import EXPORTER_FILE_NAME
 from documents.settings import EXPORTER_THUMBNAIL_NAME
 from documents.signals.handlers import update_filename_and_move_files
+from documents.utils import copy_file_with_basic_stats
 from paperless import version


@@ -116,9 +120,13 @@ class Command(BaseCommand):
        ):
            # Fill up the database with whatever is in the manifest
            try:
-                for manifest_path in manifest_paths:
-                    call_command("loaddata", manifest_path)
-            except (FieldDoesNotExist, DeserializationError) as e:
+                with transaction.atomic():
+                    for manifest_path in manifest_paths:
+                        # delete these since pk can change, re-created from import
+                        ContentType.objects.all().delete()
+                        Permission.objects.all().delete()
+                        call_command("loaddata", manifest_path)
+            except (FieldDoesNotExist, DeserializationError, IntegrityError) as e:
                self.stdout.write(self.style.ERROR("Database import failed"))
                if (
                    self.version is not None
@@ -238,7 +246,7 @@ class Command(BaseCommand):

                create_source_path_directory(document.source_path)

-                shutil.copy2(document_path, document.source_path)
+                copy_file_with_basic_stats(document_path, document.source_path)

                if thumbnail_path:
                    if thumbnail_path.suffix in {".png", ".PNG"}:
@@ -253,13 +261,16 @@ class Command(BaseCommand):
                            output_file=str(document.thumbnail_path),
                        )
                    else:
-                        shutil.copy2(thumbnail_path, document.thumbnail_path)
+                        copy_file_with_basic_stats(
+                            thumbnail_path,
+                            document.thumbnail_path,
+                        )

                if archive_path:
                    create_source_path_directory(document.archive_path)
                    # TODO: this assumes that the export is valid and
                    #  archive_filename is present on all documents with
                    #  archived files
-                    shutil.copy2(archive_path, document.archive_path)
+                    copy_file_with_basic_stats(archive_path, document.archive_path)

            document.save()
--- a/src/documents/matching.py
+++ b/src/documents/matching.py
@@ -1,7 +1,9 @@
 import logging
 import re

+from documents.classifier import DocumentClassifier
 from documents.models import Correspondent
+from documents.models import Document
 from documents.models import DocumentType
 from documents.models import MatchingModel
 from documents.models import StoragePath
@@ -11,7 +13,7 @@ from documents.permissions import get_objects_for_user_owner_aware
 logger = logging.getLogger("paperless.matching")


-def log_reason(matching_model, document, reason):
+def log_reason(matching_model: MatchingModel, document: Document, reason: str):
    class_name = type(matching_model).__name__
    logger.debug(
        f"{class_name} {matching_model.name} matched on document "
@@ -19,7 +21,7 @@ def log_reason(matching_model, document, reason):
    )


-def match_correspondents(document, classifier, user=None):
+def match_correspondents(document: Document, classifier: DocumentClassifier, user=None):
    pred_id = classifier.predict_correspondent(document.content) if classifier else None

    if user is None and document.owner is not None:
@@ -35,11 +37,15 @@ def match_correspondents(document, classifier, user=None):
        correspondents = Correspondent.objects.all()

    return list(
-        filter(lambda o: matches(o, document) or o.pk == pred_id, correspondents),
+        filter(
+            lambda o: matches(o, document)
+            or (o.pk == pred_id and o.matching_algorithm == MatchingModel.MATCH_AUTO),
+            correspondents,
+        ),
    )


-def match_document_types(document, classifier, user=None):
+def match_document_types(document: Document, classifier: DocumentClassifier, user=None):
    pred_id = classifier.predict_document_type(document.content) if classifier else None

    if user is None and document.owner is not None:
@@ -55,11 +61,15 @@ def match_document_types(document, classifier, user=None):
        document_types = DocumentType.objects.all()

    return list(
-        filter(lambda o: matches(o, document) or o.pk == pred_id, document_types),
+        filter(
+            lambda o: matches(o, document)
+            or (o.pk == pred_id and o.matching_algorithm == MatchingModel.MATCH_AUTO),
+            document_types,
+        ),
    )


-def match_tags(document, classifier, user=None):
+def match_tags(document: Document, classifier: DocumentClassifier, user=None):
    predicted_tag_ids = classifier.predict_tags(document.content) if classifier else []

    if user is None and document.owner is not None:
@@ -71,11 +81,18 @@ def match_tags(document, classifier, user=None):
        tags = Tag.objects.all()

    return list(
-        filter(lambda o: matches(o, document) or o.pk in predicted_tag_ids, tags),
+        filter(
+            lambda o: matches(o, document)
+            or (
+                o.matching_algorithm == MatchingModel.MATCH_AUTO
+                and o.pk in predicted_tag_ids
+            ),
+            tags,
+        ),
    )


-def match_storage_paths(document, classifier, user=None):
+def match_storage_paths(document: Document, classifier: DocumentClassifier, user=None):
    pred_id = classifier.predict_storage_path(document.content) if classifier else None

    if user is None and document.owner is not None:
@@ -92,13 +109,14 @@ def match_storage_paths(document, classifier, user=None):

    return list(
        filter(
-            lambda o: matches(o, document) or o.pk == pred_id,
+            lambda o: matches(o, document)
+            or (o.pk == pred_id and o.matching_algorithm == MatchingModel.MATCH_AUTO),
            storage_paths,
        ),
    )


-def matches(matching_model, document):
+def matches(matching_model: MatchingModel, document: Document):
    search_kwargs = {}

    document_content = document.content
--- a/src/documents/migrations/1037_webp_encrypted_thumbnail_conversion.py
+++ b/src/documents/migrations/1037_webp_encrypted_thumbnail_conversion.py
@@ -0,0 +1,162 @@
+# Generated by Django 4.1.9 on 2023-06-29 19:29
+import logging
+import multiprocessing.pool
+import shutil
+import tempfile
+import time
+from pathlib import Path
+
+import gnupg
+from django.conf import settings
+from django.db import migrations
+
+from documents.parsers import run_convert
+
+logger = logging.getLogger("paperless.migrations")
+
+
+def _do_convert(work_package):
+    (
+        existing_encrypted_thumbnail,
+        converted_encrypted_thumbnail,
+        passphrase,
+    ) = work_package
+
+    try:
+        gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
+
+        logger.info(f"Decrypting thumbnail: {existing_encrypted_thumbnail}")
+
+        # Decrypt png
+        decrypted_thumbnail = existing_encrypted_thumbnail.with_suffix("").resolve()
+
+        with open(existing_encrypted_thumbnail, "rb") as existing_encrypted_file:
+            raw_thumb = gpg.decrypt_file(
+                existing_encrypted_file,
+                passphrase=passphrase,
+                always_trust=True,
+            ).data
+            with open(decrypted_thumbnail, "wb") as decrypted_file:
+                decrypted_file.write(raw_thumb)
+
+        converted_decrypted_thumbnail = Path(
+            str(converted_encrypted_thumbnail).replace("webp.gpg", "webp"),
+        ).resolve()
+
+        logger.info(f"Converting decrypted thumbnail: {decrypted_thumbnail}")
+
+        # Convert to webp
+        run_convert(
+            density=300,
+            scale="500x5000>",
+            alpha="remove",
+            strip=True,
+            trim=False,
+            auto_orient=True,
+            input_file=f"{decrypted_thumbnail}[0]",
+            output_file=str(converted_decrypted_thumbnail),
+        )
+
+        logger.info(
+            f"Encrypting converted thumbnail: {converted_decrypted_thumbnail}",
+        )
+
+        # Encrypt webp
+        with open(converted_decrypted_thumbnail, "rb") as converted_decrypted_file:
+            encrypted = gpg.encrypt_file(
+                fileobj_or_path=converted_decrypted_file,
+                recipients=None,
+                passphrase=passphrase,
+                symmetric=True,
+                always_trust=True,
+            ).data
+
+            with open(converted_encrypted_thumbnail, "wb") as converted_encrypted_file:
+                converted_encrypted_file.write(encrypted)
+
+        # Copy newly created thumbnail to thumbnail directory
+        shutil.copy(converted_encrypted_thumbnail, existing_encrypted_thumbnail.parent)
+
+        # Remove the existing encrypted PNG version
+        existing_encrypted_thumbnail.unlink()
+
+        # Remove the decrypted PNG version
+        decrypted_thumbnail.unlink()
+
+        # Remove the decrypted WebP version
+        converted_decrypted_thumbnail.unlink()
+
+        logger.info(
+            "Conversion to WebP completed, "
+            f"replaced {existing_encrypted_thumbnail.name} with {converted_encrypted_thumbnail.name}",
+        )
+
+    except Exception as e:
+        logger.error(f"Error converting thumbnail (existing file unchanged): {e}")
+
+
+def _convert_encrypted_thumbnails_to_webp(apps, schema_editor):
+    start = time.time()
+
+    with tempfile.TemporaryDirectory() as tempdir:
+        work_packages = []
+
+        if len(list(Path(settings.THUMBNAIL_DIR).glob("*.png.gpg"))) > 0:
+            passphrase = settings.PASSPHRASE
+
+            if not passphrase:
+                raise Exception(
+                    "Passphrase not defined, encrypted thumbnails cannot be migrated"
+                    "without this",
+                )
+
+            for file in Path(settings.THUMBNAIL_DIR).glob("*.png.gpg"):
+                existing_thumbnail = file.resolve()
+
+                # Change the existing filename suffix from png to webp
+                converted_thumbnail_name = Path(
+                    str(existing_thumbnail).replace(".png.gpg", ".webp.gpg"),
+                ).name
+
+                # Create the expected output filename in the tempdir
+                converted_thumbnail = (
+                    Path(tempdir) / Path(converted_thumbnail_name)
+                ).resolve()
+
+                # Package up the necessary info
+                work_packages.append(
+                    (existing_thumbnail, converted_thumbnail, passphrase),
+                )
+
+            if len(work_packages):
+                logger.info(
+                    "\n\n"
+                    "  This is a one-time only migration to convert thumbnails for all of your\n"
+                    "  *encrypted* documents into WebP format. If you have a lot of encrypted documents, \n"
+                    "  this may take a while, so a coffee break may be in order."
+                    "\n",
+                )
+
+                with multiprocessing.pool.Pool(
+                    processes=min(multiprocessing.cpu_count(), 4),
+                    maxtasksperchild=4,
+                ) as pool:
+                    pool.map(_do_convert, work_packages)
+
+                    end = time.time()
+                    duration = end - start
+
+                logger.info(f"Conversion completed in {duration:.3f}s")
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("documents", "1036_alter_savedviewfilterrule_rule_type"),
+    ]
+
+    operations = [
+        migrations.RunPython(
+            code=_convert_encrypted_thumbnails_to_webp,
+            reverse_code=migrations.RunPython.noop,
+        ),
+    ]
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -18,6 +18,7 @@ from django.utils import timezone

 from documents.loggers import LoggingMixin
 from documents.signals import document_consumer_declaration
+from documents.utils import copy_file_with_basic_stats

 # This regular expression will try to find dates in the document at
 # hand and will match the following formats:
@@ -31,16 +32,18 @@ from documents.signals import document_consumer_declaration
 # - MONTH ZZZZ, with ZZZZ being 4 digits
 # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
 # - XX MON ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits. MONTH is 3 letters
+# - XXPP MONTH ZZZZ with XX being 1 or 2 and PP being 2 letters and ZZZZ being 4 digits

 # TODO: isnt there a date parsing library for this?

 DATE_REGEX = re.compile(
    r"(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|"  # noqa: E501
    r"(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|"  # noqa: E501
-    r"(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|"  # noqa: E501
+    r"(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[a-zA-Z]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|"  # noqa: E501
    r"(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|"
    r"(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))|"
-    r"(\b|(?!=([_-])))(\b[0-9]{1,2}[ \.\/-][A-Z]{3}[ \.\/-][0-9]{4})(\b|(?=([_-])))",  # noqa: E501
+    r"(\b|(?!=([_-])))([0-9]{1,2}[^ ]{2}[\. ]+[^ ]{3,9}[ \.\/-][0-9]{4})(\b|(?=([_-])))|"  # noqa: E501
+    r"(\b|(?!=([_-])))(\b[0-9]{1,2}[ \.\/-][a-zA-Z]{3}[ \.\/-][0-9]{4})(\b|(?=([_-])))",  # noqa: E501
 )


@@ -206,7 +209,7 @@ def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -
        # so we need to copy it before it gets moved.
        # https://github.com/paperless-ngx/paperless-ngx/issues/3631
        default_thumbnail_path = os.path.join(temp_dir, "document.png")
-        shutil.copy2(get_default_thumbnail(), default_thumbnail_path)
+        copy_file_with_basic_stats(get_default_thumbnail(), default_thumbnail_path)
        return default_thumbnail_path


--- a/src/documents/signals/handlers.py
+++ b/src/documents/signals/handlers.py
@@ -1,6 +1,7 @@
 import logging
 import os
 import shutil
+from typing import Optional

 from celery import states
 from celery.signals import before_task_publish
@@ -21,6 +22,7 @@ from django.utils import timezone
 from filelock import FileLock

 from documents import matching
+from documents.classifier import DocumentClassifier
 from documents.file_handling import create_source_path_directory
 from documents.file_handling import delete_empty_directories
 from documents.file_handling import generate_unique_filename
@@ -33,7 +35,7 @@ from documents.permissions import get_objects_for_user_owner_aware
 logger = logging.getLogger("paperless.handlers")


-def add_inbox_tags(sender, document=None, logging_group=None, **kwargs):
+def add_inbox_tags(sender, document: Document, logging_group=None, **kwargs):
    if document.owner is not None:
        tags = get_objects_for_user_owner_aware(
            document.owner,
@@ -48,9 +50,9 @@ def add_inbox_tags(sender, document=None, logging_group=None, **kwargs):

 def set_correspondent(
    sender,
-    document=None,
+    document: Document,
    logging_group=None,
-    classifier=None,
+    classifier: Optional[DocumentClassifier] = None,
    replace=False,
    use_first=True,
    suggest=False,
@@ -111,9 +113,9 @@ def set_correspondent(

 def set_document_type(
    sender,
-    document=None,
+    document: Document,
    logging_group=None,
-    classifier=None,
+    classifier: Optional[DocumentClassifier] = None,
    replace=False,
    use_first=True,
    suggest=False,
@@ -175,9 +177,9 @@ def set_document_type(

 def set_tags(
    sender,
-    document=None,
+    document: Document,
    logging_group=None,
-    classifier=None,
+    classifier: Optional[DocumentClassifier] = None,
    replace=False,
    suggest=False,
    base_url=None,
@@ -239,9 +241,9 @@ def set_tags(

 def set_storage_path(
    sender,
-    document=None,
+    document: Document,
    logging_group=None,
-    classifier=None,
+    classifier: Optional[DocumentClassifier] = None,
    replace=False,
    use_first=True,
    suggest=False,
@@ -491,7 +493,7 @@ def update_filename_and_move_files(sender, instance: Document, **kwargs):
            )


-def set_log_entry(sender, document=None, logging_group=None, **kwargs):
+def set_log_entry(sender, document: Document, logging_group=None, **kwargs):
    ct = ContentType.objects.get(model="document")
    user = User.objects.get(username="consumer")

--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@@ -25,6 +25,7 @@ from documents.consumer import Consumer
 from documents.consumer import ConsumerError
 from documents.data_models import ConsumableDocument
 from documents.data_models import DocumentMetadataOverrides
+from documents.double_sided import collate
 from documents.file_handling import create_source_path_directory
 from documents.file_handling import generate_unique_filename
 from documents.models import Correspondent
@@ -64,6 +65,12 @@ def train_classifier():
        and not Correspondent.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
        and not StoragePath.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
    ):
+        logger.info("No automatic matching items, not training")
+        # Special case, items were once auto and trained, so remove the model
+        # and prevent its use again
+        if settings.MODEL_FILE.exists():
+            logger.info(f"Removing {settings.MODEL_FILE} so it won't be used")
+            settings.MODEL_FILE.unlink()
        return

    classifier = load_classifier()
@@ -89,10 +96,40 @@ def consume_file(
    input_doc: ConsumableDocument,
    overrides: Optional[DocumentMetadataOverrides] = None,
 ):
+    def send_progress(status="SUCCESS", message="finished"):
+        payload = {
+            "filename": overrides.filename or input_doc.original_file.name,
+            "task_id": None,
+            "current_progress": 100,
+            "max_progress": 100,
+            "status": status,
+            "message": message,
+        }
+        try:
+            async_to_sync(get_channel_layer().group_send)(
+                "status_updates",
+                {"type": "status_update", "data": payload},
+            )
+        except ConnectionError as e:
+            logger.warning(f"ConnectionError on status send: {e!s}")
+
    # Default no overrides
    if overrides is None:
        overrides = DocumentMetadataOverrides()

+    # Handle collation of double-sided documents scanned in two parts
+    if settings.CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED and (
+        settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME
+        in input_doc.original_file.parts
+    ):
+        try:
+            msg = collate(input_doc)
+            send_progress(message=msg)
+            return msg
+        except ConsumerError as e:
+            send_progress(status="FAILURE", message=e.args[0])
+            raise e
+
    # read all barcodes in the current document
    if settings.CONSUMER_ENABLE_BARCODES or settings.CONSUMER_ENABLE_ASN_BARCODE:
        with BarcodeReader(input_doc.original_file, input_doc.mime_type) as reader:
@@ -102,32 +139,18 @@ def consume_file(
            ):
                # notify the sender, otherwise the progress bar
                # in the UI stays stuck
-                payload = {
-                    "filename": overrides.filename or input_doc.original_file.name,
-                    "task_id": None,
-                    "current_progress": 100,
-                    "max_progress": 100,
-                    "status": "SUCCESS",
-                    "message": "finished",
-                }
-                try:
-                    async_to_sync(get_channel_layer().group_send)(
-                        "status_updates",
-                        {"type": "status_update", "data": payload},
-                    )
-                except ConnectionError as e:
-                    logger.warning(f"ConnectionError on status send: {e!s}")
+                send_progress()
                # consuming stops here, since the original document with
                # the barcodes has been split and will be consumed separately
-
                input_doc.original_file.unlink()
                return "File successfully split"

            # try reading the ASN from barcode
-            if settings.CONSUMER_ENABLE_ASN_BARCODE:
+            if settings.CONSUMER_ENABLE_ASN_BARCODE and reader.asn is not None:
+                # Note this will take precedence over an API provided ASN
+                # But it's from a physical barcode, so that's good
                overrides.asn = reader.asn
-                if overrides.asn:
-                    logger.info(f"Found ASN in barcode: {overrides.asn}")
+                logger.info(f"Found ASN in barcode: {overrides.asn}")

    # continue with consumption if no barcode was found
    document = Consumer().try_consume_file(
--- a/src/documents/tests/samples/double-sided-even.pdf
+++ b/src/documents/tests/samples/double-sided-even.pdf
--- a/src/documents/tests/samples/double-sided-odd.pdf
+++ b/src/documents/tests/samples/double-sided-odd.pdf
--- a/src/documents/tests/test_api.py
+++ b/src/documents/tests/test_api.py
@@ -2369,6 +2369,62 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):

        self.assertEqual(resp_data["note"], "this is a posted note")

+    def test_notes_permissions_aware(self):
+        """
+        GIVEN:
+            - Existing document owned by user2 but with granted view perms for user1
+        WHEN:
+            - API request is made by user1 to add a note or delete
+        THEN:
+            - Notes are neither created nor deleted
+        """
+        user1 = User.objects.create_user(username="test1")
+        user1.user_permissions.add(*Permission.objects.all())
+        user1.save()
+
+        user2 = User.objects.create_user(username="test2")
+        user2.save()
+
+        doc = Document.objects.create(
+            title="test",
+            mime_type="application/pdf",
+            content="this is a document which will have notes added",
+        )
+        doc.owner = user2
+        doc.save()
+
+        self.client.force_authenticate(user1)
+
+        resp = self.client.get(
+            f"/api/documents/{doc.pk}/notes/",
+            format="json",
+        )
+        self.assertEqual(resp.content, b"Insufficient permissions to view")
+        self.assertEqual(resp.status_code, status.HTTP_403_FORBIDDEN)
+
+        assign_perm("view_document", user1, doc)
+
+        resp = self.client.post(
+            f"/api/documents/{doc.pk}/notes/",
+            data={"note": "this is a posted note"},
+        )
+        self.assertEqual(resp.content, b"Insufficient permissions to create")
+        self.assertEqual(resp.status_code, status.HTTP_403_FORBIDDEN)
+
+        note = Note.objects.create(
+            note="This is a note.",
+            document=doc,
+            user=user2,
+        )
+
+        response = self.client.delete(
+            f"/api/documents/{doc.pk}/notes/?id={note.pk}",
+            format="json",
+        )
+
+        self.assertEqual(response.content, b"Insufficient permissions to delete")
+        self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
+
    def test_delete_note(self):
        """
        GIVEN:
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -21,6 +21,7 @@ from django.utils import timezone

 from documents.consumer import Consumer
 from documents.consumer import ConsumerError
+from documents.consumer import ConsumerFilePhase
 from documents.models import Correspondent
 from documents.models import Document
 from documents.models import DocumentType
@@ -228,8 +229,8 @@ def fake_magic_from_file(file, mime=False):
 class TestConsumer(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
    def _assert_first_last_send_progress(
        self,
-        first_status="STARTING",
-        last_status="SUCCESS",
+        first_status=ConsumerFilePhase.STARTED,
+        last_status=ConsumerFilePhase.SUCCESS,
        first_progress=0,
        first_progress_max=100,
        last_progress=100,
@@ -561,10 +562,16 @@ class TestConsumer(DirectoriesMixin, FileSystemAssertsMixin, TestCase):

    @mock.patch("documents.consumer.load_classifier")
    def testClassifyDocument(self, m):
-        correspondent = Correspondent.objects.create(name="test")
-        dtype = DocumentType.objects.create(name="test")
-        t1 = Tag.objects.create(name="t1")
-        t2 = Tag.objects.create(name="t2")
+        correspondent = Correspondent.objects.create(
+            name="test",
+            matching_algorithm=Correspondent.MATCH_AUTO,
+        )
+        dtype = DocumentType.objects.create(
+            name="test",
+            matching_algorithm=DocumentType.MATCH_AUTO,
+        )
+        t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO)
+        t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_AUTO)

        m.return_value = MagicMock()
        m.return_value.predict_correspondent.return_value = correspondent.pk
--- a/src/documents/tests/test_date_parsing.py
+++ b/src/documents/tests/test_date_parsing.py
@@ -152,6 +152,55 @@ class TestDate(TestCase):
        text = "Customer Number Currency 22 MAR,2022 Credit Card 1934829304"
        self.assertIsNone(parse_date("", text), None)

+    def test_date_format_19(self):
+        text = "Customer Number Currency 21st MAR 2022 Credit Card 1934829304"
+        self.assertEqual(
+            parse_date("", text),
+            datetime.datetime(2022, 3, 21, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        )
+
+    def test_date_format_20(self):
+        text = "Customer Number Currency 22nd March 2022 Credit Card 1934829304"
+        self.assertEqual(
+            parse_date("", text),
+            datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        )
+
+    def test_date_format_21(self):
+        text = "Customer Number Currency 2nd MAR 2022 Credit Card 1934829304"
+        self.assertEqual(
+            parse_date("", text),
+            datetime.datetime(2022, 3, 2, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        )
+
+    def test_date_format_22(self):
+        text = "Customer Number Currency 23rd MAR 2022 Credit Card 1934829304"
+        self.assertEqual(
+            parse_date("", text),
+            datetime.datetime(2022, 3, 23, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        )
+
+    def test_date_format_23(self):
+        text = "Customer Number Currency 24th MAR 2022 Credit Card 1934829304"
+        self.assertEqual(
+            parse_date("", text),
+            datetime.datetime(2022, 3, 24, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        )
+
+    def test_date_format_24(self):
+        text = "Customer Number Currency 21-MAR-2022 Credit Card 1934829304"
+        self.assertEqual(
+            parse_date("", text),
+            datetime.datetime(2022, 3, 21, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        )
+
+    def test_date_format_25(self):
+        text = "Customer Number Currency 25TH MAR 2022 Credit Card 1934829304"
+        self.assertEqual(
+            parse_date("", text),
+            datetime.datetime(2022, 3, 25, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
+        )
+
    def test_crazy_date_past(self, *args):
        self.assertIsNone(parse_date("", "01-07-0590 00:00:00"))

--- a/src/documents/tests/test_double_sided.py
+++ b/src/documents/tests/test_double_sided.py
@@ -0,0 +1,253 @@
+import datetime as dt
+import os
+import shutil
+from pathlib import Path
+from typing import Union
+from unittest import mock
+
+from django.test import TestCase
+from django.test import override_settings
+from pdfminer.high_level import extract_text
+from pikepdf import Pdf
+
+from documents import tasks
+from documents.consumer import ConsumerError
+from documents.data_models import ConsumableDocument
+from documents.data_models import DocumentSource
+from documents.double_sided import STAGING_FILE_NAME
+from documents.double_sided import TIMEOUT_MINUTES
+from documents.tests.utils import DirectoriesMixin
+from documents.tests.utils import FileSystemAssertsMixin
+
+
+@override_settings(
+    CONSUMER_RECURSIVE=True,
+    CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED=True,
+)
+class TestDoubleSided(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
+    SAMPLE_DIR = Path(__file__).parent / "samples"
+
+    def setUp(self):
+        super().setUp()
+        self.dirs.double_sided_dir = self.dirs.consumption_dir / "double-sided"
+        self.dirs.double_sided_dir.mkdir()
+        self.staging_file = self.dirs.scratch_dir / STAGING_FILE_NAME
+
+    def consume_file(self, srcname, dstname: Union[str, Path] = "foo.pdf"):
+        """
+        Starts the consume process and also ensures the
+        destination file does not exist afterwards
+        """
+        src = self.SAMPLE_DIR / srcname
+        dst = self.dirs.double_sided_dir / dstname
+        dst.parent.mkdir(parents=True, exist_ok=True)
+        shutil.copy(src, dst)
+        with mock.patch("documents.tasks.async_to_sync"), mock.patch(
+            "documents.consumer.async_to_sync",
+        ):
+            msg = tasks.consume_file(
+                ConsumableDocument(
+                    source=DocumentSource.ConsumeFolder,
+                    original_file=dst,
+                ),
+                None,
+            )
+        self.assertIsNotFile(dst)
+        return msg
+
+    def create_staging_file(self, src="double-sided-odd.pdf", datetime=None):
+        shutil.copy(self.SAMPLE_DIR / src, self.staging_file)
+        if datetime is None:
+            datetime = dt.datetime.now()
+        os.utime(str(self.staging_file), (datetime.timestamp(),) * 2)
+
+    def test_odd_numbered_moved_to_staging(self):
+        """
+        GIVEN:
+            - No staging file exists
+        WHEN:
+            - A file is copied into the double-sided consume directory
+        THEN:
+            - The file becomes the new staging file
+            - The file in the consume directory gets removed
+            - The staging file has the st_mtime set to now
+            - The user gets informed
+        """
+
+        msg = self.consume_file("double-sided-odd.pdf")
+
+        self.assertIsFile(self.staging_file)
+        self.assertAlmostEqual(
+            dt.datetime.fromtimestamp(self.staging_file.stat().st_mtime),
+            dt.datetime.now(),
+            delta=dt.timedelta(seconds=5),
+        )
+        self.assertIn("Received odd numbered pages", msg)
+
+    def test_collation(self):
+        """
+        GIVEN:
+            - A staging file not older than TIMEOUT_MINUTES with odd pages exists
+        WHEN:
+            - A file is copied into the double-sided consume directory
+        THEN:
+            - A new file containing the collated staging and uploaded file is
+              created and put into the consume directory
+            - The new file is named "foo-collated.pdf", where foo is the name of
+              the second file
+            - Both staging and uploaded file get deleted
+            - The new file contains the pages in the correct order
+        """
+
+        self.create_staging_file()
+        self.consume_file("double-sided-even.pdf", "some-random-name.pdf")
+
+        target = self.dirs.consumption_dir / "some-random-name-collated.pdf"
+        self.assertIsFile(target)
+        self.assertIsNotFile(self.staging_file)
+        self.assertRegex(
+            extract_text(str(target)),
+            r"(?s)"
+            r"This is page 1.*This is page 2.*This is page 3.*"
+            r"This is page 4.*This is page 5",
+        )
+
+    def test_staging_file_expiration(self):
+        """
+        GIVEN:
+            - A staging file older than TIMEOUT_MINUTES exists
+        WHEN:
+            - A file is copied into the double-sided consume directory
+        THEN:
+            - It becomes the new staging file
+        """
+
+        self.create_staging_file(
+            datetime=dt.datetime.now()
+            - dt.timedelta(minutes=TIMEOUT_MINUTES, seconds=1),
+        )
+        msg = self.consume_file("double-sided-odd.pdf")
+        self.assertIsFile(self.staging_file)
+        self.assertIn("Received odd numbered pages", msg)
+
+    def test_less_odd_pages_then_even_fails(self):
+        """
+        GIVEN:
+            - A valid staging file
+        WHEN:
+            - A file is copied into the double-sided consume directory
+              that has more pages than the staging file
+        THEN:
+            - Both files get removed
+            - A ConsumerError exception is thrown
+        """
+        self.create_staging_file("simple.pdf")
+        self.assertRaises(
+            ConsumerError,
+            self.consume_file,
+            "double-sided-even.pdf",
+        )
+        self.assertIsNotFile(self.staging_file)
+
+    @override_settings(CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT=True)
+    def test_tiff_upload_enabled(self):
+        """
+        GIVEN:
+            - CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT is true
+            - No staging file exists
+        WHEN:
+            - A TIFF file gets uploaded into the double-sided
+              consume dir
+        THEN:
+            - The file is converted into a PDF and moved to
+              the staging file
+        """
+        self.consume_file("simple.tiff", "simple.tiff")
+        self.assertIsFile(self.staging_file)
+        # Ensure the file is a valid PDF by trying to read it
+        Pdf.open(self.staging_file)
+
+    @override_settings(CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT=False)
+    def test_tiff_upload_disabled(self):
+        """
+        GIVEN:
+            - CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT is false
+            - No staging file exists
+        WHEN:
+            - A TIFF file gets uploaded into the double-sided
+              consume dir
+        THEN:
+            - A ConsumerError is raised
+        """
+        self.assertRaises(
+            ConsumerError,
+            self.consume_file,
+            "simple.tiff",
+            "simple.tiff",
+        )
+
+    @override_settings(CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME="quux")
+    def test_different_upload_dir_name(self):
+        """
+        GIVEN:
+            - No staging file exists
+            - CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME is set to quux
+        WHEN:
+            - A file is uploaded into the quux dir
+        THEN:
+            - A staging file is created
+        """
+        self.consume_file("double-sided-odd.pdf", Path("..") / "quux" / "foo.pdf")
+        self.assertIsFile(self.staging_file)
+
+    def test_only_double_sided_dir_is_handled(self):
+        """
+        GIVEN:
+            - No staging file exists
+        WHEN:
+            - A file is uploaded into the normal consumption dir
+        THEN:
+            - The file is processed as normal
+        """
+        msg = self.consume_file("simple.pdf", Path("..") / "simple.pdf")
+        self.assertIsNotFile(self.staging_file)
+        self.assertRegex(msg, "Success. New document .* created")
+
+    def test_subdirectory_upload(self):
+        """
+        GIVEN:
+            - A staging file exists
+        WHEN:
+            - A file gets uploaded into foo/bar/double-sided
+              or double-sided/foo/bar
+        THEN:
+            - The collated file gets put into foo/bar
+        """
+        for path in [
+            Path("foo") / "bar" / "double-sided",
+            Path("double-sided") / "foo" / "bar",
+        ]:
+            with self.subTest(path=path):
+                # Ensure we get fresh directories for each run
+                self.tearDown()
+                self.setUp()
+
+                self.create_staging_file()
+                self.consume_file("double-sided-odd.pdf", path / "foo.pdf")
+                self.assertIsFile(
+                    self.dirs.consumption_dir / "foo" / "bar" / "foo-collated.pdf",
+                )
+
+    @override_settings(CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED=False)
+    def test_disabled_double_sided_dir_upload(self):
+        """
+        GIVEN:
+            - CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED is false
+        WHEN:
+            - A file is uploaded into the double-sided directory
+        THEN:
+            - The file is processed like a normal upload
+        """
+        msg = self.consume_file("simple.pdf")
+        self.assertIsNotFile(self.staging_file)
+        self.assertRegex(msg, "Success. New document .* created")
--- a/src/documents/tests/test_file_handling.py
+++ b/src/documents/tests/test_file_handling.py
@@ -446,6 +446,19 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
        self.assertIsNotDir(os.path.join(settings.ORIGINALS_DIR, "none"))
        self.assertIsDir(settings.ORIGINALS_DIR)

+    @override_settings(FILENAME_FORMAT="{doc_pk}")
+    def test_format_doc_pk(self):
+        document = Document()
+        document.pk = 1
+        document.mime_type = "application/pdf"
+        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
+
+        self.assertEqual(generate_filename(document), "0000001.pdf")
+
+        document.pk = 13579
+
+        self.assertEqual(generate_filename(document), "0013579.pdf")
+
    @override_settings(FILENAME_FORMAT=None)
    def test_format_none(self):
        document = Document()
--- a/src/documents/tests/test_management_exporter.py
+++ b/src/documents/tests/test_management_exporter.py
@@ -7,11 +7,18 @@ from pathlib import Path
 from unittest import mock
 from zipfile import ZipFile

+from django.contrib.auth.models import Group
+from django.contrib.auth.models import Permission
+from django.contrib.contenttypes.models import ContentType
 from django.core.management import call_command
 from django.core.management.base import CommandError
+from django.db import IntegrityError
 from django.test import TestCase
 from django.test import override_settings
 from django.utils import timezone
+from guardian.models import GroupObjectPermission
+from guardian.models import UserObjectPermission
+from guardian.shortcuts import assign_perm

 from documents.management.commands import document_exporter
 from documents.models import Correspondent
@@ -34,6 +41,8 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
        self.addCleanup(shutil.rmtree, self.target)

        self.user = User.objects.create(username="temp_admin")
+        self.user2 = User.objects.create(username="user2")
+        self.group1 = Group.objects.create(name="group1")

        self.d1 = Document.objects.create(
            content="Content",
@@ -73,6 +82,9 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
            user=self.user,
        )

+        assign_perm("view_document", self.user2, self.d2)
+        assign_perm("view_document", self.group1, self.d3)
+
        self.t1 = Tag.objects.create(name="t")
        self.dt1 = DocumentType.objects.create(name="dt")
        self.c1 = Correspondent.objects.create(name="c")
@@ -141,12 +153,12 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):

        manifest = self._do_export(use_filename_format=use_filename_format)

-        self.assertEqual(len(manifest), 10)
+        self.assertEqual(len(manifest), 149)

        # dont include consumer or AnonymousUser users
        self.assertEqual(
            len(list(filter(lambda e: e["model"] == "auth.user", manifest))),
-            1,
+            2,
        )

        self.assertEqual(
@@ -218,6 +230,9 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
            Correspondent.objects.all().delete()
            DocumentType.objects.all().delete()
            Tag.objects.all().delete()
+            Permission.objects.all().delete()
+            UserObjectPermission.objects.all().delete()
+            GroupObjectPermission.objects.all().delete()
            self.assertEqual(Document.objects.count(), 0)

            call_command("document_importer", "--no-progress-bar", self.target)
@@ -230,6 +245,9 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
            self.assertEqual(Document.objects.get(id=self.d2.id).title, "wow2")
            self.assertEqual(Document.objects.get(id=self.d3.id).title, "wow2")
            self.assertEqual(Document.objects.get(id=self.d4.id).title, "wow_dec")
+            self.assertEqual(GroupObjectPermission.objects.count(), 1)
+            self.assertEqual(UserObjectPermission.objects.count(), 1)
+            self.assertEqual(Permission.objects.count(), 108)
            messages = check_sanity()
            # everything is alright after the test
            self.assertEqual(len(messages), 0)
@@ -259,7 +277,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
        st_mtime_1 = os.stat(os.path.join(self.target, "manifest.json")).st_mtime

        with mock.patch(
-            "documents.management.commands.document_exporter.shutil.copy2",
+            "documents.management.commands.document_exporter.copy_file_with_basic_stats",
        ) as m:
            self._do_export()
            m.assert_not_called()
@@ -270,7 +288,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
        Path(self.d1.source_path).touch()

        with mock.patch(
-            "documents.management.commands.document_exporter.shutil.copy2",
+            "documents.management.commands.document_exporter.copy_file_with_basic_stats",
        ) as m:
            self._do_export()
            self.assertEqual(m.call_count, 1)
@@ -293,7 +311,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
        self.assertIsFile(os.path.join(self.target, "manifest.json"))

        with mock.patch(
-            "documents.management.commands.document_exporter.shutil.copy2",
+            "documents.management.commands.document_exporter.copy_file_with_basic_stats",
        ) as m:
            self._do_export()
            m.assert_not_called()
@@ -304,7 +322,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
        self.d2.save()

        with mock.patch(
-            "documents.management.commands.document_exporter.shutil.copy2",
+            "documents.management.commands.document_exporter.copy_file_with_basic_stats",
        ) as m:
            self._do_export(compare_checksums=True)
            self.assertEqual(m.call_count, 1)
@@ -641,3 +659,47 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
            self.assertEqual(Document.objects.count(), 0)
            call_command("document_importer", "--no-progress-bar", self.target)
            self.assertEqual(Document.objects.count(), 4)
+
+    def test_import_db_transaction_failed(self):
+        """
+        GIVEN:
+            - Import from manifest started
+        WHEN:
+            - Import of database fails
+        THEN:
+            - ContentType & Permission objects are not deleted, db transaction rolled back
+        """
+
+        shutil.rmtree(os.path.join(self.dirs.media_dir, "documents"))
+        shutil.copytree(
+            os.path.join(os.path.dirname(__file__), "samples", "documents"),
+            os.path.join(self.dirs.media_dir, "documents"),
+        )
+
+        self.assertEqual(ContentType.objects.count(), 27)
+        self.assertEqual(Permission.objects.count(), 108)
+
+        manifest = self._do_export()
+
+        with paperless_environment():
+            self.assertEqual(
+                len(list(filter(lambda e: e["model"] == "auth.permission", manifest))),
+                108,
+            )
+            # add 1 more to db to show objects are not re-created by import
+            Permission.objects.create(
+                name="test",
+                codename="test_perm",
+                content_type_id=1,
+            )
+            self.assertEqual(Permission.objects.count(), 109)
+
+            # will cause an import error
+            self.user.delete()
+            self.user = User.objects.create(username="temp_admin")
+
+            with self.assertRaises(IntegrityError):
+                call_command("document_importer", "--no-progress-bar", self.target)
+
+            self.assertEqual(ContentType.objects.count(), 27)
+            self.assertEqual(Permission.objects.count(), 109)
--- a/src/documents/tests/test_migration_archive_files.py
+++ b/src/documents/tests/test_migration_archive_files.py
@@ -2,6 +2,7 @@ import hashlib
 import os
 import shutil
 from pathlib import Path
+from typing import Optional
 from unittest import mock

 from django.conf import settings
@@ -60,8 +61,8 @@ def make_test_document(
    mime_type: str,
    original: str,
    original_filename: str,
-    archive: str = None,
-    archive_filename: str = None,
+    archive: Optional[str] = None,
+    archive_filename: Optional[str] = None,
 ):
    doc = document_class()
    doc.filename = original_filename
--- a/src/documents/tests/test_migration_encrypted_webp_conversion.py
+++ b/src/documents/tests/test_migration_encrypted_webp_conversion.py
@@ -0,0 +1,276 @@
+import shutil
+import tempfile
+from pathlib import Path
+from typing import Callable
+from typing import Iterable
+from typing import Union
+from unittest import mock
+
+from django.test import override_settings
+
+from documents.tests.utils import TestMigrations
+
+
+@override_settings(PASSPHRASE="test")
+@mock.patch(
+    "documents.migrations.1037_webp_encrypted_thumbnail_conversion.multiprocessing.pool.Pool.map",
+)
+@mock.patch("documents.migrations.1037_webp_encrypted_thumbnail_conversion.run_convert")
+class TestMigrateToEncrytpedWebPThumbnails(TestMigrations):
+    migrate_from = "1036_alter_savedviewfilterrule_rule_type"
+    migrate_to = "1037_webp_encrypted_thumbnail_conversion"
+    auto_migrate = False
+
+    def pretend_convert_output(self, *args, **kwargs):
+        """
+        Pretends to do the conversion, by copying the input file
+        to the output file
+        """
+        shutil.copy2(
+            Path(kwargs["input_file"].rstrip("[0]")),
+            Path(kwargs["output_file"]),
+        )
+
+    def pretend_map(self, func: Callable, iterable: Iterable):
+        """
+        Pretends to be the map of a multiprocessing.Pool, but secretly does
+        everything in series
+        """
+        for item in iterable:
+            func(item)
+
+    def create_dummy_thumbnails(
+        self,
+        thumb_dir: Path,
+        ext: str,
+        count: int,
+        start_count: int = 0,
+    ):
+        """
+        Helper to create a certain count of files of given extension in a given directory
+        """
+        for idx in range(count):
+            (Path(thumb_dir) / Path(f"{start_count + idx:07}.{ext}")).touch()
+        # Triple check expected files exist
+        self.assert_file_count_by_extension(ext, thumb_dir, count)
+
+    def create_webp_thumbnail_files(
+        self,
+        thumb_dir: Path,
+        count: int,
+        start_count: int = 0,
+    ):
+        """
+        Creates a dummy WebP thumbnail file in the given directory, based on
+        the database Document
+        """
+        self.create_dummy_thumbnails(thumb_dir, "webp", count, start_count)
+
+    def create_encrypted_webp_thumbnail_files(
+        self,
+        thumb_dir: Path,
+        count: int,
+        start_count: int = 0,
+    ):
+        """
+        Creates a dummy encrypted WebP thumbnail file in the given directory, based on
+        the database Document
+        """
+        self.create_dummy_thumbnails(thumb_dir, "webp.gpg", count, start_count)
+
+    def create_png_thumbnail_files(
+        self,
+        thumb_dir: Path,
+        count: int,
+        start_count: int = 0,
+    ):
+        """
+        Creates a dummy PNG thumbnail file in the given directory, based on
+        the database Document
+        """
+
+        self.create_dummy_thumbnails(thumb_dir, "png", count, start_count)
+
+    def create_encrypted_png_thumbnail_files(
+        self,
+        thumb_dir: Path,
+        count: int,
+        start_count: int = 0,
+    ):
+        """
+        Creates a dummy encrypted PNG thumbnail file in the given directory, based on
+        the database Document
+        """
+
+        self.create_dummy_thumbnails(thumb_dir, "png.gpg", count, start_count)
+
+    def assert_file_count_by_extension(
+        self,
+        ext: str,
+        dir: Union[str, Path],
+        expected_count: int,
+    ):
+        """
+        Helper to assert a certain count of given extension files in given directory
+        """
+        if not isinstance(dir, Path):
+            dir = Path(dir)
+        matching_files = list(dir.glob(f"*.{ext}"))
+        self.assertEqual(len(matching_files), expected_count)
+
+    def assert_encrypted_png_file_count(self, dir: Path, expected_count: int):
+        """
+        Helper to assert a certain count of excrypted PNG extension files in given directory
+        """
+        self.assert_file_count_by_extension("png.gpg", dir, expected_count)
+
+    def assert_encrypted_webp_file_count(self, dir: Path, expected_count: int):
+        """
+        Helper to assert a certain count of encrypted WebP extension files in given directory
+        """
+        self.assert_file_count_by_extension("webp.gpg", dir, expected_count)
+
+    def assert_webp_file_count(self, dir: Path, expected_count: int):
+        """
+        Helper to assert a certain count of WebP extension files in given directory
+        """
+        self.assert_file_count_by_extension("webp", dir, expected_count)
+
+    def assert_png_file_count(self, dir: Path, expected_count: int):
+        """
+        Helper to assert a certain count of PNG extension files in given directory
+        """
+        self.assert_file_count_by_extension("png", dir, expected_count)
+
+    def setUp(self):
+        self.thumbnail_dir = Path(tempfile.mkdtemp()).resolve()
+
+        return super().setUp()
+
+    def tearDown(self) -> None:
+        shutil.rmtree(self.thumbnail_dir)
+
+        return super().tearDown()
+
+    def test_do_nothing_if_converted(
+        self,
+        run_convert_mock: mock.MagicMock,
+        map_mock: mock.MagicMock,
+    ):
+        """
+        GIVEN:
+            - Encrytped document exists with existing encrypted WebP thumbnail path
+        WHEN:
+            - Migration is attempted
+        THEN:
+            - Nothing is converted
+        """
+        map_mock.side_effect = self.pretend_map
+
+        with override_settings(
+            THUMBNAIL_DIR=self.thumbnail_dir,
+        ):
+            self.create_encrypted_webp_thumbnail_files(self.thumbnail_dir, 3)
+
+            self.performMigration()
+            run_convert_mock.assert_not_called()
+
+            self.assert_encrypted_webp_file_count(self.thumbnail_dir, 3)
+
+    def test_convert_thumbnails(
+        self,
+        run_convert_mock: mock.MagicMock,
+        map_mock: mock.MagicMock,
+    ):
+        """
+        GIVEN:
+            - Encrypted documents exist with PNG thumbnail
+        WHEN:
+            - Migration is attempted
+        THEN:
+            - Thumbnails are converted to webp & re-encrypted
+        """
+        map_mock.side_effect = self.pretend_map
+        run_convert_mock.side_effect = self.pretend_convert_output
+
+        with override_settings(
+            THUMBNAIL_DIR=self.thumbnail_dir,
+        ):
+            self.create_encrypted_png_thumbnail_files(self.thumbnail_dir, 3)
+
+            self.performMigration()
+
+            run_convert_mock.assert_called()
+            self.assertEqual(run_convert_mock.call_count, 3)
+
+            self.assert_encrypted_webp_file_count(self.thumbnail_dir, 3)
+
+    def test_convert_errors_out(
+        self,
+        run_convert_mock: mock.MagicMock,
+        map_mock: mock.MagicMock,
+    ):
+        """
+        GIVEN:
+            - Encrypted document exists with PNG thumbnail
+        WHEN:
+            - Migration is attempted, but raises an exception
+        THEN:
+            - Single thumbnail is converted
+        """
+        map_mock.side_effect = self.pretend_map
+        run_convert_mock.side_effect = OSError
+
+        with override_settings(
+            THUMBNAIL_DIR=self.thumbnail_dir,
+        ):
+            self.create_encrypted_png_thumbnail_files(self.thumbnail_dir, 3)
+
+            self.performMigration()
+
+            run_convert_mock.assert_called()
+            self.assertEqual(run_convert_mock.call_count, 3)
+
+            self.assert_encrypted_png_file_count(self.thumbnail_dir, 3)
+
+    def test_convert_mixed(
+        self,
+        run_convert_mock: mock.MagicMock,
+        map_mock: mock.MagicMock,
+    ):
+        """
+        GIVEN:
+            - Documents exist with PNG, encrypted PNG and WebP thumbnails
+        WHEN:
+            - Migration is attempted
+        THEN:
+            - Only encrypted PNG thumbnails are converted
+        """
+        map_mock.side_effect = self.pretend_map
+        run_convert_mock.side_effect = self.pretend_convert_output
+
+        with override_settings(
+            THUMBNAIL_DIR=self.thumbnail_dir,
+        ):
+            self.create_png_thumbnail_files(self.thumbnail_dir, 3)
+            self.create_encrypted_png_thumbnail_files(
+                self.thumbnail_dir,
+                3,
+                start_count=3,
+            )
+            self.create_webp_thumbnail_files(self.thumbnail_dir, 2, start_count=6)
+            self.create_encrypted_webp_thumbnail_files(
+                self.thumbnail_dir,
+                3,
+                start_count=8,
+            )
+
+            self.performMigration()
+
+            run_convert_mock.assert_called()
+            self.assertEqual(run_convert_mock.call_count, 3)
+
+            self.assert_png_file_count(self.thumbnail_dir, 3)
+            self.assert_encrypted_webp_file_count(self.thumbnail_dir, 6)
+            self.assert_webp_file_count(self.thumbnail_dir, 2)
+            self.assert_encrypted_png_file_count(self.thumbnail_dir, 0)
--- a/src/documents/utils.py
+++ b/src/documents/utils.py
@@ -0,0 +1,43 @@
+import shutil
+from os import utime
+from pathlib import Path
+from typing import Tuple
+from typing import Union
+
+
+def _coerce_to_path(
+    source: Union[Path, str],
+    dest: Union[Path, str],
+) -> Tuple[Path, Path]:
+    return Path(source).resolve(), Path(dest).resolve()
+
+
+def copy_basic_file_stats(source: Union[Path, str], dest: Union[Path, str]) -> None:
+    """
+    Copies only the m_time and a_time attributes from source to destination.
+    Both are expected to exist.
+
+    The extended attribute copy does weird things with SELinux and files
+    copied from temporary directories and copystat doesn't allow disabling
+    these copies
+    """
+    source, dest = _coerce_to_path(source, dest)
+    src_stat = source.stat()
+    utime(dest, ns=(src_stat.st_atime_ns, src_stat.st_mtime_ns))
+
+
+def copy_file_with_basic_stats(
+    source: Union[Path, str],
+    dest: Union[Path, str],
+) -> None:
+    """
+    A sort of simpler copy2 that doesn't copy extended file attributes,
+    only the access time and modified times from source to dest.
+
+    The extended attribute copy does weird things with SELinux and files
+    copied from temporary directories.
+    """
+    source, dest = _coerce_to_path(source, dest)
+
+    shutil.copy(source, dest)
+    copy_basic_file_stats(source, dest)
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -502,19 +502,18 @@ class DocumentViewSet(

    @action(methods=["get", "post", "delete"], detail=True)
    def notes(self, request, pk=None):
+        currentUser = request.user
        try:
            doc = Document.objects.get(pk=pk)
-            if request.user is not None and not has_perms_owner_aware(
-                request.user,
+            if currentUser is not None and not has_perms_owner_aware(
+                currentUser,
                "view_document",
                doc,
            ):
-                return HttpResponseForbidden("Insufficient permissions")
+                return HttpResponseForbidden("Insufficient permissions to view")
        except Document.DoesNotExist:
            raise Http404

-        currentUser = request.user
-
        if request.method == "GET":
            try:
                return Response(self.getNotes(doc))
@@ -525,6 +524,13 @@ class DocumentViewSet(
                )
        elif request.method == "POST":
            try:
+                if currentUser is not None and not has_perms_owner_aware(
+                    currentUser,
+                    "change_document",
+                    doc,
+                ):
+                    return HttpResponseForbidden("Insufficient permissions to create")
+
                c = Note.objects.create(
                    document=doc,
                    note=request.data["note"],
@@ -545,6 +551,13 @@ class DocumentViewSet(
                    },
                )
        elif request.method == "DELETE":
+            if currentUser is not None and not has_perms_owner_aware(
+                currentUser,
+                "change_document",
+                doc,
+            ):
+                return HttpResponseForbidden("Insufficient permissions to delete")
+
            note = Note.objects.get(id=int(request.GET.get("id")))
            note.delete()

--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -791,6 +791,18 @@ CONSUMER_BARCODE_DPI: Final[str] = int(
    os.getenv("PAPERLESS_CONSUMER_BARCODE_DPI", 300),
 )

+CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED: Final[bool] = __get_boolean(
+    "PAPERLESS_CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED",
+)
+
+CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME: Final[str] = os.getenv(
+    "PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME",
+    "double-sided",
+)
+
+CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT: Final[bool] = __get_boolean(
+    "PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT",
+)

 OCR_PAGES = int(os.getenv("PAPERLESS_OCR_PAGES", 0))

--- a/src/paperless_mail/admin.py
+++ b/src/paperless_mail/admin.py
@@ -1,6 +1,7 @@
 from django import forms
 from django.contrib import admin
 from django.utils.translation import gettext_lazy as _
+from guardian.admin import GuardedModelAdmin

 from paperless_mail.models import MailAccount
 from paperless_mail.models import MailRule
@@ -31,7 +32,7 @@ class MailAccountAdminForm(forms.ModelForm):
        ]


-class MailAccountAdmin(admin.ModelAdmin):
+class MailAccountAdmin(GuardedModelAdmin):
    list_display = ("name", "imap_server", "username")

    fieldsets = [
@@ -45,7 +46,7 @@ class MailAccountAdmin(admin.ModelAdmin):
    form = MailAccountAdminForm


-class MailRuleAdmin(admin.ModelAdmin):
+class MailRuleAdmin(GuardedModelAdmin):
    radio_fields = {
        "attachment_type": admin.VERTICAL,
        "action": admin.VERTICAL,
--- a/src/paperless_mail/mail.py
+++ b/src/paperless_mail/mail.py
@@ -2,6 +2,7 @@ import datetime
 import itertools
 import logging
 import os
+import ssl
 import tempfile
 import traceback
 from datetime import date
@@ -394,13 +395,12 @@ def get_mailbox(server, port, security) -> MailBox:
    """
    Returns the correct MailBox instance for the given configuration.
    """
-
    if security == MailAccount.ImapSecurity.NONE:
        mailbox = MailBoxUnencrypted(server, port)
    elif security == MailAccount.ImapSecurity.STARTTLS:
-        mailbox = MailBoxTls(server, port)
+        mailbox = MailBoxTls(server, port, ssl_context=ssl.create_default_context())
    elif security == MailAccount.ImapSecurity.SSL:
-        mailbox = MailBox(server, port)
+        mailbox = MailBox(server, port, ssl_context=ssl.create_default_context())
    else:
        raise NotImplementedError("Unknown IMAP security")  # pragma: nocover
    return mailbox
--- a/src/paperless_mail/serialisers.py
+++ b/src/paperless_mail/serialisers.py
@@ -25,7 +25,6 @@ class MailAccountSerializer(OwnedObjectSerializer):

    class Meta:
        model = MailAccount
-        depth = 1
        fields = [
            "id",
            "name",
@@ -36,6 +35,10 @@ class MailAccountSerializer(OwnedObjectSerializer):
            "password",
            "character_set",
            "is_token",
+            "owner",
+            "user_can_change",
+            "permissions",
+            "set_permissions",
        ]

    def update(self, instance, validated_data):
@@ -67,7 +70,6 @@ class MailRuleSerializer(OwnedObjectSerializer):

    class Meta:
        model = MailRule
-        depth = 1
        fields = [
            "id",
            "name",
@@ -89,6 +91,10 @@ class MailRuleSerializer(OwnedObjectSerializer):
            "order",
            "attachment_type",
            "consumption_scope",
+            "owner",
+            "user_can_change",
+            "permissions",
+            "set_permissions",
        ]

    def update(self, instance, validated_data):
--- a/src/paperless_mail/tests/test_api.py
+++ b/src/paperless_mail/tests/test_api.py
@@ -1,7 +1,9 @@
 import json
 from unittest import mock

+from django.contrib.auth.models import Permission
 from django.contrib.auth.models import User
+from guardian.shortcuts import assign_perm
 from rest_framework import status
 from rest_framework.test import APITestCase

@@ -27,7 +29,9 @@ class TestAPIMailAccounts(DirectoriesMixin, APITestCase):

        super().setUp()

-        self.user = User.objects.create_superuser(username="temp_admin")
+        self.user = User.objects.create_user(username="temp_admin")
+        self.user.user_permissions.add(*Permission.objects.all())
+        self.user.save()
        self.client.force_authenticate(user=self.user)

    def test_get_mail_accounts(self):
@@ -266,6 +270,73 @@ class TestAPIMailAccounts(DirectoriesMixin, APITestCase):
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        self.assertEqual(response.data["success"], True)

+    def test_get_mail_accounts_owner_aware(self):
+        """
+        GIVEN:
+            - Configured accounts with different users
+        WHEN:
+            - API call is made to get mail accounts
+        THEN:
+            - Only unowned, owned by user or granted accounts are provided
+        """
+
+        user2 = User.objects.create_user(username="temp_admin2")
+
+        account1 = MailAccount.objects.create(
+            name="Email1",
+            username="username1",
+            password="password1",
+            imap_server="server.example.com",
+            imap_port=443,
+            imap_security=MailAccount.ImapSecurity.SSL,
+            character_set="UTF-8",
+        )
+
+        account2 = MailAccount.objects.create(
+            name="Email2",
+            username="username2",
+            password="password2",
+            imap_server="server.example.com",
+            imap_port=443,
+            imap_security=MailAccount.ImapSecurity.SSL,
+            character_set="UTF-8",
+        )
+        account2.owner = self.user
+        account2.save()
+
+        account3 = MailAccount.objects.create(
+            name="Email3",
+            username="username3",
+            password="password3",
+            imap_server="server.example.com",
+            imap_port=443,
+            imap_security=MailAccount.ImapSecurity.SSL,
+            character_set="UTF-8",
+        )
+        account3.owner = user2
+        account3.save()
+
+        account4 = MailAccount.objects.create(
+            name="Email4",
+            username="username4",
+            password="password4",
+            imap_server="server.example.com",
+            imap_port=443,
+            imap_security=MailAccount.ImapSecurity.SSL,
+            character_set="UTF-8",
+        )
+        account4.owner = user2
+        account4.save()
+        assign_perm("view_mailaccount", self.user, account4)
+
+        response = self.client.get(self.ENDPOINT)
+
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertEqual(response.data["count"], 3)
+        self.assertEqual(response.data["results"][0]["name"], account1.name)
+        self.assertEqual(response.data["results"][1]["name"], account2.name)
+        self.assertEqual(response.data["results"][2]["name"], account4.name)
+

 class TestAPIMailRules(DirectoriesMixin, APITestCase):
    ENDPOINT = "/api/mail_rules/"
@@ -273,7 +344,9 @@ class TestAPIMailRules(DirectoriesMixin, APITestCase):
    def setUp(self):
        super().setUp()

-        self.user = User.objects.create_superuser(username="temp_admin")
+        self.user = User.objects.create_user(username="temp_admin")
+        self.user.user_permissions.add(*Permission.objects.all())
+        self.user.save()
        self.client.force_authenticate(user=self.user)

    def test_get_mail_rules(self):
@@ -533,3 +606,72 @@ class TestAPIMailRules(DirectoriesMixin, APITestCase):
        returned_rule1 = MailRule.objects.get(pk=rule1.pk)
        self.assertEqual(returned_rule1.name, "Updated Name 1")
        self.assertEqual(returned_rule1.action, MailRule.MailAction.DELETE)
+
+    def test_get_mail_rules_owner_aware(self):
+        """
+        GIVEN:
+            - Configured rules with different users
+        WHEN:
+            - API call is made to get mail rules
+        THEN:
+            - Only unowned, owned by user or granted mail rules are provided
+        """
+
+        user2 = User.objects.create_user(username="temp_admin2")
+
+        account1 = MailAccount.objects.create(
+            name="Email1",
+            username="username1",
+            password="password1",
+            imap_server="server.example.com",
+            imap_port=443,
+            imap_security=MailAccount.ImapSecurity.SSL,
+            character_set="UTF-8",
+        )
+
+        rule1 = MailRule.objects.create(
+            name="Rule1",
+            account=account1,
+            folder="INBOX",
+            filter_from="from@example1.com",
+            order=0,
+        )
+
+        rule2 = MailRule.objects.create(
+            name="Rule2",
+            account=account1,
+            folder="INBOX",
+            filter_from="from@example2.com",
+            order=1,
+        )
+        rule2.owner = self.user
+        rule2.save()
+
+        rule3 = MailRule.objects.create(
+            name="Rule3",
+            account=account1,
+            folder="INBOX",
+            filter_from="from@example3.com",
+            order=2,
+        )
+        rule3.owner = user2
+        rule3.save()
+
+        rule4 = MailRule.objects.create(
+            name="Rule4",
+            account=account1,
+            folder="INBOX",
+            filter_from="from@example4.com",
+            order=3,
+        )
+        rule4.owner = user2
+        rule4.save()
+        assign_perm("view_mailrule", self.user, rule4)
+
+        response = self.client.get(self.ENDPOINT)
+
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertEqual(response.data["count"], 3)
+        self.assertEqual(response.data["results"][0]["name"], rule1.name)
+        self.assertEqual(response.data["results"][1]["name"], rule2.name)
+        self.assertEqual(response.data["results"][2]["name"], rule4.name)
--- a/src/paperless_mail/views.py
+++ b/src/paperless_mail/views.py
@@ -7,6 +7,8 @@ from rest_framework.permissions import IsAuthenticated
 from rest_framework.response import Response
 from rest_framework.viewsets import ModelViewSet

+from documents.filters import ObjectOwnedOrGrantedPermissionsFilter
+from documents.permissions import PaperlessObjectPermissions
 from documents.views import PassUserMixin
 from paperless.views import StandardPagination
 from paperless_mail.mail import MailError
@@ -24,7 +26,8 @@ class MailAccountViewSet(ModelViewSet, PassUserMixin):
    queryset = MailAccount.objects.all().order_by("pk")
    serializer_class = MailAccountSerializer
    pagination_class = StandardPagination
-    permission_classes = (IsAuthenticated,)
+    permission_classes = (IsAuthenticated, PaperlessObjectPermissions)
+    filter_backends = (ObjectOwnedOrGrantedPermissionsFilter,)


 class MailRuleViewSet(ModelViewSet, PassUserMixin):
@@ -33,7 +36,8 @@ class MailRuleViewSet(ModelViewSet, PassUserMixin):
    queryset = MailRule.objects.all().order_by("order")
    serializer_class = MailRuleSerializer
    pagination_class = StandardPagination
-    permission_classes = (IsAuthenticated,)
+    permission_classes = (IsAuthenticated, PaperlessObjectPermissions)
+    filter_backends = (ObjectOwnedOrGrantedPermissionsFilter,)


 class MailAccountTestView(GenericAPIView):
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@@ -861,8 +861,9 @@ class TestParserFileTypes(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
        parser = RasterisedDocumentParser(None)
        parser.parse(os.path.join(self.SAMPLE_FILES, "document.webp"), "image/webp")
        self.assertIsFile(parser.archive_path)
-        # OCR consistent mangles this space, oh well
-        self.assertIn(
-            "this is awebp document, created 11/14/2022.",
+        # Older tesseracts consistently mangle the space between "a webp",
+        # tesseract 5.3.0 seems to do a better job, so we're accepting both
+        self.assertRegex(
            parser.get_text().lower(),
+            r"this is a ?webp document, created 11/14/2022.",
        )