Creates a data model for the document consumption, allowing stronger typing of arguments and setting of some information about the file only once

2025-07-22 17:54:40 -05:00 · 2023-01-23 15:55:49 -08:00 · 2023-01-23 15:55:49 -08:00 · 3c2bbf244d
commit 3c2bbf244d
parent fa60251c18
14 changed files with 596 additions and 433 deletions
--- a/src/documents/barcodes.py
+++ b/src/documents/barcodes.py
@ -11,7 +11,6 @@ from typing import List
 from typing import Optional

 import img2pdf
-import magic
 from django.conf import settings
 from pdf2image import convert_from_path
 from pdf2image.exceptions import PDFPageCountError
@ -63,7 +62,7 @@ class DocumentBarcodeInfo:


@lru_cache(maxsize=8)
-def supported_file_type(mime_type) -> bool:
+def supported_file_type(mime_type: str) -> bool:
    """
    Determines if the file is valid for barcode
    processing, based on MIME type and settings
@ -115,33 +114,16 @@ def barcode_reader(image: Image) -> List[str]:
    return barcodes


-def get_file_mime_type(path: Path) -> str:
-    """
-    Determines the file type, based on MIME type.
-
-    Returns the MIME type.
-    """
-    mime_type = magic.from_file(path, mime=True)
-    logger.debug(f"Detected mime type: {mime_type}")
-    return mime_type
-
-
 def convert_from_tiff_to_pdf(filepath: Path) -> Path:
    """
    converts a given TIFF image file to pdf into a temporary directory.

    Returns the new pdf file.
    """
-    mime_type = get_file_mime_type(filepath)
    tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
    # use old file name with pdf extension
-    if mime_type == "image/tiff":
-        newpath = Path(tempdir) / Path(filepath.name).with_suffix(".pdf")
-    else:
-        logger.warning(
-            f"Cannot convert mime type {mime_type} from {filepath} to pdf.",
-        )
-        return None
+    newpath = Path(tempdir) / Path(filepath.name).with_suffix(".pdf")
+
    with Image.open(filepath) as im:
        has_alpha_layer = im.mode in ("RGBA", "LA")
    if has_alpha_layer:
@ -162,6 +144,7 @@ def convert_from_tiff_to_pdf(filepath: Path) -> Path:

 def scan_file_for_barcodes(
    filepath: Path,
+    mime_type: str,
 ) -> DocumentBarcodeInfo:
    """
    Scan the provided pdf file for any barcodes
@ -186,7 +169,6 @@ def scan_file_for_barcodes(
        return detected_barcodes

    pdf_filepath = None
-    mime_type = get_file_mime_type(filepath)
    barcodes = []

    if supported_file_type(mime_type):
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@ -284,7 +284,7 @@ class Consumer(LoggingMixin):

    def try_consume_file(
        self,
-        path,
+        path: Path,
        override_filename=None,
        override_title=None,
        override_correspondent_id=None,
--- a/src/documents/data_models.py
+++ b/src/documents/data_models.py
@ -0,0 +1,62 @@
+import dataclasses
+import datetime
+import enum
+from pathlib import Path
+from typing import List
+from typing import Optional
+
+import magic
+
+
+@dataclasses.dataclass
+class DocumentMetadataOverrides:
+    """
+    Manages overrides for document fields which normally would
+    be set from content or matching.  All fields default to None,
+    meaning no override is happening
+    """
+
+    filename: Optional[str] = None
+    title: Optional[str] = None
+    correspondent_id: Optional[int] = None
+    document_type_id: Optional[int] = None
+    tag_ids: Optional[List[int]] = None
+    created: Optional[datetime.datetime] = None
+    asn: Optional[int] = None
+    owner_id: Optional[int] = None
+
+
+class DocumentSource(enum.IntEnum):
+    """
+    The source of an incoming document.  May have other uses in the future
+    """
+
+    ConsumeFolder = enum.auto()
+    ApiUpload = enum.auto()
+    MailFetch = enum.auto()
+
+
+@dataclasses.dataclass
+class ConsumableDocument:
+    """
+    Encapsulates an incoming document, either from consume folder, API upload
+    or mail fetching and certain useful operations on it.
+    """
+
+    source: DocumentSource
+    original_file: Path
+    mime_type: str = dataclasses.field(init=False, default=None)
+
+    def __post_init__(self):
+        """
+        After a dataclass is initialized, this is called to finalize some data
+        1. Make sure the original path is an absolute, fully qualified path
+        2. Get the mime type of the file
+        """
+        # Always fully qualify the path first thing
+        # Just in case, convert to a path if it's a str
+        self.original_file = Path(self.original_file).resolve()
+
+        # Get the file type once at init
+        # Note this function isn't called when the object is unpickled
+        self.mime_type = magic.from_file(self.original_file, mime=True)
--- a/src/documents/management/commands/document_consumer.py
+++ b/src/documents/management/commands/document_consumer.py
@ -13,6 +13,9 @@ from typing import Set
 from django.conf import settings
 from django.core.management.base import BaseCommand
 from django.core.management.base import CommandError
+from documents.data_models import ConsumableDocument
+from documents.data_models import DocumentMetadataOverrides
+from documents.data_models import DocumentSource
 from documents.models import Tag
 from documents.parsers import is_file_ext_supported
 from documents.tasks import consume_file
@ -122,8 +125,11 @@ def _consume(filepath: str) -> None:
    try:
        logger.info(f"Adding {filepath} to the task queue.")
        consume_file.delay(
-            filepath,
-            override_tag_ids=list(tag_ids) if tag_ids else None,
+            ConsumableDocument(
+                source=DocumentSource.ConsumeFolder,
+                original_file=filepath,
+            ),
+            DocumentMetadataOverrides(tag_ids=tag_ids),
        )
    except Exception:
        # Catch all so that the consumer won't crash.
--- a/src/documents/signals/handlers.py
+++ b/src/documents/signals/handlers.py
@ -1,7 +1,6 @@
 import logging
 import os
 import shutil
-from pathlib import Path

 from celery import states
 from celery.signals import before_task_publish
@ -533,17 +532,9 @@ def before_task_publish_handler(sender=None, headers=None, body=None, **kwargs):

    try:
        task_args = body[0]
-        task_kwargs = body[1]
+        input_doc, _ = task_args

-        task_file_name = ""
-        if "override_filename" in task_kwargs:
-            task_file_name = task_kwargs["override_filename"]
-
-        # Nothing was found, report the task first argument
-        if not len(task_file_name):
-            # There are always some arguments to the consume, first is always filename
-            filepath = Path(task_args[0])
-            task_file_name = filepath.name
+        task_file_name = input_doc.original_file.name

        PaperlessTask.objects.create(
            task_id=headers["id"],
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@ -1,13 +1,10 @@
 import hashlib
 import logging
-import os
 import shutil
 import uuid
-from pathlib import Path
 from typing import Optional
 from typing import Type

-import dateutil.parser
 import tqdm
 from asgiref.sync import async_to_sync
 from celery import shared_task
@ -22,6 +19,9 @@ from documents.classifier import DocumentClassifier
 from documents.classifier import load_classifier
 from documents.consumer import Consumer
 from documents.consumer import ConsumerError
+from documents.data_models import ConsumableDocument
+from documents.data_models import DocumentMetadataOverrides
+from documents.data_models import DocumentSource
 from documents.file_handling import create_source_path_directory
 from documents.file_handling import generate_unique_filename
 from documents.models import Correspondent
@ -88,34 +88,20 @@ def train_classifier():

@shared_task
 def consume_file(
-    path,
-    override_filename=None,
-    override_title=None,
-    override_correspondent_id=None,
-    override_document_type_id=None,
-    override_tag_ids=None,
-    task_id=None,
-    override_created=None,
-    override_owner_id=None,
-    override_archive_serial_num: Optional[int] = None,
+    input_doc: ConsumableDocument,
+    overrides: Optional[DocumentMetadataOverrides] = None,
 ):

-    path = Path(path).resolve()
-    asn = None
-
-    # Celery converts this to a string, but everything expects a datetime
-    # Long term solution is to not use JSON for the serializer but pickle instead
-    # TODO: This will be resolved in kombu 5.3, expected with celery 5.3
-    # More types will be retained through JSON encode/decode
-    if override_created is not None and isinstance(override_created, str):
-        try:
-            override_created = dateutil.parser.isoparse(override_created)
-        except Exception:
-            pass
+    # Default no overrides
+    if overrides is None:
+        overrides = DocumentMetadataOverrides()

    # read all barcodes in the current document
    if settings.CONSUMER_ENABLE_BARCODES or settings.CONSUMER_ENABLE_ASN_BARCODE:
-        doc_barcode_info = barcodes.scan_file_for_barcodes(path)
+        doc_barcode_info = barcodes.scan_file_for_barcodes(
+            input_doc.original_file,
+            input_doc.mime_type,
+        )

        # split document by separator pages, if enabled
        if settings.CONSUMER_ENABLE_BARCODES:
@ -123,7 +109,7 @@ def consume_file(

            if len(separators) > 0:
                logger.debug(
-                    f"Pages with separators found in: {str(path)}",
+                    f"Pages with separators found in: {input_doc.original_file}",
                )
                document_list = barcodes.separate_pages(
                    doc_barcode_info.pdf_path,
@ -136,18 +122,20 @@ def consume_file(
                    # Move it to consume directory to be picked up
                    # Otherwise, use the current parent to keep possible tags
                    # from subdirectories
-                    try:
-                        # is_relative_to would be nicer, but new in 3.9
-                        _ = path.relative_to(settings.SCRATCH_DIR)
+                    if input_doc.source != DocumentSource.ConsumeFolder:
                        save_to_dir = settings.CONSUMPTION_DIR
-                    except ValueError:
-                        save_to_dir = path.parent
+                    else:
+                        # Note this uses the original file, because it's in the
+                        # consume folder already and may include additional path
+                        # components for tagging
+                        # the .path is somewhere in scratch in this case
+                        save_to_dir = input_doc.original_file.parent

                    for n, document in enumerate(document_list):
                        # save to consumption dir
                        # rename it to the original filename  with number prefix
-                        if override_filename:
-                            newname = f"{str(n)}_" + override_filename
+                        if overrides.filename is not None:
+                            newname = f"{str(n)}_{overrides.filename}"
                        else:
                            newname = None

@ -158,24 +146,27 @@ def consume_file(
                        )

                        # Split file has been copied safely, remove it
-                        os.remove(document)
+                        document.unlink()

                    # And clean up the directory as well, now it's empty
-                    shutil.rmtree(os.path.dirname(document_list[0]))
+                    shutil.rmtree(document_list[0].parent)

-                    # Delete the PDF file which was split
-                    os.remove(doc_barcode_info.pdf_path)
+                    # This file has been split into multiple files without issue
+                    # remove the original and working copy
+                    input_doc.original_file.unlink()

-                    # If the original was a TIFF, remove the original file as well
-                    if str(doc_barcode_info.pdf_path) != str(path):
-                        logger.debug(f"Deleting file {path}")
-                        os.unlink(path)
+                    # If the original file was a TIFF, remove the PDF generated from it
+                    if input_doc.mime_type == "image/tiff":
+                        logger.debug(
+                            f"Deleting file {doc_barcode_info.pdf_path}",
+                        )
+                        doc_barcode_info.pdf_path.unlink()

                    # notify the sender, otherwise the progress bar
                    # in the UI stays stuck
                    payload = {
-                        "filename": override_filename or path.name,
-                        "task_id": task_id,
+                        "filename": overrides.filename or input_doc.original_file.name,
+                        "task_id": None,
                        "current_progress": 100,
                        "max_progress": 100,
                        "status": "SUCCESS",
@ -194,22 +185,21 @@ def consume_file(

        # try reading the ASN from barcode
        if settings.CONSUMER_ENABLE_ASN_BARCODE:
-            asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes)
-            if asn:
-                logger.info(f"Found ASN in barcode: {asn}")
+            overrides.asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes)
+            if overrides.asn:
+                logger.info(f"Found ASN in barcode: {overrides.asn}")

    # continue with consumption if no barcode was found
    document = Consumer().try_consume_file(
-        path,
-        override_filename=override_filename,
-        override_title=override_title,
-        override_correspondent_id=override_correspondent_id,
-        override_document_type_id=override_document_type_id,
-        override_tag_ids=override_tag_ids,
-        task_id=task_id,
-        override_created=override_created,
-        override_asn=override_archive_serial_num or asn,
-        override_owner_id=override_owner_id,
+        input_doc.original_file,
+        override_filename=overrides.filename,
+        override_title=overrides.title,
+        override_correspondent_id=overrides.correspondent_id,
+        override_document_type_id=overrides.document_type_id,
+        override_tag_ids=overrides.tag_ids,
+        override_created=overrides.created,
+        override_asn=overrides.asn,
+        override_owner_id=overrides.owner_id,
    )

    if document:
--- a/src/documents/tests/test_api.py
+++ b/src/documents/tests/test_api.py
@ -32,6 +32,7 @@ from documents import bulk_edit
 from documents import index
 from documents.models import Correspondent
 from documents.models import Document
+from documents.tests.utils import DocumentConsumeDelayMixin
 from documents.models import DocumentType
 from documents.models import MatchingModel
 from documents.models import PaperlessTask
@ -45,7 +46,7 @@ from rest_framework.test import APITestCase
 from whoosh.writing import AsyncWriter


-class TestDocumentApi(DirectoriesMixin, APITestCase):
+class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
    def setUp(self):
        super().setUp()

@ -1085,10 +1086,11 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
        self.assertEqual(response.data["documents_inbox"], None)
        self.assertEqual(response.data["inbox_tag"], None)

-    @mock.patch("documents.views.consume_file.delay")
-    def test_upload(self, m):
+    def test_upload(self):

-        m.return_value = celery.result.AsyncResult(id=str(uuid.uuid4()))
+        self.consume_file_mock.return_value = celery.result.AsyncResult(
+            id=str(uuid.uuid4()),
+        )

        with open(
            os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"),
@ -1101,21 +1103,22 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):

        self.assertEqual(response.status_code, status.HTTP_200_OK)

-        m.assert_called_once()
+        self.consume_file_mock.assert_called_once()

-        args, kwargs = m.call_args
-        file_path = Path(args[0])
-        self.assertEqual(file_path.name, "simple.pdf")
-        self.assertIn(Path(settings.SCRATCH_DIR), file_path.parents)
-        self.assertIsNone(kwargs["override_title"])
-        self.assertIsNone(kwargs["override_correspondent_id"])
-        self.assertIsNone(kwargs["override_document_type_id"])
-        self.assertIsNone(kwargs["override_tag_ids"])
+        input_doc, overrides = self.get_last_consume_delay_call_args()

-    @mock.patch("documents.views.consume_file.delay")
-    def test_upload_empty_metadata(self, m):
+        self.assertEqual(input_doc.original_file.name, "simple.pdf")
+        self.assertIn(Path(settings.SCRATCH_DIR), input_doc.original_file.parents)
+        self.assertIsNone(overrides.title)
+        self.assertIsNone(overrides.correspondent_id)
+        self.assertIsNone(overrides.document_type_id)
+        self.assertIsNone(overrides.tag_ids)

-        m.return_value = celery.result.AsyncResult(id=str(uuid.uuid4()))
+    def test_upload_empty_metadata(self):
+
+        self.consume_file_mock.return_value = celery.result.AsyncResult(
+            id=str(uuid.uuid4()),
+        )

        with open(
            os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"),
@ -1128,21 +1131,22 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):

        self.assertEqual(response.status_code, status.HTTP_200_OK)

-        m.assert_called_once()
+        self.consume_file_mock.assert_called_once()

-        args, kwargs = m.call_args
-        file_path = Path(args[0])
-        self.assertEqual(file_path.name, "simple.pdf")
-        self.assertIn(Path(settings.SCRATCH_DIR), file_path.parents)
-        self.assertIsNone(kwargs["override_title"])
-        self.assertIsNone(kwargs["override_correspondent_id"])
-        self.assertIsNone(kwargs["override_document_type_id"])
-        self.assertIsNone(kwargs["override_tag_ids"])
+        input_doc, overrides = self.get_last_consume_delay_call_args()

-    @mock.patch("documents.views.consume_file.delay")
-    def test_upload_invalid_form(self, m):
+        self.assertEqual(input_doc.original_file.name, "simple.pdf")
+        self.assertIn(Path(settings.SCRATCH_DIR), input_doc.original_file.parents)
+        self.assertIsNone(overrides.title)
+        self.assertIsNone(overrides.correspondent_id)
+        self.assertIsNone(overrides.document_type_id)
+        self.assertIsNone(overrides.tag_ids)

-        m.return_value = celery.result.AsyncResult(id=str(uuid.uuid4()))
+    def test_upload_invalid_form(self):
+
+        self.consume_file_mock.return_value = celery.result.AsyncResult(
+            id=str(uuid.uuid4()),
+        )

        with open(
            os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"),
@ -1153,12 +1157,13 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
                {"documenst": f},
            )
        self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
-        m.assert_not_called()
+        self.consume_file_mock.assert_not_called()

-    @mock.patch("documents.views.consume_file.delay")
-    def test_upload_invalid_file(self, m):
+    def test_upload_invalid_file(self):

-        m.return_value = celery.result.AsyncResult(id=str(uuid.uuid4()))
+        self.consume_file_mock.return_value = celery.result.AsyncResult(
+            id=str(uuid.uuid4()),
+        )

        with open(
            os.path.join(os.path.dirname(__file__), "samples", "simple.zip"),
@ -1169,12 +1174,13 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
                {"document": f},
            )
        self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
-        m.assert_not_called()
+        self.consume_file_mock.assert_not_called()

-    @mock.patch("documents.views.consume_file.delay")
-    def test_upload_with_title(self, async_task):
+    def test_upload_with_title(self):

-        async_task.return_value = celery.result.AsyncResult(id=str(uuid.uuid4()))
+        self.consume_file_mock.return_value = celery.result.AsyncResult(
+            id=str(uuid.uuid4()),
+        )

        with open(
            os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"),
@ -1186,16 +1192,20 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
            )
        self.assertEqual(response.status_code, status.HTTP_200_OK)

-        async_task.assert_called_once()
+        self.consume_file_mock.assert_called_once()

-        args, kwargs = async_task.call_args
+        _, overrides = self.get_last_consume_delay_call_args()

-        self.assertEqual(kwargs["override_title"], "my custom title")
+        self.assertEqual(overrides.title, "my custom title")
+        self.assertIsNone(overrides.correspondent_id)
+        self.assertIsNone(overrides.document_type_id)
+        self.assertIsNone(overrides.tag_ids)

-    @mock.patch("documents.views.consume_file.delay")
-    def test_upload_with_correspondent(self, async_task):
+    def test_upload_with_correspondent(self):

-        async_task.return_value = celery.result.AsyncResult(id=str(uuid.uuid4()))
+        self.consume_file_mock.return_value = celery.result.AsyncResult(
+            id=str(uuid.uuid4()),
+        )

        c = Correspondent.objects.create(name="test-corres")
        with open(
@ -1208,16 +1218,20 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
            )
        self.assertEqual(response.status_code, status.HTTP_200_OK)

-        async_task.assert_called_once()
+        self.consume_file_mock.assert_called_once()

-        args, kwargs = async_task.call_args
+        _, overrides = self.get_last_consume_delay_call_args()

-        self.assertEqual(kwargs["override_correspondent_id"], c.id)
+        self.assertEqual(overrides.correspondent_id, c.id)
+        self.assertIsNone(overrides.title)
+        self.assertIsNone(overrides.document_type_id)
+        self.assertIsNone(overrides.tag_ids)

-    @mock.patch("documents.views.consume_file.delay")
-    def test_upload_with_invalid_correspondent(self, async_task):
+    def test_upload_with_invalid_correspondent(self):

-        async_task.return_value = celery.result.AsyncResult(id=str(uuid.uuid4()))
+        self.consume_file_mock.return_value = celery.result.AsyncResult(
+            id=str(uuid.uuid4()),
+        )

        with open(
            os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"),
@ -1229,12 +1243,13 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
            )
        self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)

-        async_task.assert_not_called()
+        self.consume_file_mock.assert_not_called()

-    @mock.patch("documents.views.consume_file.delay")
-    def test_upload_with_document_type(self, async_task):
+    def test_upload_with_document_type(self):

-        async_task.return_value = celery.result.AsyncResult(id=str(uuid.uuid4()))
+        self.consume_file_mock.return_value = celery.result.AsyncResult(
+            id=str(uuid.uuid4()),
+        )

        dt = DocumentType.objects.create(name="invoice")
        with open(
@ -1247,16 +1262,20 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
            )
        self.assertEqual(response.status_code, status.HTTP_200_OK)

-        async_task.assert_called_once()
+        self.consume_file_mock.assert_called_once()

-        args, kwargs = async_task.call_args
+        _, overrides = self.get_last_consume_delay_call_args()

-        self.assertEqual(kwargs["override_document_type_id"], dt.id)
+        self.assertEqual(overrides.document_type_id, dt.id)
+        self.assertIsNone(overrides.correspondent_id)
+        self.assertIsNone(overrides.title)
+        self.assertIsNone(overrides.tag_ids)

-    @mock.patch("documents.views.consume_file.delay")
-    def test_upload_with_invalid_document_type(self, async_task):
+    def test_upload_with_invalid_document_type(self):

-        async_task.return_value = celery.result.AsyncResult(id=str(uuid.uuid4()))
+        self.consume_file_mock.return_value = celery.result.AsyncResult(
+            id=str(uuid.uuid4()),
+        )

        with open(
            os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"),
@ -1268,12 +1287,13 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
            )
        self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)

-        async_task.assert_not_called()
+        self.consume_file_mock.assert_not_called()

-    @mock.patch("documents.views.consume_file.delay")
-    def test_upload_with_tags(self, async_task):
+    def test_upload_with_tags(self):

-        async_task.return_value = celery.result.AsyncResult(id=str(uuid.uuid4()))
+        self.consume_file_mock.return_value = celery.result.AsyncResult(
+            id=str(uuid.uuid4()),
+        )

        t1 = Tag.objects.create(name="tag1")
        t2 = Tag.objects.create(name="tag2")
@ -1287,16 +1307,20 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
            )
        self.assertEqual(response.status_code, status.HTTP_200_OK)

-        async_task.assert_called_once()
+        self.consume_file_mock.assert_called_once()

-        args, kwargs = async_task.call_args
+        _, overrides = self.get_last_consume_delay_call_args()

-        self.assertCountEqual(kwargs["override_tag_ids"], [t1.id, t2.id])
+        self.assertCountEqual(overrides.tag_ids, [t1.id, t2.id])
+        self.assertIsNone(overrides.document_type_id)
+        self.assertIsNone(overrides.correspondent_id)
+        self.assertIsNone(overrides.title)

-    @mock.patch("documents.views.consume_file.delay")
-    def test_upload_with_invalid_tags(self, async_task):
+    def test_upload_with_invalid_tags(self):

-        async_task.return_value = celery.result.AsyncResult(id=str(uuid.uuid4()))
+        self.consume_file_mock.return_value = celery.result.AsyncResult(
+            id=str(uuid.uuid4()),
+        )

        t1 = Tag.objects.create(name="tag1")
        t2 = Tag.objects.create(name="tag2")
@ -1310,12 +1334,13 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
            )
        self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)

-        async_task.assert_not_called()
+        self.consume_file_mock.assert_not_called()

-    @mock.patch("documents.views.consume_file.delay")
-    def test_upload_with_created(self, async_task):
+    def test_upload_with_created(self):

-        async_task.return_value = celery.result.AsyncResult(id=str(uuid.uuid4()))
+        self.consume_file_mock.return_value = celery.result.AsyncResult(
+            id=str(uuid.uuid4()),
+        )

        created = datetime.datetime(
            2022,
@ -1337,16 +1362,17 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
            )
        self.assertEqual(response.status_code, status.HTTP_200_OK)

-        async_task.assert_called_once()
+        self.consume_file_mock.assert_called_once()

-        args, kwargs = async_task.call_args
+        _, overrides = self.get_last_consume_delay_call_args()

-        self.assertEqual(kwargs["override_created"], created)
+        self.assertEqual(overrides.created, created)

-    @mock.patch("documents.views.consume_file.delay")
-    def test_upload_with_asn(self, m):
+    def test_upload_with_asn(self):

-        m.return_value = celery.result.AsyncResult(id=str(uuid.uuid4()))
+        self.consume_file_mock.return_value = celery.result.AsyncResult(
+            id=str(uuid.uuid4()),
+        )

        with open(
            os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"),
@ -1359,17 +1385,16 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):

        self.assertEqual(response.status_code, status.HTTP_200_OK)

-        m.assert_called_once()
+        self.consume_file_mock.assert_called_once()

-        args, kwargs = m.call_args
-        file_path = Path(args[0])
-        self.assertEqual(file_path.name, "simple.pdf")
-        self.assertIn(Path(settings.SCRATCH_DIR), file_path.parents)
-        self.assertIsNone(kwargs["override_title"])
-        self.assertIsNone(kwargs["override_correspondent_id"])
-        self.assertIsNone(kwargs["override_document_type_id"])
-        self.assertIsNone(kwargs["override_tag_ids"])
-        self.assertEqual(500, kwargs["override_archive_serial_num"])
+        input_doc, overrides = self.get_last_consume_delay_call_args()
+
+        self.assertEqual(input_doc.original_file.name, "simple.pdf")
+        self.assertEqual(overrides.filename, "simple.pdf")
+        self.assertIsNone(overrides.correspondent_id)
+        self.assertIsNone(overrides.document_type_id)
+        self.assertIsNone(overrides.tag_ids)
+        self.assertEqual(500, overrides.asn)

    def test_get_metadata(self):
        doc = Document.objects.create(
--- a/src/documents/tests/test_barcodes.py
+++ b/src/documents/tests/test_barcodes.py
@ -10,6 +10,9 @@ from django.test import TestCase
 from documents import barcodes
 from documents import tasks
 from documents.consumer import ConsumerError
+from documents.data_models import ConsumableDocument
+from documents.data_models import DocumentMetadataOverrides
+from documents.data_models import DocumentSource
 from documents.tests.utils import DirectoriesMixin
 from documents.tests.utils import FileSystemAssertsMixin
 from PIL import Image
@ -183,46 +186,14 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
        img = Image.open(test_file)
        self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM BARCODE"])

-    def test_get_mime_type(self):
-        """
-        GIVEN:
-            -
-        WHEN:
-            -
-        THEN:
-            -
-        """
-        tiff_file = self.SAMPLE_DIR / "simple.tiff"
-
-        pdf_file = self.SAMPLE_DIR / "simple.pdf"
-
-        png_file = self.BARCODE_SAMPLE_DIR / "barcode-128-custom.png"
-
-        tiff_file_no_extension = settings.SCRATCH_DIR / "testfile1"
-        pdf_file_no_extension = settings.SCRATCH_DIR / "testfile2"
-        shutil.copy(tiff_file, tiff_file_no_extension)
-        shutil.copy(pdf_file, pdf_file_no_extension)
-
-        self.assertEqual(barcodes.get_file_mime_type(tiff_file), "image/tiff")
-        self.assertEqual(barcodes.get_file_mime_type(pdf_file), "application/pdf")
-        self.assertEqual(
-            barcodes.get_file_mime_type(tiff_file_no_extension),
-            "image/tiff",
-        )
-        self.assertEqual(
-            barcodes.get_file_mime_type(pdf_file_no_extension),
-            "application/pdf",
-        )
-        self.assertEqual(barcodes.get_file_mime_type(png_file), "image/png")
-
    def test_convert_from_tiff_to_pdf(self):
        """
        GIVEN:
-            -
+            - Multi-page TIFF image
        WHEN:
-            -
+            - Conversion to PDF
        THEN:
-            -
+            - The file converts without error
        """
        test_file = self.SAMPLE_DIR / "simple.tiff"

@ -233,34 +204,20 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
        self.assertIsFile(target_file)
        self.assertEqual(target_file.suffix, ".pdf")

-    def test_convert_error_from_pdf_to_pdf(self):
-        """
-        GIVEN:
-            -
-        WHEN:
-            -
-        THEN:
-            -
-        """
-        test_file = self.SAMPLE_DIR / "simple.pdf"
-
-        dst = settings.SCRATCH_DIR / "simple.pdf"
-        shutil.copy(test_file, dst)
-        self.assertIsNone(barcodes.convert_from_tiff_to_pdf(dst))
-
    def test_scan_file_for_separating_barcodes(self):
        """
        GIVEN:
-            -
+            - PDF containing barcodes
        WHEN:
-            -
+            - File is scanned for barcodes
        THEN:
-            -
+            - Correct page index located
        """
        test_file = self.BARCODE_SAMPLE_DIR / "patch-code-t.pdf"

        doc_barcode_info = barcodes.scan_file_for_barcodes(
            test_file,
+            "application/pdf",
        )
        separator_page_numbers = barcodes.get_separating_barcodes(
            doc_barcode_info.barcodes,
@ -272,15 +229,17 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
    def test_scan_file_for_separating_barcodes_none_present(self):
        """
        GIVEN:
-            -
+            - File with no barcodes
        WHEN:
-            -
+            - File is scanned
        THEN:
-            -
+            - No barcodes detected
+            - No pages to split on
        """
        test_file = self.SAMPLE_DIR / "simple.pdf"
        doc_barcode_info = barcodes.scan_file_for_barcodes(
            test_file,
+            "application/pdf",
        )
        separator_page_numbers = barcodes.get_separating_barcodes(
            doc_barcode_info.barcodes,
@ -302,6 +261,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):

        doc_barcode_info = barcodes.scan_file_for_barcodes(
            test_file,
+            "application/pdf",
        )
        separator_page_numbers = barcodes.get_separating_barcodes(
            doc_barcode_info.barcodes,
@ -323,6 +283,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):

        doc_barcode_info = barcodes.scan_file_for_barcodes(
            test_file,
+            "application/pdf",
        )
        separator_page_numbers = barcodes.get_separating_barcodes(
            doc_barcode_info.barcodes,
@ -345,6 +306,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):

        doc_barcode_info = barcodes.scan_file_for_barcodes(
            test_file,
+            "application/pdf",
        )
        separator_page_numbers = barcodes.get_separating_barcodes(
            doc_barcode_info.barcodes,
@ -366,6 +328,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):

        doc_barcode_info = barcodes.scan_file_for_barcodes(
            test_file,
+            "application/pdf",
        )
        separator_page_numbers = barcodes.get_separating_barcodes(
            doc_barcode_info.barcodes,
@ -388,6 +351,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):

        doc_barcode_info = barcodes.scan_file_for_barcodes(
            test_file,
+            "application/pdf",
        )
        separator_page_numbers = barcodes.get_separating_barcodes(
            doc_barcode_info.barcodes,
@ -411,6 +375,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):

        doc_barcode_info = barcodes.scan_file_for_barcodes(
            test_file,
+            "application/pdf",
        )
        separator_page_numbers = barcodes.get_separating_barcodes(
            doc_barcode_info.barcodes,
@ -435,6 +400,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):

        doc_barcode_info = barcodes.scan_file_for_barcodes(
            test_file,
+            "application/pdf",
        )
        separator_page_numbers = barcodes.get_separating_barcodes(
            doc_barcode_info.barcodes,
@ -459,6 +425,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):

        doc_barcode_info = barcodes.scan_file_for_barcodes(
            test_file,
+            "application/pdf",
        )
        separator_page_numbers = barcodes.get_separating_barcodes(
            doc_barcode_info.barcodes,
@ -482,6 +449,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):

        doc_barcode_info = barcodes.scan_file_for_barcodes(
            test_file,
+            "application/pdf",
        )
        separator_page_numbers = barcodes.get_separating_barcodes(
            doc_barcode_info.barcodes,
@ -504,6 +472,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):

        doc_barcode_info = barcodes.scan_file_for_barcodes(
            test_file,
+            "application/pdf",
        )
        separator_page_numbers = barcodes.get_separating_barcodes(
            doc_barcode_info.barcodes,
@ -636,6 +605,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):

        doc_barcode_info = barcodes.scan_file_for_barcodes(
            test_file,
+            "application/pdf",
        )
        separator_page_numbers = barcodes.get_separating_barcodes(
            doc_barcode_info.barcodes,
@ -673,7 +643,16 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
        shutil.copy(test_file, dst)

        with mock.patch("documents.tasks.async_to_sync"):
-            self.assertEqual(tasks.consume_file(dst), "File successfully split")
+            self.assertEqual(
+                tasks.consume_file(
+                    ConsumableDocument(
+                        source=DocumentSource.ConsumeFolder,
+                        original_file=dst,
+                    ),
+                    None,
+                ),
+                "File successfully split",
+            )

    @override_settings(
        CONSUMER_ENABLE_BARCODES=True,
@ -694,7 +673,17 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
        shutil.copy(test_file, dst)

        with mock.patch("documents.tasks.async_to_sync"):
-            self.assertEqual(tasks.consume_file(dst), "File successfully split")
+            self.assertEqual(
+                tasks.consume_file(
+                    ConsumableDocument(
+                        source=DocumentSource.ConsumeFolder,
+                        original_file=dst,
+                    ),
+                    None,
+                ),
+                "File successfully split",
+            )
+        self.assertFalse(dst.exists())

    @override_settings(
        CONSUMER_ENABLE_BARCODES=True,
@ -717,7 +706,16 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
        shutil.copy(test_file, dst)

        with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
-            self.assertIn("Success", tasks.consume_file(dst))
+            self.assertIn(
+                "Success",
+                tasks.consume_file(
+                    ConsumableDocument(
+                        source=DocumentSource.ConsumeFolder,
+                        original_file=dst,
+                    ),
+                    None,
+                ),
+            )

        self.assertListEqual(
            cm.output,
@ -754,7 +752,17 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
        shutil.copy(test_file, dst)

        with mock.patch("documents.tasks.async_to_sync"):
-            self.assertEqual(tasks.consume_file(dst), "File successfully split")
+            self.assertEqual(
+                tasks.consume_file(
+                    ConsumableDocument(
+                        source=DocumentSource.ConsumeFolder,
+                        original_file=dst,
+                    ),
+                    None,
+                ),
+                "File successfully split",
+            )
+        self.assertFalse(dst.exists())

    def test_scan_file_for_separating_barcodes_password(self):
        """
@ -769,6 +777,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
        with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
            doc_barcode_info = barcodes.scan_file_for_barcodes(
                test_file,
+                "application/pdf",
            )
            warning = cm.output[0]
            expected_str = "WARNING:paperless.barcodes:File is likely password protected, not checking for barcodes"
@ -798,6 +807,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):

        doc_barcode_info = barcodes.scan_file_for_barcodes(
            test_file,
+            "application/pdf",
        )
        separator_page_numbers = barcodes.get_separating_barcodes(
            doc_barcode_info.barcodes,
@ -835,6 +845,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):

        doc_barcode_info = barcodes.scan_file_for_barcodes(
            test_file,
+            "application/pdf",
        )
        separator_page_numbers = barcodes.get_separating_barcodes(
            doc_barcode_info.barcodes,
@ -855,7 +866,7 @@ class TestBarcode(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
        self.assertEqual(len(document_list), 5)


-class TestAsnBarcodes(DirectoriesMixin, TestCase):
+class TestAsnBarcode(DirectoriesMixin, TestCase):

    SAMPLE_DIR = Path(__file__).parent / "samples"

@ -923,6 +934,7 @@ class TestAsnBarcodes(DirectoriesMixin, TestCase):

        doc_barcode_info = barcodes.scan_file_for_barcodes(
            test_file,
+            "application/pdf",
        )
        asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes)

@ -944,6 +956,7 @@ class TestAsnBarcodes(DirectoriesMixin, TestCase):

        doc_barcode_info = barcodes.scan_file_for_barcodes(
            test_file,
+            "application/pdf",
        )

        asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes)
@ -970,7 +983,13 @@ class TestAsnBarcodes(DirectoriesMixin, TestCase):
        shutil.copy(test_file, dst)

        with mock.patch("documents.consumer.Consumer.try_consume_file") as mocked_call:
-            tasks.consume_file(dst)
+            tasks.consume_file(
+                ConsumableDocument(
+                    source=DocumentSource.ConsumeFolder,
+                    original_file=dst,
+                ),
+                None,
+            )

            args, kwargs = mocked_call.call_args

@ -991,6 +1010,7 @@ class TestAsnBarcodes(DirectoriesMixin, TestCase):

        doc_barcode_info = barcodes.scan_file_for_barcodes(
            test_file,
+            "application/pdf",
        )
        asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes)

@ -1010,6 +1030,7 @@ class TestAsnBarcodes(DirectoriesMixin, TestCase):

        doc_barcode_info = barcodes.scan_file_for_barcodes(
            test_file,
+            "application/pdf",
        )
        asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes)

@ -1032,12 +1053,17 @@ class TestAsnBarcodes(DirectoriesMixin, TestCase):
        dst = self.dirs.scratch_dir / "barcode-128-asn-too-large.pdf"
        shutil.copy(src, dst)

+        input_doc = ConsumableDocument(
+            source=DocumentSource.ConsumeFolder,
+            original_file=dst,
+        )
+
        with mock.patch("documents.consumer.Consumer._send_progress"):
            self.assertRaisesMessage(
                ConsumerError,
                "Given ASN 4294967296 is out of range [0, 4,294,967,295]",
                tasks.consume_file,
-                dst,
+                input_doc,
            )


@ -1055,5 +1081,5 @@ class TestBarcodeZxing(TestBarcode):
    reason="No zxingcpp",
 )
@override_settings(CONSUMER_BARCODE_SCANNER="ZXING")
-class TestAsnBarcodesZxing(TestAsnBarcodes):
+class TestAsnBarcodesZxing(TestAsnBarcode):
    pass
--- a/src/documents/tests/test_management_consumer.py
+++ b/src/documents/tests/test_management_consumer.py
@ -1,6 +1,7 @@
 import filecmp
 import os
 import shutil
+from pathlib import Path
 from threading import Thread
 from time import sleep
 from unittest import mock
@ -11,9 +12,12 @@ from django.core.management import CommandError
 from django.test import override_settings
 from django.test import TransactionTestCase
 from documents.consumer import ConsumerError
+from documents.data_models import ConsumableDocument
+from documents.data_models import DocumentMetadataOverrides
 from documents.management.commands import document_consumer
 from documents.models import Tag
 from documents.tests.utils import DirectoriesMixin
+from documents.tests.utils import DocumentConsumeDelayMixin


 class ConsumerThread(Thread):
@ -35,18 +39,19 @@ def chunked(size, source):
        yield source[i : i + size]


-class ConsumerMixin:
+class ConsumerThreadMixin(DocumentConsumeDelayMixin):
+    """
+    Provides a thread which runs the consumer management command at setUp
+    and stops it at tearDown
+    """

-    sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
+    sample_file: Path = (
+        Path(__file__).parent / Path("samples") / Path("simple.pdf")
+    ).resolve()

    def setUp(self) -> None:
        super().setUp()
        self.t = None
-        patcher = mock.patch(
-            "documents.tasks.consume_file.delay",
-        )
-        self.task_mock = patcher.start()
-        self.addCleanup(patcher.stop)

    def t_start(self):
        self.t = ConsumerThread()
@ -67,7 +72,7 @@ class ConsumerMixin:
    def wait_for_task_mock_call(self, expected_call_count=1):
        n = 0
        while n < 50:
-            if self.task_mock.call_count >= expected_call_count:
+            if self.consume_file_mock.call_count >= expected_call_count:
                # give task_mock some time to finish and raise errors
                sleep(1)
                return
@ -76,8 +81,12 @@ class ConsumerMixin:

    # A bogus async_task that will simply check the file for
    # completeness and raise an exception otherwise.
-    def bogus_task(self, filename, **kwargs):
-        eq = filecmp.cmp(filename, self.sample_file, shallow=False)
+    def bogus_task(
+        self,
+        input_doc: ConsumableDocument,
+        overrides=None,
+    ):
+        eq = filecmp.cmp(input_doc.original_file, self.sample_file, shallow=False)
        if not eq:
            print("Consumed an INVALID file.")
            raise ConsumerError("Incomplete File READ FAILED")
@ -103,19 +112,20 @@ class ConsumerMixin:
@override_settings(
    CONSUMER_INOTIFY_DELAY=0.01,
 )
-class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
+class TestConsumer(DirectoriesMixin, ConsumerThreadMixin, TransactionTestCase):
    def test_consume_file(self):
        self.t_start()

-        f = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
+        f = Path(os.path.join(self.dirs.consumption_dir, "my_file.pdf"))
        shutil.copy(self.sample_file, f)

        self.wait_for_task_mock_call()

-        self.task_mock.assert_called_once()
+        self.consume_file_mock.assert_called_once()

-        args, kwargs = self.task_mock.call_args
-        self.assertEqual(args[0], f)
+        input_doc, _ = self.get_last_consume_delay_call_args()
+
+        self.assertEqual(input_doc.original_file, f)

    def test_consume_file_invalid_ext(self):
        self.t_start()
@ -125,26 +135,27 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):

        self.wait_for_task_mock_call()

-        self.task_mock.assert_not_called()
+        self.consume_file_mock.assert_not_called()

    def test_consume_existing_file(self):
-        f = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
+        f = Path(os.path.join(self.dirs.consumption_dir, "my_file.pdf"))
        shutil.copy(self.sample_file, f)

        self.t_start()
-        self.task_mock.assert_called_once()
+        self.consume_file_mock.assert_called_once()

-        args, kwargs = self.task_mock.call_args
-        self.assertEqual(args[0], f)
+        input_doc, _ = self.get_last_consume_delay_call_args()
+
+        self.assertEqual(input_doc.original_file, f)

    @mock.patch("documents.management.commands.document_consumer.logger.error")
    def test_slow_write_pdf(self, error_logger):

-        self.task_mock.side_effect = self.bogus_task
+        self.consume_file_mock.side_effect = self.bogus_task

        self.t_start()

-        fname = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
+        fname = Path(os.path.join(self.dirs.consumption_dir, "my_file.pdf"))

        self.slow_write_file(fname)

@ -152,48 +163,52 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):

        error_logger.assert_not_called()

-        self.task_mock.assert_called_once()
+        self.consume_file_mock.assert_called_once()

-        args, kwargs = self.task_mock.call_args
-        self.assertEqual(args[0], fname)
+        input_doc, _ = self.get_last_consume_delay_call_args()
+
+        self.assertEqual(input_doc.original_file, fname)

    @mock.patch("documents.management.commands.document_consumer.logger.error")
    def test_slow_write_and_move(self, error_logger):

-        self.task_mock.side_effect = self.bogus_task
+        self.consume_file_mock.side_effect = self.bogus_task

        self.t_start()

-        fname = os.path.join(self.dirs.consumption_dir, "my_file.~df")
-        fname2 = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
+        fname = Path(os.path.join(self.dirs.consumption_dir, "my_file.~df"))
+        fname2 = Path(os.path.join(self.dirs.consumption_dir, "my_file.pdf"))

        self.slow_write_file(fname)
        shutil.move(fname, fname2)

        self.wait_for_task_mock_call()

-        self.task_mock.assert_called_once()
+        self.consume_file_mock.assert_called_once()

-        args, kwargs = self.task_mock.call_args
-        self.assertEqual(args[0], fname2)
+        input_doc, _ = self.get_last_consume_delay_call_args()
+
+        self.assertEqual(input_doc.original_file, fname2)

        error_logger.assert_not_called()

    @mock.patch("documents.management.commands.document_consumer.logger.error")
    def test_slow_write_incomplete(self, error_logger):

-        self.task_mock.side_effect = self.bogus_task
+        self.consume_file_mock.side_effect = self.bogus_task

        self.t_start()

-        fname = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
+        fname = Path(os.path.join(self.dirs.consumption_dir, "my_file.pdf"))
        self.slow_write_file(fname, incomplete=True)

        self.wait_for_task_mock_call()

-        self.task_mock.assert_called_once()
-        args, kwargs = self.task_mock.call_args
-        self.assertEqual(args[0], fname)
+        self.consume_file_mock.assert_called_once()
+
+        input_doc, _ = self.get_last_consume_delay_call_args()
+
+        self.assertEqual(input_doc.original_file, fname)

        # assert that we have an error logged with this invalid file.
        error_logger.assert_called_once()
@ -209,7 +224,7 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
        self.assertRaises(CommandError, call_command, "document_consumer", "--oneshot")

    def test_mac_write(self):
-        self.task_mock.side_effect = self.bogus_task
+        self.consume_file_mock.side_effect = self.bogus_task

        self.t_start()

@ -238,12 +253,13 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):

        self.wait_for_task_mock_call(expected_call_count=2)

-        self.assertEqual(2, self.task_mock.call_count)
+        self.assertEqual(2, self.consume_file_mock.call_count)

-        fnames = [
-            os.path.basename(args[0]) for args, _ in self.task_mock.call_args_list
-        ]
-        self.assertCountEqual(fnames, ["my_file.pdf", "my_second_file.pdf"])
+        consumed_files = []
+        for input_doc, _ in self.get_all_consume_delay_call_args():
+            consumed_files.append(input_doc.original_file.name)
+
+        self.assertCountEqual(consumed_files, ["my_file.pdf", "my_second_file.pdf"])

    def test_is_ignored(self):
        test_paths = [
@ -341,7 +357,7 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):

        self.wait_for_task_mock_call()

-        self.task_mock.assert_not_called()
+        self.consume_file_mock.assert_not_called()


@override_settings(
@ -373,7 +389,7 @@ class TestConsumerRecursivePolling(TestConsumer):
    pass


-class TestConsumerTags(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
+class TestConsumerTags(DirectoriesMixin, ConsumerThreadMixin, TransactionTestCase):
    @override_settings(CONSUMER_RECURSIVE=True, CONSUMER_SUBDIRS_AS_TAGS=True)
    def test_consume_file_with_path_tags(self):

@ -387,7 +403,7 @@ class TestConsumerTags(DirectoriesMixin, ConsumerMixin, TransactionTestCase):

        path = os.path.join(self.dirs.consumption_dir, *tag_names)
        os.makedirs(path, exist_ok=True)
-        f = os.path.join(path, "my_file.pdf")
+        f = Path(os.path.join(path, "my_file.pdf"))
        # Wait at least inotify read_delay for recursive watchers
        # to be created for the new directories
        sleep(1)
@ -395,18 +411,19 @@ class TestConsumerTags(DirectoriesMixin, ConsumerMixin, TransactionTestCase):

        self.wait_for_task_mock_call()

-        self.task_mock.assert_called_once()
+        self.consume_file_mock.assert_called_once()

        # Add the pk of the Tag created by _consume()
        tag_ids.append(Tag.objects.get(name=tag_names[1]).pk)

-        args, kwargs = self.task_mock.call_args
-        self.assertEqual(args[0], f)
+        input_doc, overrides = self.get_last_consume_delay_call_args()
+
+        self.assertEqual(input_doc.original_file, f)

        # assertCountEqual has a bad name, but test that the first
        # sequence contains the same elements as second, regardless of
        # their order.
-        self.assertCountEqual(kwargs["override_tag_ids"], tag_ids)
+        self.assertCountEqual(overrides.tag_ids, tag_ids)

    @override_settings(
        CONSUMER_POLLING=1,
--- a/src/documents/tests/test_task_signals.py
+++ b/src/documents/tests/test_task_signals.py
@ -1,76 +1,21 @@
+import uuid
+from unittest import mock
+
 import celery
 from django.test import TestCase
+from documents.data_models import ConsumableDocument
+from documents.data_models import DocumentMetadataOverrides
+from documents.data_models import DocumentSource
 from documents.models import PaperlessTask
 from documents.signals.handlers import before_task_publish_handler
 from documents.signals.handlers import task_postrun_handler
 from documents.signals.handlers import task_prerun_handler
+from documents.tests.test_consumer import fake_magic_from_file
 from documents.tests.utils import DirectoriesMixin


+@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
 class TestTaskSignalHandler(DirectoriesMixin, TestCase):
-
-    HEADERS_CONSUME = {
-        "lang": "py",
-        "task": "documents.tasks.consume_file",
-        "id": "52d31e24-9dcc-4c32-9e16-76007e9add5e",
-        "shadow": None,
-        "eta": None,
-        "expires": None,
-        "group": None,
-        "group_index": None,
-        "retries": 0,
-        "timelimit": [None, None],
-        "root_id": "52d31e24-9dcc-4c32-9e16-76007e9add5e",
-        "parent_id": None,
-        "argsrepr": "('/consume/hello-999.pdf',)",
-        "kwargsrepr": "{'override_tag_ids': None}",
-        "origin": "gen260@paperless-ngx-dev-webserver",
-        "ignore_result": False,
-    }
-
-    BODY_CONSUME = (
-        # args
-        ("/consume/hello-999.pdf",),
-        # kwargs
-        {"override_tag_ids": None},
-        {"callbacks": None, "errbacks": None, "chain": None, "chord": None},
-    )
-
-    HEADERS_WEB_UI = {
-        "lang": "py",
-        "task": "documents.tasks.consume_file",
-        "id": "6e88a41c-e5f8-4631-9972-68c314512498",
-        "shadow": None,
-        "eta": None,
-        "expires": None,
-        "group": None,
-        "group_index": None,
-        "retries": 0,
-        "timelimit": [None, None],
-        "root_id": "6e88a41c-e5f8-4631-9972-68c314512498",
-        "parent_id": None,
-        "argsrepr": "('/tmp/paperless/paperless-upload-st9lmbvx',)",
-        "kwargsrepr": "{'override_filename': 'statement.pdf', 'override_title': None, 'override_correspondent_id': None, 'override_document_type_id': None, 'override_tag_ids': None, 'task_id': 'f5622ca9-3707-4ed0-b418-9680b912572f', 'override_created': None}",
-        "origin": "gen342@paperless-ngx-dev-webserver",
-        "ignore_result": False,
-    }
-
-    BODY_WEB_UI = (
-        # args
-        ("/tmp/paperless/paperless-upload-st9lmbvx",),
-        # kwargs
-        {
-            "override_filename": "statement.pdf",
-            "override_title": None,
-            "override_correspondent_id": None,
-            "override_document_type_id": None,
-            "override_tag_ids": None,
-            "task_id": "f5622ca9-3707-4ed0-b418-9680b912572f",
-            "override_created": None,
-        },
-        {"callbacks": None, "errbacks": None, "chain": None, "chord": None},
-    )
-
    def util_call_before_task_publish_handler(self, headers_to_use, body_to_use):
        """
        Simple utility to call the pre-run handle and ensure it created a single task
@ -91,41 +36,36 @@ class TestTaskSignalHandler(DirectoriesMixin, TestCase):
        THEN:
            - The task is created and marked as pending
        """
+        headers = {
+            "id": str(uuid.uuid4()),
+            "task": "documents.tasks.consume_file",
+        }
+        body = (
+            # args
+            (
+                ConsumableDocument(
+                    source=DocumentSource.ConsumeFolder,
+                    original_file="/consume/hello-999.pdf",
+                ),
+                None,
+            ),
+            # kwargs
+            {},
+            # celery stuff
+            {"callbacks": None, "errbacks": None, "chain": None, "chord": None},
+        )
        self.util_call_before_task_publish_handler(
-            headers_to_use=self.HEADERS_CONSUME,
-            body_to_use=self.BODY_CONSUME,
+            headers_to_use=headers,
+            body_to_use=body,
        )

        task = PaperlessTask.objects.get()
        self.assertIsNotNone(task)
-        self.assertEqual(self.HEADERS_CONSUME["id"], task.task_id)
+        self.assertEqual(headers["id"], task.task_id)
        self.assertEqual("hello-999.pdf", task.task_file_name)
        self.assertEqual("documents.tasks.consume_file", task.task_name)
        self.assertEqual(celery.states.PENDING, task.status)

-    def test_before_task_publish_handler_webui(self):
-        """
-        GIVEN:
-            - A celery task is started via the web ui
-        WHEN:
-            - Task before publish handler is called
-        THEN:
-            - The task is created and marked as pending
-        """
-        self.util_call_before_task_publish_handler(
-            headers_to_use=self.HEADERS_WEB_UI,
-            body_to_use=self.BODY_WEB_UI,
-        )
-
-        task = PaperlessTask.objects.get()
-
-        self.assertIsNotNone(task)
-
-        self.assertEqual(self.HEADERS_WEB_UI["id"], task.task_id)
-        self.assertEqual("statement.pdf", task.task_file_name)
-        self.assertEqual("documents.tasks.consume_file", task.task_name)
-        self.assertEqual(celery.states.PENDING, task.status)
-
    def test_task_prerun_handler(self):
        """
        GIVEN:
@ -135,12 +75,32 @@ class TestTaskSignalHandler(DirectoriesMixin, TestCase):
        THEN:
            - The task is marked as started
        """
-        self.util_call_before_task_publish_handler(
-            headers_to_use=self.HEADERS_CONSUME,
-            body_to_use=self.BODY_CONSUME,
+
+        headers = {
+            "id": str(uuid.uuid4()),
+            "task": "documents.tasks.consume_file",
+        }
+        body = (
+            # args
+            (
+                ConsumableDocument(
+                    source=DocumentSource.ConsumeFolder,
+                    original_file="/consume/hello-99.pdf",
+                ),
+                None,
+            ),
+            # kwargs
+            {},
+            # celery stuff
+            {"callbacks": None, "errbacks": None, "chain": None, "chord": None},
        )

-        task_prerun_handler(task_id=self.HEADERS_CONSUME["id"])
+        self.util_call_before_task_publish_handler(
+            headers_to_use=headers,
+            body_to_use=body,
+        )
+
+        task_prerun_handler(task_id=headers["id"])

        task = PaperlessTask.objects.get()

@ -155,13 +115,31 @@ class TestTaskSignalHandler(DirectoriesMixin, TestCase):
        THEN:
            - The task is marked as started
        """
+        headers = {
+            "id": str(uuid.uuid4()),
+            "task": "documents.tasks.consume_file",
+        }
+        body = (
+            # args
+            (
+                ConsumableDocument(
+                    source=DocumentSource.ConsumeFolder,
+                    original_file="/consume/hello-9.pdf",
+                ),
+                None,
+            ),
+            # kwargs
+            {},
+            # celery stuff
+            {"callbacks": None, "errbacks": None, "chain": None, "chord": None},
+        )
        self.util_call_before_task_publish_handler(
-            headers_to_use=self.HEADERS_CONSUME,
-            body_to_use=self.BODY_CONSUME,
+            headers_to_use=headers,
+            body_to_use=body,
        )

        task_postrun_handler(
-            task_id=self.HEADERS_CONSUME["id"],
+            task_id=headers["id"],
            retval="Success. New document id 1 created",
            state=celery.states.SUCCESS,
        )
--- a/src/documents/tests/utils.py
+++ b/src/documents/tests/utils.py
@ -4,6 +4,8 @@ from collections import namedtuple
 from contextlib import contextmanager
 from os import PathLike
 from pathlib import Path
+from typing import Iterator
+from typing import Tuple
 from typing import Union
 from unittest import mock

@ -12,6 +14,8 @@ from django.db import connection
 from django.db.migrations.executor import MigrationExecutor
 from django.test import override_settings
 from django.test import TransactionTestCase
+from documents.data_models import ConsumableDocument
+from documents.data_models import DocumentMetadataOverrides


 def setup_directories():
@ -116,6 +120,11 @@ class ConsumerProgressMixin:


 class DocumentConsumeDelayMixin:
+    """
+    Provides mocking of the consume_file asynchronous task and useful utilities
+    for decoding its arguments
+    """
+
    def setUp(self) -> None:
        self.consume_file_patcher = mock.patch("documents.tasks.consume_file.delay")
        self.consume_file_mock = self.consume_file_patcher.start()
@ -125,6 +134,47 @@ class DocumentConsumeDelayMixin:
        super().tearDown()
        self.consume_file_patcher.stop()

+    def get_last_consume_delay_call_args(
+        self,
+    ) -> Tuple[ConsumableDocument, DocumentMetadataOverrides]:
+        """
+        Returns the most recent arguments to the async task
+        """
+        # Must be at least 1 call
+        self.consume_file_mock.assert_called()
+
+        args, _ = self.consume_file_mock.call_args
+        input_doc, overrides = args
+
+        return (input_doc, overrides)
+
+    def get_all_consume_delay_call_args(
+        self,
+    ) -> Iterator[Tuple[ConsumableDocument, DocumentMetadataOverrides]]:
+        """
+        Iterates over all calls to the async task and returns the arguments
+        """
+
+        for args, _ in self.consume_file_mock.call_args_list:
+            input_doc, overrides = args
+
+            yield (input_doc, overrides)
+
+    def get_specific_consume_delay_call_args(
+        self,
+        index: int,
+    ) -> Iterator[Tuple[ConsumableDocument, DocumentMetadataOverrides]]:
+        """
+        Returns the arguments of a specific call to the async task
+        """
+        # Must be at least 1 call
+        self.consume_file_mock.assert_called()
+
+        args, _ = self.consume_file_mock.call_args_list[index]
+        input_doc, overrides = args
+
+        return (input_doc, overrides)
+

 class TestMigrations(TransactionTestCase):
    @property
@ -140,7 +190,7 @@ class TestMigrations(TransactionTestCase):

        assert (
            self.migrate_from and self.migrate_to
-        ), "TestCase '{}' must define migrate_from and migrate_to     properties".format(
+        ), "TestCase '{}' must define migrate_from and migrate_to properties".format(
            type(self).__name__,
        )
        self.migrate_from = [(self.app, self.migrate_from)]
--- a/src/documents/views.py
+++ b/src/documents/views.py
@ -5,7 +5,6 @@ import os
 import re
 import tempfile
 import urllib
-import uuid
 import zipfile
 from datetime import datetime
 from pathlib import Path
@ -65,6 +64,9 @@ from .bulk_download import ArchiveOnlyStrategy
 from .bulk_download import OriginalAndArchiveStrategy
 from .bulk_download import OriginalsOnlyStrategy
 from .classifier import load_classifier
+from .data_models import ConsumableDocument
+from .data_models import DocumentMetadataOverrides
+from .data_models import DocumentSource
 from .filters import CorrespondentFilterSet
 from .filters import DocumentFilterSet
 from .filters import DocumentTypeFilterSet
@ -692,19 +694,24 @@ class PostDocumentView(GenericAPIView):

        os.utime(temp_file_path, times=(t, t))

-        task_id = str(uuid.uuid4())
+        input_doc = ConsumableDocument(
+            source=DocumentSource.ApiUpload,
+            original_file=temp_file_path,
+        )
+        input_doc_overrides = DocumentMetadataOverrides(
+            filename=doc_name,
+            title=title,
+            correspondent_id=correspondent_id,
+            document_type_id=document_type_id,
+            tag_ids=tag_ids,
+            created=created,
+            asn=archive_serial_number,
+            owner_id=request.user.id,
+        )

        async_task = consume_file.delay(
-            # Paths are not JSON friendly
-            str(temp_file_path),
-            override_title=title,
-            override_correspondent_id=correspondent_id,
-            override_document_type_id=document_type_id,
-            override_tag_ids=tag_ids,
-            task_id=task_id,
-            override_created=created,
-            override_owner_id=request.user.id,
-            override_archive_serial_num=archive_serial_number,
+            input_doc,
+            input_doc_overrides,
        )

        return Response(async_task.id)
--- a/src/paperless_mail/mail.py
+++ b/src/paperless_mail/mail.py
@ -21,6 +21,9 @@ from django.conf import settings
 from django.db import DatabaseError
 from django.utils.timezone import is_naive
 from django.utils.timezone import make_aware
+from documents.data_models import ConsumableDocument
+from documents.data_models import DocumentMetadataOverrides
+from documents.data_models import DocumentSource
 from documents.loggers import LoggingMixin
 from documents.models import Correspondent
 from documents.parsers import is_mime_type_supported
@ -694,18 +697,22 @@ class MailAccountHandler(LoggingMixin):
                    f"{message.subject} from {message.from_}",
                )

+                input_doc = ConsumableDocument(
+                    source=DocumentSource.MailFetch,
+                    original_file=temp_filename,
+                )
+                doc_overrides = DocumentMetadataOverrides(
+                    title=title,
+                    filename=pathvalidate.sanitize_filename(att.filename),
+                    correspondent_id=correspondent.id if correspondent else None,
+                    document_type_id=doc_type.id if doc_type else None,
+                    tag_ids=tag_ids,
+                    owner_id=rule.owner.id if rule.owner else None,
+                )
+
                consume_task = consume_file.s(
-                    path=temp_filename,
-                    override_filename=pathvalidate.sanitize_filename(
-                        att.filename,
-                    ),
-                    override_title=title,
-                    override_correspondent_id=correspondent.id
-                    if correspondent
-                    else None,
-                    override_document_type_id=doc_type.id if doc_type else None,
-                    override_tag_ids=tag_ids,
-                    override_owner_id=rule.owner.id if rule.owner else None,
+                    input_doc,
+                    doc_overrides,
                )

                consume_tasks.append(consume_task)
@ -770,16 +777,22 @@ class MailAccountHandler(LoggingMixin):
            f"{message.subject} from {message.from_}",
        )

+        input_doc = ConsumableDocument(
+            source=DocumentSource.MailFetch,
+            original_file=temp_filename,
+        )
+        doc_overrides = DocumentMetadataOverrides(
+            title=message.subject,
+            filename=pathvalidate.sanitize_filename(f"{message.subject}.eml"),
+            correspondent_id=correspondent.id if correspondent else None,
+            document_type_id=doc_type.id if doc_type else None,
+            tag_ids=tag_ids,
+            owner_id=rule.owner.id if rule.owner else None,
+        )
+
        consume_task = consume_file.s(
-            path=temp_filename,
-            override_filename=pathvalidate.sanitize_filename(
-                message.subject + ".eml",
-            ),
-            override_title=message.subject,
-            override_correspondent_id=correspondent.id if correspondent else None,
-            override_document_type_id=doc_type.id if doc_type else None,
-            override_tag_ids=tag_ids,
-            override_owner_id=rule.owner.id if rule.owner else None,
+            input_doc,
+            doc_overrides,
        )

        queue_consumption_tasks(
--- a/src/paperless_mail/tests/test_mail.py
+++ b/src/paperless_mail/tests/test_mail.py
@ -12,8 +12,11 @@ from unittest import mock
 from django.core.management import call_command
 from django.db import DatabaseError
 from django.test import TestCase
+from documents.data_models import ConsumableDocument
+from documents.data_models import DocumentMetadataOverrides
 from documents.models import Correspondent
 from documents.tests.utils import DirectoriesMixin
+from documents.tests.utils import DocumentConsumeDelayMixin
 from documents.tests.utils import FileSystemAssertsMixin
 from imap_tools import EmailAddress
 from imap_tools import FolderInfo
@ -194,7 +197,11 @@ def fake_magic_from_buffer(buffer, mime=False):


@mock.patch("paperless_mail.mail.magic.from_buffer", fake_magic_from_buffer)
-class TestMail(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
+class TestMail(
+    DirectoriesMixin,
+    FileSystemAssertsMixin,
+    TestCase,
+):
    def setUp(self):
        self._used_uids = set()

@ -409,6 +416,8 @@ class TestMail(DirectoriesMixin, FileSystemAssertsMixin, TestCase):

        self.assertEqual(result, 2)

+        self._queue_consumption_tasks_mock.assert_called()
+
        self.assert_queue_consumption_tasks_call_args(
            [
                [
@ -426,7 +435,7 @@ class TestMail(DirectoriesMixin, FileSystemAssertsMixin, TestCase):

        result = self.mail_account_handler._handle_message(message, rule)

-        self.assertFalse(self._queue_consumption_tasks_mock.called)
+        self._queue_consumption_tasks_mock.assert_not_called()
        self.assertEqual(result, 0)

    def test_handle_unknown_mime_type(self):
@ -541,7 +550,6 @@ class TestMail(DirectoriesMixin, FileSystemAssertsMixin, TestCase):

        for (pattern, matches) in tests:
            with self.subTest(msg=pattern):
-                print(f"PATTERN {pattern}")
                self._queue_consumption_tasks_mock.reset_mock()
                account = MailAccount(name=str(uuid.uuid4()))
                account.save()
@ -855,7 +863,7 @@ class TestMail(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
        self.mail_account_handler.handle_mail_account(account)

        self.bogus_mailbox.folder.list.assert_called_once()
-        self.assertEqual(self._queue_consumption_tasks_mock.call_count, 0)
+        self._queue_consumption_tasks_mock.assert_not_called()

    def test_error_folder_set_error_listing(self):
        """
@ -888,7 +896,7 @@ class TestMail(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
        self.mail_account_handler.handle_mail_account(account)

        self.bogus_mailbox.folder.list.assert_called_once()
-        self.assertEqual(self._queue_consumption_tasks_mock.call_count, 0)
+        self._queue_consumption_tasks_mock.assert_not_called()

    @mock.patch("paperless_mail.mail.MailAccountHandler._get_correspondent")
    def test_error_skip_mail(self, m):
@ -1002,7 +1010,7 @@ class TestMail(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
                self.reset_bogus_mailbox()
                self._queue_consumption_tasks_mock.reset_mock()

-                self.assertEqual(self._queue_consumption_tasks_mock.call_count, 0)
+                self._queue_consumption_tasks_mock.assert_not_called()
                self.assertEqual(len(self.bogus_mailbox.messages), 3)

                self.mail_account_handler.handle_mail_account(account)
@ -1041,7 +1049,7 @@ class TestMail(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
        )

        self.assertEqual(len(self.bogus_mailbox.messages), 3)
-        self.assertEqual(self._queue_consumption_tasks_mock.call_count, 0)
+        self._queue_consumption_tasks_mock.assert_not_called()
        self.assertEqual(len(self.bogus_mailbox.fetch("UNSEEN", False)), 2)

        self.mail_account_handler.handle_mail_account(account)
@ -1148,13 +1156,21 @@ class TestMail(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
                consume_tasks,
                expected_signatures,
            ):
+                input_doc, overrides = consume_task.args
+
                # assert the file exists
-                self.assertIsFile(consume_task.kwargs["path"])
+                self.assertIsFile(input_doc.original_file)

                # assert all expected arguments are present in the signature
                for key, value in expected_signature.items():
-                    self.assertIn(key, consume_task.kwargs)
-                    self.assertEqual(consume_task.kwargs[key], value)
+                    if key == "override_correspondent_id":
+                        self.assertEqual(overrides.correspondent_id, value)
+                    elif key == "override_filename":
+                        self.assertEqual(overrides.filename, value)
+                    elif key == "override_title":
+                        self.assertEqual(overrides.title, value)
+                    else:
+                        self.fail("No match for expected arg")

    def apply_mail_actions(self):
        """