Refactor file consumption task to allow beginnings of a plugin system (#5367)

2026-02-11 23:59:31 -06:00 · 2024-01-13 08:11:14 -08:00
parent 4dbf8d7969
commit 2da5e46386
11 changed files with 767 additions and 531 deletions
--- a/src/documents/double_sided.py
+++ b/src/documents/double_sided.py
@@ -3,127 +3,145 @@ import logging
 import os
 import shutil
 from pathlib import Path
+from typing import Final
+from typing import Optional

 from django.conf import settings
 from pikepdf import Pdf

 from documents.consumer import ConsumerError
 from documents.converters import convert_from_tiff_to_pdf
-from documents.data_models import ConsumableDocument
+from documents.plugins.base import ConsumeTaskPlugin
+from documents.plugins.base import NoCleanupPluginMixin
+from documents.plugins.base import NoSetupPluginMixin
+from documents.plugins.base import StopConsumeTaskError

 logger = logging.getLogger("paperless.double_sided")

 # Hardcoded for now, could be made a configurable setting if needed
-TIMEOUT_MINUTES = 30
+TIMEOUT_MINUTES: Final[int] = 30
+TIMEOUT_SECONDS: Final[int] = TIMEOUT_MINUTES * 60

 # Used by test cases
 STAGING_FILE_NAME = "double-sided-staging.pdf"


-def collate(input_doc: ConsumableDocument) -> str:
-    """
-    Tries to collate pages from 2 single sided scans of a double sided
-    document.
+class CollatePlugin(NoCleanupPluginMixin, NoSetupPluginMixin, ConsumeTaskPlugin):
+    NAME: str = "CollatePlugin"

-    When called with a file, it checks whether or not a staging file
-    exists, if not, the current file is turned into that staging file
-    containing the odd numbered pages.
-
-    If a staging file exists, and it is not too old, the current file is
-    considered to be the second part (the even numbered pages) and it will
-    collate the pages of both, the pages of the second file will be added
-    in reverse order, since the ADF will have scanned the pages from bottom
-    to top.
-
-    Returns a status message on success, or raises a ConsumerError
-    in case of failure.
-    """
-
-    # Make sure scratch dir exists, Consumer might not have run yet
-    settings.SCRATCH_DIR.mkdir(exist_ok=True)
-
-    if input_doc.mime_type == "application/pdf":
-        pdf_file = input_doc.original_file
-    elif (
-        input_doc.mime_type == "image/tiff"
-        and settings.CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT
-    ):
-        pdf_file = convert_from_tiff_to_pdf(
-            input_doc.original_file,
-            settings.SCRATCH_DIR,
-        )
-        input_doc.original_file.unlink()
-    else:
-        raise ConsumerError("Unsupported file type for collation of double-sided scans")
-
-    staging = settings.SCRATCH_DIR / STAGING_FILE_NAME
-
-    valid_staging_exists = False
-    if staging.exists():
-        stats = os.stat(str(staging))
-        # if the file is older than the timeout, we don't consider
-        # it valid
-        if dt.datetime.now().timestamp() - stats.st_mtime > TIMEOUT_MINUTES * 60:
-            logger.warning("Outdated double sided staging file exists, deleting it")
-            os.unlink(str(staging))
-        else:
-            valid_staging_exists = True
-
-    if valid_staging_exists:
-        try:
-            # Collate pages from second PDF in reverse order
-            with Pdf.open(staging) as pdf1, Pdf.open(pdf_file) as pdf2:
-                pdf2.pages.reverse()
-                try:
-                    for i, page in enumerate(pdf2.pages):
-                        pdf1.pages.insert(2 * i + 1, page)
-                except IndexError:
-                    raise ConsumerError(
-                        "This second file (even numbered pages) contains more "
-                        "pages than the first/odd numbered one. This means the "
-                        "two uploaded files don't belong to the same double-"
-                        "sided scan. Please retry, starting with the odd "
-                        "numbered pages again.",
-                    )
-                # Merged file has the same path, but without the
-                # double-sided subdir. Therefore, it is also in the
-                # consumption dir and will be picked up for processing
-                old_file = input_doc.original_file
-                new_file = Path(
-                    *(
-                        part
-                        for part in old_file.with_name(
-                            f"{old_file.stem}-collated.pdf",
-                        ).parts
-                        if part != settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME
-                    ),
-                )
-                # If the user didn't create the subdirs yet, do it for them
-                new_file.parent.mkdir(parents=True, exist_ok=True)
-                pdf1.save(new_file)
-            logger.info("Collated documents into new file %s", new_file)
-            return (
-                "Success. Even numbered pages of double sided scan collated "
-                "with odd pages"
-            )
-        finally:
-            # Delete staging and recently uploaded file no matter what.
-            # If any error occurs, the user needs to be able to restart
-            # the process from scratch; after all, the staging file
-            # with the odd numbered pages might be the culprit
-            pdf_file.unlink()
-            staging.unlink()
-
-    else:
-        shutil.move(pdf_file, staging)
-        # update access to modification time so we know if the file
-        # is outdated when another file gets uploaded
-        os.utime(staging, (dt.datetime.now().timestamp(),) * 2)
-        logger.info(
-            "Got scan with odd numbered pages of double-sided scan, moved it to %s",
-            staging,
-        )
+    @property
+    def able_to_run(self) -> bool:
        return (
-            "Received odd numbered pages of double sided scan, waiting up to "
-            f"{TIMEOUT_MINUTES} minutes for even numbered pages"
+            settings.CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED
+            and settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME
+            in self.input_doc.original_file.parts
        )
+
+    def run(self) -> Optional[str]:
+        """
+        Tries to collate pages from 2 single sided scans of a double sided
+        document.
+
+        When called with a file, it checks whether or not a staging file
+        exists, if not, the current file is turned into that staging file
+        containing the odd numbered pages.
+
+        If a staging file exists, and it is not too old, the current file is
+        considered to be the second part (the even numbered pages) and it will
+        collate the pages of both, the pages of the second file will be added
+        in reverse order, since the ADF will have scanned the pages from bottom
+        to top.
+
+        Returns a status message on success, or raises a ConsumerError
+        in case of failure.
+        """
+
+        if self.input_doc.mime_type == "application/pdf":
+            pdf_file = self.input_doc.original_file
+        elif (
+            self.input_doc.mime_type == "image/tiff"
+            and settings.CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT
+        ):
+            pdf_file = convert_from_tiff_to_pdf(
+                self.input_doc.original_file,
+                self.base_tmp_dir,
+            )
+            self.input_doc.original_file.unlink()
+        else:
+            raise ConsumerError(
+                "Unsupported file type for collation of double-sided scans",
+            )
+
+        staging: Path = settings.SCRATCH_DIR / STAGING_FILE_NAME
+
+        valid_staging_exists = False
+        if staging.exists():
+            stats = staging.stat()
+            # if the file is older than the timeout, we don't consider
+            # it valid
+            if (dt.datetime.now().timestamp() - stats.st_mtime) > TIMEOUT_SECONDS:
+                logger.warning("Outdated double sided staging file exists, deleting it")
+                staging.unlink()
+            else:
+                valid_staging_exists = True
+
+        if valid_staging_exists:
+            try:
+                # Collate pages from second PDF in reverse order
+                with Pdf.open(staging) as pdf1, Pdf.open(pdf_file) as pdf2:
+                    pdf2.pages.reverse()
+                    try:
+                        for i, page in enumerate(pdf2.pages):
+                            pdf1.pages.insert(2 * i + 1, page)
+                    except IndexError:
+                        raise ConsumerError(
+                            "This second file (even numbered pages) contains more "
+                            "pages than the first/odd numbered one. This means the "
+                            "two uploaded files don't belong to the same double-"
+                            "sided scan. Please retry, starting with the odd "
+                            "numbered pages again.",
+                        )
+                    # Merged file has the same path, but without the
+                    # double-sided subdir. Therefore, it is also in the
+                    # consumption dir and will be picked up for processing
+                    old_file = self.input_doc.original_file
+                    new_file = Path(
+                        *(
+                            part
+                            for part in old_file.with_name(
+                                f"{old_file.stem}-collated.pdf",
+                            ).parts
+                            if part
+                            != settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME
+                        ),
+                    )
+                    # If the user didn't create the subdirs yet, do it for them
+                    new_file.parent.mkdir(parents=True, exist_ok=True)
+                    pdf1.save(new_file)
+                logger.info("Collated documents into new file %s", new_file)
+                raise StopConsumeTaskError(
+                    "Success. Even numbered pages of double sided scan collated "
+                    "with odd pages",
+                )
+            finally:
+                # Delete staging and recently uploaded file no matter what.
+                # If any error occurs, the user needs to be able to restart
+                # the process from scratch; after all, the staging file
+                # with the odd numbered pages might be the culprit
+                pdf_file.unlink()
+                staging.unlink()
+
+        else:
+            shutil.move(pdf_file, staging)
+            # update access to modification time so we know if the file
+            # is outdated when another file gets uploaded
+            timestamp = dt.datetime.now().timestamp()
+            os.utime(staging, (timestamp, timestamp))
+            logger.info(
+                "Got scan with odd numbered pages of double-sided scan, moved it to %s",
+                staging,
+            )
+            raise StopConsumeTaskError(
+                "Received odd numbered pages of double sided scan, waiting up to "
+                f"{TIMEOUT_MINUTES} minutes for even numbered pages",
+            )