Feature: collate two single-sided multipage scans (#3784)

* Feature: collate two single-sided scans Some ADF only support single-sided scans, making scanning double-sided documents a bit annoying. This new feature enables Paperless to do most of the work, by merging two seperate scans into a single one, collating the even and odd numbered pages. * Documentation: clarify that collation is disabled by default * Apply suggestions from code review Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com> * Address code review remarks * Grammar fixes --------- Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
2026-02-11 23:59:31 -06:00 · 2023-07-24 09:29:04 +02:00
parent 9f5d47c320
commit 8c7554e081
11 changed files with 584 additions and 48 deletions
--- a/src/documents/double_sided.py
+++ b/src/documents/double_sided.py
@@ -0,0 +1,131 @@
+import datetime as dt
+import logging
+import os
+import shutil
+from pathlib import Path
+
+from django.conf import settings
+from pikepdf import Pdf
+
+from documents.consumer import ConsumerError
+from documents.converters import convert_from_tiff_to_pdf
+from documents.data_models import ConsumableDocument
+
+logger = logging.getLogger("paperless.double_sided")
+
+# Hardcoded for now, could be made a configurable setting if needed
+TIMEOUT_MINUTES = 30
+
+# Used by test cases
+STAGING_FILE_NAME = "double-sided-staging.pdf"
+
+
+def collate(input_doc: ConsumableDocument) -> str:
+    """
+    Tries to collate pages from 2 single sided scans of a double sided
+    document.
+
+    When called with a file, it checks whether or not a staging file
+    exists, if not, the current file is turned into that staging file
+    containing the odd numbered pages.
+
+    If a staging file exists, and it is not too old, the current file is
+    considered to be the second part (the even numbered pages) and it will
+    collate the pages of both, the pages of the second file will be added
+    in reverse order, since the ADF will have scanned the pages from bottom
+    to top.
+
+    Returns a status message on succcess, or raises a ConsumerError
+    in case of failure.
+    """
+
+    # Make sure scratch dir exists, Consumer might not have run yet
+    settings.SCRATCH_DIR.mkdir(exist_ok=True)
+
+    if input_doc.mime_type == "application/pdf":
+        pdf_file = input_doc.original_file
+    elif (
+        input_doc.mime_type == "image/tiff"
+        and settings.CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT
+    ):
+        pdf_file = convert_from_tiff_to_pdf(
+            input_doc.original_file,
+            settings.SCRATCH_DIR,
+        )
+        input_doc.original_file.unlink()
+    else:
+        raise ConsumerError("Unsupported file type for collation of double-sided scans")
+
+    staging = settings.SCRATCH_DIR / STAGING_FILE_NAME
+
+    valid_staging_exists = False
+    if staging.exists():
+        stats = os.stat(str(staging))
+        # if the file is older than the timeout, we don't consider
+        # it valid
+        if dt.datetime.now().timestamp() - stats.st_mtime > TIMEOUT_MINUTES * 60:
+            logger.warning("Outdated double sided staging file exists, deleting it")
+            os.unlink(str(staging))
+        else:
+            valid_staging_exists = True
+
+    if valid_staging_exists:
+        try:
+            # Collate pages from second PDF in reverse order
+            with Pdf.open(staging) as pdf1, Pdf.open(pdf_file) as pdf2:
+                pdf2.pages.reverse()
+                try:
+                    for i, page in enumerate(pdf2.pages):
+                        pdf1.pages.insert(2 * i + 1, page)
+                except IndexError:
+                    raise ConsumerError(
+                        "This second file (even numbered pages) contains more "
+                        "pages than the first/odd numbered one. This means the "
+                        "two uploaded files don't belong to the same double-"
+                        "sided scan. Please retry, starting with the odd "
+                        "numbered pages again.",
+                    )
+                # Merged file has the same path, but without the
+                # double-sided subdir. Therefore, it is also in the
+                # consumption dir and will be picked up for processing
+                old_file = input_doc.original_file
+                new_file = Path(
+                    *(
+                        part
+                        for part in old_file.with_name(
+                            f"{old_file.stem}-collated.pdf",
+                        ).parts
+                        if part != settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME
+                    ),
+                )
+                # If the user didn't create the subdirs yet, do it for them
+                new_file.parent.mkdir(parents=True, exist_ok=True)
+                pdf1.save(new_file)
+            logger.info("Collated documents into new file %s", new_file)
+            return (
+                "Success. Even numbered pages of double sided scan collated "
+                "with odd pages"
+            )
+        finally:
+            # Delete staging and recently uploaded file no matter what.
+            # If any error occurs, the user needs to be able to restart
+            # the process from scratch; after all, the staging file
+            # with the odd numbered pages might be the culprit
+            pdf_file.unlink()
+            staging.unlink()
+
+    else:
+        # In Python 3.9 move supports Path objects directly,
+        # but for now we have to be compatible with 3.8
+        shutil.move(str(pdf_file), str(staging))
+        # update access to modification time so we know if the file
+        # is outdated when another file gets uploaded
+        os.utime(str(staging), (dt.datetime.now().timestamp(),) * 2)
+        logger.info(
+            "Got scan with odd numbered pages of double-sided scan, moved it to %s",
+            staging,
+        )
+        return (
+            "Received odd numbered pages of double sided scan, waiting up to "
+            f"{TIMEOUT_MINUTES} minutes for even numbered pages"
+        )