Merge double sided

This commit is contained in:
shamoon
2025-04-08 16:31:51 -07:00
parent 774070c783
commit a5c75313f5
3 changed files with 3 additions and 3 deletions

View File

@@ -1,146 +0,0 @@
import datetime as dt
import logging
import os
import shutil
from pathlib import Path
from typing import Final
from django.conf import settings
from pikepdf import Pdf
from documents.converters import convert_from_tiff_to_pdf
from documents.plugins.base import ConsumeTaskPlugin
from documents.plugins.base import NoCleanupPluginMixin
from documents.plugins.base import NoSetupPluginMixin
from documents.plugins.base import StopConsumeTaskError
from paperless.consumer import ConsumerError
logger = logging.getLogger("paperless.double_sided")
# Hardcoded for now, could be made a configurable setting if needed
TIMEOUT_MINUTES: Final[int] = 30
TIMEOUT_SECONDS: Final[int] = TIMEOUT_MINUTES * 60
# Used by test cases
STAGING_FILE_NAME = "double-sided-staging.pdf"
class CollatePlugin(NoCleanupPluginMixin, NoSetupPluginMixin, ConsumeTaskPlugin):
NAME: str = "CollatePlugin"
@property
def able_to_run(self) -> bool:
return (
settings.CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED
and settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME
in self.input_doc.original_file.parts
)
def run(self) -> str | None:
"""
Tries to collate pages from 2 single sided scans of a double sided
document.
When called with a file, it checks whether or not a staging file
exists, if not, the current file is turned into that staging file
containing the odd numbered pages.
If a staging file exists, and it is not too old, the current file is
considered to be the second part (the even numbered pages) and it will
collate the pages of both, the pages of the second file will be added
in reverse order, since the ADF will have scanned the pages from bottom
to top.
Returns a status message on success, or raises a ConsumerError
in case of failure.
"""
if self.input_doc.mime_type == "application/pdf":
pdf_file = self.input_doc.original_file
elif (
self.input_doc.mime_type == "image/tiff"
and settings.CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT
):
pdf_file = convert_from_tiff_to_pdf(
self.input_doc.original_file,
self.base_tmp_dir,
)
self.input_doc.original_file.unlink()
else:
raise ConsumerError(
"Unsupported file type for collation of double-sided scans",
)
staging: Path = settings.SCRATCH_DIR / STAGING_FILE_NAME
valid_staging_exists = False
if staging.exists():
stats = staging.stat()
# if the file is older than the timeout, we don't consider
# it valid
if (dt.datetime.now().timestamp() - stats.st_mtime) > TIMEOUT_SECONDS:
logger.warning("Outdated double sided staging file exists, deleting it")
staging.unlink()
else:
valid_staging_exists = True
if valid_staging_exists:
try:
# Collate pages from second PDF in reverse order
with Pdf.open(staging) as pdf1, Pdf.open(pdf_file) as pdf2:
pdf2.pages.reverse()
try:
for i, page in enumerate(pdf2.pages):
pdf1.pages.insert(2 * i + 1, page)
except IndexError:
raise ConsumerError(
"This second file (even numbered pages) contains more "
"pages than the first/odd numbered one. This means the "
"two uploaded files don't belong to the same double-"
"sided scan. Please retry, starting with the odd "
"numbered pages again.",
)
# Merged file has the same path, but without the
# double-sided subdir. Therefore, it is also in the
# consumption dir and will be picked up for processing
old_file = self.input_doc.original_file
new_file = Path(
*(
part
for part in old_file.with_name(
f"{old_file.stem}-collated.pdf",
).parts
if part
!= settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME
),
)
# If the user didn't create the subdirs yet, do it for them
new_file.parent.mkdir(parents=True, exist_ok=True)
pdf1.save(new_file)
logger.info("Collated documents into new file %s", new_file)
raise StopConsumeTaskError(
"Success. Even numbered pages of double sided scan collated "
"with odd pages",
)
finally:
# Delete staging and recently uploaded file no matter what.
# If any error occurs, the user needs to be able to restart
# the process from scratch; after all, the staging file
# with the odd numbered pages might be the culprit
pdf_file.unlink()
staging.unlink()
else:
shutil.move(pdf_file, staging)
# update access to modification time so we know if the file
# is outdated when another file gets uploaded
timestamp = dt.datetime.now().timestamp()
os.utime(staging, (timestamp, timestamp))
logger.info(
"Got scan with odd numbered pages of double-sided scan, moved it to %s",
staging,
)
raise StopConsumeTaskError(
"Received odd numbered pages of double sided scan, waiting up to "
f"{TIMEOUT_MINUTES} minutes for even numbered pages",
)

View File

@@ -24,7 +24,6 @@ from documents.barcodes import BarcodePlugin
from documents.caching import clear_document_caches
from documents.classifier import DocumentClassifier
from documents.classifier import load_classifier
from documents.double_sided import CollatePlugin
from documents.file_handling import create_source_path_directory
from documents.file_handling import generate_unique_filename
from documents.parsers import DocumentParser
@@ -42,6 +41,7 @@ from paperless.consumer import ConsumerPlugin
from paperless.consumer import WorkflowTriggerPlugin
from paperless.data_models import ConsumableDocument
from paperless.data_models import DocumentMetadataOverrides
from paperless.double_sided import CollatePlugin
from paperless.models import Correspondent
from paperless.models import CustomFieldInstance
from paperless.models import Document

View File

@@ -10,14 +10,14 @@ from pdfminer.high_level import extract_text
from pikepdf import Pdf
from documents import tasks
from documents.double_sided import STAGING_FILE_NAME
from documents.double_sided import TIMEOUT_MINUTES
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import DummyProgressManager
from documents.tests.utils import FileSystemAssertsMixin
from paperless.consumer import ConsumerError
from paperless.data_models import ConsumableDocument
from paperless.data_models import DocumentSource
from paperless.double_sided import STAGING_FILE_NAME
from paperless.double_sided import TIMEOUT_MINUTES
@override_settings(