mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-08-26 01:16:16 +00:00
Feature: collate two single-sided multipage scans (#3784)
* Feature: collate two single-sided scans Some ADF only support single-sided scans, making scanning double-sided documents a bit annoying. This new feature enables Paperless to do most of the work, by merging two seperate scans into a single one, collating the even and odd numbered pages. * Documentation: clarify that collation is disabled by default * Apply suggestions from code review Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com> * Address code review remarks * Grammar fixes --------- Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
This commit is contained in:
131
src/documents/double_sided.py
Normal file
131
src/documents/double_sided.py
Normal file
@@ -0,0 +1,131 @@
|
||||
import datetime as dt
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
from pikepdf import Pdf
|
||||
|
||||
from documents.consumer import ConsumerError
|
||||
from documents.converters import convert_from_tiff_to_pdf
|
||||
from documents.data_models import ConsumableDocument
|
||||
|
||||
logger = logging.getLogger("paperless.double_sided")
|
||||
|
||||
# Hardcoded for now, could be made a configurable setting if needed
|
||||
TIMEOUT_MINUTES = 30
|
||||
|
||||
# Used by test cases
|
||||
STAGING_FILE_NAME = "double-sided-staging.pdf"
|
||||
|
||||
|
||||
def collate(input_doc: ConsumableDocument) -> str:
|
||||
"""
|
||||
Tries to collate pages from 2 single sided scans of a double sided
|
||||
document.
|
||||
|
||||
When called with a file, it checks whether or not a staging file
|
||||
exists, if not, the current file is turned into that staging file
|
||||
containing the odd numbered pages.
|
||||
|
||||
If a staging file exists, and it is not too old, the current file is
|
||||
considered to be the second part (the even numbered pages) and it will
|
||||
collate the pages of both, the pages of the second file will be added
|
||||
in reverse order, since the ADF will have scanned the pages from bottom
|
||||
to top.
|
||||
|
||||
Returns a status message on succcess, or raises a ConsumerError
|
||||
in case of failure.
|
||||
"""
|
||||
|
||||
# Make sure scratch dir exists, Consumer might not have run yet
|
||||
settings.SCRATCH_DIR.mkdir(exist_ok=True)
|
||||
|
||||
if input_doc.mime_type == "application/pdf":
|
||||
pdf_file = input_doc.original_file
|
||||
elif (
|
||||
input_doc.mime_type == "image/tiff"
|
||||
and settings.CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT
|
||||
):
|
||||
pdf_file = convert_from_tiff_to_pdf(
|
||||
input_doc.original_file,
|
||||
settings.SCRATCH_DIR,
|
||||
)
|
||||
input_doc.original_file.unlink()
|
||||
else:
|
||||
raise ConsumerError("Unsupported file type for collation of double-sided scans")
|
||||
|
||||
staging = settings.SCRATCH_DIR / STAGING_FILE_NAME
|
||||
|
||||
valid_staging_exists = False
|
||||
if staging.exists():
|
||||
stats = os.stat(str(staging))
|
||||
# if the file is older than the timeout, we don't consider
|
||||
# it valid
|
||||
if dt.datetime.now().timestamp() - stats.st_mtime > TIMEOUT_MINUTES * 60:
|
||||
logger.warning("Outdated double sided staging file exists, deleting it")
|
||||
os.unlink(str(staging))
|
||||
else:
|
||||
valid_staging_exists = True
|
||||
|
||||
if valid_staging_exists:
|
||||
try:
|
||||
# Collate pages from second PDF in reverse order
|
||||
with Pdf.open(staging) as pdf1, Pdf.open(pdf_file) as pdf2:
|
||||
pdf2.pages.reverse()
|
||||
try:
|
||||
for i, page in enumerate(pdf2.pages):
|
||||
pdf1.pages.insert(2 * i + 1, page)
|
||||
except IndexError:
|
||||
raise ConsumerError(
|
||||
"This second file (even numbered pages) contains more "
|
||||
"pages than the first/odd numbered one. This means the "
|
||||
"two uploaded files don't belong to the same double-"
|
||||
"sided scan. Please retry, starting with the odd "
|
||||
"numbered pages again.",
|
||||
)
|
||||
# Merged file has the same path, but without the
|
||||
# double-sided subdir. Therefore, it is also in the
|
||||
# consumption dir and will be picked up for processing
|
||||
old_file = input_doc.original_file
|
||||
new_file = Path(
|
||||
*(
|
||||
part
|
||||
for part in old_file.with_name(
|
||||
f"{old_file.stem}-collated.pdf",
|
||||
).parts
|
||||
if part != settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME
|
||||
),
|
||||
)
|
||||
# If the user didn't create the subdirs yet, do it for them
|
||||
new_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
pdf1.save(new_file)
|
||||
logger.info("Collated documents into new file %s", new_file)
|
||||
return (
|
||||
"Success. Even numbered pages of double sided scan collated "
|
||||
"with odd pages"
|
||||
)
|
||||
finally:
|
||||
# Delete staging and recently uploaded file no matter what.
|
||||
# If any error occurs, the user needs to be able to restart
|
||||
# the process from scratch; after all, the staging file
|
||||
# with the odd numbered pages might be the culprit
|
||||
pdf_file.unlink()
|
||||
staging.unlink()
|
||||
|
||||
else:
|
||||
# In Python 3.9 move supports Path objects directly,
|
||||
# but for now we have to be compatible with 3.8
|
||||
shutil.move(str(pdf_file), str(staging))
|
||||
# update access to modification time so we know if the file
|
||||
# is outdated when another file gets uploaded
|
||||
os.utime(str(staging), (dt.datetime.now().timestamp(),) * 2)
|
||||
logger.info(
|
||||
"Got scan with odd numbered pages of double-sided scan, moved it to %s",
|
||||
staging,
|
||||
)
|
||||
return (
|
||||
"Received odd numbered pages of double sided scan, waiting up to "
|
||||
f"{TIMEOUT_MINUTES} minutes for even numbered pages"
|
||||
)
|
Reference in New Issue
Block a user