paperless-ngx/src/documents/tests/test_double_sided.py
Dennis Brakhane 8c7554e081
Feature: collate two single-sided multipage scans (#3784)
* Feature: collate two single-sided scans

Some ADF only support single-sided scans, making scanning
double-sided documents a bit annoying.

This new feature enables Paperless to do most of the work,
by merging two seperate scans into a single one, collating
the even and odd numbered pages.

* Documentation: clarify that collation is disabled by default

* Apply suggestions from code review

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>

* Address code review remarks

* Grammar fixes

---------

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
2023-07-24 00:29:04 -07:00

254 lines
8.6 KiB
Python

import datetime as dt
import os
import shutil
from pathlib import Path
from typing import Union
from unittest import mock
from django.test import TestCase
from django.test import override_settings
from pdfminer.high_level import extract_text
from pikepdf import Pdf
from documents import tasks
from documents.consumer import ConsumerError
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentSource
from documents.double_sided import STAGING_FILE_NAME
from documents.double_sided import TIMEOUT_MINUTES
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
@override_settings(
CONSUMER_RECURSIVE=True,
CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED=True,
)
class TestDoubleSided(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
SAMPLE_DIR = Path(__file__).parent / "samples"
def setUp(self):
super().setUp()
self.dirs.double_sided_dir = self.dirs.consumption_dir / "double-sided"
self.dirs.double_sided_dir.mkdir()
self.staging_file = self.dirs.scratch_dir / STAGING_FILE_NAME
def consume_file(self, srcname, dstname: Union[str, Path] = "foo.pdf"):
"""
Starts the consume process and also ensures the
destination file does not exist afterwards
"""
src = self.SAMPLE_DIR / srcname
dst = self.dirs.double_sided_dir / dstname
dst.parent.mkdir(parents=True, exist_ok=True)
shutil.copy(src, dst)
with mock.patch("documents.tasks.async_to_sync"), mock.patch(
"documents.consumer.async_to_sync",
):
msg = tasks.consume_file(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=dst,
),
None,
)
self.assertIsNotFile(dst)
return msg
def create_staging_file(self, src="double-sided-odd.pdf", datetime=None):
shutil.copy(self.SAMPLE_DIR / src, self.staging_file)
if datetime is None:
datetime = dt.datetime.now()
os.utime(str(self.staging_file), (datetime.timestamp(),) * 2)
def test_odd_numbered_moved_to_staging(self):
"""
GIVEN:
- No staging file exists
WHEN:
- A file is copied into the double-sided consume directory
THEN:
- The file becomes the new staging file
- The file in the consume directory gets removed
- The staging file has the st_mtime set to now
- The user gets informed
"""
msg = self.consume_file("double-sided-odd.pdf")
self.assertIsFile(self.staging_file)
self.assertAlmostEqual(
dt.datetime.fromtimestamp(self.staging_file.stat().st_mtime),
dt.datetime.now(),
delta=dt.timedelta(seconds=5),
)
self.assertIn("Received odd numbered pages", msg)
def test_collation(self):
"""
GIVEN:
- A staging file not older than TIMEOUT_MINUTES with odd pages exists
WHEN:
- A file is copied into the double-sided consume directory
THEN:
- A new file containing the collated staging and uploaded file is
created and put into the consume directory
- The new file is named "foo-collated.pdf", where foo is the name of
the second file
- Both staging and uploaded file get deleted
- The new file contains the pages in the correct order
"""
self.create_staging_file()
self.consume_file("double-sided-even.pdf", "some-random-name.pdf")
target = self.dirs.consumption_dir / "some-random-name-collated.pdf"
self.assertIsFile(target)
self.assertIsNotFile(self.staging_file)
self.assertRegex(
extract_text(str(target)),
r"(?s)"
r"This is page 1.*This is page 2.*This is page 3.*"
r"This is page 4.*This is page 5",
)
def test_staging_file_expiration(self):
"""
GIVEN:
- A staging file older than TIMEOUT_MINUTES exists
WHEN:
- A file is copied into the double-sided consume directory
THEN:
- It becomes the new staging file
"""
self.create_staging_file(
datetime=dt.datetime.now()
- dt.timedelta(minutes=TIMEOUT_MINUTES, seconds=1),
)
msg = self.consume_file("double-sided-odd.pdf")
self.assertIsFile(self.staging_file)
self.assertIn("Received odd numbered pages", msg)
def test_less_odd_pages_then_even_fails(self):
"""
GIVEN:
- A valid staging file
WHEN:
- A file is copied into the double-sided consume directory
that has more pages than the staging file
THEN:
- Both files get removed
- A ConsumerError exception is thrown
"""
self.create_staging_file("simple.pdf")
self.assertRaises(
ConsumerError,
self.consume_file,
"double-sided-even.pdf",
)
self.assertIsNotFile(self.staging_file)
@override_settings(CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT=True)
def test_tiff_upload_enabled(self):
"""
GIVEN:
- CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT is true
- No staging file exists
WHEN:
- A TIFF file gets uploaded into the double-sided
consume dir
THEN:
- The file is converted into a PDF and moved to
the staging file
"""
self.consume_file("simple.tiff", "simple.tiff")
self.assertIsFile(self.staging_file)
# Ensure the file is a valid PDF by trying to read it
Pdf.open(self.staging_file)
@override_settings(CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT=False)
def test_tiff_upload_disabled(self):
"""
GIVEN:
- CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT is false
- No staging file exists
WHEN:
- A TIFF file gets uploaded into the double-sided
consume dir
THEN:
- A ConsumerError is raised
"""
self.assertRaises(
ConsumerError,
self.consume_file,
"simple.tiff",
"simple.tiff",
)
@override_settings(CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME="quux")
def test_different_upload_dir_name(self):
"""
GIVEN:
- No staging file exists
- CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME is set to quux
WHEN:
- A file is uploaded into the quux dir
THEN:
- A staging file is created
"""
self.consume_file("double-sided-odd.pdf", Path("..") / "quux" / "foo.pdf")
self.assertIsFile(self.staging_file)
def test_only_double_sided_dir_is_handled(self):
"""
GIVEN:
- No staging file exists
WHEN:
- A file is uploaded into the normal consumption dir
THEN:
- The file is processed as normal
"""
msg = self.consume_file("simple.pdf", Path("..") / "simple.pdf")
self.assertIsNotFile(self.staging_file)
self.assertRegex(msg, "Success. New document .* created")
def test_subdirectory_upload(self):
"""
GIVEN:
- A staging file exists
WHEN:
- A file gets uploaded into foo/bar/double-sided
or double-sided/foo/bar
THEN:
- The collated file gets put into foo/bar
"""
for path in [
Path("foo") / "bar" / "double-sided",
Path("double-sided") / "foo" / "bar",
]:
with self.subTest(path=path):
# Ensure we get fresh directories for each run
self.tearDown()
self.setUp()
self.create_staging_file()
self.consume_file("double-sided-odd.pdf", path / "foo.pdf")
self.assertIsFile(
self.dirs.consumption_dir / "foo" / "bar" / "foo-collated.pdf",
)
@override_settings(CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED=False)
def test_disabled_double_sided_dir_upload(self):
"""
GIVEN:
- CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED is false
WHEN:
- A file is uploaded into the double-sided directory
THEN:
- The file is processed like a normal upload
"""
msg = self.consume_file("simple.pdf")
self.assertIsNotFile(self.staging_file)
self.assertRegex(msg, "Success. New document .* created")