mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-17 10:13:56 -05:00

* Feature: collate two single-sided scans Some ADF only support single-sided scans, making scanning double-sided documents a bit annoying. This new feature enables Paperless to do most of the work, by merging two seperate scans into a single one, collating the even and odd numbered pages. * Documentation: clarify that collation is disabled by default * Apply suggestions from code review Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com> * Address code review remarks * Grammar fixes --------- Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
254 lines
8.6 KiB
Python
254 lines
8.6 KiB
Python
import datetime as dt
|
|
import os
|
|
import shutil
|
|
from pathlib import Path
|
|
from typing import Union
|
|
from unittest import mock
|
|
|
|
from django.test import TestCase
|
|
from django.test import override_settings
|
|
from pdfminer.high_level import extract_text
|
|
from pikepdf import Pdf
|
|
|
|
from documents import tasks
|
|
from documents.consumer import ConsumerError
|
|
from documents.data_models import ConsumableDocument
|
|
from documents.data_models import DocumentSource
|
|
from documents.double_sided import STAGING_FILE_NAME
|
|
from documents.double_sided import TIMEOUT_MINUTES
|
|
from documents.tests.utils import DirectoriesMixin
|
|
from documents.tests.utils import FileSystemAssertsMixin
|
|
|
|
|
|
@override_settings(
|
|
CONSUMER_RECURSIVE=True,
|
|
CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED=True,
|
|
)
|
|
class TestDoubleSided(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
|
SAMPLE_DIR = Path(__file__).parent / "samples"
|
|
|
|
def setUp(self):
|
|
super().setUp()
|
|
self.dirs.double_sided_dir = self.dirs.consumption_dir / "double-sided"
|
|
self.dirs.double_sided_dir.mkdir()
|
|
self.staging_file = self.dirs.scratch_dir / STAGING_FILE_NAME
|
|
|
|
def consume_file(self, srcname, dstname: Union[str, Path] = "foo.pdf"):
|
|
"""
|
|
Starts the consume process and also ensures the
|
|
destination file does not exist afterwards
|
|
"""
|
|
src = self.SAMPLE_DIR / srcname
|
|
dst = self.dirs.double_sided_dir / dstname
|
|
dst.parent.mkdir(parents=True, exist_ok=True)
|
|
shutil.copy(src, dst)
|
|
with mock.patch("documents.tasks.async_to_sync"), mock.patch(
|
|
"documents.consumer.async_to_sync",
|
|
):
|
|
msg = tasks.consume_file(
|
|
ConsumableDocument(
|
|
source=DocumentSource.ConsumeFolder,
|
|
original_file=dst,
|
|
),
|
|
None,
|
|
)
|
|
self.assertIsNotFile(dst)
|
|
return msg
|
|
|
|
def create_staging_file(self, src="double-sided-odd.pdf", datetime=None):
|
|
shutil.copy(self.SAMPLE_DIR / src, self.staging_file)
|
|
if datetime is None:
|
|
datetime = dt.datetime.now()
|
|
os.utime(str(self.staging_file), (datetime.timestamp(),) * 2)
|
|
|
|
def test_odd_numbered_moved_to_staging(self):
|
|
"""
|
|
GIVEN:
|
|
- No staging file exists
|
|
WHEN:
|
|
- A file is copied into the double-sided consume directory
|
|
THEN:
|
|
- The file becomes the new staging file
|
|
- The file in the consume directory gets removed
|
|
- The staging file has the st_mtime set to now
|
|
- The user gets informed
|
|
"""
|
|
|
|
msg = self.consume_file("double-sided-odd.pdf")
|
|
|
|
self.assertIsFile(self.staging_file)
|
|
self.assertAlmostEqual(
|
|
dt.datetime.fromtimestamp(self.staging_file.stat().st_mtime),
|
|
dt.datetime.now(),
|
|
delta=dt.timedelta(seconds=5),
|
|
)
|
|
self.assertIn("Received odd numbered pages", msg)
|
|
|
|
def test_collation(self):
|
|
"""
|
|
GIVEN:
|
|
- A staging file not older than TIMEOUT_MINUTES with odd pages exists
|
|
WHEN:
|
|
- A file is copied into the double-sided consume directory
|
|
THEN:
|
|
- A new file containing the collated staging and uploaded file is
|
|
created and put into the consume directory
|
|
- The new file is named "foo-collated.pdf", where foo is the name of
|
|
the second file
|
|
- Both staging and uploaded file get deleted
|
|
- The new file contains the pages in the correct order
|
|
"""
|
|
|
|
self.create_staging_file()
|
|
self.consume_file("double-sided-even.pdf", "some-random-name.pdf")
|
|
|
|
target = self.dirs.consumption_dir / "some-random-name-collated.pdf"
|
|
self.assertIsFile(target)
|
|
self.assertIsNotFile(self.staging_file)
|
|
self.assertRegex(
|
|
extract_text(str(target)),
|
|
r"(?s)"
|
|
r"This is page 1.*This is page 2.*This is page 3.*"
|
|
r"This is page 4.*This is page 5",
|
|
)
|
|
|
|
def test_staging_file_expiration(self):
|
|
"""
|
|
GIVEN:
|
|
- A staging file older than TIMEOUT_MINUTES exists
|
|
WHEN:
|
|
- A file is copied into the double-sided consume directory
|
|
THEN:
|
|
- It becomes the new staging file
|
|
"""
|
|
|
|
self.create_staging_file(
|
|
datetime=dt.datetime.now()
|
|
- dt.timedelta(minutes=TIMEOUT_MINUTES, seconds=1),
|
|
)
|
|
msg = self.consume_file("double-sided-odd.pdf")
|
|
self.assertIsFile(self.staging_file)
|
|
self.assertIn("Received odd numbered pages", msg)
|
|
|
|
def test_less_odd_pages_then_even_fails(self):
|
|
"""
|
|
GIVEN:
|
|
- A valid staging file
|
|
WHEN:
|
|
- A file is copied into the double-sided consume directory
|
|
that has more pages than the staging file
|
|
THEN:
|
|
- Both files get removed
|
|
- A ConsumerError exception is thrown
|
|
"""
|
|
self.create_staging_file("simple.pdf")
|
|
self.assertRaises(
|
|
ConsumerError,
|
|
self.consume_file,
|
|
"double-sided-even.pdf",
|
|
)
|
|
self.assertIsNotFile(self.staging_file)
|
|
|
|
@override_settings(CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT=True)
|
|
def test_tiff_upload_enabled(self):
|
|
"""
|
|
GIVEN:
|
|
- CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT is true
|
|
- No staging file exists
|
|
WHEN:
|
|
- A TIFF file gets uploaded into the double-sided
|
|
consume dir
|
|
THEN:
|
|
- The file is converted into a PDF and moved to
|
|
the staging file
|
|
"""
|
|
self.consume_file("simple.tiff", "simple.tiff")
|
|
self.assertIsFile(self.staging_file)
|
|
# Ensure the file is a valid PDF by trying to read it
|
|
Pdf.open(self.staging_file)
|
|
|
|
@override_settings(CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT=False)
|
|
def test_tiff_upload_disabled(self):
|
|
"""
|
|
GIVEN:
|
|
- CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT is false
|
|
- No staging file exists
|
|
WHEN:
|
|
- A TIFF file gets uploaded into the double-sided
|
|
consume dir
|
|
THEN:
|
|
- A ConsumerError is raised
|
|
"""
|
|
self.assertRaises(
|
|
ConsumerError,
|
|
self.consume_file,
|
|
"simple.tiff",
|
|
"simple.tiff",
|
|
)
|
|
|
|
@override_settings(CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME="quux")
|
|
def test_different_upload_dir_name(self):
|
|
"""
|
|
GIVEN:
|
|
- No staging file exists
|
|
- CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME is set to quux
|
|
WHEN:
|
|
- A file is uploaded into the quux dir
|
|
THEN:
|
|
- A staging file is created
|
|
"""
|
|
self.consume_file("double-sided-odd.pdf", Path("..") / "quux" / "foo.pdf")
|
|
self.assertIsFile(self.staging_file)
|
|
|
|
def test_only_double_sided_dir_is_handled(self):
|
|
"""
|
|
GIVEN:
|
|
- No staging file exists
|
|
WHEN:
|
|
- A file is uploaded into the normal consumption dir
|
|
THEN:
|
|
- The file is processed as normal
|
|
"""
|
|
msg = self.consume_file("simple.pdf", Path("..") / "simple.pdf")
|
|
self.assertIsNotFile(self.staging_file)
|
|
self.assertRegex(msg, "Success. New document .* created")
|
|
|
|
def test_subdirectory_upload(self):
|
|
"""
|
|
GIVEN:
|
|
- A staging file exists
|
|
WHEN:
|
|
- A file gets uploaded into foo/bar/double-sided
|
|
or double-sided/foo/bar
|
|
THEN:
|
|
- The collated file gets put into foo/bar
|
|
"""
|
|
for path in [
|
|
Path("foo") / "bar" / "double-sided",
|
|
Path("double-sided") / "foo" / "bar",
|
|
]:
|
|
with self.subTest(path=path):
|
|
# Ensure we get fresh directories for each run
|
|
self.tearDown()
|
|
self.setUp()
|
|
|
|
self.create_staging_file()
|
|
self.consume_file("double-sided-odd.pdf", path / "foo.pdf")
|
|
self.assertIsFile(
|
|
self.dirs.consumption_dir / "foo" / "bar" / "foo-collated.pdf",
|
|
)
|
|
|
|
@override_settings(CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED=False)
|
|
def test_disabled_double_sided_dir_upload(self):
|
|
"""
|
|
GIVEN:
|
|
- CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED is false
|
|
WHEN:
|
|
- A file is uploaded into the double-sided directory
|
|
THEN:
|
|
- The file is processed like a normal upload
|
|
"""
|
|
msg = self.consume_file("simple.pdf")
|
|
self.assertIsNotFile(self.staging_file)
|
|
self.assertRegex(msg, "Success. New document .* created")
|