From 8c7554e0819e4b5d190cb8678ef9a97bd0b58edc Mon Sep 17 00:00:00 2001 From: Dennis Brakhane Date: Mon, 24 Jul 2023 09:29:04 +0200 Subject: [PATCH] Feature: collate two single-sided multipage scans (#3784) * Feature: collate two single-sided scans Some ADF only support single-sided scans, making scanning double-sided documents a bit annoying. This new feature enables Paperless to do most of the work, by merging two seperate scans into a single one, collating the even and odd numbered pages. * Documentation: clarify that collation is disabled by default * Apply suggestions from code review Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com> * Address code review remarks * Grammar fixes --------- Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com> --- docs/advanced_usage.md | 68 ++++- docs/configuration.md | 37 +++ paperless.conf.example | 3 + src/documents/barcodes.py | 34 +-- src/documents/converters.py | 46 ++++ src/documents/double_sided.py | 131 +++++++++ src/documents/tasks.py | 48 ++-- .../tests/samples/double-sided-even.pdf | Bin 0 -> 3386 bytes .../tests/samples/double-sided-odd.pdf | Bin 0 -> 3708 bytes src/documents/tests/test_double_sided.py | 253 ++++++++++++++++++ src/paperless/settings.py | 12 + 11 files changed, 584 insertions(+), 48 deletions(-) create mode 100644 src/documents/converters.py create mode 100644 src/documents/double_sided.py create mode 100644 src/documents/tests/samples/double-sided-even.pdf create mode 100644 src/documents/tests/samples/double-sided-odd.pdf create mode 100644 src/documents/tests/test_double_sided.py diff --git a/docs/advanced_usage.md b/docs/advanced_usage.md index 094943f3e..199931bb5 100644 --- a/docs/advanced_usage.md +++ b/docs/advanced_usage.md @@ -528,7 +528,7 @@ For how to enable barcode usage, see [the configuration](/configuration#barcodes The two settings may be enabled independently, but do have interactions as explained below. -### Document Splitting +### Document Splitting {#document-splitting} When enabled, Paperless will look for a barcode with the configured value and create a new document starting from the next page. The page with the barcode on it will _not_ be retained. It @@ -543,3 +543,69 @@ If document splitting via barcode is also enabled, documents will be split when barcode is located. However, differing from the splitting, the page with the barcode _will_ be retained. This allows application of a barcode to any page, including one which holds data to keep in the document. + +## Automatic collation of double-sided documents {#collate} + +!!! note + + If your scanner supports double-sided scanning natively, you do not need this feature. + +This feature is turned off by default, see [configuration](/configuration#collate) on how to turn it on. + +### Summary + +If you have a scanner with an automatic document feeder (ADF) that only scans a single side, +this feature makes scanning double-sided documents much more convenient by automatically +collating two separate scans into one document, reordering the pages as necessary. + +### Usage example + +Suppose you have a double-sided document with 6 pages (3 sheets of paper). First, +put the stack into your ADF as normal, ensuring that page 1 is scanned first. Your ADF +will now scan pages 1, 3, and 5. Then you (or your the scanner, if it supports it) upload +the scan into the correct sub-directory of the consume folder (`double-sided` by default; +keep in mind that Paperless will _not_ automatically create the directory for you.) +Paperless will then process the scan and move it into an internal staging area. + +The next step is to turn your stack upside down (without reordering the sheets of paper), +and scan it once again, your ADF will now scan pages 6, 4, and 2, in that order. Once this +scan is copied into the sub-directory, Paperless will collate the previous scan with the +new one, reversing the order of the pages on the second, "even numbered" scan. The +resulting document will have the pages 1-6 in the correct order, and this new file will +then be processed as normal. + +!!! tip + + When scanning the even numbered pages, you can omit the last empty pages, if there are + any. For example, if page 6 is empty, you only need to scan pages 2 and 4. _Do not_ omit + empty pages in the middle of the document. + +### Things that could go wrong + +Paperless will notice when the first, "odd numbered" scan has less pages than the second +scan (this can happen when e.g. the ADF skipped a few pages in the first pass). In that +case, Paperless will remove the staging copy as well as the scan, and give you an error +message asking you to restart the process from scratch, by scanning the odd pages again, +followed by the even pages. + +Another thing that might happen is that you start a double sided scan, but then forget +to upload the second file. To avoid collating the wrong documents if you then come back +a day later to scan a new double-sided document, Paperless will only keep an "odd numbered +pages" file for up to 30 minutes. If more time passes, it will consider the next incoming +scan a completely new "odd numbered pages" one. The old staging file will get discarded. + +### Interaction with "subdirs as tags" + +The collation feature can be used together with the "subdirs as tags" feature (but this is not +a requirement). Just create a correctly named double-sided subdir in the hierachy and upload +your scans there. For example, both `double-sided/foo/bar` as well as `foo/bar/double-sided` will +cause the collated document to be treated as if it were uploaded into `foo/bar` and receive both +`foo` and `bar` tags, but not `double-sided`. + +### Interaction with document splitting + +You can use the [document splitting](#document-splitting) feature, but if you use a normal +single-sided split marker page, the split document(s) will have an empty page at the front (or +whatever else was on the backside of the split marker page.) You can work around that by having +a split marker page that has the split barcode on _both_ sides. This way, the extra page will +get automatically removed. diff --git a/docs/configuration.md b/docs/configuration.md index 8f587d8ac..0ed2218a6 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -1116,6 +1116,43 @@ combination with PAPERLESS_CONSUMER_BARCODE_UPSCALE bigger than 1.0. Defaults to "300" +## Collate Double-Sided Documents {#collate} + +`PAPERLESS_CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED=` + +: Enables automatic collation of two single-sided scans into a double-sided +document. + + This is useful if you have an automatic document feeder that only supports + single-sided scans, but you need to scan a double-sided document. If your + ADF supports double-sided scans natively, you do not need this feature. + + `PAPERLESS_CONSUMER_RECURSIVE` must be enabled for this to work. + + For more information, read the [corresponding section in the advanced + documentation](/advanced_usage#collate). + + Defaults to false. + +`PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME=` + +: The name of the subdirectory that the collate feature expects documents to +arrive. + + This only has an effect if `PAPERLESS_CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED` + has been enabled. Note that Paperless will not automatically create the + directory. + + Defaults to "double-sided". + +`PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT=` +: Whether TIFF image files should be supported when collating documents. +This will automatically convert any TIFF image(s) to pdfs for later +processing. This only has an effect if +`PAPERLESS_CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED` has been enabled. + + Defaults to false. + ## Binaries There are a few external software packages that Paperless expects to diff --git a/paperless.conf.example b/paperless.conf.example index 9b168db0c..1610dcda9 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -68,6 +68,9 @@ #PAPERLESS_CONSUMER_BARCODE_STRING=PATCHT #PAPERLESS_CONSUMER_BARCODE_UPSCALE=0.0 #PAPERLESS_CONSUMER_BARCODE_DPI=300 +#PAPERLESS_CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED=false +#PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME=double-sided +#PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT=false #PAPERLESS_PRE_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh #PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh #PAPERLESS_FILENAME_DATE_ORDER=YMD diff --git a/src/documents/barcodes.py b/src/documents/barcodes.py index cabc195b3..b64f531d8 100644 --- a/src/documents/barcodes.py +++ b/src/documents/barcodes.py @@ -2,13 +2,11 @@ import logging import tempfile from dataclasses import dataclass from pathlib import Path -from subprocess import run from typing import Dict from typing import Final from typing import List from typing import Optional -import img2pdf from django.conf import settings from pdf2image import convert_from_path from pdf2image.exceptions import PDFPageCountError @@ -16,6 +14,7 @@ from pikepdf import Page from pikepdf import Pdf from PIL import Image +from documents.converters import convert_from_tiff_to_pdf from documents.data_models import DocumentSource from documents.utils import copy_basic_file_stats from documents.utils import copy_file_with_basic_stats @@ -55,7 +54,7 @@ class BarcodeReader: self.mime: Final[str] = mime_type self.pdf_file: Path = self.file self.barcodes: List[Barcode] = [] - self.temp_dir: Optional[Path] = None + self.temp_dir: Optional[tempfile.TemporaryDirectory] = None if settings.CONSUMER_BARCODE_TIFF_SUPPORT: self.SUPPORTED_FILE_MIMES = {"application/pdf", "image/tiff"} @@ -155,34 +154,7 @@ class BarcodeReader: if self.mime != "image/tiff": return - with Image.open(self.file) as im: - has_alpha_layer = im.mode in ("RGBA", "LA") - if has_alpha_layer: - # Note the save into the temp folder, so as not to trigger a new - # consume - scratch_image = Path(self.temp_dir.name) / Path(self.file.name) - run( - [ - settings.CONVERT_BINARY, - "-alpha", - "off", - self.file, - scratch_image, - ], - ) - else: - # Not modifying the original, safe to use in place - scratch_image = self.file - - self.pdf_file = Path(self.temp_dir.name) / Path(self.file.name).with_suffix( - ".pdf", - ) - - with scratch_image.open("rb") as img_file, self.pdf_file.open("wb") as pdf_file: - pdf_file.write(img2pdf.convert(img_file)) - - # Copy what file stat is possible - copy_basic_file_stats(self.file, self.pdf_file) + self.pdf_file = convert_from_tiff_to_pdf(self.file, Path(self.temp_dir.name)) def detect(self) -> None: """ diff --git a/src/documents/converters.py b/src/documents/converters.py new file mode 100644 index 000000000..e3a7cb786 --- /dev/null +++ b/src/documents/converters.py @@ -0,0 +1,46 @@ +from pathlib import Path +from subprocess import run + +import img2pdf +from django.conf import settings +from PIL import Image + +from documents.utils import copy_basic_file_stats + + +def convert_from_tiff_to_pdf(tiff_path: Path, target_directory: Path) -> Path: + """ + Converts a TIFF file into a PDF file. + + The PDF will be created in the given target_directory and share the name of + the original TIFF file, as well as its stats (mtime etc.). + + Returns the path of the PDF created. + """ + with Image.open(tiff_path) as im: + has_alpha_layer = im.mode in ("RGBA", "LA") + if has_alpha_layer: + # Note the save into the temp folder, so as not to trigger a new + # consume + scratch_image = target_directory / tiff_path.name + run( + [ + settings.CONVERT_BINARY, + "-alpha", + "off", + tiff_path, + scratch_image, + ], + ) + else: + # Not modifying the original, safe to use in place + scratch_image = tiff_path + + pdf_path = (target_directory / tiff_path.name).with_suffix(".pdf") + + with scratch_image.open("rb") as img_file, pdf_path.open("wb") as pdf_file: + pdf_file.write(img2pdf.convert(img_file)) + + # Copy what file stat is possible + copy_basic_file_stats(tiff_path, pdf_path) + return pdf_path diff --git a/src/documents/double_sided.py b/src/documents/double_sided.py new file mode 100644 index 000000000..4e6b8b7a3 --- /dev/null +++ b/src/documents/double_sided.py @@ -0,0 +1,131 @@ +import datetime as dt +import logging +import os +import shutil +from pathlib import Path + +from django.conf import settings +from pikepdf import Pdf + +from documents.consumer import ConsumerError +from documents.converters import convert_from_tiff_to_pdf +from documents.data_models import ConsumableDocument + +logger = logging.getLogger("paperless.double_sided") + +# Hardcoded for now, could be made a configurable setting if needed +TIMEOUT_MINUTES = 30 + +# Used by test cases +STAGING_FILE_NAME = "double-sided-staging.pdf" + + +def collate(input_doc: ConsumableDocument) -> str: + """ + Tries to collate pages from 2 single sided scans of a double sided + document. + + When called with a file, it checks whether or not a staging file + exists, if not, the current file is turned into that staging file + containing the odd numbered pages. + + If a staging file exists, and it is not too old, the current file is + considered to be the second part (the even numbered pages) and it will + collate the pages of both, the pages of the second file will be added + in reverse order, since the ADF will have scanned the pages from bottom + to top. + + Returns a status message on succcess, or raises a ConsumerError + in case of failure. + """ + + # Make sure scratch dir exists, Consumer might not have run yet + settings.SCRATCH_DIR.mkdir(exist_ok=True) + + if input_doc.mime_type == "application/pdf": + pdf_file = input_doc.original_file + elif ( + input_doc.mime_type == "image/tiff" + and settings.CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT + ): + pdf_file = convert_from_tiff_to_pdf( + input_doc.original_file, + settings.SCRATCH_DIR, + ) + input_doc.original_file.unlink() + else: + raise ConsumerError("Unsupported file type for collation of double-sided scans") + + staging = settings.SCRATCH_DIR / STAGING_FILE_NAME + + valid_staging_exists = False + if staging.exists(): + stats = os.stat(str(staging)) + # if the file is older than the timeout, we don't consider + # it valid + if dt.datetime.now().timestamp() - stats.st_mtime > TIMEOUT_MINUTES * 60: + logger.warning("Outdated double sided staging file exists, deleting it") + os.unlink(str(staging)) + else: + valid_staging_exists = True + + if valid_staging_exists: + try: + # Collate pages from second PDF in reverse order + with Pdf.open(staging) as pdf1, Pdf.open(pdf_file) as pdf2: + pdf2.pages.reverse() + try: + for i, page in enumerate(pdf2.pages): + pdf1.pages.insert(2 * i + 1, page) + except IndexError: + raise ConsumerError( + "This second file (even numbered pages) contains more " + "pages than the first/odd numbered one. This means the " + "two uploaded files don't belong to the same double-" + "sided scan. Please retry, starting with the odd " + "numbered pages again.", + ) + # Merged file has the same path, but without the + # double-sided subdir. Therefore, it is also in the + # consumption dir and will be picked up for processing + old_file = input_doc.original_file + new_file = Path( + *( + part + for part in old_file.with_name( + f"{old_file.stem}-collated.pdf", + ).parts + if part != settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME + ), + ) + # If the user didn't create the subdirs yet, do it for them + new_file.parent.mkdir(parents=True, exist_ok=True) + pdf1.save(new_file) + logger.info("Collated documents into new file %s", new_file) + return ( + "Success. Even numbered pages of double sided scan collated " + "with odd pages" + ) + finally: + # Delete staging and recently uploaded file no matter what. + # If any error occurs, the user needs to be able to restart + # the process from scratch; after all, the staging file + # with the odd numbered pages might be the culprit + pdf_file.unlink() + staging.unlink() + + else: + # In Python 3.9 move supports Path objects directly, + # but for now we have to be compatible with 3.8 + shutil.move(str(pdf_file), str(staging)) + # update access to modification time so we know if the file + # is outdated when another file gets uploaded + os.utime(str(staging), (dt.datetime.now().timestamp(),) * 2) + logger.info( + "Got scan with odd numbered pages of double-sided scan, moved it to %s", + staging, + ) + return ( + "Received odd numbered pages of double sided scan, waiting up to " + f"{TIMEOUT_MINUTES} minutes for even numbered pages" + ) diff --git a/src/documents/tasks.py b/src/documents/tasks.py index 97a7791f3..f1b65c45f 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -25,6 +25,7 @@ from documents.consumer import Consumer from documents.consumer import ConsumerError from documents.data_models import ConsumableDocument from documents.data_models import DocumentMetadataOverrides +from documents.double_sided import collate from documents.file_handling import create_source_path_directory from documents.file_handling import generate_unique_filename from documents.models import Correspondent @@ -89,10 +90,40 @@ def consume_file( input_doc: ConsumableDocument, overrides: Optional[DocumentMetadataOverrides] = None, ): + def send_progress(status="SUCCESS", message="finished"): + payload = { + "filename": overrides.filename or input_doc.original_file.name, + "task_id": None, + "current_progress": 100, + "max_progress": 100, + "status": status, + "message": message, + } + try: + async_to_sync(get_channel_layer().group_send)( + "status_updates", + {"type": "status_update", "data": payload}, + ) + except ConnectionError as e: + logger.warning(f"ConnectionError on status send: {e!s}") + # Default no overrides if overrides is None: overrides = DocumentMetadataOverrides() + # Handle collation of double-sided documents scanned in two parts + if settings.CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED and ( + settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME + in input_doc.original_file.parts + ): + try: + msg = collate(input_doc) + send_progress(message=msg) + return msg + except ConsumerError as e: + send_progress(status="FAILURE", message=e.args[0]) + raise e + # read all barcodes in the current document if settings.CONSUMER_ENABLE_BARCODES or settings.CONSUMER_ENABLE_ASN_BARCODE: with BarcodeReader(input_doc.original_file, input_doc.mime_type) as reader: @@ -102,24 +133,9 @@ def consume_file( ): # notify the sender, otherwise the progress bar # in the UI stays stuck - payload = { - "filename": overrides.filename or input_doc.original_file.name, - "task_id": None, - "current_progress": 100, - "max_progress": 100, - "status": "SUCCESS", - "message": "finished", - } - try: - async_to_sync(get_channel_layer().group_send)( - "status_updates", - {"type": "status_update", "data": payload}, - ) - except ConnectionError as e: - logger.warning(f"ConnectionError on status send: {e!s}") + send_progress() # consuming stops here, since the original document with # the barcodes has been split and will be consumed separately - input_doc.original_file.unlink() return "File successfully split" diff --git a/src/documents/tests/samples/double-sided-even.pdf b/src/documents/tests/samples/double-sided-even.pdf new file mode 100644 index 0000000000000000000000000000000000000000..7caa48a57f13b28d151051b63c39e65f76d613c1 GIT binary patch literal 3386 zcma);c{CJy8^~-nHbB2p)vL)+f8T;%Gmd%C?i`c zOV*I3jLJ?(p)5m^xBK4ry!YPs-qU&id(Q9to^yVm=llGg=PPcZcR>~=rwkBZ&rL4^ zD1qcbcsCqCLj!uj!;66N13@o%xe_pX7&QJ025N%wxk_*c!QcojEr374598_$AilA= zc@EH~!==wq)t8&z>@vfG<2hSRoyz3%KytX&`%M;WM?2F2Uv_2|S;1UaMYUv%L++AMK^rGBhoBmy*>PH9u40fk6o$N>= zgAgInu(S4cqOf#}K7R)9b4loppjv;(p}mukUjP`NEB|@YkLWiD<^K~#>1Qw{d>uEY zOXRG97Q83wTRt{{y2W>8CwR<`t41yDUj6Ack>U2k<{@&w6y{N*Gp&Q!z8K zM0_!|VmsvVsco5rjp+NHc|H+D#^!%vHt6$u%E|+>pA@pOT34$6u6p{8@&&s3Ox#hHj@XW}2$L5fCJ)rMBYG=oX|s+Y zutJy>GAJH+XT-FCs}|$$UV!c#LlF4j^j@}(4)o8 zGQaNADQkn51x`{>&Yoa5YA5<@ zNJWToKF7`e*i&g&A7a=Pm)QAc3}mFIRA6Xr072Mfu*DT7#m4dt4JKqe)Z`UdI7PgJ0 zz4GkwE<7ko_)hDeFfToLAUPh^DM9_z7YEM0*5d}kB;+f`Nf)Y`Z=Eo-66xmz#ZfIa zG=!BVO_o`Wt-j{J06ThZ1g z6nqQL=W2BEuk9_x{nVB`Pa*YY&;4dp-!`D{&R80K<+7sD)DF7Jtm4swuTDqnZNuyn zHJ?ce;ASGlM0VC)Te^uJG%dB&sE*HFO&OD)0{w~?>5%eIax3!&gne+ij2AWO>DU5> za<|5kDgiTltamX`Sn@*qa56CLmZ8z1jGQvMI*;&oYgOqE8d8ZP}A_rQd7Fb znUoVZTAPJa-ABWkAkokgk1Z*i^-SJS+PTsI{bjzY46ErbgUQK<<=xAW^2+pzQX7Y( z;lYbib5e-+-oU$e14rp0c67RW)mtH$>R4WGapp>e%q1o+f!<* zzRNsqx8f&MNIrdA>n;H~&sRVcxap+Kcv?>~pB`9$n_GWTIdf`D8_0;RcLtJhS`_2`}a-cuMQz471h@f8o_;W87Q?;gkYO)SbBzx6~$JxotXbPZ_HsQ4f$M~piaPh&2N zSzOV*kySk_CN?_<7MlxKAKmwqt$Dt{cjO3Ps;}?ScA4X0`8ESDt$QG&qUR?CZgMq7 z+ocNtIFcR@y9z0tUF@%Owsk^YvUGN~yo9`-xzK%Qd8y0*%M(y9@*cZLRSn;F^!IE> zdYWFS7=1Y|nQh*m!dzwBIIB*2vm>^ylmjWbeyrfU?)md+?KezcvcxsiKbx8wJe}MQ zPV)-&KmTDh2?c=WnrP+t@FfDM(pHGIvUkJ{-N60PTNG*MVAUa&Ah#WqgF3G5?7QAT zxl6!<;h2O}@$KH^c8b(&91=y1qtpB zg(jX=ujs~+4pU%RFXr`}xC#k+kg+Z!(CG$D4M%|rT?OfzT{ECJL7OpQL-H@QYB9ev z>1AadYBR9s(eKsEO;D9yABZSEAs$hCsMV0Q#xU;Q5y*HVhL?C!n*$O=Tx`dlyyow6 zB4#&mM@bTW*!Q@kv){Fhj@Di?+a*lWVYTKcLDn_3#2_!{DVZ_ZsMr4b--UzR$h9wK zzE1}~J>=*!1tEUi(_3BOTQh*q@2B7;xZ_i`<6XI7#Iq5=ezu;2hbKNev$)x3AX`&J zptxcbEyAs(r_h1vUBm$zQZX;ikOitu2L|+gVdS(yzwcd4+!c!Wfk+*(eLrIXPUPLTWoW% zYXk928rf;-Jhru}OoSK*k2Zi5Wj$^fCG+mKOiWzpR?Q|!p~%}~n=i5Qy1J&O5xnhi z&m@(lbkoHAa=j>#UbP>|N-I@eR+DlCh>&KsDNvnS0dc z2;3!9an45YJutCQXjIVVEbLuN$Am+Y)bLO;yfZhx_KCTn*T9XLB2FW#gbfR!)$7=# zr1G6_%=@Xos{TW1|K!!IgRWsfP;)n&6~P;727>+O3;g^EAcVrt41&^MtKMS$7WY<% zH+0SJ1==Km0rn6-Cr|gKHi#xAOsKeQ)}$kP*&IBJlsdwvPD+}JrhP3Q1unnH8E^b3 zoOFnXzm1nUUN8nkIPwsM zqSYXQUzqCW;OLB1i{O+(zFj#XyHkE6&&J;@Z8x~f_#xHiqjT2KPPl4myKUGx3u?|2 z(#F`u>J}zN8qkllDAd6b{&6{M59gQc=`KTKl#~afcMCXC87(mX%wsDD@n4&nLKI`| z!QXgC`<#L-ey;$xe7nV^qdZ;dkT9LvRq!Q z+-zV-WcT492-15E4YezyA&)e?LZL5r(~igA_9{S)czj59%3t|HcZ&_$|7dvsXbJ5e zzNW`_<`MSUY8JC5GW+TV^ToRFD$h*zLNS5Xss=r7DU{|M^R-Q3Iu!aS)-)nK^$LhJ zEFe;s_i;2tH*c29nmG&doMO+P!l7HF%af=@>VX?Z`7lN5p5VJOUYb=Xx;}O|Uny+% zjP0s{x4#{b3d)|nkE4}+PEH@dPeQ9+5|7>LXliXe?_V@H(N!!tO4&H(EMb5AA37BA z$LIgOL+vdwSg4WD6$}xCk_W+*ouGCg6bz&YLO4OKJV=-yM^H;Vo&bXX$h7=%6N?8a z{5$-~@Ia{id3K;(BBBls?nIo?j)`_r0B4$TNWjGqbuyqK^`(L;9b~5^rhrt5iFoYsNTwLG$0^nZ?I!ck22uKZ~NJj|LHPU;PD!B*((gf)e z0Rib^(9l6ZiYP_bJG-;joqOlbe%$?b=FB-W=Q-~)@B8Dwrl~Ck6Gs62I|b>bfHXt` z;^pWDC@Mm=UEQ%L3(QBVVv=Pj%Y1O}H;Rt9{r7?gtt5IAd|s0MVY zp4U2*(-L3U?>8!A?7=ORiJM)z#=zEKZ)sa`NJCBqcE7w|q2f7zi(6S#U&2~i2%|EV z!f;t^>ZVXoabOsonIBVUcTQuLa~N70B$XLo#=nBE`W({6@mVx(FN$!x&@&?Na6dxe z*_Z93`I7Rz=|w(I&s^(Sl>AT##nFP`B>?5={I4hdSp5!x_1LmJf!61@yFv-7l4j~Eu8LihM^qQGDi^}20 zV4_YcVe775FCn4ug8E0YT9OQyq0-4s|MqBq9VJM#pwXT?J-N(<63FE06-%rq$+JJJ?BB6s-KcUbAL%@+#eX^BjKm^}6{ z#dNE>!0)Q8F(eZ^-mZ7Z2crH(bHT9i*4u!9!8Z}D?h}ReYBzJG5q;^F^P7B*xsBy3 z3JK9p^1S$DB6JuN6hA<(`U{&X4QeT^MW56{NY>4bhbk(JXTch)1n7KKJ|qySc~>70 z&KxbA!QEP6Q;~CRc0SY2MdH1KTN`H2QBNrhXI{?P%R{mn3u`%DI2y27cr|j~!}KKX zWUtfr;>Do&<7ADyl+PFUdJcsZCy$6a^IWnqhTqbCVhj{331o6D#Bju=A(+OPmVfwxq?}n@5D?U_{Cp^ zC`GzB+vK`~WWWbuFIKgPnX$T292}x^A5y1jtIa=gJMD)}9$ua0QV8?WvE8`04sK{I zEh!YNbtNvRBeFRg^Ax7iGE@RW@lUyhgSLrFt*=VvZk|5w7O019Og1T!+KA@1^&vxJ z<;@=jrp42euU(=|>Fpd!d8w;$-Zz-qn3TiR+V>5_{wRN|uKhrn35C zE4Eb4@EpA*G*_woTc4P@$Cv%i)&c`Pmf>yyW2qmJ=Ad{*k;Nja*IvunnOTHXZX}XY zOnl>d{PqJ*JiTl(z!t#pl5BQ7k}{(OlcNHFHVJUE4)2p!#j6LCFh+g%T=(V=Ma5J| zHeb(!LG#}H!Aa_d_rphT(~%?Lz0tE5sfaNOWBxP9TnBiE^|K1^>lf$7kDKSpQ~#ihmDfq_VGvg z=ojT-=&p>Tb@QukF`z|KSjbCLg`BL%cIYQ=!Nje0uZ~A+mfH*pMJkEKg(Uk$_BgYU z(aF#Jf=_sB(-^4qN<4-)qId=~S}~thGG!c>Orf5Mb9Vkahg%w`IkHt_OJ!qmRZB}% zu|?C$Zg+Y3{n*t3qu{-T+4D3B+iMO!$%%b74URYGUNVz%<=eP^y{mFl@&0VGzBpO) z%k@B0WI~Pm&J8O3*z~KqC67BpNpw}uCZ4#g4F)wy_0HcO5phwNJe?tRC#!eUQK7!{FPs;pO4Sx@{0e) z^5QP7p~*)_5gryzi~NkfNX5ixfcX#`{Zi1XL3Anmx`u}q%Fhp_<#8KCe2-t-QjQ&v zI`D6xHTC_>rF<|Hm*UZiVTW_LStIWUP2-DVmX~)`I>)CfgE<@fn0rM|g}A`rB7U|D zFPxKhIMsA@YrRyiG+BM!{D4Xx60ykOoX@FKm=_}m3x)ACw4urIa4s}w?wiw|E*&oY zMv@F{9c*dn^d`Fgdav&?-#ZltM+4ORzA5jmKv3zfzA!aKxOH_6$1N!!l}(3w)R>FF z-or-oo@vJ`^x2h$FFgDv+-~kdO39aVae!?ipqM=NLH+4(3I}~4E?b=aF_9&Fq9xM& z9XqII`%?Jv35smY>TSA4Xp@m2&J%B57wgecW`5Jb!!RaaLjaG`ocYq6q|QZezawz& z)QsZ$IJ&R^WWO_AO7 zjKjZ4U6Qi@F#v7x^KLy52HbWi0BvHs>KGjT~WJduS0 zU1U*S=IED0XMAgJ64=?C|IIC;^H-;admimUR3XtbRssMa2|#q2ij6%H&At~3s@?a$ zaeZ!glb5w=As{SpY=f3wG)HdQ^6^V5^+|iBJCV)lT*Y9YGvtJ;anMkBmrb*k9*4?@_2TaPa8z#zyX4;)_(+1ivtok;2^g)meJdjhKJ;H% zLH^c)eHPQDE?3^_&#=p8p4E6I`f9>0%vXm4-U}S^hNAdQ3rlUoP4>rvEciH|Oj^8; zu9LFSxF{Zmo05M!7PS++h4pH(jq~icv-3xX4)m0cw#XOmBa{rku_NolmxcIy%??qw zWQP6iP6(5_A;DIP!(Pvg4ys#7S5{i!Bl+!Sx5qaZTp*pyy#6y4Ei`M=0>|Y0-MSH} z=bRmG)Plv4+M`_>qqN?vOo-w4({Lr*vtNN-T0Av!n` zXTU*hLV8v_nn`7i1!go4X3M{Ob*oQ=OpzJuKhKz;=GSKT3EhGt!pEgL?(fVi;&520 z>erTJe_ln`de{AI8~WOo*8^jALuE~FXEtpUlsbjG3d*TIgEyJ+nhejQ0_mewAt{v3 zfC&D*J`)oyi#u0IA1hmhcSuCiE_ye`RD)Kp3TlYn;Si=qme1KlrDvRwgJkRL|2H z6$r7GfWYKzp_UM97(@ypV+%EP4MP3cf|_`FVIlAzH%)$=M0-JgZz?8#=QKk0XQiF& zBkq3xo)u9FiH@?DR1_06LD6SaLApVTxi*>+Y12HHx0GC3@0Q~$~NNwOBYTUl= literal 0 HcmV?d00001 diff --git a/src/documents/tests/test_double_sided.py b/src/documents/tests/test_double_sided.py new file mode 100644 index 000000000..88cbe7d87 --- /dev/null +++ b/src/documents/tests/test_double_sided.py @@ -0,0 +1,253 @@ +import datetime as dt +import os +import shutil +from pathlib import Path +from typing import Union +from unittest import mock + +from django.test import TestCase +from django.test import override_settings +from pdfminer.high_level import extract_text +from pikepdf import Pdf + +from documents import tasks +from documents.consumer import ConsumerError +from documents.data_models import ConsumableDocument +from documents.data_models import DocumentSource +from documents.double_sided import STAGING_FILE_NAME +from documents.double_sided import TIMEOUT_MINUTES +from documents.tests.utils import DirectoriesMixin +from documents.tests.utils import FileSystemAssertsMixin + + +@override_settings( + CONSUMER_RECURSIVE=True, + CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED=True, +) +class TestDoubleSided(DirectoriesMixin, FileSystemAssertsMixin, TestCase): + SAMPLE_DIR = Path(__file__).parent / "samples" + + def setUp(self): + super().setUp() + self.dirs.double_sided_dir = self.dirs.consumption_dir / "double-sided" + self.dirs.double_sided_dir.mkdir() + self.staging_file = self.dirs.scratch_dir / STAGING_FILE_NAME + + def consume_file(self, srcname, dstname: Union[str, Path] = "foo.pdf"): + """ + Starts the consume process and also ensures the + destination file does not exist afterwards + """ + src = self.SAMPLE_DIR / srcname + dst = self.dirs.double_sided_dir / dstname + dst.parent.mkdir(parents=True, exist_ok=True) + shutil.copy(src, dst) + with mock.patch("documents.tasks.async_to_sync"), mock.patch( + "documents.consumer.async_to_sync", + ): + msg = tasks.consume_file( + ConsumableDocument( + source=DocumentSource.ConsumeFolder, + original_file=dst, + ), + None, + ) + self.assertIsNotFile(dst) + return msg + + def create_staging_file(self, src="double-sided-odd.pdf", datetime=None): + shutil.copy(self.SAMPLE_DIR / src, self.staging_file) + if datetime is None: + datetime = dt.datetime.now() + os.utime(str(self.staging_file), (datetime.timestamp(),) * 2) + + def test_odd_numbered_moved_to_staging(self): + """ + GIVEN: + - No staging file exists + WHEN: + - A file is copied into the double-sided consume directory + THEN: + - The file becomes the new staging file + - The file in the consume directory gets removed + - The staging file has the st_mtime set to now + - The user gets informed + """ + + msg = self.consume_file("double-sided-odd.pdf") + + self.assertIsFile(self.staging_file) + self.assertAlmostEqual( + dt.datetime.fromtimestamp(self.staging_file.stat().st_mtime), + dt.datetime.now(), + delta=dt.timedelta(seconds=5), + ) + self.assertIn("Received odd numbered pages", msg) + + def test_collation(self): + """ + GIVEN: + - A staging file not older than TIMEOUT_MINUTES with odd pages exists + WHEN: + - A file is copied into the double-sided consume directory + THEN: + - A new file containing the collated staging and uploaded file is + created and put into the consume directory + - The new file is named "foo-collated.pdf", where foo is the name of + the second file + - Both staging and uploaded file get deleted + - The new file contains the pages in the correct order + """ + + self.create_staging_file() + self.consume_file("double-sided-even.pdf", "some-random-name.pdf") + + target = self.dirs.consumption_dir / "some-random-name-collated.pdf" + self.assertIsFile(target) + self.assertIsNotFile(self.staging_file) + self.assertRegex( + extract_text(str(target)), + r"(?s)" + r"This is page 1.*This is page 2.*This is page 3.*" + r"This is page 4.*This is page 5", + ) + + def test_staging_file_expiration(self): + """ + GIVEN: + - A staging file older than TIMEOUT_MINUTES exists + WHEN: + - A file is copied into the double-sided consume directory + THEN: + - It becomes the new staging file + """ + + self.create_staging_file( + datetime=dt.datetime.now() + - dt.timedelta(minutes=TIMEOUT_MINUTES, seconds=1), + ) + msg = self.consume_file("double-sided-odd.pdf") + self.assertIsFile(self.staging_file) + self.assertIn("Received odd numbered pages", msg) + + def test_less_odd_pages_then_even_fails(self): + """ + GIVEN: + - A valid staging file + WHEN: + - A file is copied into the double-sided consume directory + that has more pages than the staging file + THEN: + - Both files get removed + - A ConsumerError exception is thrown + """ + self.create_staging_file("simple.pdf") + self.assertRaises( + ConsumerError, + self.consume_file, + "double-sided-even.pdf", + ) + self.assertIsNotFile(self.staging_file) + + @override_settings(CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT=True) + def test_tiff_upload_enabled(self): + """ + GIVEN: + - CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT is true + - No staging file exists + WHEN: + - A TIFF file gets uploaded into the double-sided + consume dir + THEN: + - The file is converted into a PDF and moved to + the staging file + """ + self.consume_file("simple.tiff", "simple.tiff") + self.assertIsFile(self.staging_file) + # Ensure the file is a valid PDF by trying to read it + Pdf.open(self.staging_file) + + @override_settings(CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT=False) + def test_tiff_upload_disabled(self): + """ + GIVEN: + - CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT is false + - No staging file exists + WHEN: + - A TIFF file gets uploaded into the double-sided + consume dir + THEN: + - A ConsumerError is raised + """ + self.assertRaises( + ConsumerError, + self.consume_file, + "simple.tiff", + "simple.tiff", + ) + + @override_settings(CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME="quux") + def test_different_upload_dir_name(self): + """ + GIVEN: + - No staging file exists + - CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME is set to quux + WHEN: + - A file is uploaded into the quux dir + THEN: + - A staging file is created + """ + self.consume_file("double-sided-odd.pdf", Path("..") / "quux" / "foo.pdf") + self.assertIsFile(self.staging_file) + + def test_only_double_sided_dir_is_handled(self): + """ + GIVEN: + - No staging file exists + WHEN: + - A file is uploaded into the normal consumption dir + THEN: + - The file is processed as normal + """ + msg = self.consume_file("simple.pdf", Path("..") / "simple.pdf") + self.assertIsNotFile(self.staging_file) + self.assertRegex(msg, "Success. New document .* created") + + def test_subdirectory_upload(self): + """ + GIVEN: + - A staging file exists + WHEN: + - A file gets uploaded into foo/bar/double-sided + or double-sided/foo/bar + THEN: + - The collated file gets put into foo/bar + """ + for path in [ + Path("foo") / "bar" / "double-sided", + Path("double-sided") / "foo" / "bar", + ]: + with self.subTest(path=path): + # Ensure we get fresh directories for each run + self.tearDown() + self.setUp() + + self.create_staging_file() + self.consume_file("double-sided-odd.pdf", path / "foo.pdf") + self.assertIsFile( + self.dirs.consumption_dir / "foo" / "bar" / "foo-collated.pdf", + ) + + @override_settings(CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED=False) + def test_disabled_double_sided_dir_upload(self): + """ + GIVEN: + - CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED is false + WHEN: + - A file is uploaded into the double-sided directory + THEN: + - The file is processed like a normal upload + """ + msg = self.consume_file("simple.pdf") + self.assertIsNotFile(self.staging_file) + self.assertRegex(msg, "Success. New document .* created") diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 763cf96fc..39460066e 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -791,6 +791,18 @@ CONSUMER_BARCODE_DPI: Final[str] = int( os.getenv("PAPERLESS_CONSUMER_BARCODE_DPI", 300), ) +CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED: Final[bool] = __get_boolean( + "PAPERLESS_CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED", +) + +CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME: Final[str] = os.getenv( + "PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME", + "double-sided", +) + +CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT: Final[bool] = __get_boolean( + "PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT", +) OCR_PAGES = int(os.getenv("PAPERLESS_OCR_PAGES", 0))