Merge pull request #532 from paperless-ngx/feature-barcode-splitter
Feature barcode splitter
							
								
								
									
										2
									
								
								.github/workflows/ci.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						| @@ -132,7 +132,7 @@ jobs: | |||||||
|         name: Install system dependencies |         name: Install system dependencies | ||||||
|         run: | |         run: | | ||||||
|           sudo apt-get update -qq |           sudo apt-get update -qq | ||||||
|           sudo apt-get install -qq --no-install-recommends unpaper tesseract-ocr imagemagick ghostscript optipng |           sudo apt-get install -qq --no-install-recommends unpaper tesseract-ocr imagemagick ghostscript optipng libzbar0 poppler-utils | ||||||
|       - |       - | ||||||
|         name: Install Python dependencies |         name: Install Python dependencies | ||||||
|         run: | |         run: | | ||||||
|   | |||||||
							
								
								
									
										2
									
								
								Pipfile
									
									
									
									
									
								
							
							
						
						| @@ -51,6 +51,8 @@ concurrent-log-handler = "*" | |||||||
| "backports.zoneinfo" = {version = "*", markers = "python_version < '3.9'"} | "backports.zoneinfo" = {version = "*", markers = "python_version < '3.9'"} | ||||||
| "importlib-resources" = {version = "*", markers = "python_version < '3.9'"} | "importlib-resources" = {version = "*", markers = "python_version < '3.9'"} | ||||||
| zipp = {version = "*", markers = "python_version < '3.9'"} | zipp = {version = "*", markers = "python_version < '3.9'"} | ||||||
|  | pyzbar = "*" | ||||||
|  | pdf2image = "*" | ||||||
|  |  | ||||||
| [dev-packages] | [dev-packages] | ||||||
| coveralls = "*" | coveralls = "*" | ||||||
|   | |||||||
							
								
								
									
										24
									
								
								Pipfile.lock
									
									
									
										generated
									
									
									
								
							
							
						
						| @@ -693,6 +693,14 @@ | |||||||
|             "index": "pypi", |             "index": "pypi", | ||||||
|             "version": "==2.5.0" |             "version": "==2.5.0" | ||||||
|         }, |         }, | ||||||
|  |         "pdf2image": { | ||||||
|  |             "hashes": [ | ||||||
|  |                 "sha256:84f79f2b8fad943e36323ea4e937fcb05f26ded0caa0a01181df66049e42fb65", | ||||||
|  |                 "sha256:d58ed94d978a70c73c2bb7fdf8acbaf2a7089c29ff8141be5f45433c0c4293bb" | ||||||
|  |             ], | ||||||
|  |             "index": "pypi", | ||||||
|  |             "version": "==1.16.0" | ||||||
|  |         }, | ||||||
|         "pdfminer.six": { |         "pdfminer.six": { | ||||||
|             "hashes": [ |             "hashes": [ | ||||||
|                 "sha256:af0630f98a292bad4170f54e80f82ca81b916dd0b2c996437ec45c02f11d8762", |                 "sha256:af0630f98a292bad4170f54e80f82ca81b916dd0b2c996437ec45c02f11d8762", | ||||||
| @@ -960,6 +968,15 @@ | |||||||
|             ], |             ], | ||||||
|             "version": "==6.0" |             "version": "==6.0" | ||||||
|         }, |         }, | ||||||
|  |         "pyzbar": { | ||||||
|  |             "hashes": [ | ||||||
|  |                 "sha256:13e3ee5a2f3a545204a285f41814d5c0db571967e8d4af8699a03afc55182a9c", | ||||||
|  |                 "sha256:4559628b8192feb25766d954b36a3753baaf5c97c03135aec7e4a026036b475d", | ||||||
|  |                 "sha256:8f4c5264c9c7c6b9f20d01efc52a4eba1ded47d9ba857a94130afe33703eb518" | ||||||
|  |             ], | ||||||
|  |             "index": "pypi", | ||||||
|  |             "version": "==0.1.9" | ||||||
|  |         }, | ||||||
|         "redis": { |         "redis": { | ||||||
|             "hashes": [ |             "hashes": [ | ||||||
|                 "sha256:0e7e0cfca8660dea8b7d5cd8c4f6c5e29e11f31158c0b0ae91a397f00e5a05a2", |                 "sha256:0e7e0cfca8660dea8b7d5cd8c4f6c5e29e11f31158c0b0ae91a397f00e5a05a2", | ||||||
| @@ -1784,6 +1801,13 @@ | |||||||
|             ], |             ], | ||||||
|             "version": "==1.6.0" |             "version": "==1.6.0" | ||||||
|         }, |         }, | ||||||
|  |         "mypy-extensions": { | ||||||
|  |             "hashes": [ | ||||||
|  |                 "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d", | ||||||
|  |                 "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8" | ||||||
|  |             ], | ||||||
|  |             "version": "==0.4.3" | ||||||
|  |         }, | ||||||
|         "packaging": { |         "packaging": { | ||||||
|             "hashes": [ |             "hashes": [ | ||||||
|                 "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb", |                 "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb", | ||||||
|   | |||||||
| @@ -613,6 +613,27 @@ PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS=<bool> | |||||||
|  |  | ||||||
|     Defaults to false. |     Defaults to false. | ||||||
|  |  | ||||||
|  | PAPERLESS_CONSUMER_ENABLE_BARCODES=<bool> | ||||||
|  |     Enables the scanning and page separation based on detected barcodes. | ||||||
|  |     This allows for scanning and adding multiple documents per uploaded | ||||||
|  |     file, which are separated by one or multiple barcode pages. | ||||||
|  |  | ||||||
|  |     For ease of use, it is suggested to use a standardized separation page, | ||||||
|  |     e.g. `here <https://www.alliancegroup.co.uk/patch-codes.htm>`_. | ||||||
|  |  | ||||||
|  |     If no barcodes are detected in the uploaded file, no page separation | ||||||
|  |     will happen. | ||||||
|  |  | ||||||
|  |     Defaults to false. | ||||||
|  |  | ||||||
|  |  | ||||||
|  | PAPERLESS_CONSUMER_BARCODE_STRING=PATCHT | ||||||
|  |   Defines the string to be detected as a separator barcode. | ||||||
|  |   If paperless is used with the PATCH-T separator pages, users | ||||||
|  |   shouldn't change this. | ||||||
|  |  | ||||||
|  |   Defaults to "PATCHT" | ||||||
|  |  | ||||||
|  |  | ||||||
| PAPERLESS_CONVERT_MEMORY_LIMIT=<num> | PAPERLESS_CONVERT_MEMORY_LIMIT=<num> | ||||||
|     On smaller systems, or even in the case of Very Large Documents, the consumer |     On smaller systems, or even in the case of Very Large Documents, the consumer | ||||||
|   | |||||||
| @@ -62,6 +62,8 @@ | |||||||
| #PAPERLESS_CONSUMER_RECURSIVE=false | #PAPERLESS_CONSUMER_RECURSIVE=false | ||||||
| #PAPERLESS_CONSUMER_IGNORE_PATTERNS=[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini"] | #PAPERLESS_CONSUMER_IGNORE_PATTERNS=[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini"] | ||||||
| #PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS=false | #PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS=false | ||||||
|  | #PAPERLESS_CONSUMER_ENABLE_BARCODES=false | ||||||
|  | #PAPERLESS_CONSUMER_ENABLE_BARCODES=PATCHT | ||||||
| #PAPERLESS_OPTIMIZE_THUMBNAILS=true | #PAPERLESS_OPTIMIZE_THUMBNAILS=true | ||||||
| #PAPERLESS_PRE_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh | #PAPERLESS_PRE_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh | ||||||
| #PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh | #PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh | ||||||
|   | |||||||
| @@ -61,6 +61,7 @@ ocrmypdf==13.4.2 | |||||||
| packaging==21.3; python_version >= '3.6' | packaging==21.3; python_version >= '3.6' | ||||||
| pathvalidate==2.5.0 | pathvalidate==2.5.0 | ||||||
| pdfminer.six==20220319 | pdfminer.six==20220319 | ||||||
|  | pdf2image==1.16.0 | ||||||
| pikepdf==5.1.1 | pikepdf==5.1.1 | ||||||
| pillow==9.1.0 | pillow==9.1.0 | ||||||
| pluggy==1.0.0; python_version >= '3.6' | pluggy==1.0.0; python_version >= '3.6' | ||||||
| @@ -79,6 +80,7 @@ python-magic==0.4.25 | |||||||
| pytz-deprecation-shim==0.1.0.post0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5' | pytz-deprecation-shim==0.1.0.post0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5' | ||||||
| pytz==2022.1 | pytz==2022.1 | ||||||
| pyyaml==6.0 | pyyaml==6.0 | ||||||
|  | pyzbar==0.1.9 | ||||||
| redis==3.5.3 | redis==3.5.3 | ||||||
| regex==2022.3.2; python_version >= '3.6' | regex==2022.3.2; python_version >= '3.6' | ||||||
| reportlab==3.6.9; python_version >= '3.7' and python_version < '4' | reportlab==3.6.9; python_version >= '3.7' and python_version < '4' | ||||||
|   | |||||||
| @@ -1,6 +1,12 @@ | |||||||
| import logging | import logging | ||||||
|  | import os | ||||||
|  | import shutil | ||||||
|  | import tempfile | ||||||
|  | from typing import List  # for type hinting. Can be removed, if only Python >3.8 is used | ||||||
|  |  | ||||||
| import tqdm | import tqdm | ||||||
|  | from asgiref.sync import async_to_sync | ||||||
|  | from channels.layers import get_channel_layer | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
| from django.db.models.signals import post_save | from django.db.models.signals import post_save | ||||||
| from documents import index | from documents import index | ||||||
| @@ -14,8 +20,12 @@ from documents.models import Document | |||||||
| from documents.models import DocumentType | from documents.models import DocumentType | ||||||
| from documents.models import Tag | from documents.models import Tag | ||||||
| from documents.sanity_checker import SanityCheckFailedException | from documents.sanity_checker import SanityCheckFailedException | ||||||
|  | from pdf2image import convert_from_path | ||||||
|  | from pikepdf import Pdf | ||||||
|  | from pyzbar import pyzbar | ||||||
| from whoosh.writing import AsyncWriter | from whoosh.writing import AsyncWriter | ||||||
|  |  | ||||||
|  |  | ||||||
| logger = logging.getLogger("paperless.tasks") | logger = logging.getLogger("paperless.tasks") | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -62,6 +72,115 @@ def train_classifier(): | |||||||
|         logger.warning("Classifier error: " + str(e)) |         logger.warning("Classifier error: " + str(e)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def barcode_reader(image) -> List[str]: | ||||||
|  |     """ | ||||||
|  |     Read any barcodes contained in image | ||||||
|  |     Returns a list containing all found barcodes | ||||||
|  |     """ | ||||||
|  |     barcodes = [] | ||||||
|  |     # Decode the barcode image | ||||||
|  |     detected_barcodes = pyzbar.decode(image) | ||||||
|  |  | ||||||
|  |     if detected_barcodes: | ||||||
|  |         # Traverse through all the detected barcodes in image | ||||||
|  |         for barcode in detected_barcodes: | ||||||
|  |             if barcode.data: | ||||||
|  |                 decoded_barcode = barcode.data.decode("utf-8") | ||||||
|  |                 barcodes.append(decoded_barcode) | ||||||
|  |                 logger.debug( | ||||||
|  |                     f"Barcode of type {str(barcode.type)} found: {decoded_barcode}", | ||||||
|  |                 ) | ||||||
|  |     return barcodes | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def scan_file_for_separating_barcodes(filepath: str) -> List[int]: | ||||||
|  |     """ | ||||||
|  |     Scan the provided file for page separating barcodes | ||||||
|  |     Returns a list of pagenumbers, which separate the file | ||||||
|  |     """ | ||||||
|  |     separator_page_numbers = [] | ||||||
|  |     separator_barcode = str(settings.CONSUMER_BARCODE_STRING) | ||||||
|  |     # use a temporary directory in case the file os too big to handle in memory | ||||||
|  |     with tempfile.TemporaryDirectory() as path: | ||||||
|  |         pages_from_path = convert_from_path(filepath, output_folder=path) | ||||||
|  |         for current_page_number, page in enumerate(pages_from_path): | ||||||
|  |             current_barcodes = barcode_reader(page) | ||||||
|  |             if separator_barcode in current_barcodes: | ||||||
|  |                 separator_page_numbers.append(current_page_number) | ||||||
|  |     return separator_page_numbers | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]: | ||||||
|  |     """ | ||||||
|  |     Separate the provided file on the pages_to_split_on. | ||||||
|  |     The pages which are defined by page_numbers will be removed. | ||||||
|  |     Returns a list of (temporary) filepaths to consume. | ||||||
|  |     These will need to be deleted later. | ||||||
|  |     """ | ||||||
|  |     os.makedirs(settings.SCRATCH_DIR, exist_ok=True) | ||||||
|  |     tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) | ||||||
|  |     fname = os.path.splitext(os.path.basename(filepath))[0] | ||||||
|  |     pdf = Pdf.open(filepath) | ||||||
|  |     document_paths = [] | ||||||
|  |     logger.debug(f"Temp dir is {str(tempdir)}") | ||||||
|  |     if not pages_to_split_on: | ||||||
|  |         logger.warning("No pages to split on!") | ||||||
|  |     else: | ||||||
|  |         # go from the first page to the first separator page | ||||||
|  |         dst = Pdf.new() | ||||||
|  |         for n, page in enumerate(pdf.pages): | ||||||
|  |             if n < pages_to_split_on[0]: | ||||||
|  |                 dst.pages.append(page) | ||||||
|  |         output_filename = "{}_document_0.pdf".format(fname) | ||||||
|  |         savepath = os.path.join(tempdir, output_filename) | ||||||
|  |         with open(savepath, "wb") as out: | ||||||
|  |             dst.save(out) | ||||||
|  |         document_paths = [savepath] | ||||||
|  |  | ||||||
|  |         # iterate through the rest of the document | ||||||
|  |         for count, page_number in enumerate(pages_to_split_on): | ||||||
|  |             logger.debug(f"Count: {str(count)} page_number: {str(page_number)}") | ||||||
|  |             dst = Pdf.new() | ||||||
|  |             try: | ||||||
|  |                 next_page = pages_to_split_on[count + 1] | ||||||
|  |             except IndexError: | ||||||
|  |                 next_page = len(pdf.pages) | ||||||
|  |             # skip the first page_number. This contains the barcode page | ||||||
|  |             for page in range(page_number + 1, next_page): | ||||||
|  |                 logger.debug( | ||||||
|  |                     f"page_number: {str(page_number)} next_page: {str(next_page)}", | ||||||
|  |                 ) | ||||||
|  |                 dst.pages.append(pdf.pages[page]) | ||||||
|  |             output_filename = "{}_document_{}.pdf".format(fname, str(count + 1)) | ||||||
|  |             logger.debug(f"pdf no:{str(count)} has {str(len(dst.pages))} pages") | ||||||
|  |             savepath = os.path.join(tempdir, output_filename) | ||||||
|  |             with open(savepath, "wb") as out: | ||||||
|  |                 dst.save(out) | ||||||
|  |             document_paths.append(savepath) | ||||||
|  |     logger.debug(f"Temp files are {str(document_paths)}") | ||||||
|  |     return document_paths | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def save_to_dir( | ||||||
|  |     filepath: str, | ||||||
|  |     newname: str = None, | ||||||
|  |     target_dir: str = settings.CONSUMPTION_DIR, | ||||||
|  | ): | ||||||
|  |     """ | ||||||
|  |     Copies filepath to target_dir. | ||||||
|  |     Optionally rename the file. | ||||||
|  |     """ | ||||||
|  |     if os.path.isfile(filepath) and os.path.isdir(target_dir): | ||||||
|  |         dst = shutil.copy(filepath, target_dir) | ||||||
|  |         logging.debug(f"saved {str(filepath)} to {str(dst)}") | ||||||
|  |         if newname: | ||||||
|  |             dst_new = os.path.join(target_dir, newname) | ||||||
|  |             logger.debug(f"moving {str(dst)} to {str(dst_new)}") | ||||||
|  |             os.rename(dst, dst_new) | ||||||
|  |     else: | ||||||
|  |         logger.warning(f"{str(filepath)} or {str(target_dir)} don't exist.") | ||||||
|  |  | ||||||
|  |  | ||||||
| def consume_file( | def consume_file( | ||||||
|     path, |     path, | ||||||
|     override_filename=None, |     override_filename=None, | ||||||
| @@ -72,6 +191,48 @@ def consume_file( | |||||||
|     task_id=None, |     task_id=None, | ||||||
| ): | ): | ||||||
|  |  | ||||||
|  |     # check for separators in current document | ||||||
|  |     if settings.CONSUMER_ENABLE_BARCODES: | ||||||
|  |         separators = [] | ||||||
|  |         document_list = [] | ||||||
|  |         separators = scan_file_for_separating_barcodes(path) | ||||||
|  |         if separators: | ||||||
|  |             logger.debug(f"Pages with separators found in: {str(path)}") | ||||||
|  |             document_list = separate_pages(path, separators) | ||||||
|  |         if document_list: | ||||||
|  |             for n, document in enumerate(document_list): | ||||||
|  |                 # save to consumption dir | ||||||
|  |                 # rename it to the original filename  with number prefix | ||||||
|  |                 if override_filename: | ||||||
|  |                     newname = f"{str(n)}_" + override_filename | ||||||
|  |                 else: | ||||||
|  |                     newname = None | ||||||
|  |                 save_to_dir(document, newname=newname) | ||||||
|  |             # if we got here, the document was successfully split | ||||||
|  |             # and can safely be deleted | ||||||
|  |             logger.debug("Deleting file {}".format(path)) | ||||||
|  |             os.unlink(path) | ||||||
|  |             # notify the sender, otherwise the progress bar | ||||||
|  |             # in the UI stays stuck | ||||||
|  |             payload = { | ||||||
|  |                 "filename": override_filename, | ||||||
|  |                 "task_id": task_id, | ||||||
|  |                 "current_progress": 100, | ||||||
|  |                 "max_progress": 100, | ||||||
|  |                 "status": "SUCCESS", | ||||||
|  |                 "message": "finished", | ||||||
|  |             } | ||||||
|  |             try: | ||||||
|  |                 async_to_sync(get_channel_layer().group_send)( | ||||||
|  |                     "status_updates", | ||||||
|  |                     {"type": "status_update", "data": payload}, | ||||||
|  |                 ) | ||||||
|  |             except OSError as e: | ||||||
|  |                 logger.warning("OSError. It could be, the broker cannot be reached.") | ||||||
|  |                 logger.warning(str(e)) | ||||||
|  |             return "File successfully split" | ||||||
|  |  | ||||||
|  |     # continue with consumption if no barcode was found | ||||||
|     document = Consumer().try_consume_file( |     document = Consumer().try_consume_file( | ||||||
|         path, |         path, | ||||||
|         override_filename=override_filename, |         override_filename=override_filename, | ||||||
|   | |||||||
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/barcodes/barcode-128-PATCHT.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						| After Width: | Height: | Size: 836 B | 
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/barcodes/barcode-128-custom.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/barcodes/barcode-128-custom.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						| After Width: | Height: | Size: 1.2 KiB | 
| After Width: | Height: | Size: 33 KiB | 
| After Width: | Height: | Size: 39 KiB | 
| After Width: | Height: | Size: 9.5 KiB | 
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/barcodes/barcode-39-PATCHT.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						| After Width: | Height: | Size: 891 B | 
							
								
								
									
										243
									
								
								src/documents/tests/samples/barcodes/barcode-39-custom.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/barcodes/barcode-39-custom.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						| After Width: | Height: | Size: 1.3 KiB | 
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/barcodes/barcode-qr-custom.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/barcodes/barcode-qr-custom.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						| After Width: | Height: | Size: 337 B | 
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/barcodes/patch-code-t-middle.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/barcodes/patch-code-t-qr.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/barcodes/patch-code-t.pbm
									
									
									
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/barcodes/patch-code-t.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/barcodes/qr-code-PATCHT.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						| After Width: | Height: | Size: 7.4 KiB | 
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/barcodes/several-patcht-codes.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -1,7 +1,10 @@ | |||||||
| import os | import os | ||||||
|  | import shutil | ||||||
|  | import tempfile | ||||||
| from unittest import mock | from unittest import mock | ||||||
|  |  | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
|  | from django.test import override_settings | ||||||
| from django.test import TestCase | from django.test import TestCase | ||||||
| from django.utils import timezone | from django.utils import timezone | ||||||
| from documents import tasks | from documents import tasks | ||||||
| @@ -12,6 +15,7 @@ from documents.models import Tag | |||||||
| from documents.sanity_checker import SanityCheckFailedException | from documents.sanity_checker import SanityCheckFailedException | ||||||
| from documents.sanity_checker import SanityCheckMessages | from documents.sanity_checker import SanityCheckMessages | ||||||
| from documents.tests.utils import DirectoriesMixin | from documents.tests.utils import DirectoriesMixin | ||||||
|  | from PIL import Image | ||||||
|  |  | ||||||
|  |  | ||||||
| class TestTasks(DirectoriesMixin, TestCase): | class TestTasks(DirectoriesMixin, TestCase): | ||||||
| @@ -89,6 +93,318 @@ class TestTasks(DirectoriesMixin, TestCase): | |||||||
|         mtime3 = os.stat(settings.MODEL_FILE).st_mtime |         mtime3 = os.stat(settings.MODEL_FILE).st_mtime | ||||||
|         self.assertNotEqual(mtime2, mtime3) |         self.assertNotEqual(mtime2, mtime3) | ||||||
|  |  | ||||||
|  |     def test_barcode_reader(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "barcodes", | ||||||
|  |             "barcode-39-PATCHT.png", | ||||||
|  |         ) | ||||||
|  |         img = Image.open(test_file) | ||||||
|  |         separator_barcode = str(settings.CONSUMER_BARCODE_STRING) | ||||||
|  |         self.assertEqual(tasks.barcode_reader(img), [separator_barcode]) | ||||||
|  |  | ||||||
|  |     def test_barcode_reader2(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "barcodes", | ||||||
|  |             "patch-code-t.pbm", | ||||||
|  |         ) | ||||||
|  |         img = Image.open(test_file) | ||||||
|  |         separator_barcode = str(settings.CONSUMER_BARCODE_STRING) | ||||||
|  |         self.assertEqual(tasks.barcode_reader(img), [separator_barcode]) | ||||||
|  |  | ||||||
|  |     def test_barcode_reader_distorsion(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "barcodes", | ||||||
|  |             "barcode-39-PATCHT-distorsion.png", | ||||||
|  |         ) | ||||||
|  |         img = Image.open(test_file) | ||||||
|  |         separator_barcode = str(settings.CONSUMER_BARCODE_STRING) | ||||||
|  |         self.assertEqual(tasks.barcode_reader(img), [separator_barcode]) | ||||||
|  |  | ||||||
|  |     def test_barcode_reader_distorsion2(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "barcodes", | ||||||
|  |             "barcode-39-PATCHT-distorsion2.png", | ||||||
|  |         ) | ||||||
|  |         img = Image.open(test_file) | ||||||
|  |         separator_barcode = str(settings.CONSUMER_BARCODE_STRING) | ||||||
|  |         self.assertEqual(tasks.barcode_reader(img), [separator_barcode]) | ||||||
|  |  | ||||||
|  |     def test_barcode_reader_unreadable(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "barcodes", | ||||||
|  |             "barcode-39-PATCHT-unreadable.png", | ||||||
|  |         ) | ||||||
|  |         img = Image.open(test_file) | ||||||
|  |         self.assertEqual(tasks.barcode_reader(img), []) | ||||||
|  |  | ||||||
|  |     def test_barcode_reader_qr(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "barcodes", | ||||||
|  |             "qr-code-PATCHT.png", | ||||||
|  |         ) | ||||||
|  |         img = Image.open(test_file) | ||||||
|  |         separator_barcode = str(settings.CONSUMER_BARCODE_STRING) | ||||||
|  |         self.assertEqual(tasks.barcode_reader(img), [separator_barcode]) | ||||||
|  |  | ||||||
|  |     def test_barcode_reader_128(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "barcodes", | ||||||
|  |             "barcode-128-PATCHT.png", | ||||||
|  |         ) | ||||||
|  |         img = Image.open(test_file) | ||||||
|  |         separator_barcode = str(settings.CONSUMER_BARCODE_STRING) | ||||||
|  |         self.assertEqual(tasks.barcode_reader(img), [separator_barcode]) | ||||||
|  |  | ||||||
|  |     def test_barcode_reader_no_barcode(self): | ||||||
|  |         test_file = os.path.join(os.path.dirname(__file__), "samples", "simple.png") | ||||||
|  |         img = Image.open(test_file) | ||||||
|  |         self.assertEqual(tasks.barcode_reader(img), []) | ||||||
|  |  | ||||||
|  |     def test_barcode_reader_custom_separator(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "barcodes", | ||||||
|  |             "barcode-39-custom.png", | ||||||
|  |         ) | ||||||
|  |         img = Image.open(test_file) | ||||||
|  |         self.assertEqual(tasks.barcode_reader(img), ["CUSTOM BARCODE"]) | ||||||
|  |  | ||||||
|  |     def test_barcode_reader_custom_qr_separator(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "barcodes", | ||||||
|  |             "barcode-qr-custom.png", | ||||||
|  |         ) | ||||||
|  |         img = Image.open(test_file) | ||||||
|  |         self.assertEqual(tasks.barcode_reader(img), ["CUSTOM BARCODE"]) | ||||||
|  |  | ||||||
|  |     def test_barcode_reader_custom_128_separator(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "barcodes", | ||||||
|  |             "barcode-128-custom.png", | ||||||
|  |         ) | ||||||
|  |         img = Image.open(test_file) | ||||||
|  |         self.assertEqual(tasks.barcode_reader(img), ["CUSTOM BARCODE"]) | ||||||
|  |  | ||||||
|  |     def test_scan_file_for_separating_barcodes(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "barcodes", | ||||||
|  |             "patch-code-t.pdf", | ||||||
|  |         ) | ||||||
|  |         pages = tasks.scan_file_for_separating_barcodes(test_file) | ||||||
|  |         self.assertEqual(pages, [0]) | ||||||
|  |  | ||||||
|  |     def test_scan_file_for_separating_barcodes2(self): | ||||||
|  |         test_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf") | ||||||
|  |         pages = tasks.scan_file_for_separating_barcodes(test_file) | ||||||
|  |         self.assertEqual(pages, []) | ||||||
|  |  | ||||||
|  |     def test_scan_file_for_separating_barcodes3(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "barcodes", | ||||||
|  |             "patch-code-t-middle.pdf", | ||||||
|  |         ) | ||||||
|  |         pages = tasks.scan_file_for_separating_barcodes(test_file) | ||||||
|  |         self.assertEqual(pages, [1]) | ||||||
|  |  | ||||||
|  |     def test_scan_file_for_separating_barcodes4(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "barcodes", | ||||||
|  |             "several-patcht-codes.pdf", | ||||||
|  |         ) | ||||||
|  |         pages = tasks.scan_file_for_separating_barcodes(test_file) | ||||||
|  |         self.assertEqual(pages, [2, 5]) | ||||||
|  |  | ||||||
|  |     def test_scan_file_for_separating_barcodes_upsidedown(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "barcodes", | ||||||
|  |             "patch-code-t-middle_reverse.pdf", | ||||||
|  |         ) | ||||||
|  |         pages = tasks.scan_file_for_separating_barcodes(test_file) | ||||||
|  |         self.assertEqual(pages, [1]) | ||||||
|  |  | ||||||
|  |     def test_scan_file_for_separating_qr_barcodes(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "barcodes", | ||||||
|  |             "patch-code-t-qr.pdf", | ||||||
|  |         ) | ||||||
|  |         pages = tasks.scan_file_for_separating_barcodes(test_file) | ||||||
|  |         self.assertEqual(pages, [0]) | ||||||
|  |  | ||||||
|  |     @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE") | ||||||
|  |     def test_scan_file_for_separating_custom_barcodes(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "barcodes", | ||||||
|  |             "barcode-39-custom.pdf", | ||||||
|  |         ) | ||||||
|  |         pages = tasks.scan_file_for_separating_barcodes(test_file) | ||||||
|  |         self.assertEqual(pages, [0]) | ||||||
|  |  | ||||||
|  |     @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE") | ||||||
|  |     def test_scan_file_for_separating_custom_qr_barcodes(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "barcodes", | ||||||
|  |             "barcode-qr-custom.pdf", | ||||||
|  |         ) | ||||||
|  |         pages = tasks.scan_file_for_separating_barcodes(test_file) | ||||||
|  |         self.assertEqual(pages, [0]) | ||||||
|  |  | ||||||
|  |     @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE") | ||||||
|  |     def test_scan_file_for_separating_custom_128_barcodes(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "barcodes", | ||||||
|  |             "barcode-128-custom.pdf", | ||||||
|  |         ) | ||||||
|  |         pages = tasks.scan_file_for_separating_barcodes(test_file) | ||||||
|  |         self.assertEqual(pages, [0]) | ||||||
|  |  | ||||||
|  |     def test_scan_file_for_separating_wrong_qr_barcodes(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "barcodes", | ||||||
|  |             "barcode-39-custom.pdf", | ||||||
|  |         ) | ||||||
|  |         pages = tasks.scan_file_for_separating_barcodes(test_file) | ||||||
|  |         self.assertEqual(pages, []) | ||||||
|  |  | ||||||
|  |     def test_separate_pages(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "barcodes", | ||||||
|  |             "patch-code-t-middle.pdf", | ||||||
|  |         ) | ||||||
|  |         pages = tasks.separate_pages(test_file, [1]) | ||||||
|  |         self.assertEqual(len(pages), 2) | ||||||
|  |  | ||||||
|  |     def test_separate_pages_no_list(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "barcodes", | ||||||
|  |             "patch-code-t-middle.pdf", | ||||||
|  |         ) | ||||||
|  |         with self.assertLogs("paperless.tasks", level="WARNING") as cm: | ||||||
|  |             pages = tasks.separate_pages(test_file, []) | ||||||
|  |             self.assertEqual(pages, []) | ||||||
|  |             self.assertEqual( | ||||||
|  |                 cm.output, | ||||||
|  |                 [ | ||||||
|  |                     f"WARNING:paperless.tasks:No pages to split on!", | ||||||
|  |                 ], | ||||||
|  |             ) | ||||||
|  |  | ||||||
|  |     def test_save_to_dir(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "barcodes", | ||||||
|  |             "patch-code-t.pdf", | ||||||
|  |         ) | ||||||
|  |         tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) | ||||||
|  |         tasks.save_to_dir(test_file, target_dir=tempdir) | ||||||
|  |         target_file = os.path.join(tempdir, "patch-code-t.pdf") | ||||||
|  |         self.assertTrue(os.path.isfile(target_file)) | ||||||
|  |  | ||||||
|  |     def test_save_to_dir2(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "barcodes", | ||||||
|  |             "patch-code-t.pdf", | ||||||
|  |         ) | ||||||
|  |         nonexistingdir = "/nowhere" | ||||||
|  |         if os.path.isdir(nonexistingdir): | ||||||
|  |             self.fail("non-existing dir exists") | ||||||
|  |         else: | ||||||
|  |             with self.assertLogs("paperless.tasks", level="WARNING") as cm: | ||||||
|  |                 tasks.save_to_dir(test_file, target_dir=nonexistingdir) | ||||||
|  |             self.assertEqual( | ||||||
|  |                 cm.output, | ||||||
|  |                 [ | ||||||
|  |                     f"WARNING:paperless.tasks:{str(test_file)} or {str(nonexistingdir)} don't exist.", | ||||||
|  |                 ], | ||||||
|  |             ) | ||||||
|  |  | ||||||
|  |     def test_save_to_dir3(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "barcodes", | ||||||
|  |             "patch-code-t.pdf", | ||||||
|  |         ) | ||||||
|  |         tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) | ||||||
|  |         tasks.save_to_dir(test_file, newname="newname.pdf", target_dir=tempdir) | ||||||
|  |         target_file = os.path.join(tempdir, "newname.pdf") | ||||||
|  |         self.assertTrue(os.path.isfile(target_file)) | ||||||
|  |  | ||||||
|  |     def test_barcode_splitter(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "barcodes", | ||||||
|  |             "patch-code-t-middle.pdf", | ||||||
|  |         ) | ||||||
|  |         tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) | ||||||
|  |         separators = tasks.scan_file_for_separating_barcodes(test_file) | ||||||
|  |         self.assertTrue(separators) | ||||||
|  |         document_list = tasks.separate_pages(test_file, separators) | ||||||
|  |         self.assertTrue(document_list) | ||||||
|  |         for document in document_list: | ||||||
|  |             tasks.save_to_dir(document, target_dir=tempdir) | ||||||
|  |         target_file1 = os.path.join(tempdir, "patch-code-t-middle_document_0.pdf") | ||||||
|  |         target_file2 = os.path.join(tempdir, "patch-code-t-middle_document_1.pdf") | ||||||
|  |         self.assertTrue(os.path.isfile(target_file1)) | ||||||
|  |         self.assertTrue(os.path.isfile(target_file2)) | ||||||
|  |  | ||||||
|  |     @override_settings(CONSUMER_ENABLE_BARCODES=True) | ||||||
|  |     def test_consume_barcode_file(self): | ||||||
|  |         test_file = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "barcodes", | ||||||
|  |             "patch-code-t-middle.pdf", | ||||||
|  |         ) | ||||||
|  |         dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.pd") | ||||||
|  |         shutil.copy(test_file, dst) | ||||||
|  |  | ||||||
|  |         self.assertEqual(tasks.consume_file(dst), "File successfully split") | ||||||
|  |  | ||||||
|     @mock.patch("documents.tasks.sanity_checker.check_sanity") |     @mock.patch("documents.tasks.sanity_checker.check_sanity") | ||||||
|     def test_sanity_check_success(self, m): |     def test_sanity_check_success(self, m): | ||||||
|         m.return_value = SanityCheckMessages() |         m.return_value = SanityCheckMessages() | ||||||
|   | |||||||
| @@ -498,6 +498,12 @@ CONSUMER_IGNORE_PATTERNS = list( | |||||||
|  |  | ||||||
| CONSUMER_SUBDIRS_AS_TAGS = __get_boolean("PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS") | CONSUMER_SUBDIRS_AS_TAGS = __get_boolean("PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS") | ||||||
|  |  | ||||||
|  | CONSUMER_ENABLE_BARCODES = __get_boolean( | ||||||
|  |     "PAPERLESS_CONSUMER_ENABLE_BARCODES", | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | CONSUMER_BARCODE_STRING = os.getenv("PAPERLESS_CONSUMER_BARCODE_STRING", "PATCHT") | ||||||
|  |  | ||||||
| OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true") | OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true") | ||||||
|  |  | ||||||
| OCR_PAGES = int(os.getenv("PAPERLESS_OCR_PAGES", 0)) | OCR_PAGES = int(os.getenv("PAPERLESS_OCR_PAGES", 0)) | ||||||
|   | |||||||
 Florian
					Florian