No tracking data is collected by the app in any way.
-- + - + {{contentTrimmed}}
diff --git a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.ts b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.ts index b43187879..5d24042b9 100644 --- a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.ts +++ b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.ts @@ -70,6 +70,22 @@ export class DocumentCardLargeComponent { } } + get searchCommentHighlights() { + let highlights = [] + if ( + this.document['__search_hit__'] && + this.document['__search_hit__'].comment_highlights + ) { + // only show comments with a match + highlights = ( + this.document['__search_hit__'].comment_highlights as string + ) + .split(',') + .filter((higlight) => higlight.includes(' Date: Thu, 26 Jan 2023 08:00:02 -0800 Subject: [PATCH 12/18] Adds setting to Gotenberg API call for outputting the correct PDF/A format --- src/paperless_mail/parsers.py | 10 ++++++++++ src/paperless_tika/parsers.py | 12 +++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/src/paperless_mail/parsers.py b/src/paperless_mail/parsers.py index cc5d4e3c8..f1ee263aa 100644 --- a/src/paperless_mail/parsers.py +++ b/src/paperless_mail/parsers.py @@ -271,6 +271,16 @@ class MailDocumentParser(DocumentParser): "paperHeight": "11.7", "scale": "1.0", } + + # Set the output format of the resulting PDF + # Valid inputs: https://gotenberg.dev/docs/modules/pdf-engines#uno + if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}: + data["pdfFormat"] = "PDF/A-2b" + elif settings.OCR_OUTPUT_TYPE == "pdfa-1": + data["pdfFormat"] = "PDF/A-1a" + elif settings.OCR_OUTPUT_TYPE == "pdfa-3": + data["pdfFormat"] = "PDF/A-3b" + try: response = requests.post( url, diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py index 1cfb1eecb..f34ecbbab 100644 --- a/src/paperless_tika/parsers.py +++ b/src/paperless_tika/parsers.py @@ -95,9 +95,19 @@ class TikaDocumentParser(DocumentParser): ), } headers = {} + data = {} + + # Set the output format of the resulting PDF + # Valid inputs: https://gotenberg.dev/docs/modules/pdf-engines#uno + if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}: + data["pdfFormat"] = "PDF/A-2b" + elif settings.OCR_OUTPUT_TYPE == "pdfa-1": + data["pdfFormat"] = "PDF/A-1a" + elif settings.OCR_OUTPUT_TYPE == "pdfa-3": + data["pdfFormat"] = "PDF/A-3b" try: - response = requests.post(url, files=files, headers=headers) + response = requests.post(url, files=files, headers=headers, data=data) response.raise_for_status() # ensure we notice bad responses except Exception as err: raise ParseError( From 583f05af2db22325d7fac65b4fd70e604b6bb9e5 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Thu, 26 Jan 2023 08:23:11 -0800 Subject: [PATCH 13/18] Fixes test parameters --- src/paperless_mail/tests/test_parsers.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/paperless_mail/tests/test_parsers.py b/src/paperless_mail/tests/test_parsers.py index e02267970..809a1192f 100644 --- a/src/paperless_mail/tests/test_parsers.py +++ b/src/paperless_mail/tests/test_parsers.py @@ -573,8 +573,8 @@ class TestParser(TestCase): self.parser.gotenberg_server + "/forms/chromium/convert/html", mock_post.call_args.args[0], ) - self.assertEqual({}, mock_post.call_args.kwargs["headers"]) - self.assertEqual( + self.assertDictEqual({}, mock_post.call_args.kwargs["headers"]) + self.assertDictEqual( { "marginTop": "0.1", "marginBottom": "0.1", @@ -583,6 +583,7 @@ class TestParser(TestCase): "paperWidth": "8.27", "paperHeight": "11.7", "scale": "1.0", + "pdfFormat": "PDF/A-2b", }, mock_post.call_args.kwargs["data"], ) @@ -663,8 +664,8 @@ class TestParser(TestCase): self.parser.gotenberg_server + "/forms/chromium/convert/html", mock_post.call_args.args[0], ) - self.assertEqual({}, mock_post.call_args.kwargs["headers"]) - self.assertEqual( + self.assertDictEqual({}, mock_post.call_args.kwargs["headers"]) + self.assertDictEqual( { "marginTop": "0.1", "marginBottom": "0.1", From 2ab77fbaf7a42f60c23e8b28cf2af6080d84b919 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Fri, 27 Jan 2023 08:34:00 -0800 Subject: [PATCH 14/18] Removes pikepdf based scanning, fixes up unit testing (+ commenting) --- src/documents/barcodes.py | 73 +-- ...n.png => barcode-39-PATCHT-distortion.png} | Bin ....png => barcode-39-PATCHT-distortion2.png} | Bin src/documents/tests/test_barcodes.py | 448 +++++++++++++----- 4 files changed, 352 insertions(+), 169 deletions(-) rename src/documents/tests/samples/barcodes/{barcode-39-PATCHT-distorsion.png => barcode-39-PATCHT-distortion.png} (100%) rename src/documents/tests/samples/barcodes/{barcode-39-PATCHT-distorsion2.png => barcode-39-PATCHT-distortion2.png} (100%) diff --git a/src/documents/barcodes.py b/src/documents/barcodes.py index 82b8afecc..6e3ecfe05 100644 --- a/src/documents/barcodes.py +++ b/src/documents/barcodes.py @@ -4,7 +4,6 @@ import shutil import tempfile from dataclasses import dataclass from functools import lru_cache -from math import ceil from pathlib import Path from typing import List from typing import Optional @@ -12,10 +11,9 @@ from typing import Optional import magic from django.conf import settings from pdf2image import convert_from_path +from pdf2image.exceptions import PDFPageCountError from pikepdf import Page -from pikepdf import PasswordError from pikepdf import Pdf -from pikepdf import PdfImage from PIL import Image from PIL import ImageSequence from pyzbar import pyzbar @@ -154,52 +152,15 @@ def scan_file_for_barcodes( (page_number, barcode_text) tuples """ - def _pikepdf_barcode_scan(pdf_filepath: str) -> List[Barcode]: - detected_barcodes = [] - with Pdf.open(pdf_filepath) as pdf: - for page_num, page in enumerate(pdf.pages): - for image_key in page.images: - pdfimage = PdfImage(page.images[image_key]) - - # This type is known to have issues: - # https://github.com/pikepdf/pikepdf/issues/401 - if "/CCITTFaxDecode" in pdfimage.filters: - raise BarcodeImageFormatError( - "Unable to decode CCITTFaxDecode images", - ) - - # Not all images can be transcoded to a PIL image, which - # is what pyzbar expects to receive, so this may - # raise an exception, triggering fallback - pillow_img = pdfimage.as_pil_image() - - # Scale the image down - # See: https://github.com/paperless-ngx/paperless-ngx/issues/2385 - # TLDR: zbar has issues with larger images - width, height = pillow_img.size - if width > 1024: - scaler = ceil(width / 1024) - new_width = int(width / scaler) - new_height = int(height / scaler) - pillow_img = pillow_img.resize((new_width, new_height)) - - width, height = pillow_img.size - if height > 2048: - scaler = ceil(height / 2048) - new_width = int(width / scaler) - new_height = int(height / scaler) - pillow_img = pillow_img.resize((new_width, new_height)) - - for barcode_value in barcode_reader(pillow_img): - detected_barcodes.append(Barcode(page_num, barcode_value)) - - return detected_barcodes - def _pdf2image_barcode_scan(pdf_filepath: str) -> List[Barcode]: detected_barcodes = [] # use a temporary directory in case the file is too big to handle in memory with tempfile.TemporaryDirectory() as path: - pages_from_path = convert_from_path(pdf_filepath, output_folder=path) + pages_from_path = convert_from_path( + pdf_filepath, + dpi=300, + output_folder=path, + ) for current_page_number, page in enumerate(pages_from_path): for barcode_value in barcode_reader(page): detected_barcodes.append( @@ -219,27 +180,19 @@ def scan_file_for_barcodes( # Always try pikepdf first, it's usually fine, faster and # uses less memory try: - barcodes = _pikepdf_barcode_scan(pdf_filepath) + barcodes = _pdf2image_barcode_scan(pdf_filepath) # Password protected files can't be checked - except PasswordError as e: + # This is the exception raised for those + except PDFPageCountError as e: logger.warning( f"File is likely password protected, not checking for barcodes: {e}", ) - # Handle pikepdf related image decoding issues with a fallback to page - # by page conversion to images in a temporary directory - except Exception as e: + # This file is really borked, allow the consumption to continue + # but it may fail further on + except Exception as e: # pragma: no cover logger.warning( - f"Falling back to pdf2image because: {e}", + f"Exception during barcode scanning: {e}", ) - try: - barcodes = _pdf2image_barcode_scan(pdf_filepath) - # This file is really borked, allow the consumption to continue - # but it may fail further on - except Exception as e: # pragma: no cover - logger.warning( - f"Exception during barcode scanning: {e}", - ) - else: logger.warning( f"Unsupported file format for barcode reader: {str(mime_type)}", diff --git a/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distorsion.png b/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion.png similarity index 100% rename from src/documents/tests/samples/barcodes/barcode-39-PATCHT-distorsion.png rename to src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion.png diff --git a/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distorsion2.png b/src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion2.png similarity index 100% rename from src/documents/tests/samples/barcodes/barcode-39-PATCHT-distorsion2.png rename to src/documents/tests/samples/barcodes/barcode-39-PATCHT-distortion2.png diff --git a/src/documents/tests/test_barcodes.py b/src/documents/tests/test_barcodes.py index 7beeee288..8d8b2acfb 100644 --- a/src/documents/tests/test_barcodes.py +++ b/src/documents/tests/test_barcodes.py @@ -3,7 +3,6 @@ import shutil import tempfile from unittest import mock -import pikepdf from django.conf import settings from django.test import override_settings from django.test import TestCase @@ -23,13 +22,29 @@ class TestBarcode(DirectoriesMixin, TestCase): BARCODE_SAMPLE_DIR = os.path.join(SAMPLE_DIR, "barcodes") - def test_barcode_reader(self): + def test_barcode_reader_png(self): + """ + GIVEN: + - PNG file with separator barcode + WHEN: + - Image is scanned for codes + THEN: + - The barcode is detected + """ test_file = os.path.join(self.BARCODE_SAMPLE_DIR, "barcode-39-PATCHT.png") img = Image.open(test_file) - separator_barcode = str(settings.CONSUMER_BARCODE_STRING) + separator_barcode = settings.CONSUMER_BARCODE_STRING self.assertEqual(barcodes.barcode_reader(img), [separator_barcode]) - def test_barcode_reader2(self): + def test_barcode_reader_pbm(self): + """ + GIVEN: + - Netpbm bitmap file with separator barcode + WHEN: + - Image is scanned for codes + THEN: + - The barcode is detected + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "patch-code-t.pbm", @@ -38,25 +53,49 @@ class TestBarcode(DirectoriesMixin, TestCase): separator_barcode = str(settings.CONSUMER_BARCODE_STRING) self.assertEqual(barcodes.barcode_reader(img), [separator_barcode]) - def test_barcode_reader_distorsion(self): + def test_barcode_reader_distortion_scratchy(self): + """ + GIVEN: + - Image containing high noise + WHEN: + - Image is scanned for codes + THEN: + - The barcode is detected + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, - "barcode-39-PATCHT-distorsion.png", + "barcode-39-PATCHT-distortion.png", ) img = Image.open(test_file) separator_barcode = str(settings.CONSUMER_BARCODE_STRING) self.assertEqual(barcodes.barcode_reader(img), [separator_barcode]) - def test_barcode_reader_distorsion2(self): + def test_barcode_reader_distortion_stretched(self): + """ + GIVEN: + - Image with a stretched barcode + WHEN: + - Image is scanned for codes + THEN: + - The barcode is detected + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, - "barcode-39-PATCHT-distorsion2.png", + "barcode-39-PATCHT-distortion2.png", ) img = Image.open(test_file) separator_barcode = str(settings.CONSUMER_BARCODE_STRING) self.assertEqual(barcodes.barcode_reader(img), [separator_barcode]) def test_barcode_reader_unreadable(self): + """ + GIVEN: + - Image with a truly unreadable barcode + WHEN: + - Image is scanned for codes + THEN: + - No barcode is detected + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "barcode-39-PATCHT-unreadable.png", @@ -65,6 +104,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(barcodes.barcode_reader(img), []) def test_barcode_reader_qr(self): + """ + GIVEN: + - Image file with QR separator barcode + WHEN: + - Image is scanned for codes + THEN: + - The barcode is detected + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "qr-code-PATCHT.png", @@ -74,6 +121,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(barcodes.barcode_reader(img), [separator_barcode]) def test_barcode_reader_128(self): + """ + GIVEN: + - Image file with 128 style separator barcode + WHEN: + - Image is scanned for codes + THEN: + - The barcode is detected + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "barcode-128-PATCHT.png", @@ -83,11 +138,27 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(barcodes.barcode_reader(img), [separator_barcode]) def test_barcode_reader_no_barcode(self): + """ + GIVEN: + - Image file with no barcode + WHEN: + - Image is scanned for codes + THEN: + - No barcode is detected + """ test_file = os.path.join(self.SAMPLE_DIR, "simple.png") img = Image.open(test_file) - self.assertEqual(barcodes.barcode_reader(img), []) + self.assertListEqual(barcodes.barcode_reader(img), []) def test_barcode_reader_custom_separator(self): + """ + GIVEN: + - Image file with custom separator barcode value + WHEN: + - Image is scanned for codes + THEN: + - The barcode is detected + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "barcode-39-custom.png", @@ -96,6 +167,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM BARCODE"]) def test_barcode_reader_custom_qr_separator(self): + """ + GIVEN: + - Image file with custom separator barcode value as a QR code + WHEN: + - Image is scanned for codes + THEN: + - The barcode is detected + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "barcode-qr-custom.png", @@ -104,6 +183,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM BARCODE"]) def test_barcode_reader_custom_128_separator(self): + """ + GIVEN: + - Image file with custom separator 128 barcode value + WHEN: + - Image is scanned for codes + THEN: + - The barcode is detected + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "barcode-128-custom.png", @@ -164,6 +251,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM-PREFIX-00123"]) def test_get_mime_type(self): + """ + GIVEN: + - + WHEN: + - + THEN: + - + """ tiff_file = os.path.join( self.SAMPLE_DIR, "simple.tiff", @@ -194,6 +289,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(barcodes.get_file_mime_type(png_file), "image/png") def test_convert_from_tiff_to_pdf(self): + """ + GIVEN: + - + WHEN: + - + THEN: + - + """ test_file = os.path.join( os.path.dirname(__file__), "samples", @@ -207,6 +310,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(file_extension, ".pdf") def test_convert_error_from_pdf_to_pdf(self): + """ + GIVEN: + - + WHEN: + - + THEN: + - + """ test_file = os.path.join( self.SAMPLE_DIR, "simple.pdf", @@ -216,6 +327,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertIsNone(barcodes.convert_from_tiff_to_pdf(dst)) def test_scan_file_for_separating_barcodes(self): + """ + GIVEN: + - + WHEN: + - + THEN: + - + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "patch-code-t.pdf", @@ -231,6 +350,14 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertListEqual(separator_page_numbers, [0]) def test_scan_file_for_separating_barcodes_none_present(self): + """ + GIVEN: + - + WHEN: + - + THEN: + - + """ test_file = os.path.join(self.SAMPLE_DIR, "simple.pdf") doc_barcode_info = barcodes.scan_file_for_barcodes( test_file, @@ -242,7 +369,15 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertListEqual(separator_page_numbers, []) - def test_scan_file_for_separating_barcodes3(self): + def test_scan_file_for_separating_barcodes_middle_page(self): + """ + GIVEN: + - PDF file containing a separator on page 1 (zero indexed) + WHEN: + - File is scanned for barcodes + THEN: + - Barcode is detected on page 1 (zero indexed) + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "patch-code-t-middle.pdf", @@ -257,7 +392,15 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertListEqual(separator_page_numbers, [1]) - def test_scan_file_for_separating_barcodes4(self): + def test_scan_file_for_separating_barcodes_multiple_pages(self): + """ + GIVEN: + - PDF file containing a separator on pages 2 and 5 (zero indexed) + WHEN: + - File is scanned for barcodes + THEN: + - Barcode is detected on pages 2 and 5 (zero indexed) + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "several-patcht-codes.pdf", @@ -272,7 +415,16 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertListEqual(separator_page_numbers, [2, 5]) - def test_scan_file_for_separating_barcodes_upsidedown(self): + def test_scan_file_for_separating_barcodes_upside_down(self): + """ + GIVEN: + - PDF file containing a separator on page 1 (zero indexed) + - The barcode is upside down + WHEN: + - File is scanned for barcodes + THEN: + - Barcode is detected on page 1 (zero indexed) + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "patch-code-t-middle_reverse.pdf", @@ -287,66 +439,6 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertListEqual(separator_page_numbers, [1]) - def test_scan_file_for_barcodes_pillow_transcode_error(self): - """ - GIVEN: - - A PDF containing an image which cannot be transcoded to a PIL image - WHEN: - - The image tries to be transcoded to a PIL image, but fails - THEN: - - The barcode reader is still called - """ - - def _build_device_n_pdf(self, save_path: str): - # Based on the pikepdf tests - # https://github.com/pikepdf/pikepdf/blob/abb35ebe17d579d76abe08265e00cf8890a12a95/tests/test_image_access.py - pdf = pikepdf.new() - pdf.add_blank_page(page_size=(72, 72)) - imobj = pikepdf.Stream( - pdf, - bytes(range(0, 256)), - BitsPerComponent=8, - ColorSpace=pikepdf.Array( - [ - pikepdf.Name.DeviceN, - pikepdf.Array([pikepdf.Name.Black]), - pikepdf.Name.DeviceCMYK, - pikepdf.Stream( - pdf, - b"{0 0 0 4 -1 roll}", # Colorspace conversion function - FunctionType=4, - Domain=[0.0, 1.0], - Range=[0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0], - ), - ], - ), - Width=16, - Height=16, - Type=pikepdf.Name.XObject, - Subtype=pikepdf.Name.Image, - ) - pim = pikepdf.PdfImage(imobj) - self.assertEqual(pim.mode, "DeviceN") - self.assertTrue(pim.is_device_n) - - pdf.pages[0].Contents = pikepdf.Stream(pdf, b"72 0 0 72 0 0 cm /Im0 Do") - pdf.pages[0].Resources = pikepdf.Dictionary( - XObject=pikepdf.Dictionary(Im0=imobj), - ) - pdf.save(save_path) - - with tempfile.NamedTemporaryFile(suffix="pdf") as device_n_pdf: - # Build an offending file - _build_device_n_pdf(self, str(device_n_pdf.name)) - with mock.patch("documents.barcodes.barcode_reader") as reader: - reader.return_value = list() - - _ = barcodes.scan_file_for_barcodes( - str(device_n_pdf.name), - ) - - reader.assert_called() - def test_scan_file_for_separating_barcodes_fax_decode(self): """ GIVEN: @@ -371,6 +463,15 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertListEqual(separator_page_numbers, [1]) def test_scan_file_for_separating_qr_barcodes(self): + """ + GIVEN: + - PDF file containing a separator on page 0 (zero indexed) + - The barcode is a QR code + WHEN: + - File is scanned for barcodes + THEN: + - Barcode is detected on page 0 (zero indexed) + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "patch-code-t-qr.pdf", @@ -387,6 +488,15 @@ class TestBarcode(DirectoriesMixin, TestCase): @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE") def test_scan_file_for_separating_custom_barcodes(self): + """ + GIVEN: + - PDF file containing a separator on page 0 (zero indexed) + - The barcode separation value is customized + WHEN: + - File is scanned for barcodes + THEN: + - Barcode is detected on page 0 (zero indexed) + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "barcode-39-custom.pdf", @@ -403,6 +513,16 @@ class TestBarcode(DirectoriesMixin, TestCase): @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE") def test_scan_file_for_separating_custom_qr_barcodes(self): + """ + GIVEN: + - PDF file containing a separator on page 0 (zero indexed) + - The barcode separation value is customized + - The barcode is a QR code + WHEN: + - File is scanned for barcodes + THEN: + - Barcode is detected on page 0 (zero indexed) + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "barcode-qr-custom.pdf", @@ -419,6 +539,16 @@ class TestBarcode(DirectoriesMixin, TestCase): @override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE") def test_scan_file_for_separating_custom_128_barcodes(self): + """ + GIVEN: + - PDF file containing a separator on page 0 (zero indexed) + - The barcode separation value is customized + - The barcode is a 128 code + WHEN: + - File is scanned for barcodes + THEN: + - Barcode is detected on page 0 (zero indexed) + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "barcode-128-custom.pdf", @@ -434,6 +564,16 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertListEqual(separator_page_numbers, [0]) def test_scan_file_for_separating_wrong_qr_barcodes(self): + """ + GIVEN: + - PDF file containing a separator on page 0 (zero indexed) + - The barcode value is customized + - The separation value is NOT customized + WHEN: + - File is scanned for barcodes + THEN: + - No split pages are detected + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "barcode-39-custom.pdf", @@ -474,13 +614,21 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertListEqual(separator_page_numbers, [1]) def test_separate_pages(self): + """ + GIVEN: + - Input PDF 2 pages after separation + WHEN: + - The input file separated at the barcode + THEN: + - Two new documents are produced + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "patch-code-t-middle.pdf", ) - pages = barcodes.separate_pages(test_file, [1]) + documents = barcodes.separate_pages(test_file, [1]) - self.assertEqual(len(pages), 2) + self.assertEqual(len(documents), 2) def test_separate_pages_double_code(self): """ @@ -493,8 +641,7 @@ class TestBarcode(DirectoriesMixin, TestCase): """ test_file = os.path.join( os.path.dirname(__file__), - "samples", - "barcodes", + self.BARCODE_SAMPLE_DIR, "patch-code-t-double.pdf", ) pages = barcodes.separate_pages(test_file, [1, 2]) @@ -502,6 +649,15 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(len(pages), 2) def test_separate_pages_no_list(self): + """ + GIVEN: + - Input file to separate + WHEN: + - No separation pages are provided + THEN: + - No new documents are produced + - A warning is logged + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "patch-code-t-middle.pdf", @@ -517,16 +673,32 @@ class TestBarcode(DirectoriesMixin, TestCase): ) def test_save_to_dir(self): + """ + GIVEN: + - File to save to a directory + WHEN: + - The file is saved + THEN: + - The file exists + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "patch-code-t.pdf", ) - tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) - barcodes.save_to_dir(test_file, target_dir=tempdir) - target_file = os.path.join(tempdir, "patch-code-t.pdf") + barcodes.save_to_dir(test_file, target_dir=settings.SCRATCH_DIR) + target_file = os.path.join(settings.SCRATCH_DIR, "patch-code-t.pdf") self.assertTrue(os.path.isfile(target_file)) - def test_save_to_dir2(self): + def test_save_to_dir_not_existing(self): + """ + GIVEN: + - File to save to a directory + - The directory doesn't exist + WHEN: + - The file is saved + THEN: + - The file exists + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "patch-code-t.pdf", @@ -534,32 +706,51 @@ class TestBarcode(DirectoriesMixin, TestCase): nonexistingdir = "/nowhere" if os.path.isdir(nonexistingdir): self.fail("non-existing dir exists") - else: - with self.assertLogs("paperless.barcodes", level="WARNING") as cm: - barcodes.save_to_dir(test_file, target_dir=nonexistingdir) - self.assertEqual( - cm.output, - [ - f"WARNING:paperless.barcodes:{str(test_file)} or {str(nonexistingdir)} don't exist.", - ], - ) - def test_save_to_dir3(self): + with self.assertLogs("paperless.barcodes", level="WARNING") as cm: + barcodes.save_to_dir(test_file, target_dir=nonexistingdir) + self.assertEqual( + cm.output, + [ + f"WARNING:paperless.barcodes:{str(test_file)} or {str(nonexistingdir)} don't exist.", + ], + ) + + def test_save_to_dir_given_name(self): + """ + GIVEN: + - File to save to a directory + - There is a name override + WHEN: + - The file is saved + THEN: + - The file exists + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "patch-code-t.pdf", ) - tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) - barcodes.save_to_dir(test_file, newname="newname.pdf", target_dir=tempdir) - target_file = os.path.join(tempdir, "newname.pdf") + barcodes.save_to_dir( + test_file, + newname="newname.pdf", + target_dir=settings.SCRATCH_DIR, + ) + target_file = os.path.join(settings.SCRATCH_DIR, "newname.pdf") self.assertTrue(os.path.isfile(target_file)) def test_barcode_splitter(self): + """ + GIVEN: + - Input file containing barcodes + WHEN: + - Input file is split on barcodes + THEN: + - Correct number of files produced + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "patch-code-t-middle.pdf", ) - tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) doc_barcode_info = barcodes.scan_file_for_barcodes( test_file, @@ -572,18 +763,33 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertTrue(len(separator_page_numbers) > 0) document_list = barcodes.separate_pages(test_file, separator_page_numbers) - self.assertTrue(document_list) - for document in document_list: - barcodes.save_to_dir(document, target_dir=tempdir) + self.assertGreater(len(document_list), 0) - target_file1 = os.path.join(tempdir, "patch-code-t-middle_document_0.pdf") - target_file2 = os.path.join(tempdir, "patch-code-t-middle_document_1.pdf") + for document in document_list: + barcodes.save_to_dir(document, target_dir=settings.SCRATCH_DIR) + + target_file1 = os.path.join( + settings.SCRATCH_DIR, + "patch-code-t-middle_document_0.pdf", + ) + target_file2 = os.path.join( + settings.SCRATCH_DIR, + "patch-code-t-middle_document_1.pdf", + ) self.assertTrue(os.path.isfile(target_file1)) self.assertTrue(os.path.isfile(target_file2)) @override_settings(CONSUMER_ENABLE_BARCODES=True) def test_consume_barcode_file(self): + """ + GIVEN: + - Input file with barcodes given to consume task + WHEN: + - Consume task returns + THEN: + - The file was split + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "patch-code-t-middle.pdf", @@ -600,6 +806,14 @@ class TestBarcode(DirectoriesMixin, TestCase): CONSUMER_BARCODE_TIFF_SUPPORT=True, ) def test_consume_barcode_tiff_file(self): + """ + GIVEN: + - TIFF image containing barcodes + WHEN: + - Consume task returns + THEN: + - The file was split + """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, "patch-code-t-middle.tiff", @@ -617,11 +831,13 @@ class TestBarcode(DirectoriesMixin, TestCase): @mock.patch("documents.consumer.Consumer.try_consume_file") def test_consume_barcode_unsupported_jpg_file(self, m): """ - This test assumes barcode and TIFF support are enabled and - the user uploads an unsupported image file (e.g. jpg) - - The function shouldn't try to scan for separating barcodes - and continue archiving the file as is. + GIVEN: + - JPEG image as input + WHEN: + - Consume task returns + THEN: + - Barcode reader reported warning + - Consumption continued with the file """ test_file = os.path.join( self.SAMPLE_DIR, @@ -629,8 +845,10 @@ class TestBarcode(DirectoriesMixin, TestCase): ) dst = os.path.join(settings.SCRATCH_DIR, "simple.jpg") shutil.copy(test_file, dst) + with self.assertLogs("paperless.barcodes", level="WARNING") as cm: self.assertIn("Success", tasks.consume_file(dst)) + self.assertListEqual( cm.output, [ @@ -652,8 +870,13 @@ class TestBarcode(DirectoriesMixin, TestCase): ) def test_consume_barcode_supported_no_extension_file(self): """ - This test assumes barcode and TIFF support are enabled and - the user uploads a supported image file, but without extension + GIVEN: + - TIFF image containing barcodes + - TIFF file is given without extension + WHEN: + - Consume task returns + THEN: + - The file was split """ test_file = os.path.join( self.BARCODE_SAMPLE_DIR, @@ -669,11 +892,10 @@ class TestBarcode(DirectoriesMixin, TestCase): """ GIVEN: - Password protected PDF - - pikepdf based scanning WHEN: - File is scanned for barcode THEN: - - Scanning handles the exception without exception + - Scanning handles the exception without crashing """ test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf") doc_barcode_info = barcodes.scan_file_for_barcodes( @@ -808,7 +1030,15 @@ class TestBarcode(DirectoriesMixin, TestCase): @override_settings(CONSUMER_ENABLE_ASN_BARCODE=True) def test_asn_too_large(self): - + """ + GIVEN: + - ASN from barcode enabled + - Barcode contains too large an ASN value + WHEN: + - ASN from barcode checked for correctness + THEN: + - Exception is raised regarding size limits + """ src = os.path.join( os.path.dirname(__file__), "samples", From 4fce5aba63aab92f3f2346304f9e8e3eb9335006 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Fri, 27 Jan 2023 08:37:00 -0800 Subject: [PATCH 15/18] Moves ASN barcode testing into a dedicated class --- src/documents/tests/test_barcodes.py | 267 ++++++++++++++------------- src/documents/tests/utils.py | 25 +++ 2 files changed, 163 insertions(+), 129 deletions(-) diff --git a/src/documents/tests/test_barcodes.py b/src/documents/tests/test_barcodes.py index 8d8b2acfb..1ff698858 100644 --- a/src/documents/tests/test_barcodes.py +++ b/src/documents/tests/test_barcodes.py @@ -1,6 +1,5 @@ import os import shutil -import tempfile from unittest import mock from django.conf import settings @@ -198,58 +197,6 @@ class TestBarcode(DirectoriesMixin, TestCase): img = Image.open(test_file) self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM BARCODE"]) - def test_barcode_reader_asn_normal(self): - """ - GIVEN: - - Image containing standard ASNxxxxx barcode - WHEN: - - Image is scanned for barcodes - THEN: - - The barcode is located - - The barcode value is correct - """ - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "barcode-39-asn-123.png", - ) - img = Image.open(test_file) - self.assertEqual(barcodes.barcode_reader(img), ["ASN00123"]) - - def test_barcode_reader_asn_invalid(self): - """ - GIVEN: - - Image containing invalid ASNxxxxx barcode - - The number portion of the ASN is not a number - WHEN: - - Image is scanned for barcodes - THEN: - - The barcode is located - - The barcode value is correct - """ - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "barcode-39-asn-invalid.png", - ) - img = Image.open(test_file) - self.assertEqual(barcodes.barcode_reader(img), ["ASNXYZXYZ"]) - - def test_barcode_reader_asn_custom_prefix(self): - """ - GIVEN: - - Image containing custom prefix barcode - WHEN: - - Image is scanned for barcodes - THEN: - - The barcode is located - - The barcode value is correct - """ - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "barcode-39-asn-custom-prefix.png", - ) - img = Image.open(test_file) - self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM-PREFIX-00123"]) - def test_get_mime_type(self): """ GIVEN: @@ -908,6 +855,144 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertListEqual(separator_page_numbers, []) + +class TestAsnBarcodes(DirectoriesMixin, TestCase): + + SAMPLE_DIR = os.path.join( + os.path.dirname(__file__), + "samples", + ) + + BARCODE_SAMPLE_DIR = os.path.join(SAMPLE_DIR, "barcodes") + + def test_barcode_reader_asn_normal(self): + """ + GIVEN: + - Image containing standard ASNxxxxx barcode + WHEN: + - Image is scanned for barcodes + THEN: + - The barcode is located + - The barcode value is correct + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "barcode-39-asn-123.png", + ) + img = Image.open(test_file) + self.assertEqual(barcodes.barcode_reader(img), ["ASN00123"]) + + def test_barcode_reader_asn_invalid(self): + """ + GIVEN: + - Image containing invalid ASNxxxxx barcode + - The number portion of the ASN is not a number + WHEN: + - Image is scanned for barcodes + THEN: + - The barcode is located + - The barcode value is correct + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "barcode-39-asn-invalid.png", + ) + img = Image.open(test_file) + self.assertEqual(barcodes.barcode_reader(img), ["ASNXYZXYZ"]) + + def test_barcode_reader_asn_custom_prefix(self): + """ + GIVEN: + - Image containing custom prefix barcode + WHEN: + - Image is scanned for barcodes + THEN: + - The barcode is located + - The barcode value is correct + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "barcode-39-asn-custom-prefix.png", + ) + img = Image.open(test_file) + self.assertEqual(barcodes.barcode_reader(img), ["CUSTOM-PREFIX-00123"]) + + @override_settings(CONSUMER_ASN_BARCODE_PREFIX="CUSTOM-PREFIX-") + def test_scan_file_for_asn_custom_prefix(self): + """ + GIVEN: + - PDF containing an ASN barcode with custom prefix + - The ASN value is 123 + WHEN: + - File is scanned for barcodes + THEN: + - The ASN is located + - The ASN integer value is correct + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "barcode-39-asn-custom-prefix.pdf", + ) + doc_barcode_info = barcodes.scan_file_for_barcodes( + test_file, + ) + asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes) + + self.assertEqual(doc_barcode_info.pdf_path, test_file) + self.assertEqual(asn, 123) + + def test_scan_file_for_asn_barcode_invalid(self): + """ + GIVEN: + - PDF containing an ASN barcode + - The ASN value is XYZXYZ + WHEN: + - File is scanned for barcodes + THEN: + - The ASN is located + - The ASN value is not used + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "barcode-39-asn-invalid.pdf", + ) + doc_barcode_info = barcodes.scan_file_for_barcodes( + test_file, + ) + + asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes) + + self.assertEqual(doc_barcode_info.pdf_path, test_file) + self.assertEqual(asn, None) + + @override_settings(CONSUMER_ENABLE_ASN_BARCODE=True) + def test_consume_barcode_file_asn_assignment(self): + """ + GIVEN: + - PDF containing an ASN barcode + - The ASN value is 123 + WHEN: + - File is scanned for barcodes + THEN: + - The ASN is located + - The ASN integer value is correct + - The ASN is provided as the override value to the consumer + """ + test_file = os.path.join( + self.BARCODE_SAMPLE_DIR, + "barcode-39-asn-123.pdf", + ) + + dst = os.path.join(settings.SCRATCH_DIR, "barcode-39-asn-123.pdf") + shutil.copy(test_file, dst) + + with mock.patch("documents.consumer.Consumer.try_consume_file") as mocked_call: + tasks.consume_file(dst) + + args, kwargs = mocked_call.call_args + + self.assertEqual(kwargs["override_asn"], 123) + def test_scan_file_for_asn_barcode(self): """ GIVEN: @@ -952,82 +1037,6 @@ class TestBarcode(DirectoriesMixin, TestCase): self.assertEqual(doc_barcode_info.pdf_path, test_file) self.assertEqual(asn, None) - def test_scan_file_for_asn_barcode_invalid(self): - """ - GIVEN: - - PDF containing an ASN barcode - - The ASN value is XYZXYZ - WHEN: - - File is scanned for barcodes - THEN: - - The ASN is located - - The ASN value is not used - """ - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "barcode-39-asn-invalid.pdf", - ) - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - ) - - asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes) - - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertEqual(asn, None) - - @override_settings(CONSUMER_ASN_BARCODE_PREFIX="CUSTOM-PREFIX-") - def test_scan_file_for_asn_custom_prefix(self): - """ - GIVEN: - - PDF containing an ASN barcode with custom prefix - - The ASN value is 123 - WHEN: - - File is scanned for barcodes - THEN: - - The ASN is located - - The ASN integer value is correct - """ - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "barcode-39-asn-custom-prefix.pdf", - ) - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - ) - asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes) - - self.assertEqual(doc_barcode_info.pdf_path, test_file) - self.assertEqual(asn, 123) - - @override_settings(CONSUMER_ENABLE_ASN_BARCODE=True) - def test_consume_barcode_file_asn_assignment(self): - """ - GIVEN: - - PDF containing an ASN barcode - - The ASN value is 123 - WHEN: - - File is scanned for barcodes - THEN: - - The ASN is located - - The ASN integer value is correct - - The ASN is provided as the override value to the consumer - """ - test_file = os.path.join( - self.BARCODE_SAMPLE_DIR, - "barcode-39-asn-123.pdf", - ) - - dst = os.path.join(settings.SCRATCH_DIR, "barcode-39-asn-123.pdf") - shutil.copy(test_file, dst) - - with mock.patch("documents.consumer.Consumer.try_consume_file") as mocked_call: - tasks.consume_file(dst) - - args, kwargs = mocked_call.call_args - - self.assertEqual(kwargs["override_asn"], 123) - @override_settings(CONSUMER_ENABLE_ASN_BARCODE=True) def test_asn_too_large(self): """ diff --git a/src/documents/tests/utils.py b/src/documents/tests/utils.py index c52c9be92..b2ec0d024 100644 --- a/src/documents/tests/utils.py +++ b/src/documents/tests/utils.py @@ -3,6 +3,7 @@ import shutil import tempfile from collections import namedtuple from contextlib import contextmanager +from unittest import mock from django.apps import apps from django.db import connection @@ -86,6 +87,30 @@ class DirectoriesMixin: remove_dirs(self.dirs) +class ConsumerProgressMixin: + def setUp(self) -> None: + self.send_progress_patcher = mock.patch( + "documents.consumer.Consumer._send_progress", + ) + self.send_progress_mock = self.send_progress_patcher.start() + super().setUp() + + def tearDown(self) -> None: + super().tearDown() + self.send_progress_patcher.stop() + + +class DocumentConsumeDelayMixin: + def setUp(self) -> None: + self.consume_file_patcher = mock.patch("documents.tasks.consume_file.delay") + self.consume_file_mock = self.consume_file_patcher.start() + super().setUp() + + def tearDown(self) -> None: + super().tearDown() + self.consume_file_patcher.stop() + + class TestMigrations(TransactionTestCase): @property def app(self): From 9784ea4a602df9f8cb1d627b2d370d472f3a6a48 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Fri, 27 Jan 2023 11:11:09 -0800 Subject: [PATCH 16/18] Minor tweak to password test to ensure the right lines were hit --- src/documents/tests/test_barcodes.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/documents/tests/test_barcodes.py b/src/documents/tests/test_barcodes.py index 1ff698858..4f7f1278a 100644 --- a/src/documents/tests/test_barcodes.py +++ b/src/documents/tests/test_barcodes.py @@ -845,9 +845,14 @@ class TestBarcode(DirectoriesMixin, TestCase): - Scanning handles the exception without crashing """ test_file = os.path.join(self.SAMPLE_DIR, "password-is-test.pdf") - doc_barcode_info = barcodes.scan_file_for_barcodes( - test_file, - ) + with self.assertLogs("paperless.barcodes", level="WARNING") as cm: + doc_barcode_info = barcodes.scan_file_for_barcodes( + test_file, + ) + warning = cm.output[0] + expected_str = "WARNING:paperless.barcodes:File is likely password protected, not checking for barcodes" + self.assertTrue(warning.startswith(expected_str)) + separator_page_numbers = barcodes.get_separating_barcodes( doc_barcode_info.barcodes, ) From 7dd9a4e089dd5fcc68b887045e92f8c563a06828 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Sat, 28 Jan 2023 09:32:40 -0800 Subject: [PATCH 17/18] Changes the consumer to work on a temporary copy and provies that copy to the pre-consume script for modifications --- src/documents/consumer.py | 35 +++++++++++++++++++++------- src/documents/tests/test_consumer.py | 26 +++++++++++++++++++-- 2 files changed, 50 insertions(+), 11 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index bc344abb9..8c80304d3 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -1,7 +1,10 @@ import datetime import hashlib import os +import shutil +import tempfile import uuid +from pathlib import Path from subprocess import CompletedProcess from subprocess import run from typing import Optional @@ -94,7 +97,8 @@ class Consumer(LoggingMixin): def __init__(self): super().__init__() - self.path = None + self.path: Optional[Path] = None + self.original_path: Optional[Path] = None self.filename = None self.override_title = None self.override_correspondent_id = None @@ -167,16 +171,18 @@ class Consumer(LoggingMixin): self.log("info", f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}") - filepath_arg = os.path.normpath(self.path) + working_file_path = str(self.path) + original_file_path = str(self.original_path) script_env = os.environ.copy() - script_env["DOCUMENT_SOURCE_PATH"] = filepath_arg + script_env["DOCUMENT_SOURCE_PATH"] = original_file_path + script_env["DOCUMENT_WORKING_PATH"] = working_file_path try: completed_proc = run( args=[ settings.PRE_CONSUME_SCRIPT, - filepath_arg, + original_file_path, ], env=script_env, capture_output=True, @@ -195,7 +201,7 @@ class Consumer(LoggingMixin): exception=e, ) - def run_post_consume_script(self, document): + def run_post_consume_script(self, document: Document): if not settings.POST_CONSUME_SCRIPT: return @@ -285,8 +291,8 @@ class Consumer(LoggingMixin): Return the document object if it was successfully created. """ - self.path = path - self.filename = override_filename or os.path.basename(path) + self.path = Path(path).resolve() + self.filename = override_filename or self.path.name self.override_title = override_title self.override_correspondent_id = override_correspondent_id self.override_document_type_id = override_document_type_id @@ -311,6 +317,15 @@ class Consumer(LoggingMixin): self.log("info", f"Consuming {self.filename}") + # For the actual work, copy the file into a tempdir + self.original_path = self.path + tempdir = tempfile.TemporaryDirectory( + prefix="paperless-ngx", + dir=settings.SCRATCH_DIR, + ) + self.path = Path(tempdir.name) / Path(self.filename) + shutil.copy(self.original_path, self.path) + # Determine the parser class. mime_type = magic.from_file(self.path, mime=True) @@ -453,11 +468,12 @@ class Consumer(LoggingMixin): # Delete the file only if it was successfully consumed self.log("debug", f"Deleting file {self.path}") os.unlink(self.path) + self.original_path.unlink() # https://github.com/jonaswinkler/paperless-ng/discussions/1037 shadow_file = os.path.join( - os.path.dirname(self.path), - "._" + os.path.basename(self.path), + os.path.dirname(self.original_path), + "._" + os.path.basename(self.original_path), ) if os.path.isfile(shadow_file): @@ -474,6 +490,7 @@ class Consumer(LoggingMixin): ) finally: document_parser.cleanup() + tempdir.cleanup() self.run_post_consume_script(document) diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index dc86de331..de368018f 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -833,7 +833,8 @@ class PreConsumeTestCase(TestCase): with tempfile.NamedTemporaryFile() as script: with override_settings(PRE_CONSUME_SCRIPT=script.name): c = Consumer() - c.path = "path-to-file" + c.original_path = "path-to-file" + c.path = "/tmp/somewhere/path-to-file" c.run_pre_consume_script() m.assert_called_once() @@ -841,10 +842,19 @@ class PreConsumeTestCase(TestCase): args, kwargs = m.call_args command = kwargs["args"] + environment = kwargs["env"] self.assertEqual(command[0], script.name) self.assertEqual(command[1], "path-to-file") + self.assertDictContainsSubset( + { + "DOCUMENT_SOURCE_PATH": c.original_path, + "DOCUMENT_WORKING_PATH": c.path, + }, + environment, + ) + @mock.patch("documents.consumer.Consumer.log") def test_script_with_output(self, mocked_log): """ @@ -961,9 +971,10 @@ class PostConsumeTestCase(TestCase): m.assert_called_once() - args, kwargs = m.call_args + _, kwargs = m.call_args command = kwargs["args"] + environment = kwargs["env"] self.assertEqual(command[0], script.name) self.assertEqual(command[1], str(doc.pk)) @@ -972,6 +983,17 @@ class PostConsumeTestCase(TestCase): self.assertEqual(command[7], "my_bank") self.assertCountEqual(command[8].split(","), ["a", "b"]) + self.assertDictContainsSubset( + { + "DOCUMENT_ID": str(doc.pk), + "DOCUMENT_DOWNLOAD_URL": f"/api/documents/{doc.pk}/download/", + "DOCUMENT_THUMBNAIL_URL": f"/api/documents/{doc.pk}/thumb/", + "DOCUMENT_CORRESPONDENT": "my_bank", + "DOCUMENT_TAGS": "a,b", + }, + environment, + ) + def test_script_exit_non_zero(self): """ GIVEN: From 7b9c0d65b99c55c7227f3cee6a6dcd2f829e7d67 Mon Sep 17 00:00:00 2001 From: Trenton Holmes <797416+stumpylog@users.noreply.github.com> Date: Sat, 28 Jan 2023 10:25:21 -0800 Subject: [PATCH 18/18] Documents the change to pre-consume script and improves the readability --- docs/advanced_usage.md | 60 ++++++++++++++++++++++++++++-------------- mkdocs.yml | 1 + 2 files changed, 41 insertions(+), 20 deletions(-) diff --git a/docs/advanced_usage.md b/docs/advanced_usage.md index 61b1c072e..9a1abcfff 100644 --- a/docs/advanced_usage.md +++ b/docs/advanced_usage.md @@ -121,7 +121,17 @@ Executed after the consumer sees a new document in the consumption folder, but before any processing of the document is performed. This script can access the following relevant environment variables set: -- `DOCUMENT_SOURCE_PATH` +| Environment Variable | Description | +| ----------------------- | ------------------------------------------------------------ | +| `DOCUMENT_SOURCE_PATH` | Original path of the consumed document | +| `DOCUMENT_WORKING_PATH` | Path to a copy of the original that consumption will work on | + +!!! note + + Pre-consume scripts which modify the document should only change + the `DOCUMENT_WORKING_PATH` file or a second consume task may + be triggered, leading to failures as two tasks work on the + same document path A simple but common example for this would be creating a simple script like this: @@ -130,7 +140,7 @@ like this: ```bash #!/usr/bin/env bash -pdf2pdfocr.py -i ${DOCUMENT_SOURCE_PATH} +pdf2pdfocr.py -i ${DOCUMENT_WORKING_PATH} ``` `/etc/paperless.conf` @@ -157,26 +167,36 @@ Executed after the consumer has successfully processed a document and has moved it into paperless. It receives the following environment variables: -- `DOCUMENT_ID` -- `DOCUMENT_FILE_NAME` -- `DOCUMENT_CREATED` -- `DOCUMENT_MODIFIED` -- `DOCUMENT_ADDED` -- `DOCUMENT_SOURCE_PATH` -- `DOCUMENT_ARCHIVE_PATH` -- `DOCUMENT_THUMBNAIL_PATH` -- `DOCUMENT_DOWNLOAD_URL` -- `DOCUMENT_THUMBNAIL_URL` -- `DOCUMENT_CORRESPONDENT` -- `DOCUMENT_TAGS` -- `DOCUMENT_ORIGINAL_FILENAME` +| Environment Variable | Description | +| ---------------------------- | --------------------------------------------- | +| `DOCUMENT_ID` | Database primary key of the document | +| `DOCUMENT_FILE_NAME` | Formatted filename, not including paths | +| `DOCUMENT_CREATED` | Date & time when document created | +| `DOCUMENT_MODIFIED` | Date & time when document was last modified | +| `DOCUMENT_ADDED` | Date & time when document was added | +| `DOCUMENT_SOURCE_PATH` | Path to the original document file | +| `DOCUMENT_ARCHIVE_PATH` | Path to the generate archive file (if any) | +| `DOCUMENT_THUMBNAIL_PATH` | Path to the generated thumbnail | +| `DOCUMENT_DOWNLOAD_URL` | URL for document download | +| `DOCUMENT_THUMBNAIL_URL` | URL for the document thumbnail | +| `DOCUMENT_CORRESPONDENT` | Assigned correspondent (if any) | +| `DOCUMENT_TAGS` | Comma separated list of tags applied (if any) | +| `DOCUMENT_ORIGINAL_FILENAME` | Filename of original document | -The script can be in any language, but for a simple shell script -example, you can take a look at -[post-consumption-example.sh](https://github.com/paperless-ngx/paperless-ngx/blob/main/scripts/post-consumption-example.sh) -in this project. +The script can be in any language, A simple shell script example: -The post consumption script cannot cancel the consumption process. +```bash title="post-consumption-example" +--8<-- "./scripts/post-consumption-example.sh" +``` + +!!! note + + The post consumption script cannot cancel the consumption process. + +!!! warning + + The post consumption script should not modify the document files + directly The script's stdout and stderr will be logged line by line to the webserver log, along with the exit code of the script. diff --git a/mkdocs.yml b/mkdocs.yml index 6314a44d3..03f24c4f3 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -41,6 +41,7 @@ markdown_extensions: anchor_linenums: true - pymdownx.superfences - pymdownx.inlinehilite + - pymdownx.snippets strict: true nav: - index.md