Updates handling of barcodes to encapsulate logic, moving it out of tasks and into barcodes
| @@ -1,12 +1,11 @@ | ||||
| import logging | ||||
| import os | ||||
| import shutil | ||||
| import tempfile | ||||
| from dataclasses import dataclass | ||||
| from functools import lru_cache | ||||
| from pathlib import Path | ||||
| from subprocess import run | ||||
| from typing import Dict | ||||
| from typing import Final | ||||
| from typing import List | ||||
| from typing import Optional | ||||
|  | ||||
| @@ -18,13 +17,11 @@ from pikepdf import Page | ||||
| from pikepdf import Pdf | ||||
| from PIL import Image | ||||
|  | ||||
| from documents.data_models import DocumentSource | ||||
|  | ||||
| logger = logging.getLogger("paperless.barcodes") | ||||
|  | ||||
|  | ||||
| class BarcodeImageFormatError(Exception): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| @dataclass(frozen=True) | ||||
| class Barcode: | ||||
|     """ | ||||
| @@ -51,56 +48,72 @@ class Barcode: | ||||
|         return self.value.startswith(settings.CONSUMER_ASN_BARCODE_PREFIX) | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class DocumentBarcodeInfo: | ||||
|     """ | ||||
|     Describes a single document's barcode status | ||||
|     """ | ||||
| class BarcodeReader: | ||||
|     def __init__(self, filepath: Path, mime_type: str) -> None: | ||||
|         self.file: Final[Path] = filepath | ||||
|         self.mime: Final[str] = mime_type | ||||
|         self.pdf_file: Path = self.file | ||||
|         self.barcodes: List[Barcode] = [] | ||||
|         self.temp_dir: Optional[Path] = None | ||||
|  | ||||
|     pdf_path: Path | ||||
|     barcodes: List[Barcode] | ||||
|         if settings.CONSUMER_BARCODE_TIFF_SUPPORT: | ||||
|             self.SUPPORTED_FILE_MIMES = {"application/pdf", "image/tiff"} | ||||
|         else: | ||||
|             self.SUPPORTED_FILE_MIMES = {"application/pdf"} | ||||
|  | ||||
|     def __enter__(self): | ||||
|         if self.supported_mime_type: | ||||
|             self.temp_dir = tempfile.TemporaryDirectory(prefix="paperless-barcodes") | ||||
|         return self | ||||
|  | ||||
| @lru_cache(maxsize=8) | ||||
| def supported_file_type(mime_type: str) -> bool: | ||||
|     """ | ||||
|     Determines if the file is valid for barcode | ||||
|     processing, based on MIME type and settings | ||||
|     def __exit__(self, exc_type, exc_val, exc_tb): | ||||
|         if self.temp_dir is not None: | ||||
|             self.temp_dir.cleanup() | ||||
|             self.temp_dir = None | ||||
|  | ||||
|     :return: True if the file is supported, False otherwise | ||||
|     """ | ||||
|     supported_mime = ["application/pdf"] | ||||
|     if settings.CONSUMER_BARCODE_TIFF_SUPPORT: | ||||
|         supported_mime += ["image/tiff"] | ||||
|     @property | ||||
|     def supported_mime_type(self) -> bool: | ||||
|         """ | ||||
|         Return True if the given mime type is supported for barcodes, false otherwise | ||||
|         """ | ||||
|         return self.mime in self.SUPPORTED_FILE_MIMES | ||||
|  | ||||
|     return mime_type in supported_mime | ||||
|     @property | ||||
|     def asn(self) -> Optional[int]: | ||||
|         """ | ||||
|         Search the parsed barcodes for any ASNs. | ||||
|         The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX | ||||
|         is considered the ASN to be used. | ||||
|         Returns the detected ASN (or None) | ||||
|         """ | ||||
|         asn = None | ||||
|  | ||||
|         # Ensure the barcodes have been read | ||||
|         self.detect() | ||||
|  | ||||
| def barcode_reader(image: Image) -> List[str]: | ||||
|     """ | ||||
|     Read any barcodes contained in image | ||||
|     Returns a list containing all found barcodes | ||||
|     """ | ||||
|     barcodes = [] | ||||
|         # get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX | ||||
|         asn_text = next( | ||||
|             (x.value for x in self.barcodes if x.is_asn), | ||||
|             None, | ||||
|         ) | ||||
|  | ||||
|     if settings.CONSUMER_BARCODE_SCANNER == "PYZBAR": | ||||
|         logger.debug("Scanning for barcodes using PYZBAR") | ||||
|         from pyzbar import pyzbar | ||||
|         if asn_text: | ||||
|             logger.debug(f"Found ASN Barcode: {asn_text}") | ||||
|             # remove the prefix and remove whitespace | ||||
|             asn_text = asn_text[len(settings.CONSUMER_ASN_BARCODE_PREFIX) :].strip() | ||||
|  | ||||
|         # Decode the barcode image | ||||
|         detected_barcodes = pyzbar.decode(image) | ||||
|             # now, try parsing the ASN number | ||||
|             try: | ||||
|                 asn = int(asn_text) | ||||
|             except ValueError as e: | ||||
|                 logger.warning(f"Failed to parse ASN number because: {e}") | ||||
|  | ||||
|         return asn | ||||
|  | ||||
|     @staticmethod | ||||
|     def read_barcodes_zxing(image: Image) -> List[str]: | ||||
|         barcodes = [] | ||||
|  | ||||
|         if detected_barcodes: | ||||
|             # Traverse through all the detected barcodes in image | ||||
|             for barcode in detected_barcodes: | ||||
|                 if barcode.data: | ||||
|                     decoded_barcode = barcode.data.decode("utf-8") | ||||
|                     barcodes.append(decoded_barcode) | ||||
|                     logger.debug( | ||||
|                         f"Barcode of type {str(barcode.type)} found: {decoded_barcode}", | ||||
|                     ) | ||||
|     elif settings.CONSUMER_BARCODE_SCANNER == "ZXING": | ||||
|         logger.debug("Scanning for barcodes using ZXING") | ||||
|         import zxingcpp | ||||
|  | ||||
|         detected_barcodes = zxingcpp.read_barcodes(image) | ||||
| @@ -111,74 +124,92 @@ def barcode_reader(image: Image) -> List[str]: | ||||
|                     f"Barcode of type {str(barcode.format)} found: {barcode.text}", | ||||
|                 ) | ||||
|  | ||||
|     return barcodes | ||||
|         return barcodes | ||||
|  | ||||
|     @staticmethod | ||||
|     def read_barcodes_pyzbar(image: Image) -> List[str]: | ||||
|         barcodes = [] | ||||
|  | ||||
| def convert_from_tiff_to_pdf(filepath: Path) -> Path: | ||||
|     """ | ||||
|     converts a given TIFF image file to pdf into a temporary directory. | ||||
|         from pyzbar import pyzbar | ||||
|  | ||||
|     Returns the new pdf file. | ||||
|     """ | ||||
|     tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) | ||||
|     # use old file name with pdf extension | ||||
|     newpath = Path(tempdir) / Path(filepath.name).with_suffix(".pdf") | ||||
|         # Decode the barcode image | ||||
|         detected_barcodes = pyzbar.decode(image) | ||||
|  | ||||
|     with Image.open(filepath) as im: | ||||
|         has_alpha_layer = im.mode in ("RGBA", "LA") | ||||
|     if has_alpha_layer: | ||||
|         run( | ||||
|             [ | ||||
|                 settings.CONVERT_BINARY, | ||||
|                 "-alpha", | ||||
|                 "off", | ||||
|                 filepath, | ||||
|                 filepath, | ||||
|             ], | ||||
|         ) | ||||
|     with filepath.open("rb") as img_file, newpath.open("wb") as pdf_file: | ||||
|         pdf_file.write(img2pdf.convert(img_file)) | ||||
|     return newpath | ||||
|         # Traverse through all the detected barcodes in image | ||||
|         for barcode in detected_barcodes: | ||||
|             if barcode.data: | ||||
|                 decoded_barcode = barcode.data.decode("utf-8") | ||||
|                 barcodes.append(decoded_barcode) | ||||
|                 logger.debug( | ||||
|                     f"Barcode of type {str(barcode.type)} found: {decoded_barcode}", | ||||
|                 ) | ||||
|  | ||||
|         return barcodes | ||||
|  | ||||
| def scan_file_for_barcodes( | ||||
|     filepath: Path, | ||||
|     mime_type: str, | ||||
| ) -> DocumentBarcodeInfo: | ||||
|     """ | ||||
|     Scan the provided pdf file for any barcodes | ||||
|     Returns a PDF filepath and a list of | ||||
|     (page_number, barcode_text) tuples | ||||
|     """ | ||||
|     def convert_from_tiff_to_pdf(self): | ||||
|         """ | ||||
|         May convert a TIFF image into a PDF, if the input is a TIFF | ||||
|         """ | ||||
|         # Nothing to do, pdf_file is already assigned correctly | ||||
|         if self.mime != "image/tiff": | ||||
|             return | ||||
|  | ||||
|     def _pdf2image_barcode_scan(pdf_filepath: str) -> List[Barcode]: | ||||
|         detected_barcodes = [] | ||||
|         # use a temporary directory in case the file is too big to handle in memory | ||||
|         with tempfile.TemporaryDirectory() as path: | ||||
|             pages_from_path = convert_from_path( | ||||
|                 pdf_filepath, | ||||
|                 dpi=300, | ||||
|                 output_folder=path, | ||||
|         with Image.open(self.file) as im: | ||||
|             has_alpha_layer = im.mode in ("RGBA", "LA") | ||||
|         if has_alpha_layer: | ||||
|             # Note the save into the temp folder, so as not to trigger a new | ||||
|             # consume | ||||
|             scratch_image = Path(self.temp_dir.name) / Path(self.file.name) | ||||
|             run( | ||||
|                 [ | ||||
|                     settings.CONVERT_BINARY, | ||||
|                     "-alpha", | ||||
|                     "off", | ||||
|                     self.file, | ||||
|                     scratch_image, | ||||
|                 ], | ||||
|             ) | ||||
|         else: | ||||
|             # Not modifying the original, safe to use in place | ||||
|             scratch_image = self.file | ||||
|  | ||||
|         self.pdf_file = Path(self.temp_dir.name) / Path(self.file.name).with_suffix( | ||||
|             ".pdf", | ||||
|         ) | ||||
|  | ||||
|         with scratch_image.open("rb") as img_file, self.pdf_file.open("wb") as pdf_file: | ||||
|             pdf_file.write(img2pdf.convert(img_file)) | ||||
|  | ||||
|     def detect(self) -> None: | ||||
|         """ | ||||
|         Scan all pages of the PDF as images, updating barcodes and the pages | ||||
|         found on as we go | ||||
|         """ | ||||
|         # Bail if barcodes already exist | ||||
|         if self.barcodes: | ||||
|             return | ||||
|  | ||||
|         # Choose the library for reading | ||||
|         if settings.CONSUMER_BARCODE_SCANNER == "PYZBAR": | ||||
|             reader = self.read_barcodes_pyzbar | ||||
|             logger.debug("Scanning for barcodes using PYZBAR") | ||||
|         else: | ||||
|             reader = self.read_barcodes_zxing | ||||
|             logger.debug("Scanning for barcodes using ZXING") | ||||
|  | ||||
|         try: | ||||
|             pages_from_path = convert_from_path( | ||||
|                 self.pdf_file, | ||||
|                 dpi=300, | ||||
|                 output_folder=self.temp_dir.name, | ||||
|             ) | ||||
|  | ||||
|             for current_page_number, page in enumerate(pages_from_path): | ||||
|                 for barcode_value in barcode_reader(page): | ||||
|                     detected_barcodes.append( | ||||
|                 for barcode_value in reader(page): | ||||
|                     self.barcodes.append( | ||||
|                         Barcode(current_page_number, barcode_value), | ||||
|                     ) | ||||
|         return detected_barcodes | ||||
|  | ||||
|     pdf_filepath = None | ||||
|     barcodes = [] | ||||
|  | ||||
|     if supported_file_type(mime_type): | ||||
|         pdf_filepath = filepath | ||||
|         if mime_type == "image/tiff": | ||||
|             pdf_filepath = convert_from_tiff_to_pdf(filepath) | ||||
|  | ||||
|         # Always try pikepdf first, it's usually fine, faster and | ||||
|         # uses less memory | ||||
|         try: | ||||
|             barcodes = _pdf2image_barcode_scan(pdf_filepath) | ||||
|         # Password protected files can't be checked | ||||
|         # This is the exception raised for those | ||||
|         except PDFPageCountError as e: | ||||
| @@ -191,141 +222,130 @@ def scan_file_for_barcodes( | ||||
|             logger.warning( | ||||
|                 f"Exception during barcode scanning: {e}", | ||||
|             ) | ||||
|     else: | ||||
|         logger.warning( | ||||
|             f"Unsupported file format for barcode reader: {str(mime_type)}", | ||||
|         ) | ||||
|  | ||||
|     return DocumentBarcodeInfo(pdf_filepath, barcodes) | ||||
|     def get_separation_pages(self) -> Dict[int, bool]: | ||||
|         """ | ||||
|         Search the parsed barcodes for separators and returns a dict of page | ||||
|         numbers, which separate the file into new files, together with the | ||||
|         information whether to keep the page. | ||||
|         """ | ||||
|         # filter all barcodes for the separator string | ||||
|         # get the page numbers of the separating barcodes | ||||
|         separator_pages = {bc.page: False for bc in self.barcodes if bc.is_separator} | ||||
|         if not settings.CONSUMER_ENABLE_ASN_BARCODE: | ||||
|             return separator_pages | ||||
|  | ||||
|         # add the page numbers of the ASN barcodes | ||||
|         # (except for first page, that might lead to infinite loops). | ||||
|         return { | ||||
|             **separator_pages, | ||||
|             **{bc.page: True for bc in self.barcodes if bc.is_asn and bc.page != 0}, | ||||
|         } | ||||
|  | ||||
| def get_separating_barcodes(barcodes: List[Barcode]) -> Dict[int, bool]: | ||||
|     """ | ||||
|     Search the parsed barcodes for separators | ||||
|     and returns a dict of page numbers, which | ||||
|     separate the file into new files, together | ||||
|     with the information whether to keep the page. | ||||
|     """ | ||||
|     # filter all barcodes for the separator string | ||||
|     # get the page numbers of the separating barcodes | ||||
|     separator_pages = {bc.page: False for bc in barcodes if bc.is_separator} | ||||
|     if not settings.CONSUMER_ENABLE_ASN_BARCODE: | ||||
|         return separator_pages | ||||
|     def separate_pages(self, pages_to_split_on: Dict[int, bool]) -> List[Path]: | ||||
|         """ | ||||
|         Separate the provided pdf file on the pages_to_split_on. | ||||
|         The pages which are defined by the keys in page_numbers | ||||
|         will be removed if the corresponding value is false. | ||||
|         Returns a list of (temporary) filepaths to consume. | ||||
|         These will need to be deleted later. | ||||
|         """ | ||||
|  | ||||
|     # add the page numbers of the ASN barcodes | ||||
|     # (except for first page, that might lead to infinite loops). | ||||
|     return { | ||||
|         **separator_pages, | ||||
|         **{bc.page: True for bc in barcodes if bc.is_asn and bc.page != 0}, | ||||
|     } | ||||
|         document_paths = [] | ||||
|         fname = self.file.with_suffix("").name | ||||
|         with Pdf.open(self.pdf_file) as input_pdf: | ||||
|             # Start with an empty document | ||||
|             current_document: List[Page] = [] | ||||
|             # A list of documents, ie a list of lists of pages | ||||
|             documents: List[List[Page]] = [current_document] | ||||
|  | ||||
|             for idx, page in enumerate(input_pdf.pages): | ||||
|                 # Keep building the new PDF as long as it is not a | ||||
|                 # separator index | ||||
|                 if idx not in pages_to_split_on: | ||||
|                     current_document.append(page) | ||||
|                     continue | ||||
|  | ||||
| def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]: | ||||
|     """ | ||||
|     Search the parsed barcodes for any ASNs. | ||||
|     The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX | ||||
|     is considered the ASN to be used. | ||||
|     Returns the detected ASN (or None) | ||||
|     """ | ||||
|     asn = None | ||||
|                 # This is a split index | ||||
|                 # Start a new destination page listing | ||||
|                 logger.debug(f"Starting new document at idx {idx}") | ||||
|                 current_document = [] | ||||
|                 documents.append(current_document) | ||||
|                 keep_page = pages_to_split_on[idx] | ||||
|                 if keep_page: | ||||
|                     # Keep the page | ||||
|                     # (new document is started by asn barcode) | ||||
|                     current_document.append(page) | ||||
|  | ||||
|     # get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX | ||||
|     asn_text = next( | ||||
|         (x.value for x in barcodes if x.is_asn), | ||||
|         None, | ||||
|     ) | ||||
|             documents = [x for x in documents if len(x)] | ||||
|  | ||||
|     if asn_text: | ||||
|         logger.debug(f"Found ASN Barcode: {asn_text}") | ||||
|         # remove the prefix and remove whitespace | ||||
|         asn_text = asn_text[len(settings.CONSUMER_ASN_BARCODE_PREFIX) :].strip() | ||||
|             logger.debug(f"Split into {len(documents)} new documents") | ||||
|  | ||||
|         # now, try parsing the ASN number | ||||
|         try: | ||||
|             asn = int(asn_text) | ||||
|         except ValueError as e: | ||||
|             logger.warning(f"Failed to parse ASN number because: {e}") | ||||
|             # Write the new documents out | ||||
|             for doc_idx, document in enumerate(documents): | ||||
|                 dst = Pdf.new() | ||||
|                 dst.pages.extend(document) | ||||
|  | ||||
|     return asn | ||||
|                 output_filename = f"{fname}_document_{doc_idx}.pdf" | ||||
|  | ||||
|                 logger.debug(f"pdf no:{doc_idx} has {len(dst.pages)} pages") | ||||
|                 savepath = Path(self.temp_dir.name) / output_filename | ||||
|                 with open(savepath, "wb") as out: | ||||
|                     dst.save(out) | ||||
|                 document_paths.append(savepath) | ||||
|  | ||||
| def separate_pages(filepath: Path, pages_to_split_on: Dict[int, bool]) -> List[Path]: | ||||
|     """ | ||||
|     Separate the provided pdf file on the pages_to_split_on. | ||||
|     The pages which are defined by the keys in page_numbers | ||||
|     will be removed if the corresponding value is false. | ||||
|     Returns a list of (temporary) filepaths to consume. | ||||
|     These will need to be deleted later. | ||||
|     """ | ||||
|             return document_paths | ||||
|  | ||||
|     document_paths = [] | ||||
|     def separate( | ||||
|         self, | ||||
|         source: DocumentSource, | ||||
|         override_name: Optional[str] = None, | ||||
|     ) -> bool: | ||||
|         """ | ||||
|         Separates the document, based on barcodes and configuration, creating new | ||||
|         documents as required in the appropriate location. | ||||
|  | ||||
|     if not pages_to_split_on: | ||||
|         logger.warning("No pages to split on!") | ||||
|         return document_paths | ||||
|         Returns True if a split happened, False otherwise | ||||
|         """ | ||||
|         # Do nothing | ||||
|         if not self.supported_mime_type: | ||||
|             logger.warning(f"Unsupported file format for barcode reader: {self.mime}") | ||||
|             return False | ||||
|  | ||||
|     os.makedirs(settings.SCRATCH_DIR, exist_ok=True) | ||||
|     tempdir = Path(tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)) | ||||
|     fname = filepath.with_suffix("").name | ||||
|     pdf = Pdf.open(filepath) | ||||
|         # Does nothing unless needed | ||||
|         self.convert_from_tiff_to_pdf() | ||||
|  | ||||
|     # Start with an empty document | ||||
|     current_document: List[Page] = [] | ||||
|     # A list of documents, ie a list of lists of pages | ||||
|     documents: List[List[Page]] = [current_document] | ||||
|         # Actually read the codes, if any | ||||
|         self.detect() | ||||
|  | ||||
|     for idx, page in enumerate(pdf.pages): | ||||
|         # Keep building the new PDF as long as it is not a | ||||
|         # separator index | ||||
|         if idx not in pages_to_split_on: | ||||
|             current_document.append(page) | ||||
|             continue | ||||
|         separator_pages = self.get_separation_pages() | ||||
|  | ||||
|         # This is a split index | ||||
|         # Start a new destination page listing | ||||
|         logger.debug(f"Starting new document at idx {idx}") | ||||
|         current_document = [] | ||||
|         documents.append(current_document) | ||||
|         keep_page = pages_to_split_on[idx] | ||||
|         if keep_page: | ||||
|             # Keep the page | ||||
|             # (new document is started by asn barcode) | ||||
|             current_document.append(page) | ||||
|         # Also do nothing | ||||
|         if not separator_pages: | ||||
|             logger.warning("No pages to split on!") | ||||
|             return False | ||||
|  | ||||
|     documents = [x for x in documents if len(x)] | ||||
|         # Create the split documents | ||||
|         doc_paths = self.separate_pages(separator_pages) | ||||
|  | ||||
|     logger.debug(f"Split into {len(documents)} new documents") | ||||
|         # Save the new documents to correct folder | ||||
|         if source != DocumentSource.ConsumeFolder: | ||||
|             # The given file is somewhere in SCRATCH_DIR, | ||||
|             # and new documents must be moved to the CONSUMPTION_DIR | ||||
|             # for the consumer to notice them | ||||
|             save_to_dir = settings.CONSUMPTION_DIR | ||||
|         else: | ||||
|             # The given file is somewhere in CONSUMPTION_DIR, | ||||
|             # and may be some levels down for recursive tagging | ||||
|             # so use the file's parent to preserve any metadata | ||||
|             save_to_dir = self.file.parent | ||||
|  | ||||
|     # Write the new documents out | ||||
|     for doc_idx, document in enumerate(documents): | ||||
|         dst = Pdf.new() | ||||
|         dst.pages.extend(document) | ||||
|  | ||||
|         output_filename = f"{fname}_document_{doc_idx}.pdf" | ||||
|  | ||||
|         logger.debug(f"pdf no:{doc_idx} has {len(dst.pages)} pages") | ||||
|         savepath = tempdir / output_filename | ||||
|         with open(savepath, "wb") as out: | ||||
|             dst.save(out) | ||||
|         document_paths.append(savepath) | ||||
|  | ||||
|     return document_paths | ||||
|  | ||||
|  | ||||
| def save_to_dir( | ||||
|     filepath: Path, | ||||
|     newname: str = None, | ||||
|     target_dir: Path = settings.CONSUMPTION_DIR, | ||||
| ): | ||||
|     """ | ||||
|     Copies filepath to target_dir. | ||||
|     Optionally rename the file. | ||||
|     """ | ||||
|     if filepath.is_file() and target_dir.is_dir(): | ||||
|         dest = target_dir | ||||
|         if newname is not None: | ||||
|             dest = dest / newname | ||||
|         shutil.copy(filepath, dest) | ||||
|         logging.debug(f"saved {str(filepath)} to {str(dest)}") | ||||
|     else: | ||||
|         logger.warning(f"{str(filepath)} or {str(target_dir)} don't exist.") | ||||
|         for idx, document_path in enumerate(doc_paths): | ||||
|             if override_name is not None: | ||||
|                 newname = f"{str(idx)}_{override_name}" | ||||
|                 dest = save_to_dir / newname | ||||
|             else: | ||||
|                 dest = save_to_dir | ||||
|             logger.info(f"Saving {document_path} to {dest}") | ||||
|             shutil.copy2(document_path, dest) | ||||
|         return True | ||||
|   | ||||
| @@ -16,16 +16,15 @@ from filelock import FileLock | ||||
| from redis.exceptions import ConnectionError | ||||
| from whoosh.writing import AsyncWriter | ||||
|  | ||||
| from documents import barcodes | ||||
| from documents import index | ||||
| from documents import sanity_checker | ||||
| from documents.barcodes import BarcodeReader | ||||
| from documents.classifier import DocumentClassifier | ||||
| from documents.classifier import load_classifier | ||||
| from documents.consumer import Consumer | ||||
| from documents.consumer import ConsumerError | ||||
| from documents.data_models import ConsumableDocument | ||||
| from documents.data_models import DocumentMetadataOverrides | ||||
| from documents.data_models import DocumentSource | ||||
| from documents.file_handling import create_source_path_directory | ||||
| from documents.file_handling import generate_unique_filename | ||||
| from documents.models import Correspondent | ||||
| @@ -96,95 +95,39 @@ def consume_file( | ||||
|  | ||||
|     # read all barcodes in the current document | ||||
|     if settings.CONSUMER_ENABLE_BARCODES or settings.CONSUMER_ENABLE_ASN_BARCODE: | ||||
|         doc_barcode_info = barcodes.scan_file_for_barcodes( | ||||
|             input_doc.original_file, | ||||
|             input_doc.mime_type, | ||||
|         ) | ||||
|         with BarcodeReader(input_doc.original_file, input_doc.mime_type) as reader: | ||||
|             if settings.CONSUMER_ENABLE_BARCODES and reader.separate( | ||||
|                 input_doc.source, | ||||
|                 overrides.filename, | ||||
|             ): | ||||
|                 # notify the sender, otherwise the progress bar | ||||
|                 # in the UI stays stuck | ||||
|                 payload = { | ||||
|                     "filename": overrides.filename or input_doc.original_file.name, | ||||
|                     "task_id": None, | ||||
|                     "current_progress": 100, | ||||
|                     "max_progress": 100, | ||||
|                     "status": "SUCCESS", | ||||
|                     "message": "finished", | ||||
|                 } | ||||
|                 try: | ||||
|                     async_to_sync(get_channel_layer().group_send)( | ||||
|                         "status_updates", | ||||
|                         {"type": "status_update", "data": payload}, | ||||
|                     ) | ||||
|                 except ConnectionError as e: | ||||
|                     logger.warning(f"ConnectionError on status send: {str(e)}") | ||||
|                 # consuming stops here, since the original document with | ||||
|                 # the barcodes has been split and will be consumed separately | ||||
|  | ||||
|         # split document by separator pages, if enabled | ||||
|         if settings.CONSUMER_ENABLE_BARCODES: | ||||
|             separators = barcodes.get_separating_barcodes(doc_barcode_info.barcodes) | ||||
|                 input_doc.original_file.unlink() | ||||
|                 return "File successfully split" | ||||
|  | ||||
|             if len(separators) > 0: | ||||
|                 logger.debug( | ||||
|                     f"Pages with separators found in: {input_doc.original_file}", | ||||
|                 ) | ||||
|                 document_list = barcodes.separate_pages( | ||||
|                     doc_barcode_info.pdf_path, | ||||
|                     separators, | ||||
|                 ) | ||||
|  | ||||
|                 if document_list: | ||||
|                     # If the file is an upload, it's in the scratch directory | ||||
|                     # Move it to consume directory to be picked up | ||||
|                     # Otherwise, use the current parent to keep possible tags | ||||
|                     # from subdirectories | ||||
|                     if input_doc.source != DocumentSource.ConsumeFolder: | ||||
|                         save_to_dir = settings.CONSUMPTION_DIR | ||||
|                     else: | ||||
|                         # Note this uses the original file, because it's in the | ||||
|                         # consume folder already and may include additional path | ||||
|                         # components for tagging | ||||
|                         # the .path is somewhere in scratch in this case | ||||
|                         save_to_dir = input_doc.original_file.parent | ||||
|  | ||||
|                     for n, document in enumerate(document_list): | ||||
|                         # save to consumption dir | ||||
|                         # rename it to the original filename  with number prefix | ||||
|                         if overrides.filename is not None: | ||||
|                             newname = f"{str(n)}_{overrides.filename}" | ||||
|                         else: | ||||
|                             newname = None | ||||
|  | ||||
|                         barcodes.save_to_dir( | ||||
|                             document, | ||||
|                             newname=newname, | ||||
|                             target_dir=save_to_dir, | ||||
|                         ) | ||||
|  | ||||
|                         # Split file has been copied safely, remove it | ||||
|                         document.unlink() | ||||
|  | ||||
|                     # And clean up the directory as well, now it's empty | ||||
|                     shutil.rmtree(document_list[0].parent) | ||||
|  | ||||
|                     # This file has been split into multiple files without issue | ||||
|                     # remove the original and working copy | ||||
|                     input_doc.original_file.unlink() | ||||
|  | ||||
|                     # If the original file was a TIFF, remove the PDF generated from it | ||||
|                     if input_doc.mime_type == "image/tiff": | ||||
|                         logger.debug( | ||||
|                             f"Deleting file {doc_barcode_info.pdf_path}", | ||||
|                         ) | ||||
|                         doc_barcode_info.pdf_path.unlink() | ||||
|  | ||||
|                     # notify the sender, otherwise the progress bar | ||||
|                     # in the UI stays stuck | ||||
|                     payload = { | ||||
|                         "filename": overrides.filename or input_doc.original_file.name, | ||||
|                         "task_id": None, | ||||
|                         "current_progress": 100, | ||||
|                         "max_progress": 100, | ||||
|                         "status": "SUCCESS", | ||||
|                         "message": "finished", | ||||
|                     } | ||||
|                     try: | ||||
|                         async_to_sync(get_channel_layer().group_send)( | ||||
|                             "status_updates", | ||||
|                             {"type": "status_update", "data": payload}, | ||||
|                         ) | ||||
|                     except ConnectionError as e: | ||||
|                         logger.warning(f"ConnectionError on status send: {str(e)}") | ||||
|                     # consuming stops here, since the original document with | ||||
|                     # the barcodes has been split and will be consumed separately | ||||
|                     return "File successfully split" | ||||
|  | ||||
|         # try reading the ASN from barcode | ||||
|         if settings.CONSUMER_ENABLE_ASN_BARCODE: | ||||
|             overrides.asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes) | ||||
|             if overrides.asn: | ||||
|                 logger.info(f"Found ASN in barcode: {overrides.asn}") | ||||
|             # try reading the ASN from barcode | ||||
|             if settings.CONSUMER_ENABLE_ASN_BARCODE: | ||||
|                 overrides.asn = reader.asn | ||||
|                 if overrides.asn: | ||||
|                     logger.info(f"Found ASN in barcode: {overrides.asn}") | ||||
|  | ||||
|     # continue with consumption if no barcode was found | ||||
|     document = Consumer().try_consume_file( | ||||
|   | ||||
| Before Width: | Height: | Size: 836 B | 
| Before Width: | Height: | Size: 1.2 KiB | 
| Before Width: | Height: | Size: 33 KiB | 
| Before Width: | Height: | Size: 39 KiB | 
| Before Width: | Height: | Size: 9.5 KiB | 
| Before Width: | Height: | Size: 891 B | 
| Before Width: | Height: | Size: 12 KiB | 
| Before Width: | Height: | Size: 21 KiB | 
| Before Width: | Height: | Size: 11 KiB | 
| Before Width: | Height: | Size: 1.3 KiB | 
| Before Width: | Height: | Size: 337 B | 
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/barcodes/patch-code-t-middle-distorted.pdf
									
									
									
									
									
										Executable file
									
								
							
							
						
						
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/barcodes/patch-code-t-middle-fuzzy.pdf
									
									
									
									
									
										Executable file
									
								
							
							
						
						
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/barcodes/patch-code-t-middle-unreadable.pdf
									
									
									
									
									
										Executable file
									
								
							
							
						
						| Before Width: | Height: | Size: 7.4 KiB | 
 Trenton H
					Trenton H