From 5004771d79247fd0b01a9b1dd98f158ca306b7d1 Mon Sep 17 00:00:00 2001 From: Peter Kappelt Date: Sun, 15 Jan 2023 16:15:06 +0100 Subject: [PATCH] Unified separator ans ASN barcode parsing so that barcode parsing won't run twice --- src/documents/barcodes.py | 79 ++++++++++------------ src/documents/tasks.py | 137 +++++++++++++++++++------------------- 2 files changed, 106 insertions(+), 110 deletions(-) diff --git a/src/documents/barcodes.py b/src/documents/barcodes.py index 638dfed6e..1bc0075ce 100644 --- a/src/documents/barcodes.py +++ b/src/documents/barcodes.py @@ -194,25 +194,51 @@ def scan_file_for_barcodes( return pdf_filepath, barcodes -def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], List[int]]: +def get_separating_barcodes(barcodes: List[Tuple[int, str]]) -> List[int]: """ - Scan the provided pdf file for page separating barcodes - Returns a PDF filepath and a list of pagenumbers, - which separate the file into new files + Search the parsed barcodes for separators + and returns a list of pagenumbers, which + separate the file into new files """ - separator_page_numbers = [] - - pdf_filepath, barcodes = scan_file_for_barcodes(filepath) - # filter all barcodes for the separator string separator_barcodes = list( filter(lambda bc: bc[1] == settings.CONSUMER_BARCODE_STRING, barcodes), ) - # get the page numbers of the separating barcodes separator_page_numbers = [page for page, _ in separator_barcodes] - return pdf_filepath, separator_page_numbers + return separator_page_numbers + + +def get_asn_from_barcodes(barcodes: List[Tuple[int, str]]) -> Optional[int]: + """ + Search the parsed barcodes for any ASNs. + The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX + is considered the ASN to be used. + Returns the detected ASN (or None) + """ + asn = None + + # only the barcode text is important here -> discard the page number + barcodes = [text for _, text in barcodes] + # get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX + asn_text = next( + (x for x in barcodes if x.startswith(settings.CONSUMER_ASN_BARCODE_PREFIX)), + None, + ) + + if asn_text: + logger.debug(f"Found ASN Barcode: {asn_text}") + # remove the prefix and remove whitespace + asn_text = asn_text[len(settings.CONSUMER_ASN_BARCODE_PREFIX) :].strip() + + # now, try parsing the ASN number + try: + asn = int(asn_text) + except ValueError as e: + logger.warn(f"Failed to parse ASN number because: {e}") + + return asn def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]: @@ -293,36 +319,3 @@ def save_to_dir( os.rename(dst, dst_new) else: logger.warning(f"{str(filepath)} or {str(target_dir)} don't exist.") - - -def scan_file_for_asn_barcode(filepath: str) -> Tuple[Optional[str], Optional[int]]: - """ - Scan the provided pdf file for barcodes that contain the ASN - for this document. - The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX - is considered the ASN to be used. - Returns a PDF filepath and the detected ASN (or None) - """ - asn = None - - pdf_filepath, barcodes = scan_file_for_barcodes(filepath) - # only the barcode text is important here -> discard the page number - barcodes = [text for _, text in barcodes] - # get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX - asn_text = next( - (x for x in barcodes if x.startswith(settings.CONSUMER_ASN_BARCODE_PREFIX)) - ) - - logger.debug(f"Found ASN Barcode: {asn_text}") - - if asn_text: - # remove the prefix and remove whitespace - asn_text = asn_text[len(settings.CONSUMER_ASN_BARCODE_PREFIX) :].strip() - - # now, try parsing the ASN number - try: - asn = int(asn_text) - except ValueError as e: - logger.warn(f"Failed to parse ASN number because: {e}") - - return pdf_filepath, asn diff --git a/src/documents/tasks.py b/src/documents/tasks.py index 1b7f15d5a..7f4c8e125 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -98,6 +98,7 @@ def consume_file( ): path = Path(path).resolve() + asn = None # Celery converts this to a string, but everything expects a datetime # Long term solution is to not use JSON for the serializer but pickle instead @@ -109,78 +110,80 @@ def consume_file( except Exception: pass - # check for separators in current document - if settings.CONSUMER_ENABLE_BARCODES: + # read all barcodes in the current document + if settings.CONSUMER_ENABLE_BARCODES or settings.CONSUMER_ENABLE_ASN_BARCODE: + pdf_filepath, parsed_barcodes = barcodes.scan_file_for_barcodes(path) - pdf_filepath, separators = barcodes.scan_file_for_separating_barcodes(path) + # split document by separator pages, if enabled + if settings.CONSUMER_ENABLE_BARCODES: + separators = barcodes.get_separating_barcodes(parsed_barcodes) - if separators: - logger.debug( - f"Pages with separators found in: {str(path)}", - ) - document_list = barcodes.separate_pages(pdf_filepath, separators) + if len(separators) > 0: + logger.debug( + f"Pages with separators found in: {str(path)}", + ) + document_list = barcodes.separate_pages(pdf_filepath, separators) - if document_list: - for n, document in enumerate(document_list): - # save to consumption dir - # rename it to the original filename with number prefix - if override_filename: - newname = f"{str(n)}_" + override_filename - else: - newname = None + if document_list: + for n, document in enumerate(document_list): + # save to consumption dir + # rename it to the original filename with number prefix + if override_filename: + newname = f"{str(n)}_" + override_filename + else: + newname = None - # If the file is an upload, it's in the scratch directory - # Move it to consume directory to be picked up - # Otherwise, use the current parent to keep possible tags - # from subdirectories + # If the file is an upload, it's in the scratch directory + # Move it to consume directory to be picked up + # Otherwise, use the current parent to keep possible tags + # from subdirectories + try: + # is_relative_to would be nicer, but new in 3.9 + _ = path.relative_to(settings.SCRATCH_DIR) + save_to_dir = settings.CONSUMPTION_DIR + except ValueError: + save_to_dir = path.parent + + barcodes.save_to_dir( + document, + newname=newname, + target_dir=save_to_dir, + ) + + # Delete the PDF file which was split + os.remove(pdf_filepath) + + # If the original was a TIFF, remove the original file as well + if str(pdf_filepath) != str(path): + logger.debug(f"Deleting file {path}") + os.unlink(path) + + # notify the sender, otherwise the progress bar + # in the UI stays stuck + payload = { + "filename": override_filename, + "task_id": task_id, + "current_progress": 100, + "max_progress": 100, + "status": "SUCCESS", + "message": "finished", + } try: - # is_relative_to would be nicer, but new in 3.9 - _ = path.relative_to(settings.SCRATCH_DIR) - save_to_dir = settings.CONSUMPTION_DIR - except ValueError: - save_to_dir = path.parent + async_to_sync(get_channel_layer().group_send)( + "status_updates", + {"type": "status_update", "data": payload}, + ) + except ConnectionError as e: + logger.warning(f"ConnectionError on status send: {str(e)}") + # consuming stops here, since the original document with + # the barcodes has been split and will be consumed separately + return "File successfully split" - barcodes.save_to_dir( - document, - newname=newname, - target_dir=save_to_dir, - ) - - # Delete the PDF file which was split - os.remove(pdf_filepath) - - # If the original was a TIFF, remove the original file as well - if str(pdf_filepath) != str(path): - logger.debug(f"Deleting file {path}") - os.unlink(path) - - # notify the sender, otherwise the progress bar - # in the UI stays stuck - payload = { - "filename": override_filename, - "task_id": task_id, - "current_progress": 100, - "max_progress": 100, - "status": "SUCCESS", - "message": "finished", - } - try: - async_to_sync(get_channel_layer().group_send)( - "status_updates", - {"type": "status_update", "data": payload}, - ) - except ConnectionError as e: - logger.warning(f"ConnectionError on status send: {str(e)}") - # consuming stops here, since the original document with - # the barcodes has been split and will be consumed separately - return "File successfully split" - - # try reading ASN barcodes - asn = None - if settings.CONSUMER_ENABLE_ASN_BARCODE: - _, asn = barcodes.scan_file_for_asn_barcode(path) - if asn: - logger.info(f"Using ASN {asn} from barcode") + # try reading the ASN from barcode + if settings.CONSUMER_ENABLE_ASN_BARCODE: + asn = barcodes.get_asn_from_barcodes(parsed_barcodes) + if asn: + logger.info(f"Found ASN in barcode: {asn}") # continue with consumption if no barcode was found document = Consumer().try_consume_file( @@ -192,7 +195,7 @@ def consume_file( override_tag_ids=override_tag_ids, task_id=task_id, override_created=override_created, - override_asn=asn + override_asn=asn, ) if document: