From 5004771d79247fd0b01a9b1dd98f158ca306b7d1 Mon Sep 17 00:00:00 2001
From: Peter Kappelt <kappelt.peter@gmail.com>
Date: Sun, 15 Jan 2023 16:15:06 +0100
Subject: [PATCH] Unified separator ans ASN barcode parsing so that barcode
 parsing won't run twice

---
 src/documents/barcodes.py |  79 ++++++++++------------
 src/documents/tasks.py    | 137 +++++++++++++++++++-------------------
 2 files changed, 106 insertions(+), 110 deletions(-)

diff --git a/src/documents/barcodes.py b/src/documents/barcodes.py
index 638dfed6e..1bc0075ce 100644
--- a/src/documents/barcodes.py
+++ b/src/documents/barcodes.py
@@ -194,25 +194,51 @@ def scan_file_for_barcodes(
     return pdf_filepath, barcodes
 
 
-def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], List[int]]:
+def get_separating_barcodes(barcodes: List[Tuple[int, str]]) -> List[int]:
     """
-    Scan the provided pdf file for page separating barcodes
-    Returns a PDF filepath and a list of pagenumbers,
-    which separate the file into new files
+    Search the parsed barcodes for separators
+    and returns a list of pagenumbers, which
+    separate the file into new files
     """
-    separator_page_numbers = []
-
-    pdf_filepath, barcodes = scan_file_for_barcodes(filepath)
-
     # filter all barcodes for the separator string
     separator_barcodes = list(
         filter(lambda bc: bc[1] == settings.CONSUMER_BARCODE_STRING, barcodes),
     )
-
     # get the page numbers of the separating barcodes
     separator_page_numbers = [page for page, _ in separator_barcodes]
 
-    return pdf_filepath, separator_page_numbers
+    return separator_page_numbers
+
+
+def get_asn_from_barcodes(barcodes: List[Tuple[int, str]]) -> Optional[int]:
+    """
+    Search the parsed barcodes for any ASNs.
+    The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
+    is considered the ASN to be used.
+    Returns the detected ASN (or None)
+    """
+    asn = None
+
+    # only the barcode text is important here -> discard the page number
+    barcodes = [text for _, text in barcodes]
+    # get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
+    asn_text = next(
+        (x for x in barcodes if x.startswith(settings.CONSUMER_ASN_BARCODE_PREFIX)),
+        None,
+    )
+
+    if asn_text:
+        logger.debug(f"Found ASN Barcode: {asn_text}")
+        # remove the prefix and remove whitespace
+        asn_text = asn_text[len(settings.CONSUMER_ASN_BARCODE_PREFIX) :].strip()
+
+        # now, try parsing the ASN number
+        try:
+            asn = int(asn_text)
+        except ValueError as e:
+            logger.warn(f"Failed to parse ASN number because: {e}")
+
+    return asn
 
 
 def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
@@ -293,36 +319,3 @@ def save_to_dir(
             os.rename(dst, dst_new)
     else:
         logger.warning(f"{str(filepath)} or {str(target_dir)} don't exist.")
-
-
-def scan_file_for_asn_barcode(filepath: str) -> Tuple[Optional[str], Optional[int]]:
-    """
-    Scan the provided pdf file for barcodes that contain the ASN
-    for this document.
-    The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
-    is considered the ASN to be used.
-    Returns a PDF filepath and the detected ASN (or None)
-    """
-    asn = None
-
-    pdf_filepath, barcodes = scan_file_for_barcodes(filepath)
-    # only the barcode text is important here -> discard the page number
-    barcodes = [text for _, text in barcodes]
-    # get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
-    asn_text = next(
-        (x for x in barcodes if x.startswith(settings.CONSUMER_ASN_BARCODE_PREFIX))
-    )
-
-    logger.debug(f"Found ASN Barcode: {asn_text}")
-
-    if asn_text:
-        # remove the prefix and remove whitespace
-        asn_text = asn_text[len(settings.CONSUMER_ASN_BARCODE_PREFIX) :].strip()
-
-        # now, try parsing the ASN number
-        try:
-            asn = int(asn_text)
-        except ValueError as e:
-            logger.warn(f"Failed to parse ASN number because: {e}")
-
-    return pdf_filepath, asn
diff --git a/src/documents/tasks.py b/src/documents/tasks.py
index 1b7f15d5a..7f4c8e125 100644
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@@ -98,6 +98,7 @@ def consume_file(
 ):
 
     path = Path(path).resolve()
+    asn = None
 
     # Celery converts this to a string, but everything expects a datetime
     # Long term solution is to not use JSON for the serializer but pickle instead
@@ -109,78 +110,80 @@ def consume_file(
         except Exception:
             pass
 
-    # check for separators in current document
-    if settings.CONSUMER_ENABLE_BARCODES:
+    # read all barcodes in the current document
+    if settings.CONSUMER_ENABLE_BARCODES or settings.CONSUMER_ENABLE_ASN_BARCODE:
+        pdf_filepath, parsed_barcodes = barcodes.scan_file_for_barcodes(path)
 
-        pdf_filepath, separators = barcodes.scan_file_for_separating_barcodes(path)
+        # split document by separator pages, if enabled
+        if settings.CONSUMER_ENABLE_BARCODES:
+            separators = barcodes.get_separating_barcodes(parsed_barcodes)
 
-        if separators:
-            logger.debug(
-                f"Pages with separators found in: {str(path)}",
-            )
-            document_list = barcodes.separate_pages(pdf_filepath, separators)
+            if len(separators) > 0:
+                logger.debug(
+                    f"Pages with separators found in: {str(path)}",
+                )
+                document_list = barcodes.separate_pages(pdf_filepath, separators)
 
-            if document_list:
-                for n, document in enumerate(document_list):
-                    # save to consumption dir
-                    # rename it to the original filename  with number prefix
-                    if override_filename:
-                        newname = f"{str(n)}_" + override_filename
-                    else:
-                        newname = None
+                if document_list:
+                    for n, document in enumerate(document_list):
+                        # save to consumption dir
+                        # rename it to the original filename  with number prefix
+                        if override_filename:
+                            newname = f"{str(n)}_" + override_filename
+                        else:
+                            newname = None
 
-                    # If the file is an upload, it's in the scratch directory
-                    # Move it to consume directory to be picked up
-                    # Otherwise, use the current parent to keep possible tags
-                    # from subdirectories
+                        # If the file is an upload, it's in the scratch directory
+                        # Move it to consume directory to be picked up
+                        # Otherwise, use the current parent to keep possible tags
+                        # from subdirectories
+                        try:
+                            # is_relative_to would be nicer, but new in 3.9
+                            _ = path.relative_to(settings.SCRATCH_DIR)
+                            save_to_dir = settings.CONSUMPTION_DIR
+                        except ValueError:
+                            save_to_dir = path.parent
+
+                        barcodes.save_to_dir(
+                            document,
+                            newname=newname,
+                            target_dir=save_to_dir,
+                        )
+
+                    # Delete the PDF file which was split
+                    os.remove(pdf_filepath)
+
+                    # If the original was a TIFF, remove the original file as well
+                    if str(pdf_filepath) != str(path):
+                        logger.debug(f"Deleting file {path}")
+                        os.unlink(path)
+
+                    # notify the sender, otherwise the progress bar
+                    # in the UI stays stuck
+                    payload = {
+                        "filename": override_filename,
+                        "task_id": task_id,
+                        "current_progress": 100,
+                        "max_progress": 100,
+                        "status": "SUCCESS",
+                        "message": "finished",
+                    }
                     try:
-                        # is_relative_to would be nicer, but new in 3.9
-                        _ = path.relative_to(settings.SCRATCH_DIR)
-                        save_to_dir = settings.CONSUMPTION_DIR
-                    except ValueError:
-                        save_to_dir = path.parent
+                        async_to_sync(get_channel_layer().group_send)(
+                            "status_updates",
+                            {"type": "status_update", "data": payload},
+                        )
+                    except ConnectionError as e:
+                        logger.warning(f"ConnectionError on status send: {str(e)}")
+                    # consuming stops here, since the original document with
+                    # the barcodes has been split and will be consumed separately
+                    return "File successfully split"
 
-                    barcodes.save_to_dir(
-                        document,
-                        newname=newname,
-                        target_dir=save_to_dir,
-                    )
-
-                # Delete the PDF file which was split
-                os.remove(pdf_filepath)
-
-                # If the original was a TIFF, remove the original file as well
-                if str(pdf_filepath) != str(path):
-                    logger.debug(f"Deleting file {path}")
-                    os.unlink(path)
-
-                # notify the sender, otherwise the progress bar
-                # in the UI stays stuck
-                payload = {
-                    "filename": override_filename,
-                    "task_id": task_id,
-                    "current_progress": 100,
-                    "max_progress": 100,
-                    "status": "SUCCESS",
-                    "message": "finished",
-                }
-                try:
-                    async_to_sync(get_channel_layer().group_send)(
-                        "status_updates",
-                        {"type": "status_update", "data": payload},
-                    )
-                except ConnectionError as e:
-                    logger.warning(f"ConnectionError on status send: {str(e)}")
-                # consuming stops here, since the original document with
-                # the barcodes has been split and will be consumed separately
-                return "File successfully split"
-
-    # try reading ASN barcodes
-    asn = None
-    if settings.CONSUMER_ENABLE_ASN_BARCODE:
-        _, asn = barcodes.scan_file_for_asn_barcode(path)
-        if asn:
-            logger.info(f"Using ASN {asn} from barcode")
+        # try reading the ASN from barcode
+        if settings.CONSUMER_ENABLE_ASN_BARCODE:
+            asn = barcodes.get_asn_from_barcodes(parsed_barcodes)
+            if asn:
+                logger.info(f"Found ASN in barcode: {asn}")
 
     # continue with consumption if no barcode was found
     document = Consumer().try_consume_file(
@@ -192,7 +195,7 @@ def consume_file(
         override_tag_ids=override_tag_ids,
         task_id=task_id,
         override_created=override_created,
-        override_asn=asn
+        override_asn=asn,
     )
 
     if document: