Unified separator ans ASN barcode parsing

so that barcode parsing won't run twice
This commit is contained in:
Peter Kappelt 2023-01-15 16:15:06 +01:00 committed by Trenton H
parent 92b9fc1ba9
commit 5004771d79
2 changed files with 106 additions and 110 deletions

View File

@ -194,25 +194,51 @@ def scan_file_for_barcodes(
return pdf_filepath, barcodes
def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], List[int]]:
def get_separating_barcodes(barcodes: List[Tuple[int, str]]) -> List[int]:
"""
Scan the provided pdf file for page separating barcodes
Returns a PDF filepath and a list of pagenumbers,
which separate the file into new files
Search the parsed barcodes for separators
and returns a list of pagenumbers, which
separate the file into new files
"""
separator_page_numbers = []
pdf_filepath, barcodes = scan_file_for_barcodes(filepath)
# filter all barcodes for the separator string
separator_barcodes = list(
filter(lambda bc: bc[1] == settings.CONSUMER_BARCODE_STRING, barcodes),
)
# get the page numbers of the separating barcodes
separator_page_numbers = [page for page, _ in separator_barcodes]
return pdf_filepath, separator_page_numbers
return separator_page_numbers
def get_asn_from_barcodes(barcodes: List[Tuple[int, str]]) -> Optional[int]:
"""
Search the parsed barcodes for any ASNs.
The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
is considered the ASN to be used.
Returns the detected ASN (or None)
"""
asn = None
# only the barcode text is important here -> discard the page number
barcodes = [text for _, text in barcodes]
# get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
asn_text = next(
(x for x in barcodes if x.startswith(settings.CONSUMER_ASN_BARCODE_PREFIX)),
None,
)
if asn_text:
logger.debug(f"Found ASN Barcode: {asn_text}")
# remove the prefix and remove whitespace
asn_text = asn_text[len(settings.CONSUMER_ASN_BARCODE_PREFIX) :].strip()
# now, try parsing the ASN number
try:
asn = int(asn_text)
except ValueError as e:
logger.warn(f"Failed to parse ASN number because: {e}")
return asn
def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
@ -293,36 +319,3 @@ def save_to_dir(
os.rename(dst, dst_new)
else:
logger.warning(f"{str(filepath)} or {str(target_dir)} don't exist.")
def scan_file_for_asn_barcode(filepath: str) -> Tuple[Optional[str], Optional[int]]:
"""
Scan the provided pdf file for barcodes that contain the ASN
for this document.
The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
is considered the ASN to be used.
Returns a PDF filepath and the detected ASN (or None)
"""
asn = None
pdf_filepath, barcodes = scan_file_for_barcodes(filepath)
# only the barcode text is important here -> discard the page number
barcodes = [text for _, text in barcodes]
# get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
asn_text = next(
(x for x in barcodes if x.startswith(settings.CONSUMER_ASN_BARCODE_PREFIX))
)
logger.debug(f"Found ASN Barcode: {asn_text}")
if asn_text:
# remove the prefix and remove whitespace
asn_text = asn_text[len(settings.CONSUMER_ASN_BARCODE_PREFIX) :].strip()
# now, try parsing the ASN number
try:
asn = int(asn_text)
except ValueError as e:
logger.warn(f"Failed to parse ASN number because: {e}")
return pdf_filepath, asn

View File

@ -98,6 +98,7 @@ def consume_file(
):
path = Path(path).resolve()
asn = None
# Celery converts this to a string, but everything expects a datetime
# Long term solution is to not use JSON for the serializer but pickle instead
@ -109,12 +110,15 @@ def consume_file(
except Exception:
pass
# check for separators in current document
# read all barcodes in the current document
if settings.CONSUMER_ENABLE_BARCODES or settings.CONSUMER_ENABLE_ASN_BARCODE:
pdf_filepath, parsed_barcodes = barcodes.scan_file_for_barcodes(path)
# split document by separator pages, if enabled
if settings.CONSUMER_ENABLE_BARCODES:
separators = barcodes.get_separating_barcodes(parsed_barcodes)
pdf_filepath, separators = barcodes.scan_file_for_separating_barcodes(path)
if separators:
if len(separators) > 0:
logger.debug(
f"Pages with separators found in: {str(path)}",
)
@ -175,12 +179,11 @@ def consume_file(
# the barcodes has been split and will be consumed separately
return "File successfully split"
# try reading ASN barcodes
asn = None
# try reading the ASN from barcode
if settings.CONSUMER_ENABLE_ASN_BARCODE:
_, asn = barcodes.scan_file_for_asn_barcode(path)
asn = barcodes.get_asn_from_barcodes(parsed_barcodes)
if asn:
logger.info(f"Using ASN {asn} from barcode")
logger.info(f"Found ASN in barcode: {asn}")
# continue with consumption if no barcode was found
document = Consumer().try_consume_file(
@ -192,7 +195,7 @@ def consume_file(
override_tag_ids=override_tag_ids,
task_id=task_id,
override_created=override_created,
override_asn=asn
override_asn=asn,
)
if document: