Unified separator ans ASN barcode parsing

so that barcode parsing won't run twice
This commit is contained in:
Peter Kappelt 2023-01-15 16:15:06 +01:00 committed by Trenton H
parent 92b9fc1ba9
commit 5004771d79
2 changed files with 106 additions and 110 deletions

View File

@ -194,25 +194,51 @@ def scan_file_for_barcodes(
return pdf_filepath, barcodes return pdf_filepath, barcodes
def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], List[int]]: def get_separating_barcodes(barcodes: List[Tuple[int, str]]) -> List[int]:
""" """
Scan the provided pdf file for page separating barcodes Search the parsed barcodes for separators
Returns a PDF filepath and a list of pagenumbers, and returns a list of pagenumbers, which
which separate the file into new files separate the file into new files
""" """
separator_page_numbers = []
pdf_filepath, barcodes = scan_file_for_barcodes(filepath)
# filter all barcodes for the separator string # filter all barcodes for the separator string
separator_barcodes = list( separator_barcodes = list(
filter(lambda bc: bc[1] == settings.CONSUMER_BARCODE_STRING, barcodes), filter(lambda bc: bc[1] == settings.CONSUMER_BARCODE_STRING, barcodes),
) )
# get the page numbers of the separating barcodes # get the page numbers of the separating barcodes
separator_page_numbers = [page for page, _ in separator_barcodes] separator_page_numbers = [page for page, _ in separator_barcodes]
return pdf_filepath, separator_page_numbers return separator_page_numbers
def get_asn_from_barcodes(barcodes: List[Tuple[int, str]]) -> Optional[int]:
"""
Search the parsed barcodes for any ASNs.
The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
is considered the ASN to be used.
Returns the detected ASN (or None)
"""
asn = None
# only the barcode text is important here -> discard the page number
barcodes = [text for _, text in barcodes]
# get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
asn_text = next(
(x for x in barcodes if x.startswith(settings.CONSUMER_ASN_BARCODE_PREFIX)),
None,
)
if asn_text:
logger.debug(f"Found ASN Barcode: {asn_text}")
# remove the prefix and remove whitespace
asn_text = asn_text[len(settings.CONSUMER_ASN_BARCODE_PREFIX) :].strip()
# now, try parsing the ASN number
try:
asn = int(asn_text)
except ValueError as e:
logger.warn(f"Failed to parse ASN number because: {e}")
return asn
def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]: def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
@ -293,36 +319,3 @@ def save_to_dir(
os.rename(dst, dst_new) os.rename(dst, dst_new)
else: else:
logger.warning(f"{str(filepath)} or {str(target_dir)} don't exist.") logger.warning(f"{str(filepath)} or {str(target_dir)} don't exist.")
def scan_file_for_asn_barcode(filepath: str) -> Tuple[Optional[str], Optional[int]]:
"""
Scan the provided pdf file for barcodes that contain the ASN
for this document.
The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
is considered the ASN to be used.
Returns a PDF filepath and the detected ASN (or None)
"""
asn = None
pdf_filepath, barcodes = scan_file_for_barcodes(filepath)
# only the barcode text is important here -> discard the page number
barcodes = [text for _, text in barcodes]
# get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
asn_text = next(
(x for x in barcodes if x.startswith(settings.CONSUMER_ASN_BARCODE_PREFIX))
)
logger.debug(f"Found ASN Barcode: {asn_text}")
if asn_text:
# remove the prefix and remove whitespace
asn_text = asn_text[len(settings.CONSUMER_ASN_BARCODE_PREFIX) :].strip()
# now, try parsing the ASN number
try:
asn = int(asn_text)
except ValueError as e:
logger.warn(f"Failed to parse ASN number because: {e}")
return pdf_filepath, asn

View File

@ -98,6 +98,7 @@ def consume_file(
): ):
path = Path(path).resolve() path = Path(path).resolve()
asn = None
# Celery converts this to a string, but everything expects a datetime # Celery converts this to a string, but everything expects a datetime
# Long term solution is to not use JSON for the serializer but pickle instead # Long term solution is to not use JSON for the serializer but pickle instead
@ -109,78 +110,80 @@ def consume_file(
except Exception: except Exception:
pass pass
# check for separators in current document # read all barcodes in the current document
if settings.CONSUMER_ENABLE_BARCODES: if settings.CONSUMER_ENABLE_BARCODES or settings.CONSUMER_ENABLE_ASN_BARCODE:
pdf_filepath, parsed_barcodes = barcodes.scan_file_for_barcodes(path)
pdf_filepath, separators = barcodes.scan_file_for_separating_barcodes(path) # split document by separator pages, if enabled
if settings.CONSUMER_ENABLE_BARCODES:
separators = barcodes.get_separating_barcodes(parsed_barcodes)
if separators: if len(separators) > 0:
logger.debug( logger.debug(
f"Pages with separators found in: {str(path)}", f"Pages with separators found in: {str(path)}",
) )
document_list = barcodes.separate_pages(pdf_filepath, separators) document_list = barcodes.separate_pages(pdf_filepath, separators)
if document_list: if document_list:
for n, document in enumerate(document_list): for n, document in enumerate(document_list):
# save to consumption dir # save to consumption dir
# rename it to the original filename with number prefix # rename it to the original filename with number prefix
if override_filename: if override_filename:
newname = f"{str(n)}_" + override_filename newname = f"{str(n)}_" + override_filename
else: else:
newname = None newname = None
# If the file is an upload, it's in the scratch directory # If the file is an upload, it's in the scratch directory
# Move it to consume directory to be picked up # Move it to consume directory to be picked up
# Otherwise, use the current parent to keep possible tags # Otherwise, use the current parent to keep possible tags
# from subdirectories # from subdirectories
try:
# is_relative_to would be nicer, but new in 3.9
_ = path.relative_to(settings.SCRATCH_DIR)
save_to_dir = settings.CONSUMPTION_DIR
except ValueError:
save_to_dir = path.parent
barcodes.save_to_dir(
document,
newname=newname,
target_dir=save_to_dir,
)
# Delete the PDF file which was split
os.remove(pdf_filepath)
# If the original was a TIFF, remove the original file as well
if str(pdf_filepath) != str(path):
logger.debug(f"Deleting file {path}")
os.unlink(path)
# notify the sender, otherwise the progress bar
# in the UI stays stuck
payload = {
"filename": override_filename,
"task_id": task_id,
"current_progress": 100,
"max_progress": 100,
"status": "SUCCESS",
"message": "finished",
}
try: try:
# is_relative_to would be nicer, but new in 3.9 async_to_sync(get_channel_layer().group_send)(
_ = path.relative_to(settings.SCRATCH_DIR) "status_updates",
save_to_dir = settings.CONSUMPTION_DIR {"type": "status_update", "data": payload},
except ValueError: )
save_to_dir = path.parent except ConnectionError as e:
logger.warning(f"ConnectionError on status send: {str(e)}")
# consuming stops here, since the original document with
# the barcodes has been split and will be consumed separately
return "File successfully split"
barcodes.save_to_dir( # try reading the ASN from barcode
document, if settings.CONSUMER_ENABLE_ASN_BARCODE:
newname=newname, asn = barcodes.get_asn_from_barcodes(parsed_barcodes)
target_dir=save_to_dir, if asn:
) logger.info(f"Found ASN in barcode: {asn}")
# Delete the PDF file which was split
os.remove(pdf_filepath)
# If the original was a TIFF, remove the original file as well
if str(pdf_filepath) != str(path):
logger.debug(f"Deleting file {path}")
os.unlink(path)
# notify the sender, otherwise the progress bar
# in the UI stays stuck
payload = {
"filename": override_filename,
"task_id": task_id,
"current_progress": 100,
"max_progress": 100,
"status": "SUCCESS",
"message": "finished",
}
try:
async_to_sync(get_channel_layer().group_send)(
"status_updates",
{"type": "status_update", "data": payload},
)
except ConnectionError as e:
logger.warning(f"ConnectionError on status send: {str(e)}")
# consuming stops here, since the original document with
# the barcodes has been split and will be consumed separately
return "File successfully split"
# try reading ASN barcodes
asn = None
if settings.CONSUMER_ENABLE_ASN_BARCODE:
_, asn = barcodes.scan_file_for_asn_barcode(path)
if asn:
logger.info(f"Using ASN {asn} from barcode")
# continue with consumption if no barcode was found # continue with consumption if no barcode was found
document = Consumer().try_consume_file( document = Consumer().try_consume_file(
@ -192,7 +195,7 @@ def consume_file(
override_tag_ids=override_tag_ids, override_tag_ids=override_tag_ids,
task_id=task_id, task_id=task_id,
override_created=override_created, override_created=override_created,
override_asn=asn override_asn=asn,
) )
if document: if document: