mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Unified separator ans ASN barcode parsing
so that barcode parsing won't run twice
This commit is contained in:
parent
92b9fc1ba9
commit
5004771d79
@ -194,25 +194,51 @@ def scan_file_for_barcodes(
|
|||||||
return pdf_filepath, barcodes
|
return pdf_filepath, barcodes
|
||||||
|
|
||||||
|
|
||||||
def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], List[int]]:
|
def get_separating_barcodes(barcodes: List[Tuple[int, str]]) -> List[int]:
|
||||||
"""
|
"""
|
||||||
Scan the provided pdf file for page separating barcodes
|
Search the parsed barcodes for separators
|
||||||
Returns a PDF filepath and a list of pagenumbers,
|
and returns a list of pagenumbers, which
|
||||||
which separate the file into new files
|
separate the file into new files
|
||||||
"""
|
"""
|
||||||
separator_page_numbers = []
|
|
||||||
|
|
||||||
pdf_filepath, barcodes = scan_file_for_barcodes(filepath)
|
|
||||||
|
|
||||||
# filter all barcodes for the separator string
|
# filter all barcodes for the separator string
|
||||||
separator_barcodes = list(
|
separator_barcodes = list(
|
||||||
filter(lambda bc: bc[1] == settings.CONSUMER_BARCODE_STRING, barcodes),
|
filter(lambda bc: bc[1] == settings.CONSUMER_BARCODE_STRING, barcodes),
|
||||||
)
|
)
|
||||||
|
|
||||||
# get the page numbers of the separating barcodes
|
# get the page numbers of the separating barcodes
|
||||||
separator_page_numbers = [page for page, _ in separator_barcodes]
|
separator_page_numbers = [page for page, _ in separator_barcodes]
|
||||||
|
|
||||||
return pdf_filepath, separator_page_numbers
|
return separator_page_numbers
|
||||||
|
|
||||||
|
|
||||||
|
def get_asn_from_barcodes(barcodes: List[Tuple[int, str]]) -> Optional[int]:
|
||||||
|
"""
|
||||||
|
Search the parsed barcodes for any ASNs.
|
||||||
|
The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
|
||||||
|
is considered the ASN to be used.
|
||||||
|
Returns the detected ASN (or None)
|
||||||
|
"""
|
||||||
|
asn = None
|
||||||
|
|
||||||
|
# only the barcode text is important here -> discard the page number
|
||||||
|
barcodes = [text for _, text in barcodes]
|
||||||
|
# get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
|
||||||
|
asn_text = next(
|
||||||
|
(x for x in barcodes if x.startswith(settings.CONSUMER_ASN_BARCODE_PREFIX)),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
|
||||||
|
if asn_text:
|
||||||
|
logger.debug(f"Found ASN Barcode: {asn_text}")
|
||||||
|
# remove the prefix and remove whitespace
|
||||||
|
asn_text = asn_text[len(settings.CONSUMER_ASN_BARCODE_PREFIX) :].strip()
|
||||||
|
|
||||||
|
# now, try parsing the ASN number
|
||||||
|
try:
|
||||||
|
asn = int(asn_text)
|
||||||
|
except ValueError as e:
|
||||||
|
logger.warn(f"Failed to parse ASN number because: {e}")
|
||||||
|
|
||||||
|
return asn
|
||||||
|
|
||||||
|
|
||||||
def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
|
def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
|
||||||
@ -293,36 +319,3 @@ def save_to_dir(
|
|||||||
os.rename(dst, dst_new)
|
os.rename(dst, dst_new)
|
||||||
else:
|
else:
|
||||||
logger.warning(f"{str(filepath)} or {str(target_dir)} don't exist.")
|
logger.warning(f"{str(filepath)} or {str(target_dir)} don't exist.")
|
||||||
|
|
||||||
|
|
||||||
def scan_file_for_asn_barcode(filepath: str) -> Tuple[Optional[str], Optional[int]]:
|
|
||||||
"""
|
|
||||||
Scan the provided pdf file for barcodes that contain the ASN
|
|
||||||
for this document.
|
|
||||||
The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
|
|
||||||
is considered the ASN to be used.
|
|
||||||
Returns a PDF filepath and the detected ASN (or None)
|
|
||||||
"""
|
|
||||||
asn = None
|
|
||||||
|
|
||||||
pdf_filepath, barcodes = scan_file_for_barcodes(filepath)
|
|
||||||
# only the barcode text is important here -> discard the page number
|
|
||||||
barcodes = [text for _, text in barcodes]
|
|
||||||
# get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
|
|
||||||
asn_text = next(
|
|
||||||
(x for x in barcodes if x.startswith(settings.CONSUMER_ASN_BARCODE_PREFIX))
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.debug(f"Found ASN Barcode: {asn_text}")
|
|
||||||
|
|
||||||
if asn_text:
|
|
||||||
# remove the prefix and remove whitespace
|
|
||||||
asn_text = asn_text[len(settings.CONSUMER_ASN_BARCODE_PREFIX) :].strip()
|
|
||||||
|
|
||||||
# now, try parsing the ASN number
|
|
||||||
try:
|
|
||||||
asn = int(asn_text)
|
|
||||||
except ValueError as e:
|
|
||||||
logger.warn(f"Failed to parse ASN number because: {e}")
|
|
||||||
|
|
||||||
return pdf_filepath, asn
|
|
||||||
|
@ -98,6 +98,7 @@ def consume_file(
|
|||||||
):
|
):
|
||||||
|
|
||||||
path = Path(path).resolve()
|
path = Path(path).resolve()
|
||||||
|
asn = None
|
||||||
|
|
||||||
# Celery converts this to a string, but everything expects a datetime
|
# Celery converts this to a string, but everything expects a datetime
|
||||||
# Long term solution is to not use JSON for the serializer but pickle instead
|
# Long term solution is to not use JSON for the serializer but pickle instead
|
||||||
@ -109,78 +110,80 @@ def consume_file(
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# check for separators in current document
|
# read all barcodes in the current document
|
||||||
if settings.CONSUMER_ENABLE_BARCODES:
|
if settings.CONSUMER_ENABLE_BARCODES or settings.CONSUMER_ENABLE_ASN_BARCODE:
|
||||||
|
pdf_filepath, parsed_barcodes = barcodes.scan_file_for_barcodes(path)
|
||||||
|
|
||||||
pdf_filepath, separators = barcodes.scan_file_for_separating_barcodes(path)
|
# split document by separator pages, if enabled
|
||||||
|
if settings.CONSUMER_ENABLE_BARCODES:
|
||||||
|
separators = barcodes.get_separating_barcodes(parsed_barcodes)
|
||||||
|
|
||||||
if separators:
|
if len(separators) > 0:
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"Pages with separators found in: {str(path)}",
|
f"Pages with separators found in: {str(path)}",
|
||||||
)
|
)
|
||||||
document_list = barcodes.separate_pages(pdf_filepath, separators)
|
document_list = barcodes.separate_pages(pdf_filepath, separators)
|
||||||
|
|
||||||
if document_list:
|
if document_list:
|
||||||
for n, document in enumerate(document_list):
|
for n, document in enumerate(document_list):
|
||||||
# save to consumption dir
|
# save to consumption dir
|
||||||
# rename it to the original filename with number prefix
|
# rename it to the original filename with number prefix
|
||||||
if override_filename:
|
if override_filename:
|
||||||
newname = f"{str(n)}_" + override_filename
|
newname = f"{str(n)}_" + override_filename
|
||||||
else:
|
else:
|
||||||
newname = None
|
newname = None
|
||||||
|
|
||||||
# If the file is an upload, it's in the scratch directory
|
# If the file is an upload, it's in the scratch directory
|
||||||
# Move it to consume directory to be picked up
|
# Move it to consume directory to be picked up
|
||||||
# Otherwise, use the current parent to keep possible tags
|
# Otherwise, use the current parent to keep possible tags
|
||||||
# from subdirectories
|
# from subdirectories
|
||||||
|
try:
|
||||||
|
# is_relative_to would be nicer, but new in 3.9
|
||||||
|
_ = path.relative_to(settings.SCRATCH_DIR)
|
||||||
|
save_to_dir = settings.CONSUMPTION_DIR
|
||||||
|
except ValueError:
|
||||||
|
save_to_dir = path.parent
|
||||||
|
|
||||||
|
barcodes.save_to_dir(
|
||||||
|
document,
|
||||||
|
newname=newname,
|
||||||
|
target_dir=save_to_dir,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Delete the PDF file which was split
|
||||||
|
os.remove(pdf_filepath)
|
||||||
|
|
||||||
|
# If the original was a TIFF, remove the original file as well
|
||||||
|
if str(pdf_filepath) != str(path):
|
||||||
|
logger.debug(f"Deleting file {path}")
|
||||||
|
os.unlink(path)
|
||||||
|
|
||||||
|
# notify the sender, otherwise the progress bar
|
||||||
|
# in the UI stays stuck
|
||||||
|
payload = {
|
||||||
|
"filename": override_filename,
|
||||||
|
"task_id": task_id,
|
||||||
|
"current_progress": 100,
|
||||||
|
"max_progress": 100,
|
||||||
|
"status": "SUCCESS",
|
||||||
|
"message": "finished",
|
||||||
|
}
|
||||||
try:
|
try:
|
||||||
# is_relative_to would be nicer, but new in 3.9
|
async_to_sync(get_channel_layer().group_send)(
|
||||||
_ = path.relative_to(settings.SCRATCH_DIR)
|
"status_updates",
|
||||||
save_to_dir = settings.CONSUMPTION_DIR
|
{"type": "status_update", "data": payload},
|
||||||
except ValueError:
|
)
|
||||||
save_to_dir = path.parent
|
except ConnectionError as e:
|
||||||
|
logger.warning(f"ConnectionError on status send: {str(e)}")
|
||||||
|
# consuming stops here, since the original document with
|
||||||
|
# the barcodes has been split and will be consumed separately
|
||||||
|
return "File successfully split"
|
||||||
|
|
||||||
barcodes.save_to_dir(
|
# try reading the ASN from barcode
|
||||||
document,
|
if settings.CONSUMER_ENABLE_ASN_BARCODE:
|
||||||
newname=newname,
|
asn = barcodes.get_asn_from_barcodes(parsed_barcodes)
|
||||||
target_dir=save_to_dir,
|
if asn:
|
||||||
)
|
logger.info(f"Found ASN in barcode: {asn}")
|
||||||
|
|
||||||
# Delete the PDF file which was split
|
|
||||||
os.remove(pdf_filepath)
|
|
||||||
|
|
||||||
# If the original was a TIFF, remove the original file as well
|
|
||||||
if str(pdf_filepath) != str(path):
|
|
||||||
logger.debug(f"Deleting file {path}")
|
|
||||||
os.unlink(path)
|
|
||||||
|
|
||||||
# notify the sender, otherwise the progress bar
|
|
||||||
# in the UI stays stuck
|
|
||||||
payload = {
|
|
||||||
"filename": override_filename,
|
|
||||||
"task_id": task_id,
|
|
||||||
"current_progress": 100,
|
|
||||||
"max_progress": 100,
|
|
||||||
"status": "SUCCESS",
|
|
||||||
"message": "finished",
|
|
||||||
}
|
|
||||||
try:
|
|
||||||
async_to_sync(get_channel_layer().group_send)(
|
|
||||||
"status_updates",
|
|
||||||
{"type": "status_update", "data": payload},
|
|
||||||
)
|
|
||||||
except ConnectionError as e:
|
|
||||||
logger.warning(f"ConnectionError on status send: {str(e)}")
|
|
||||||
# consuming stops here, since the original document with
|
|
||||||
# the barcodes has been split and will be consumed separately
|
|
||||||
return "File successfully split"
|
|
||||||
|
|
||||||
# try reading ASN barcodes
|
|
||||||
asn = None
|
|
||||||
if settings.CONSUMER_ENABLE_ASN_BARCODE:
|
|
||||||
_, asn = barcodes.scan_file_for_asn_barcode(path)
|
|
||||||
if asn:
|
|
||||||
logger.info(f"Using ASN {asn} from barcode")
|
|
||||||
|
|
||||||
# continue with consumption if no barcode was found
|
# continue with consumption if no barcode was found
|
||||||
document = Consumer().try_consume_file(
|
document = Consumer().try_consume_file(
|
||||||
@ -192,7 +195,7 @@ def consume_file(
|
|||||||
override_tag_ids=override_tag_ids,
|
override_tag_ids=override_tag_ids,
|
||||||
task_id=task_id,
|
task_id=task_id,
|
||||||
override_created=override_created,
|
override_created=override_created,
|
||||||
override_asn=asn
|
override_asn=asn,
|
||||||
)
|
)
|
||||||
|
|
||||||
if document:
|
if document:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user